upload scripts

ffc9b258 · Monah Abou Alezz · ffc9b258 · ffc9b258 · ffc9b258 · ffc9b258
Commit ffc9b258 authored Mar 25, 2025 by Monah Abou Alezz
--- a/Bulk_RNAseq/DE.R
+++ b/Bulk_RNAseq/DE.R
+# ========================================================-
+# Title:         DE and functional enrichment analysis
+# Description:   Code to generate Fig1, SuppFig3, SuppFig4
+# Author:        Monah Abou Alezz
+# Date:          2025-03-06
+# ========================================================-
+suppressPackageStartupMessages(library(DESeq2))
+suppressPackageStartupMessages(library(org.Hs.eg.db))
+suppressPackageStartupMessages(library(tidyverse))
+suppressPackageStartupMessages(library(ggrepel))
+suppressPackageStartupMessages(library(pheatmap))
+suppressPackageStartupMessages(library(RColorBrewer))
+suppressPackageStartupMessages(library(clusterProfiler))
+suppressPackageStartupMessages(library(reshape2))
+# DE analysis -------------------------------------------------------------
+## Input featureCounts file
+fcount.file <- "data/GSE253820_featureCounts_results.txt"
+## Contrasts to perform
+contr_astro <- "Astro_AAV2:Astro_UT,Astro_AAV5:Astro_UT,Astro_AAV9:Astro_UT,Astro_Empty:Astro_UT"
+contr_neuro <- "Neuro_AAV1:Neuro_UT,Neuro_AAV2:Neuro_UT,Neuro_AAV5:Neuro_UT,Neuro_AAV6:Neuro_UT,Neuro_AAV9:Neuro_UT,Neuro_Spk:Neuro_UT,Neuro_Empty:Neuro_UT"
+organism.db <- org.Hs.eg.db
+comps <- c("contr_astro", "contr_neuro")
+for (t in comps) {
+  contr <- get(t)
+  contr.list <- unlist(strsplit(contr, ","))
+  ## Design Formula
+  formula.str <- "condition"
+  design.str <- gsub(",", "+", formula.str)
+  num_cond <- str_count(formula.str, ",") + 1
+  design.formula <- as.formula(paste("~", design.str, sep = ""))
+  ## Read featureCounts results
+  if (!file.exists(fcount.file)) {
+    stop("Error: featureCounts file not found.")
+  } else {
+    read.counts <- read.table(fcount.file, 
+                              header = TRUE, 
+                              check.names = FALSE) %>%
+      column_to_rownames(var = "Geneid") %>%
+      dplyr::select(-c(1:5))
+  }
+  if (t == "contr_astro") {
+    read.counts <- dplyr::select(read.counts, starts_with("BAM:Astro"))
+  } else {
+    read.counts <- dplyr::select(read.counts, starts_with("BAM:Neuro"))
+  }
+  orig.names <- names(read.counts)
+  ds <- list()
+  cond <- list()
+  for (i in seq(orig.names[1:length(orig.names)])) {
+    ds[i] <- unlist(strsplit(orig.names[i], ":"))[2]
+    cond[i] <- unlist(strsplit(orig.names[i], ":"))[3]
+  }
+  ds <- unlist(ds)
+  cond <- unlist(cond)
+  ## Samples - Conditions
+  sample_info <- as.data.frame(str_split_fixed(cond, ",", num_cond))
+  colnames(sample_info) <- str_split_fixed(formula.str, ",", num_cond)
+  rownames(sample_info) <- ds
+  colnames(read.counts) <- ds
+  sample_info$condition <- factor(sample_info$condition)
+  last_condition <- str_split_fixed(formula.str, ",", num_cond)[num_cond]
+  batch_cond <- str_split_fixed(formula.str, ",", num_cond)[num_cond - 1]
+  ## thresholds
+  adj.pval.thr <- 0.05
+  logfc.thr <- 0
+  ## DESeq2
+  DESeq.ds <- DESeqDataSetFromMatrix(countData = read.counts,
+                                     colData = sample_info,
+                                     design = design.formula)
+  ## Remove genes without any counts
+  DESeq.ds <- DESeq.ds[rowSums(counts(DESeq.ds)) > 0, ]
+  ## RunDGE
+  DESeq.ds <- DESeq(DESeq.ds)
+  ## obtain regularized log-transformed values
+  DESeq.rlog <- rlogTransformation(DESeq.ds, blind = TRUE)
+  assign(paste0("DESeq.rlog_",t), DESeq.rlog)
+  ## Contrasts
+  for (c in 1:length(contr.list)) {
+    ctrs <- unlist(strsplit(contr.list[c], ":"))
+    c1 <- ctrs[1]
+    c2 <- ctrs[2]
+    DGE.results <- results(DESeq.ds,
+                           pAdjustMethod = "BH",
+                           independentFiltering = TRUE,
+                           contrast = c(last_condition, c1, c2),
+                           alpha = adj.pval.thr)
+    # Change format
+    DGE.results.annot <- as.data.frame(DGE.results) %>% 
+      dplyr::select(log2FoldChange, lfcSE, baseMean, pvalue, padj)
+    colnames(DGE.results.annot) <- c("logFC", "lfcSE", "baseMean", "PValue", "FDR")
+    ## Add ENTREZ ID
+    DGE.results.annot$entrez <- mapIds(x = organism.db,
+                                       keys = row.names(DGE.results),
+                                       column = "ENTREZID",
+                                       keytype = "SYMBOL",
+                                       multiVals = "first")
+    ## Add description
+    DGE.results.annot$description <- mapIds(x = organism.db,
+                                            keys = row.names(DGE.results),
+                                            column = "GENENAME",
+                                            keytype = "SYMBOL",
+                                            multiVals = "first")
+    ## sort the results according to the adjusted p-value
+    DGE.results.sorted <- DGE.results.annot[order(DGE.results.annot$FDR), ]
+    DGE.results.sorted$gene <- row.names(DGE.results.sorted)
+    ## Subset for only significant genes
+    DGE.results.sig <- subset(DGE.results.sorted, 
+                              FDR < adj.pval.thr & abs(logFC) > logfc.thr)
+    assign(paste0("DGEgenes_",c1,"_vs_",c2), DGE.results.sorted)
+    assign(paste0("DGEgenes.sig_",c1,"_vs_",c2), DGE.results.sig)
+    ## volcano plots
+    data <- data.frame(gene = row.names(DGE.results.sorted), 
+                       pval = -log10(DGE.results.sorted$FDR), 
+                       lfc = DGE.results.sorted$logFC)
+    data <- na.omit(data)
+    data <- dplyr::mutate(
+      data,
+      color = dplyr::case_when(
+        data$lfc > logfc.thr & data$pval > -log10(adj.pval.thr) ~ "Overexpressed",
+        data$lfc < -logfc.thr & data$pval > -log10(adj.pval.thr) ~ "Underexpressed",
+        abs(data$lfc) < logfc.thr | data$pval < -log10(adj.pval.thr) ~ "NonSignificant"))
+    data <- data[order(data$pval, decreasing = TRUE), ]
+    highl <- rbind(head(subset(data, color == "Overexpressed"), 10), 
+                   head(subset(data, color == "Underexpressed"), 10))
+    title <- paste0(c1," vs. ", c2)
+    vol <- ggplot(data, 
+                  aes(x = lfc, y = pval, colour = color)) +
+      geom_point(size = 2, alpha = 0.8, na.rm = T) +
+      geom_hline(yintercept = -log10(adj.pval.thr), colour = "darkgray") +
+      geom_label_repel(data = highl, 
+                       aes(label = gene), 
+                       show.legend = FALSE, 
+                       size = 4,
+                       family = "Arial") +
+      xlab("log2 Fold Change") + 
+      ylab(expression(-log[10]("adjusted p-value"))) +
+      ggtitle(title) +
+      scale_color_manual(name = "Expression",
+                         values = c(Overexpressed = "red1", 
+                                    Underexpressed = "royalblue2",
+                                    NonSignificant = "gray14")) +
+      theme_bw() +
+      theme(legend.position = "right",
+            text=element_text(family="Arial", size=4*96/72),
+            axis.text.x = element_text(size = 14*96/72),
+            axis.text.y = element_text(size = 14*96/72),
+            axis.title.y = element_text(size = 16*96/72),
+            axis.title.x = element_text(size = 16*96/72),
+            legend.text = element_text(size = 12*96/72),
+            legend.title = element_text(size = 15*96/72),
+            plot.title = element_text(size = 16*96/72, hjust = 0.5))
+    assign(paste0("vol_",c1,"_vs_",c2), vol)
+  }
+}
+# Heatmaps ----------------------------------------------------------------
+degs_astro <- rbind(DGEgenes.sig_Astro_AAV2_vs_Astro_UT,
+                    DGEgenes.sig_Astro_AAV5_vs_Astro_UT,
+                    DGEgenes.sig_Astro_AAV9_vs_Astro_UT,
+                    DGEgenes.sig_Astro_Empty_vs_Astro_UT)
+counts_astro <- as.data.frame(assay(DESeq.rlog_contr_astro))[degs_astro$gene, ]
+counts_astro <- counts_astro %>% 
+  rename_with(~ str_remove(., "Astro_"), everything()) %>% 
+  select(c(13,14,15,16,10,11,12,7,8,9,4,5,6,1,2,3))
+astro_annot <- data.frame(Samples = c("UT", "UT", "UT", "UT",
+                                      "Empty", "Empty", "Empty",
+                                      "AAV9", "AAV9", "AAV9",
+                                      "AAV5", "AAV5", "AAV5", 
+                                      "AAV2", "AAV2", "AAV2"),
+                          row.names = colnames(counts_astro))
+astro_annot$Samples <- factor(astro_annot$Samples, 
+                              levels = c("UT", "Empty", "AAV9", "AAV5", "AAV2"))
+astro_hm <- pheatmap::pheatmap(counts_astro,
+                               scale = "row",
+                               color = colorRampPalette(rev(brewer.pal(n = 9, name ="RdYlBu")))(100),
+                               annotation_col = astro_annot,
+                               annotation_names_col = F,
+                               show_rownames = F,
+                               show_colnames = F,
+                               cluster_rows = T,
+                               cluster_cols = F,
+                               treeheight_row = 20,
+                               fontsize_col = 12*96/72,
+                               fontsize = 12*96/72,
+                               angle_col = 90)
+degs_neuro <- rbind(DGEgenes.sig_Neuro_AAV1_vs_Neuro_UT,
+                    DGEgenes.sig_Neuro_AAV2_vs_Neuro_UT,
+                    DGEgenes.sig_Neuro_AAV5_vs_Neuro_UT,
+                    DGEgenes.sig_Neuro_AAV6_vs_Neuro_UT,
+                    DGEgenes.sig_Neuro_AAV9_vs_Neuro_UT,
+                    DGEgenes.sig_Neuro_Empty_vs_Neuro_UT,
+                    DGEgenes.sig_Neuro_Spk_vs_Neuro_UT)
+counts_neuro <- as.data.frame(assay(DESeq.rlog_contr_neuro))[degs_neuro$gene, ]
+counts_neuro <- counts_neuro %>% 
+  rename_with(~ str_remove(., "Neuro_"), everything()) %>% 
+  select(c(22:24,16:18,13:15,7:9,19:21,1:6,10:12))
+neuro_annot <- data.frame(Samples = c("UT", "UT", "UT",
+                                      "Empty", "Empty", "Empty",
+                                      "AAV9", "AAV9", "AAV9",
+                                      "AAV5", "AAV5", "AAV5",
+                                      "Spk", "Spk", "Spk",
+                                      "AAV1", "AAV1", "AAV1",
+                                      "AAV2", "AAV2", "AAV2",
+                                      "AAV6", "AAV6", "AAV6"),
+                          row.names = colnames(counts_neuro))
+neuro_annot$Samples <- factor(neuro_annot$Samples, 
+                              levels = c("UT", "Empty", "AAV9", 
+                                         "AAV5","Spk","AAV1","AAV2", "AAV6"))
+neuro_hm <- pheatmap::pheatmap(counts_neuro,
+                               scale = "row",
+                               color = colorRampPalette(rev(brewer.pal(n = 9, 
+                                                                       name ="RdYlBu")))(100),
+                               annotation_col = neuro_annot,
+                               annotation_names_col = F,
+                               show_rownames = F,
+                               show_colnames = F,
+                               cluster_rows = T,
+                               cluster_cols = F,
+                               treeheight_row = 20,
+                               fontsize_col = 12*96/72,
+                               fontsize = 12*96/72,
+                               angle_col = 90)
+# GSEA analysis -----------------------------------------------------------
+h_gmt <- read.gmt("data/h.all.v7.2.symbols.gmt")
+for (i in ls()[grep("DGEgenes_", ls())]) {
+  gene_res <- get(i)
+  logfc_symbol <- as.vector(gene_res$logFC)
+  names(logfc_symbol) <- row.names(gene_res)
+  logfc_symbol <- sort(logfc_symbol, decreasing = TRUE)
+  contr.gsea <- GSEA(logfc_symbol, 
+                     TERM2GENE = h_gmt, 
+                     nPerm = 10000, 
+                     pvalueCutoff = 1)
+  contr.gsea.symb <- as.data.frame(contr.gsea)
+  assign(paste0(gsub("DGEgenes_", "GSEA_",i)), contr.gsea.symb)
+}
+type <- c("Astro", "Neuro")
+for (p in type) {
+  hallmark.full <- data.frame()
+  for (ds in ls()[grep(paste0("GSEA_", p), ls())]) {
+    ds.t <- get(ds)
+    ds.t.filt <- ds.t[,c("ID", "setSize", "enrichmentScore", "NES", "pvalue", "p.adjust", "qvalue"), drop = FALSE]
+    ds.t.filt$Dataset <- ds
+    if (nrow(hallmark.full) == 0) {
+      hallmark.full <- ds.t.filt
+    } else {
+      hallmark.full <- rbind(hallmark.full, ds.t.filt)
+    }
+  }
+  hallmark.full.filt <- hallmark.full[, c("ID", "NES", "Dataset", "qvalue")]
+  hallmark.full.filt$ID <- gsub(x = hallmark.full.filt$ID, "HALLMARK_", "")
+  if (p == "Astro") {
+    hallmark.full.filt$Dataset <- factor(hallmark.full.filt$Dataset, 
+                                         levels = c("GSEA_Astro_Empty_vs_Astro_UT",
+                                                    "GSEA_Astro_AAV9_vs_Astro_UT",
+                                                    "GSEA_Astro_AAV5_vs_Astro_UT",
+                                                    "GSEA_Astro_AAV2_vs_Astro_UT"))
+  } else {
+    hallmark.full.filt$Dataset <- factor(hallmark.full.filt$Dataset, 
+                                         levels = c("GSEA_Neuro_Empty_vs_Neuro_UT",
+                                                    "GSEA_Neuro_AAV9_vs_Neuro_UT",
+                                                    "GSEA_Neuro_Spk_vs_Neuro_UT",
+                                                    "GSEA_Neuro_AAV5_vs_Neuro_UT",
+                                                    "GSEA_Neuro_AAV1_vs_Neuro_UT",
+                                                    "GSEA_Neuro_AAV2_vs_Neuro_UT",
+                                                    "GSEA_Neuro_AAV6_vs_Neuro_UT"))
+  }
+  hallmark.full.filt$sig <- ifelse(
+    hallmark.full.filt$qvalue <= 0.05 & hallmark.full.filt$qvalue > 0.01, '*', 
+    ifelse(hallmark.full.filt$qvalue <= 0.01 & hallmark.full.filt$qvalue > 0.001, '**', 
+           ifelse(hallmark.full.filt$qvalue <= 0.001 & hallmark.full.filt$qvalue > 0.0001, '***', 
+                  ifelse(hallmark.full.filt$qvalue <= 0.0001, '****', ''))))
+  hallmark.order <- hallmark.full.filt %>% 
+    group_by(ID) %>% 
+    summarise(Pos = sum(NES))
+  hallmark.order.terms <- hallmark.order[order(hallmark.order$Pos, decreasing = FALSE), "ID", drop = FALSE]
+  hallmark.full.filt.tt <- reshape2::melt(reshape2::dcast(hallmark.full.filt, 
+                                                          ID ~ Dataset, 
+                                                          value.var="NES"), id.vars = c("ID"))
+  colnames(hallmark.full.filt.tt) <- c("ID", "Dataset", "NES")
+  hallmark.full.filt.tt$ID <- factor(hallmark.full.filt.tt$ID, levels = hallmark.order.terms$ID)
+  hallmark.full.filt.tt <- merge(hallmark.full.filt.tt, 
+                                 hallmark.full.filt[, c("ID", "Dataset", "sig")], 
+                                 by = c("ID", "Dataset"), 
+                                 all.x = TRUE)
+  if (p == "Astro") {
+    labels <- c("Empty", "AAV9", "AAV5", "AAV2")
+  } else {
+    labels <- c("Empty", "AAV9", "Spk100","AAV5","AAV1","AAV2","AAV6")
+  }
+  p.hallmark <- ggplot(hallmark.full.filt.tt, 
+                             aes(x = Dataset, y = ID)) +
+    geom_tile(aes(fill = NES), colour = "white") +
+    geom_text(aes(label = paste(sig)), size=4*96/72, fontface = "bold") +
+    scale_fill_gradientn(colours = colorRampPalette(rev(brewer.pal(11,"RdBu")))(100),
+                         limits = c(-3.5, 3.5),
+                         na.value = "white") +
+    ylab("") + 
+    xlab("") +
+    coord_fixed(ratio = 0.6) +
+    scale_x_discrete(labels = labels) +
+    theme_bw(base_size = 12) +
+    theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold", size = 18*96/72),
+          axis.text.y = element_text(face = "bold", size = 12*96/72),
+          legend.text = element_text(face = "bold", size = 14*96/72),
+          legend.title = element_text(face = "bold", size = 12*96/72))
+  assign(paste0("p.hallmark_", p), p.hallmark)
+}
+# Pathways ----------------------------------------------------------------
+gi_astro <- data.frame(gene = c("CDKN1A","PHLDA3","BAX","FDXR","TGFA","INPP5D","TRIM22",   
+                                "GDF15", "APOBEC3C","IL1A","CXCL8","IL1B","TNFSF9","IL11", 
+                                "EDA2R", "TNFRSF10A","TNFRSF10B","TNFRSF10C","TNFRSF10D",
+                                "CCNA2","CDC20","NDC80","KIF2C","AURKB","MCM6",
+                                "CCNF", "LMNB1","MCM2","E2F1","CDC45", "MCM7",
+                                "DNA2", "CDCA5","CLSPN" ),
+                       terms = c(rep("P53 Pathway", 9),
+                                 rep("Inflammatory Response", 5),
+                                 rep("TNFA Signaling Via NFKB", 1),
+                                 rep("Apoptosis", 4),
+                                 rep("G2M Checkpoint", 14),
+                                 rep("DNA Repair", 1)))
+gi_neuro <- data.frame(gene = c("CDKN1A","FAS","BAX","FDXR","TGFA","INPP5D","TRIM22","GDF15",
+                                "CXCL8","IL1R1","IL11","FOSL1","EDA2R","TNFRSF12A","IL6R",
+                                "TNFRSF10C","TNFRSF10D","TNFRSF10A","CDC20","AURKB","MCM6",
+                                "MCM3","CDC25A","CDC6","CHEK1","E2F1","E2F2","MCM4","RACGAP1",
+                                "DLGAP5","NCAPH", "NUF2","CLSPN", "FANCE", "TF","TK1"),
+                       terms = c(rep("P53 Pathway", 8),
+                                 rep("Inflammatory Response", 3),
+                                 rep("TNFA Signaling Via NFKB", 2),
+                                 rep("Il6 Jak Stat3 Signaling", 2),
+                                 rep("Apoptosis", 3),
+                                 rep("G2M Checkpoint", 10),
+                                 rep("Mitotic Spindle", 4),
+                                 rep("DNA Repair", 2),
+                                 rep("Coagulation", 1),
+                                 rep("E2F Targets", 1)))
+astro <- data.frame()
+for (l in ls()[grep("DGEgenes_Astro", ls())]) {
+  deg <- get(l) %>% 
+    select(logFC)
+  colnames(deg) <- gsub("DGEgenes_Astro_","",l)
+  if (nrow(astro) == 0) {
+    astro <- deg
+  } else {
+    astro <- merge(astro, deg, by = "row.names", all = TRUE)
+    rownames(astro) <- astro$Row.names
+    astro$Row.names <- NULL
+  }
+}
+astro <- astro[c(row.names(astro) %in% gi_astro$gene), ]
+astro <- astro[match(gi_astro$gene, rownames(astro)), , drop = FALSE]
+astro_rowAnnot <- gi_astro %>% 
+  column_to_rownames(var = "gene") %>% 
+  rename(Pathway = 1)
+colours_row <- list('Pathway' = c('Inflammatory Response' = 'blue',
+                                  'TNFA Signaling Via NFKB'= 'darkgreen',
+                                  'Apoptosis' = 'green',
+                                  'G2M Checkpoint' = 'red',
+                                  'P53 Pathway' = 'orange',
+                                  'UV Response Up' = 'purple',
+                                  'DNA Repair' = 'darkslategray3'))
+rowAnn <- HeatmapAnnotation(df = astro_rowAnnot,
+                            which = 'row',
+                            show_annotation_name = FALSE,
+                            col = colours_row,
+                            annotation_legend_param = list(
+                              Pathway = list(
+                                title = "Pathways",
+                                title_gp = gpar(fontsize = 12*96/72,
+                                                fontface = "bold"), 
+                                labels_gp = gpar(fontsize = 14*96/72),
+                                grid_height = unit(0.7, "cm"))),
+                            annotation_width = unit(c(16, 16), 'cm'),
+                            width = unit(c(16, 16), 'cm'),
+                            gap = unit(1, 'mm'))
+astro_mat <- as.matrix(astro[,c(4,3,2,1)])
+hmap <- Heatmap(astro_mat,
+                name = "Z-score",
+                cluster_rows = F,
+                cluster_columns = FALSE,
+                show_row_dend = F,
+                show_column_dend = TRUE,
+                column_dend_height = unit(1.2*96/72, "cm"), 
+                row_dend_width = unit(1*96/72, "cm"),
+                row_names_side = "right",
+                row_dend_side = "left",
+                row_names_gp = gpar(fontsize = 12*96/72),
+                row_names_max_width = unit(6*96/72, "cm"),
+                show_row_names = TRUE,
+                row_title = NULL,
+                row_title_gp = gpar(fontsize = 16*96/72, fontface = "bold"),
+                show_column_names = T,
+                column_names_rot = 90,
+                column_names_gp = gpar(fontsize = 14*96/72),
+                column_title = NULL,
+                column_title_side = "bottom",
+                column_title_gp = gpar(fontsize = 16*96/72, fontface = "bold"),
+                column_dend_reorder = TRUE,
+                left_annotation = rowAnn,
+                heatmap_legend_param = list(
+                  title = "logFC",
+                  title_gp = gpar(fontsize = 12*96/72,
+                                  fontface = "bold"), 
+                  labels_gp = gpar(fontsize = 12*96/72),
+                  legend_height = unit(3, "cm")
+                ),
+                width = unit(9*96/72, "cm"), 
+                height = unit(16*96/72, "cm"),
+                use_raster = TRUE,
+                raster_quality = 5)
+draw(hmap, heatmap_legend_side="right", annotation_legend_side="right")
+neuro <- data.frame()
+for (n in ls()[grep("DGEgenes_Neuro", ls())]) {
+  deg <- get(n) 
+  deg <- select(deg, logFC)
+  colnames(deg) <- gsub("DGEgenes_Neuro_","",n)
+  if (nrow(neuro) == 0) {
+    neuro <- deg
+  } else {
+    neuro <- merge(neuro, deg, by = "row.names", all = TRUE)
+    rownames(neuro) <- neuro$Row.names
+    neuro$Row.names <- NULL
+  }
+}
+neuro <- neuro[c(row.names(neuro) %in% gi_neuro$gene), ]
+neuro <- neuro[match(gi_neuro$gene, rownames(neuro)), , drop = FALSE]
+neuro_rowAnnot <- gi_neuro %>% 
+  column_to_rownames(var = "gene") %>% 
+  rename(Pathway = 1)
+colours_row <- list('Pathway' = c('Inflammatory Response' = 'blue', 
+                                  'Mitotic Spindle' = 'purple',
+                                  'TNFA Signaling Via NFKB'= 'darkgreen',
+                                  'Apoptosis' = 'green',
+                                  'Il6 Jak Stat3 Signaling' = 'pink',
+                                  'G2M Checkpoint' = 'red',
+                                  'E2F Targets' = 'gray60',
+                                  'P53 Pathway' = 'orange',
+                                  'Coagulation' = 'brown',
+                                  'DNA Repair' = 'darkslategray3'))
+rowAnn <- HeatmapAnnotation(df = neuro_rowAnnot,
+                            which = 'row',
+                            show_annotation_name = FALSE,
+                            col = colours_row,
+                            annotation_legend_param = list(
+                              Pathway = list(
+                                title = "Pathways",
+                                title_gp = gpar(fontsize = 12*96/72,
+                                                fontface = "bold"), 
+                                labels_gp = gpar(fontsize = 14*96/72),
+                                grid_height = unit(0.7, "cm"))),
+                            annotation_width = unit(c(16, 16), 'cm'),
+                            width = unit(c(16, 16), 'cm'),
+                            gap = unit(1, 'mm'))
+neuro_mat <- as.matrix(neuro[,c(6,5,7,3,1,2,4)])
+hmap <- Heatmap(neuro_mat,
+                name = "Z-score",
+                cluster_rows = F,
+                cluster_columns = FALSE,
+                show_row_dend = F,
+                show_column_dend = TRUE,
+                column_dend_height = unit(1.2*96/72, "cm"), 
+                row_dend_width = unit(1*96/72, "cm"),
+                row_names_side = "right",
+                row_dend_side = "left",
+                row_names_gp = gpar(fontsize = 12*96/72),
+                row_names_max_width = unit(6*96/72, "cm"),
+                show_row_names = TRUE,
+                row_title = NULL,
+                row_title_gp = gpar(fontsize = 16*96/72, fontface = "bold"),
+                show_column_names = T,
+                column_names_rot = 90,
+                column_names_gp = gpar(fontsize = 14*96/72),
+                column_title = NULL,
+                column_title_side = "bottom",
+                column_title_gp = gpar(fontsize = 16*96/72, fontface = "bold"),
+                column_dend_reorder = TRUE,
+                left_annotation = rowAnn,
+                heatmap_legend_param = list(
+                  title = "logFC",
+                  title_gp = gpar(fontsize = 12*96/72,
+                                  fontface = "bold"), 
+                  labels_gp = gpar(fontsize = 12*96/72),
+                  legend_height = unit(3, "cm")
+                ),
+                width = unit(9*96/72, "cm"), 
+                height = unit(16*96/72, "cm"),
+                use_raster = TRUE,
+                raster_quality = 5)
+draw(hmap, heatmap_legend_side="right", annotation_legend_side="right")
--- a/README.md
+++ b/README.md
+ostaverdera Natcom2025
+This repository includes essential scripts for producing final figures embedded in the following manuscript:  
+**Costa Verdera, et al.,** _AAV vectors trigger DNA damage response-dependent pro-inflammatory signalling in human iPSC-derived CNS models and mouse brain_ 
+PubMed: 
+DOI:  
+GEO:  GSE253824
+---
+The repository is divided in five folders, 1 bulk RNAseq analysis and 3 scRNAseq analyses and 1 snRNAseq analysis.  
+Below a brief description of the bulk and scRNAseq workflows adopted in this work.
+**Bulk RNAseq** analysis was performed using a standard pipeline that includes the follwing steps:
+1. Quality control by [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)
+2. Trimming of bad quality reads with [TrimGalore](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/)<details><summary>Running command</summary>trim_galore --quality 20 --fastqc --length 25 --output_dir {outdir} --paired {input.r1} {inout.r2}</details>
+3. Alignment with [STAR](https://github.com/alexdobin/STAR)
+    <details><summary>Running command</summary>
+            "STAR " +
+            "--runThreadN {threads} " +
+            "--genomeDir {input.genome} " +
+            "--readFilesIn {params.trim_seq} " +
+            "--outSAMstrandField intronMotif " +
+            "--outFileNamePrefix {params.aln_seq_prefix} " +
+            "--outSAMtype BAM SortedByCoordinate " +
+            "--outSAMmultNmax 1 " +
+            "--outFilterMismatchNmax 10 " +
+            "--outReadsUnmapped Fastx " +
+            "--readFilesCommand zcat "
+    </details>
+4. Gene expression quantification with [featureCounts](https://academic.oup.com/bioinformatics/article/30/7/923/232889)
+    <details><summary>Running command</summary>
+            "featureCounts " +
+            "-a {input.annot} " +
+            "-o {output.fcount} " +
+            "-g gene_name " +
+            "-p -B -C " +
+            "-s {params.strand} " +
+            "--minOverlap 10 " +
+            "-T {threads} " +
+            "{input.bams} "
+    </details>
+5. Differential Expression analysis with [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html).  
+    For Differential Gene Expression analysis we followed the standard workflow provided by package.  
+6. Dowstream functional Analysis with [ClusterProfiler](https://bioconductor.org/packages/release/bioc/html/clusterProfiler.html).  
+    In order to retrieve functional annotation from DE analysis, we performed **G**ene **S**et **E**nrichment **A**nalysis by using the _GSEA_ function provided by the package.  
+    **GSEA** analysis was performed in particular using the Hallmark gene set from the MSigDB database (version 7.2)
+---
+**scRNAseq** analysis was performed using a standard pipeline that includes the following steps:
+scRNAseq analysis was performed with [Seurat](https://satijalab.org/seurat/). Below are the main steps of the basic data analysis workflow that start from a minimal object after loading of 10X data to markers identification:  
+1. Quality control and filtering
+2. Cell cycle scoring
+3. Normalization (default seurat settings)
+4. Scaling (with following variables to regress out: percent.mt + nCount_RNA and CC.Difference calculated as show in [vignette](https://satijalab.org/seurat/articles/cell_cycle_vignette.html#alternate-workflow-1))
+5. Dimensionality reduction: PCA
+6. Clustering
+7. Markers identification
+- [7.1] Clusters related markers
+- [7.2] Intracluster differential expression analysis according to comparison of interest
+---
--- a/scRNAseq_mixed/seurat_analysis.R
+++ b/scRNAseq_mixed/seurat_analysis.R
+# ========================================================-
+# Title:         Seurat analysis of mixed cultures scRNAseq
+# Description:   Code to generate Fig3, SuppFig7
+# Author:        Monah Abou Alezz
+# Date:          2025-03-06
+# ========================================================-
+suppressPackageStartupMessages(library(Seurat))
+suppressPackageStartupMessages(library(tidyverse))
+suppressPackageStartupMessages(library(reshape2))
+suppressPackageStartupMessages(library(ggrepel))
+suppressPackageStartupMessages(library(clusterProfiler))
+suppressPackageStartupMessages(library(RColorBrewer))
+## load seurat data
+h48_final <- readRDS("data/mixed_h48_final.rds")
+d4_final <- readRDS("data/mixed_D4_final.rds")
+Idents(h48_final) <- "RNA_snn_h.orig.ident_res.1.2"
+Idents(d4_final) <- "RNA_snn_h.orig.ident_res.1.2"
+h48_final@meta.data[['orig.ident']] <- recode_factor(
+  h48_final@meta.data[["orig.ident"]],
+  "Sample3" = "AAV9",
+  "Sample5" = "Spk",
+  "Sample7" = "UT")
+h48_final@meta.data[["RNA_snn_h.orig.ident_res.1.2"]] <- recode_factor(
+  h48_final@meta.data[["RNA_snn_h.orig.ident_res.1.2"]], 
+  "0" = "Undifferentiated", 
+  "1" = "Astrocytes", 
+  "2" = "Astrocytes", 
+  "3" = "Astrocytes", 
+  "4" = "OPC", 
+  "5" = "Oligodendrocytes", 
+  "6" = "Astrocytes", 
+  "7" = "Astrocytes", 
+  "8" = "Oligodendrocytes",
+  "9" = "Immature.Neurons",
+  "10" = "Oligodendrocytes",
+  "11" = "Immature.Neurons",
+  "12" = "Immature.Astrocytes",
+  "13" = "Immature.Astrocytes",
+  "14" = "Astrocytes",
+  "15" = "Neurons",
+  "16" = "Oligodendrocytes",
+  "17" = "OPC")
+h48_final$RNA_snn_h.orig.ident_res.1.2 <- factor(
+  h48_final$RNA_snn_h.orig.ident_res.1.2,
+  levels = c("Astrocytes",
+             "Neurons",
+             "Oligodendrocytes",
+             "Immature.Astrocytes",
+             "Immature.Neurons",
+             "OPC",
+             "Undifferentiated"))
+d4_final@meta.data[['orig.ident']] <- recode_factor(
+  d4_final@meta.data[["orig.ident"]],
+  "Sample2" = "AAV2",
+  "Sample4" = "AAV9",
+  "Sample6" = "Spk",
+  "Sample8" = "UT")
+d4_final@meta.data[["RNA_snn_h.orig.ident_res.1.2"]] <- recode_factor(
+  d4_final@meta.data[["RNA_snn_h.orig.ident_res.1.2"]], 
+  "0" = "Undifferentiated", 
+  "1" = "Oligodendrocytes", 
+  "2" = "Neurons", 
+  "3" = "Astrocytes", 
+  "4" = "Immature.Neurons", 
+  "5" = "OPC", 
+  "6" = "Astrocytes", 
+  "7" = "Oligodendrocytes", 
+  "8" = "Oligodendrocytes",
+  "9" = "Immature.Neurons",
+  "10" = "Oligodendrocytes",
+  "11" = "Oligodendrocytes",
+  "12" = "Astrocytes",
+  "13" = "Immature.Astrocytes",
+  "14" = "OPC",
+  "15" = "Astrocytes",
+  "16" = "Astrocytes")
+d4_final$RNA_snn_h.orig.ident_res.1.2 <- factor(
+  d4_final$RNA_snn_h.orig.ident_res.1.2,
+  levels = c("Astrocytes",
+             "Neurons",
+             "Oligodendrocytes",
+             "Immature.Astrocytes",
+             "Immature.Neurons",
+             "OPC",
+             "Undifferentiated"))
+ggplotColours <- function(n = 6, h = c(0, 360) + 15){
+  if ((diff(h) %% 360) < 1) h[2] <- h[2] - 360/n
+  hcl(h = (seq(h[1], h[2], length = n)), c = 100, l = 65)
+}
+## umap
+DimPlot(object = h48_final,
+        reduction = "umap.harmony.orig.ident",
+        group.by = "RNA_snn_h.orig.ident_res.1.2",
+        label = T,
+        repel = T,
+        pt.size = 0.2,
+        label.size = 6) +
+  scale_color_manual(labels=c("Astrocytes",
+                              "Neurons",
+                              "Oligodendrocytes",
+                              "Immature Astrocytes",
+                              "Immature Neurons",
+                              "OPC",
+                              "Undifferentiated"),
+                     values = ggplotColours(n = 7))
+## donut plot
+ggplot_object_clusters_labels <- melt(table(h48_final$RNA_snn_h.orig.ident_res.1.2))
+colnames(ggplot_object_clusters_labels) <- c("Cell_type", "Count")
+totalB <- ggplot_object_clusters_labels %>% mutate(percentage = Count/sum(Count))
+totalB$percentage <- totalB$percentage*100
+totalB$fraction = totalB$Count / sum(totalB$Count)
+totalB$ymax <- cumsum(totalB$fraction)
+totalB$ymin <- c(0, head(totalB$ymax, n=-1))
+totalB$labelPosition <- (totalB$ymax + totalB$ymin) / 2
+totalB$Samples <- c("Astrocytes", "Neurons", "Oligodendrocytes",
+                    "Immature\nAstrocytes", "Immature\nNeurons",
+                    "OPC", "Undifferentiated")
+totalB$label <- paste0(totalB$Cell_type, ":","\n",round(totalB$percentage, 2), "%")
+totalB$label2 <- c(paste0(totalB$Samples[1], ":","\n",round(totalB$percentage[1], 2), "%"),
+                   paste0(totalB$Samples[2], ":","\n",round(totalB$percentage[2], 2), "%"),
+                   paste0(totalB$Samples[3], ":","\n",round(totalB$percentage[3], 2), "%"),
+                   paste0(totalB$Samples[4], ": ",round(totalB$percentage[4], 2), "%"),
+                   paste0(totalB$Samples[5], ": ",round(totalB$percentage[5], 2), "%"),
+                   paste0(totalB$Samples[6], ":","\n",round(totalB$percentage[6], 2), "%"),
+                   paste0(totalB$Samples[7], ":","\n",round(totalB$percentage[7], 2), "%"))
+ggplot(totalB, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=Cell_type)) +
+  geom_rect() +
+  geom_label_repel(x=3.5, aes(y=labelPosition, label=label), size=10) +
+  coord_polar(theta="y") +
+  xlim(c(2, 4)) +
+  theme_void() +
+  theme(legend.position = "none")
+## barplot
+PrctCellExpringGene <- function(object, genes, group.by = "all"){
+  if(group.by == "all"){
+    prct = unlist(lapply(genes,calc_helper, object=object))
+    result = data.frame(Markers = genes, Cell_proportion = prct)
+    return(result)
+  }
+  else{        
+    list = SplitObject(object, group.by)
+    factors = names(list)
+    results = lapply(list, PrctCellExpringGene, genes=genes)
+    for(i in 1:length(factors)){
+      results[[i]]$Feature = factors[i]
+    }
+    combined = do.call("rbind", results)
+    return(combined)
+  }
+}
+calc_helper <- function(object,genes){
+  counts = object[['RNA']]@counts
+  ncells = ncol(counts)
+  if(genes %in% row.names(counts)){
+    sum(counts[genes,]>0)/ncells
+  }else{return(NA)}
+}
+aav9 <- subset(h48_final, orig.ident=="AAV9")
+gfp_percentage_cell_type <- PrctCellExpringGene(aav9,
+                                                genes ="GFP",
+                                                group.by = "RNA_snn_h.orig.ident_res.1.2")
+gfp_percentage_cell_type$Percentage <- gfp_percentage_cell_type$Cell_proportion*100
+gfp_percentage_cell_type <- gfp_percentage_cell_type[,c(4,3)]
+colnames(gfp_percentage_cell_type) <- c("Percentage", "Cell_Types")
+gfp_percentage_cell_type <- gfp_percentage_cell_type[c(7,1,6,4,5,3,2),]
+gfp_percentage_cell_type$Cell_Types <- factor(gfp_percentage_cell_type$Cell_Types,
+                                              levels = c("Immature.Neurons",
+                                                         "Neurons",
+                                                         "Immature.Astrocytes",
+                                                         "Astrocytes",
+                                                         "OPC",
+                                                         "Oligodendrocytes",
+                                                         "Undifferentiated"))
+custom_colors <- c("Immature.Neurons" = "#00B6EB",
+                   "Neurons" = "#C49A00",
+                   "Immature.Astrocytes" = "#00C094",
+                   "Astrocytes" = "#F8766D",
+                   "OPC" = "#A58AFF",
+                   "Oligodendrocytes" = "#53B400",
+                   "Undifferentiated" = "#FB61D7")
+ggplot(data=gfp_percentage_cell_type, 
+            aes(x=Cell_Types, 
+                y=Percentage, 
+                fill = Cell_Types)) +
+  geom_bar(stat="identity", 
+           width = 0.7,
+           position=position_dodge()) + # width of bars
+  scale_fill_manual(values = custom_colors) +
+  ylab("% GFP mRNA positive") +
+  theme_bw() +
+  theme(
+    text = element_text(family = "Arial"),
+    axis.title.x = element_blank(),
+    axis.title.y = element_text(size=20*96/72),
+    axis.text.x = element_text(size=16*96/72, angle = 45, vjust = 0.9, hjust = 0.9),
+    axis.text.y = element_text(size=18*96/72),
+    legend.position = "none",
+    aspect.ratio = 1.1)
+h48_aav9_ut <- subset(h48_final, orig.ident == "AAV9" | orig.ident == "UT") 
+d4_aav9_ut <- subset(d4_final, orig.ident == "AAV9" | orig.ident == "UT")
+DimPlot(object = h48_aav9_ut,
+        reduction = "umap.harmony.orig.ident",
+        group.by = "RNA_snn_h.orig.ident_res.1.2",
+        split.by = "orig.ident",
+        label = F,
+        repel = T,
+        pt.size = 0.2,
+        label.size = 6,
+        ncol = 2) + 
+  scale_color_manual(labels=c("Astrocytes",
+                              "Neurons",
+                              "Oligodendrocytes",
+                              "Immature Astrocytes",
+                              "Immature Neurons",
+                              "OPC",
+                              "Undifferentiated"),
+                     values = ggplotColours(n = 7))
+DimPlot(object = d4_aav9_ut,
+        reduction = "umap.harmony.orig.ident",
+        group.by = "RNA_snn_h.orig.ident_res.1.2",
+        split.by = "orig.ident",
+        label = F,
+        repel = T,
+        pt.size = 0.2,
+        label.size = 6,
+        ncol = 2) + 
+  scale_color_manual(labels=c("Astrocytes",
+                              "Neurons",
+                              "Oligodendrocytes",
+                              "Immature Astrocytes",
+                              "Immature Neurons",
+                              "OPC",
+                              "Undifferentiated"),
+                     values = ggplotColours(n = 7))
+samples <- c("AAV9", "UT")
+tp <- c("h48","d4")
+for (t in tp) {
+  obj_t <- get(paste0(t,"_final"))
+  for (i in samples) {
+    obj_i <- subset(obj_t, orig.ident == i)
+    ggplot_object_clusters_labels <- melt(table(obj_i$RNA_snn_h.orig.ident_res.1.2))
+    colnames(ggplot_object_clusters_labels) <- c("Cell_type", "Count")
+    totalB <- ggplot_object_clusters_labels %>% mutate(percentage = Count/sum(Count))
+    totalB$percentage <- totalB$percentage*100
+    totalB$fraction = totalB$Count / sum(totalB$Count)
+    totalB$ymax <- cumsum(totalB$fraction)
+    totalB$ymin <- c(0, head(totalB$ymax, n=-1))
+    totalB$labelPosition <- (totalB$ymax + totalB$ymin) / 2
+    totalB$Samples <- c("Astrocytes", "Neurons", "Oligodendrocytes",
+                        "Immature\nAstrocytes", "Immature\nNeurons",
+                        "OPC", "Undifferentiated")
+    totalB$label <- paste0(totalB$Cell_type, ":","\n",round(totalB$percentage, 2), "%")
+    totalB$label2 <- c(paste0(totalB$Samples[1], ":","\n",round(totalB$percentage[1], 2), "%"),
+                       paste0(totalB$Samples[2], ":","\n",round(totalB$percentage[2], 2), "%"),
+                       paste0(totalB$Samples[3], ":","\n",round(totalB$percentage[3], 2), "%"),
+                       paste0(totalB$Samples[4], ": ",round(totalB$percentage[4], 2), "%"),
+                       paste0(totalB$Samples[5], ": ",round(totalB$percentage[5], 2), "%"),
+                       paste0(totalB$Samples[6], ":","\n",round(totalB$percentage[6], 2), "%"),
+                       paste0(totalB$Samples[7], ":","\n",round(totalB$percentage[7], 2), "%"))
+    donut_i <- ggplot(totalB, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=Cell_type)) +
+      geom_rect() +
+      geom_label_repel(x=3.5, aes(y=labelPosition, label=label), size=10) +
+      coord_polar(theta="y") +
+      xlim(c(2, 4)) +
+      ggtitle(paste0(i)) +
+      theme_void() +
+      theme(legend.position = "none",
+            plot.title = element_text(hjust = 0.5, size = 50, family = "Arial"),
+            plot.title.position = "plot")
+    assign(paste0("donut_",t,"_", i), donut_i)
+  }
+}
+donut_h48_final <- (donut_h48_AAV9 | donut_h48_UT)
+donut_d4_final <- (donut_d4_AAV9 | donut_d4_UT)
+## markers
+for (t in tp) {
+  obj_t <- get(paste0(t,"_final"))
+  neurons <- subset(
+    obj_t, 
+    RNA_snn_h.orig.ident_res.1.2=="Neurons" | RNA_snn_h.orig.ident_res.1.2=="Immature.Neurons")
+  astro <- subset(
+    obj_t, 
+    RNA_snn_h.orig.ident_res.1.2=="Astrocytes" | RNA_snn_h.orig.ident_res.1.2=="Immature.Astrocytes")
+  oligo <- subset(
+    obj_t, 
+    RNA_snn_h.orig.ident_res.1.2=="Oligodendrocytes" | RNA_snn_h.orig.ident_res.1.2=="OPC")
+  assign(paste0("neurons_",t,"_obj_x_markers"), neurons)
+  assign(paste0("astro_",t,"_obj_x_markers"), astro)
+  assign(paste0("oligo_",t,"_obj_x_markers"), oligo)
+}
+for (k in ls()[grep("_obj_x_markers", ls())]) {
+  cells <- get(k)
+  degs <- FindMarkers(object = cells, 
+                      ident.1 = "AAV9", 
+                      ident.2 = "UT", 
+                      group.by = "orig.ident",
+                      only.pos = FALSE, 
+                      min.pct = 0, 
+                      test.use = "wilcox", 
+                      logfc.threshold = 0,
+                      min.cells.group = 0)
+  assign(paste0(k,"_aav9_vs_ut_degs"), degs)
+}
+## gsea
+h_gmt <- read.gmt("data/h.all.v7.2.symbols.gmt")
+for (i in ls()[grep("aav9_vs_ut_degs", ls())]) {
+  gene_res <- get(i)
+  logfc_symbol <- as.vector(gene_res$avg_log2FC)
+  names(logfc_symbol) <- row.names(gene_res)
+  logfc_symbol <- sort(logfc_symbol, decreasing = TRUE)
+  contr.gsea <- GSEA(logfc_symbol, 
+                     TERM2GENE = h_gmt, 
+                     nPerm = 10000, 
+                     pvalueCutoff = 1)
+  contr.gsea.symb <- as.data.frame(contr.gsea)
+  assign(paste0("GSEA_",i), contr.gsea.symb)
+}
+## GSEA heatmaps
+type <- c("astro", "neurons", "oligo")
+for (p in type) {
+  hallmark.full <- data.frame()
+  for (ds in ls()[grep(paste0("GSEA_", p), ls())]) {
+    ds.t <- get(ds)
+    ds.t.filt <- ds.t[,c("ID", "setSize", "enrichmentScore", "NES", "pvalue", "p.adjust", "qvalues"), drop = FALSE]
+    ds.t.filt$Dataset <- ds
+    if (nrow(hallmark.full) == 0) {
+      hallmark.full <- ds.t.filt
+    } else {
+      hallmark.full <- rbind(hallmark.full, ds.t.filt)
+    }
+  }
+  hallmark.full.filt <- hallmark.full[, c("ID", "NES", "Dataset", "qvalues")]
+  hallmark.full.filt$ID <- gsub(x = hallmark.full.filt$ID, "HALLMARK_", "")
+  hallmark.full.filt$sig <- ifelse(
+    hallmark.full.filt$qvalues <= 0.05 & hallmark.full.filt$qvalues > 0.01, '*', 
+    ifelse(hallmark.full.filt$qvalues <= 0.01 & hallmark.full.filt$qvalues > 0.001, '**', 
+           ifelse(hallmark.full.filt$qvalues <= 0.001 & hallmark.full.filt$qvalues > 0.0001, '***', 
+                  ifelse(hallmark.full.filt$qvalues <= 0.0001, '****', ''))))
+  hallmark.order <- hallmark.full.filt %>% 
+    group_by(ID) %>% 
+    summarise(Pos = sum(NES))
+  hallmark.order.terms <- hallmark.order[order(hallmark.order$Pos, decreasing = FALSE), "ID", drop = FALSE]
+  hallmark.full.filt.tt <- reshape2::melt(reshape2::dcast(hallmark.full.filt, 
+                                                          ID ~ Dataset, 
+                                                          value.var="NES"), id.vars = c("ID"))
+  colnames(hallmark.full.filt.tt) <- c("ID", "Dataset", "NES")
+  hallmark.full.filt.tt$ID <- factor(hallmark.full.filt.tt$ID, levels = hallmark.order.terms$ID)
+  hallmark.full.filt.tt <- merge(hallmark.full.filt.tt, 
+                                 hallmark.full.filt[, c("ID", "Dataset", "sig")], 
+                                 by = c("ID", "Dataset"), 
+                                 all.x = TRUE)
+  labels <- c("Day4", "Day2")
+  p.hallmark <- ggplot(hallmark.full.filt.tt, 
+                       aes(x = Dataset, y = ID)) +
+    geom_tile(aes(fill = NES), colour = "white") +
+    geom_text(aes(label = paste(sig)), size=4*96/72, fontface = "bold") +
+    scale_fill_gradientn(colours = colorRampPalette(rev(brewer.pal(11,"RdBu")))(100),
+                         limits = c(-3.5, 3.5),
+                         na.value = "white") +
+    ylab("") + 
+    xlab("") +
+    coord_fixed(ratio = 0.6) +
+    scale_x_discrete(labels = labels) +
+    theme_bw(base_size = 12) +
+    theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold", size = 18*96/72),
+          axis.text.y = element_text(face = "bold", size = 12*96/72),
+          legend.text = element_text(face = "bold", size = 14*96/72),
+          legend.title = element_text(face = "bold", size = 12*96/72))
+  assign(paste0("p.hallmark_", p), p.hallmark)
+}
--- a/scRNAseq_organoids/seurat_analysis.R
+++ b/scRNAseq_organoids/seurat_analysis.R
+# ========================================================-
+# Title:         Seurat analysis of organoids scRNAseq
+# Description:   Code to generate Fig4, SuppFig8, SuppFig9
+# Author:        Monah Abou Alezz
+# Date:          2025-03-06
+# ========================================================-
+suppressPackageStartupMessages(library(Seurat))
+suppressPackageStartupMessages(library(tidyverse))
+suppressPackageStartupMessages(library(reshape2))
+suppressPackageStartupMessages(library(ggrepel))
+suppressPackageStartupMessages(library(clusterProfiler))
+suppressPackageStartupMessages(library(RColorBrewer))
+## load seurat data
+h48_final <- readRDS("data/organoids_h48_final.rds")
+d5_final <- readRDS("data/organoids_D5_final.rds")
+Idents(h48_final) <- "RNA_snn_h.orig.ident_res.0.6"
+Idents(d5_final) <- "RNA_snn_h.orig.ident_res.0.6"
+h48_final@meta.data[['orig.ident']] <- recode_factor(
+  h48_final@meta.data[["orig.ident"]],
+  "Sample1" = "AAV2",
+  "Sample3" = "AAV9",
+  "Sample5" = "Spk",
+  "Sample7" = "UT")
+h48_final@meta.data[["RNA_snn_h.orig.ident_res.0.6"]] <- recode_factor(
+  h48_final@meta.data[["RNA_snn_h.orig.ident_res.0.6"]], 
+  "0" = "Astrocytes", 
+  "1" = "Immature.Astrocytes", 
+  "2" = "Undifferentiated", 
+  "3" = "Neurons", 
+  "4" = "Immature.Neurons", 
+  "5" = "OPC", 
+  "6" = "Astrocytes", 
+  "7" = "Oligodendrocytes", 
+  "8" = "Undifferentiated",
+  "9" = "Astrocytes",
+  "10" = "Undifferentiated",
+  "11" = "Undifferentiated")
+h48_final$RNA_snn_h.orig.ident_res.0.6 <- factor(
+  h48_final$RNA_snn_h.orig.ident_res.0.6,
+  levels = c("Astrocytes",
+             "Neurons",
+             "Oligodendrocytes",
+             "Immature.Astrocytes",
+             "Immature.Neurons",
+             "OPC",
+             "Undifferentiated"))
+d5_final@meta.data[['orig.ident']] <- recode_factor(
+  d5_final@meta.data[["orig.ident"]],
+  "Sample2" = "AAV2",
+  "Sample4" = "AAV9",
+  "Sample6" = "Spk",
+  "Sample8" = "UT")
+d5_final@meta.data[["RNA_snn_h.orig.ident_res.0.6"]] <- recode_factor(
+  d5_final@meta.data[["RNA_snn_h.orig.ident_res.0.6"]], 
+  "0" = "Astrocytes", 
+  "1" = "Immature.Neurons", 
+  "2" = "Undifferentiated", 
+  "3" = "Neurons", 
+  "4" = "Oligodendrocytes", 
+  "5" = "Undifferentiated", 
+  "6" = "Undifferentiated", 
+  "7" = "Immature.Astrocytes", 
+  "8" = "OPC",
+  "9" = "Immature.Neurons",
+  "10" = "Astrocytes",
+  "11" = "Undifferentiated",
+  "12" = "Astrocytes",
+  "13" = "Neurons")
+d5_final$RNA_snn_h.orig.ident_res.0.6 <- factor(
+  d5_final$RNA_snn_h.orig.ident_res.0.6,
+  levels = c("Astrocytes",
+             "Neurons",
+             "Oligodendrocytes",
+             "Immature.Astrocytes",
+             "Immature.Neurons",
+             "OPC",
+             "Undifferentiated"))
+ggplotColours <- function(n = 6, h = c(0, 360) + 15){
+  if ((diff(h) %% 360) < 1) h[2] <- h[2] - 360/n
+  hcl(h = (seq(h[1], h[2], length = n)), c = 100, l = 65)
+}
+## umap
+DimPlot(object = h48_final,
+        reduction = "umap.harmony.orig.ident",
+        group.by = "RNA_snn_h.orig.ident_res.0.6",
+        label = T,
+        repel = T,
+        pt.size = 0.2,
+        label.size = 6) +
+  scale_color_manual(labels=c("Astrocytes",
+                              "Neurons",
+                              "Oligodendrocytes",
+                              "Immature Astrocytes",
+                              "Immature Neurons",
+                              "OPC",
+                              "Undifferentiated"),
+                     values = ggplotColours(n = 7))
+## donut plot
+ggplot_object_clusters_labels <- melt(table(h48_final$RNA_snn_h.orig.ident_res.0.6))
+colnames(ggplot_object_clusters_labels) <- c("Cell_type", "Count")
+totalB <- ggplot_object_clusters_labels %>% mutate(percentage = Count/sum(Count))
+totalB$percentage <- totalB$percentage*100
+totalB$fraction = totalB$Count / sum(totalB$Count)
+totalB$ymax <- cumsum(totalB$fraction)
+totalB$ymin <- c(0, head(totalB$ymax, n=-1))
+totalB$labelPosition <- (totalB$ymax + totalB$ymin) / 2
+totalB$Samples <- c("Astrocytes", "Neurons", "Oligodendrocytes",
+                    "Immature\nAstrocytes", "Immature\nNeurons",
+                    "OPC", "Undifferentiated")
+totalB$label <- paste0(totalB$Cell_type, ":","\n",round(totalB$percentage, 2), "%")
+totalB$label2 <- c(paste0(totalB$Samples[1], ":","\n",round(totalB$percentage[1], 2), "%"),
+                   paste0(totalB$Samples[2], ":","\n",round(totalB$percentage[2], 2), "%"),
+                   paste0(totalB$Samples[3], ":","\n",round(totalB$percentage[3], 2), "%"),
+                   paste0(totalB$Samples[4], ": ",round(totalB$percentage[4], 2), "%"),
+                   paste0(totalB$Samples[5], ": ",round(totalB$percentage[5], 2), "%"),
+                   paste0(totalB$Samples[6], ":","\n",round(totalB$percentage[6], 2), "%"),
+                   paste0(totalB$Samples[7], ":","\n",round(totalB$percentage[7], 2), "%"))
+ggplot(totalB, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=Cell_type)) +
+  geom_rect() +
+  geom_label_repel(x=3.5, aes(y=labelPosition, label=label), size=10) +
+  coord_polar(theta="y") +
+  xlim(c(2, 4)) +
+  theme_void() +
+  theme(legend.position = "none")
+## barplot
+PrctCellExpringGene <- function(object, genes, group.by = "all"){
+  if(group.by == "all"){
+    prct = unlist(lapply(genes,calc_helper, object=object))
+    result = data.frame(Markers = genes, Cell_proportion = prct)
+    return(result)
+  }
+  else{        
+    list = SplitObject(object, group.by)
+    factors = names(list)
+    results = lapply(list, PrctCellExpringGene, genes=genes)
+    for(i in 1:length(factors)){
+      results[[i]]$Feature = factors[i]
+    }
+    combined = do.call("rbind", results)
+    return(combined)
+  }
+}
+calc_helper <- function(object,genes){
+  counts = object[['RNA']]@counts
+  ncells = ncol(counts)
+  if(genes %in% row.names(counts)){
+    sum(counts[genes,]>0)/ncells
+  }else{return(NA)}
+}
+aav9 <- subset(h48_final, orig.ident=="AAV9")
+gfp_percentage_cell_type <- PrctCellExpringGene(aav9,
+                                                genes ="GFP",
+                                                group.by = "RNA_snn_h.orig.ident_res.0.6")
+gfp_percentage_cell_type$Percentage <- gfp_percentage_cell_type$Cell_proportion*100
+gfp_percentage_cell_type <- gfp_percentage_cell_type[,c(4,3)]
+colnames(gfp_percentage_cell_type) <- c("Percentage", "Cell_Types")
+gfp_percentage_cell_type <- gfp_percentage_cell_type[c(7,1,6,4,5,3,2),]
+gfp_percentage_cell_type$Cell_Types <- factor(gfp_percentage_cell_type$Cell_Types,
+                                              levels = c("Immature.Neurons",
+                                                         "Neurons",
+                                                         "Immature.Astrocytes",
+                                                         "Astrocytes",
+                                                         "OPC",
+                                                         "Oligodendrocytes",
+                                                         "Undifferentiated"))
+custom_colors <- c("Immature.Neurons" = "#00B6EB",
+                   "Neurons" = "#C49A00",
+                   "Immature.Astrocytes" = "#00C094",
+                   "Astrocytes" = "#F8766D",
+                   "OPC" = "#A58AFF",
+                   "Oligodendrocytes" = "#53B400",
+                   "Undifferentiated" = "#FB61D7")
+ggplot(data=gfp_percentage_cell_type, 
+       aes(x=Cell_Types, 
+           y=Percentage, 
+           fill = Cell_Types)) +
+  geom_bar(stat="identity", 
+           width = 0.7,
+           position=position_dodge()) + # width of bars
+  scale_fill_manual(values = custom_colors) +
+  ylab("% GFP mRNA positive") +
+  theme_bw() +
+  theme(
+    text = element_text(family = "Arial"),
+    axis.title.x = element_blank(),
+    axis.title.y = element_text(size=20*96/72),
+    axis.text.x = element_text(size=16*96/72, angle = 45, vjust = 0.9, hjust = 0.9),
+    axis.text.y = element_text(size=18*96/72),
+    legend.position = "none",
+    aspect.ratio = 1.1)
+h48_aav9_ut <- subset(h48_final, orig.ident == "AAV9" | orig.ident == "UT") 
+d4_aav9_ut <- subset(d4_final, orig.ident == "AAV9" | orig.ident == "UT")
+DimPlot(object = h48_aav9_ut,
+        reduction = "umap.harmony.orig.ident",
+        group.by = "RNA_snn_h.orig.ident_res.1.2",
+        split.by = "orig.ident",
+        label = F,
+        repel = T,
+        pt.size = 0.2,
+        label.size = 6,
+        ncol = 2) + 
+  scale_color_manual(labels=c("Astrocytes",
+                              "Neurons",
+                              "Oligodendrocytes",
+                              "Immature Astrocytes",
+                              "Immature Neurons",
+                              "OPC",
+                              "Undifferentiated"),
+                     values = ggplotColours(n = 7))
+DimPlot(object = d4_aav9_ut,
+        reduction = "umap.harmony.orig.ident",
+        group.by = "RNA_snn_h.orig.ident_res.1.2",
+        split.by = "orig.ident",
+        label = F,
+        repel = T,
+        pt.size = 0.2,
+        label.size = 6,
+        ncol = 2) + 
+  scale_color_manual(labels=c("Astrocytes",
+                              "Neurons",
+                              "Oligodendrocytes",
+                              "Immature Astrocytes",
+                              "Immature Neurons",
+                              "OPC",
+                              "Undifferentiated"),
+                     values = ggplotColours(n = 7))
+samples <- c("AAV9", "UT")
+tp <- c("h48","d4")
+for (t in tp) {
+  obj_t <- get(paste0(t,"_final"))
+  for (i in samples) {
+    obj_i <- subset(obj_t, orig.ident == i)
+    ggplot_object_clusters_labels <- melt(table(obj_i$RNA_snn_h.orig.ident_res.1.2))
+    colnames(ggplot_object_clusters_labels) <- c("Cell_type", "Count")
+    totalB <- ggplot_object_clusters_labels %>% mutate(percentage = Count/sum(Count))
+    totalB$percentage <- totalB$percentage*100
+    totalB$fraction = totalB$Count / sum(totalB$Count)
+    totalB$ymax <- cumsum(totalB$fraction)
+    totalB$ymin <- c(0, head(totalB$ymax, n=-1))
+    totalB$labelPosition <- (totalB$ymax + totalB$ymin) / 2
+    totalB$Samples <- c("Astrocytes", "Neurons", "Oligodendrocytes",
+                        "Immature\nAstrocytes", "Immature\nNeurons",
+                        "OPC", "Undifferentiated")
+    totalB$label <- paste0(totalB$Cell_type, ":","\n",round(totalB$percentage, 2), "%")
+    totalB$label2 <- c(paste0(totalB$Samples[1], ":","\n",round(totalB$percentage[1], 2), "%"),
+                       paste0(totalB$Samples[2], ":","\n",round(totalB$percentage[2], 2), "%"),
+                       paste0(totalB$Samples[3], ":","\n",round(totalB$percentage[3], 2), "%"),
+                       paste0(totalB$Samples[4], ": ",round(totalB$percentage[4], 2), "%"),
+                       paste0(totalB$Samples[5], ": ",round(totalB$percentage[5], 2), "%"),
+                       paste0(totalB$Samples[6], ":","\n",round(totalB$percentage[6], 2), "%"),
+                       paste0(totalB$Samples[7], ":","\n",round(totalB$percentage[7], 2), "%"))
+    donut_i <- ggplot(totalB, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=Cell_type)) +
+      geom_rect() +
+      geom_label_repel(x=3.5, aes(y=labelPosition, label=label), size=10) +
+      coord_polar(theta="y") +
+      xlim(c(2, 4)) +
+      ggtitle(paste0(i)) +
+      theme_void() +
+      theme(legend.position = "none",
+            plot.title = element_text(hjust = 0.5, size = 50, family = "Arial"),
+            plot.title.position = "plot")
+    assign(paste0("donut_",t,"_", i), donut_i)
+  }
+}
+donut_h48_final <- (donut_h48_AAV9 | donut_h48_UT)
+donut_d4_final <- (donut_d4_AAV9 | donut_d4_UT)
+## markers
+for (t in tp) {
+  obj_t <- get(paste0(t,"_final"))
+  neurons <- subset(
+    obj_t, 
+    RNA_snn_h.orig.ident_res.1.2=="Neurons" | RNA_snn_h.orig.ident_res.1.2=="Immature.Neurons")
+  astro <- subset(
+    obj_t, 
+    RNA_snn_h.orig.ident_res.1.2=="Astrocytes" | RNA_snn_h.orig.ident_res.1.2=="Immature.Astrocytes")
+  oligo <- subset(
+    obj_t, 
+    RNA_snn_h.orig.ident_res.1.2=="Oligodendrocytes" | RNA_snn_h.orig.ident_res.1.2=="OPC")
+  assign(paste0("neurons_",t,"_obj_x_markers"), neurons)
+  assign(paste0("astro_",t,"_obj_x_markers"), astro)
+  assign(paste0("oligo_",t,"_obj_x_markers"), oligo)
+}
+for (k in ls()[grep("_obj_x_markers", ls())]) {
+  cells <- get(k)
+  degs <- FindMarkers(object = cells, 
+                      ident.1 = "AAV9", 
+                      ident.2 = "UT", 
+                      group.by = "orig.ident",
+                      only.pos = FALSE, 
+                      min.pct = 0, 
+                      test.use = "wilcox", 
+                      logfc.threshold = 0,
+                      min.cells.group = 0)
+  assign(paste0(k,"_aav9_vs_ut_degs"), degs)
+}
+## gsea
+h_gmt <- read.gmt("data/h.all.v7.2.symbols.gmt")
+for (i in ls()[grep("aav9_vs_ut_degs", ls())]) {
+  gene_res <- get(i)
+  logfc_symbol <- as.vector(gene_res$avg_log2FC)
+  names(logfc_symbol) <- row.names(gene_res)
+  logfc_symbol <- sort(logfc_symbol, decreasing = TRUE)
+  contr.gsea <- GSEA(logfc_symbol, 
+                     TERM2GENE = h_gmt, 
+                     nPerm = 10000, 
+                     pvalueCutoff = 1)
+  contr.gsea.symb <- as.data.frame(contr.gsea)
+  assign(paste0("GSEA_",i), contr.gsea.symb)
+}
+## GSEA heatmaps
+type <- c("astro", "neurons", "oligo")
+for (p in type) {
+  hallmark.full <- data.frame()
+  for (ds in ls()[grep(paste0("GSEA_", p), ls())]) {
+    ds.t <- get(ds)
+    ds.t.filt <- ds.t[,c("ID", "setSize", "enrichmentScore", "NES", "pvalue", "p.adjust", "qvalues"), drop = FALSE]
+    ds.t.filt$Dataset <- ds
+    if (nrow(hallmark.full) == 0) {
+      hallmark.full <- ds.t.filt
+    } else {
+      hallmark.full <- rbind(hallmark.full, ds.t.filt)
+    }
+  }
+  hallmark.full.filt <- hallmark.full[, c("ID", "NES", "Dataset", "qvalues")]
+  hallmark.full.filt$ID <- gsub(x = hallmark.full.filt$ID, "HALLMARK_", "")
+  hallmark.full.filt$sig <- ifelse(
+    hallmark.full.filt$qvalues <= 0.05 & hallmark.full.filt$qvalues > 0.01, '*', 
+    ifelse(hallmark.full.filt$qvalues <= 0.01 & hallmark.full.filt$qvalues > 0.001, '**', 
+           ifelse(hallmark.full.filt$qvalues <= 0.001 & hallmark.full.filt$qvalues > 0.0001, '***', 
+                  ifelse(hallmark.full.filt$qvalues <= 0.0001, '****', ''))))
+  hallmark.order <- hallmark.full.filt %>% 
+    group_by(ID) %>% 
+    summarise(Pos = sum(NES))
+  hallmark.order.terms <- hallmark.order[order(hallmark.order$Pos, decreasing = FALSE), "ID", drop = FALSE]
+  hallmark.full.filt.tt <- reshape2::melt(reshape2::dcast(hallmark.full.filt, 
+                                                          ID ~ Dataset, 
+                                                          value.var="NES"), id.vars = c("ID"))
+  colnames(hallmark.full.filt.tt) <- c("ID", "Dataset", "NES")
+  hallmark.full.filt.tt$ID <- factor(hallmark.full.filt.tt$ID, levels = hallmark.order.terms$ID)
+  hallmark.full.filt.tt <- merge(hallmark.full.filt.tt, 
+                                 hallmark.full.filt[, c("ID", "Dataset", "sig")], 
+                                 by = c("ID", "Dataset"), 
+                                 all.x = TRUE)
+  labels <- c("Day4", "Day2")
+  p.hallmark <- ggplot(hallmark.full.filt.tt, 
+                       aes(x = Dataset, y = ID)) +
+    geom_tile(aes(fill = NES), colour = "white") +
+    geom_text(aes(label = paste(sig)), size=4*96/72, fontface = "bold") +
+    scale_fill_gradientn(colours = colorRampPalette(rev(brewer.pal(11,"RdBu")))(100),
+                         limits = c(-3.5, 3.5),
+                         na.value = "white") +
+    ylab("") + 
+    xlab("") +
+    coord_fixed(ratio = 0.6) +
+    scale_x_discrete(labels = labels) +
+    theme_bw(base_size = 12) +
+    theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold", size = 18*96/72),
+          axis.text.y = element_text(face = "bold", size = 12*96/72),
+          legend.text = element_text(face = "bold", size = 14*96/72),
+          legend.title = element_text(face = "bold", size = 12*96/72))
+  assign(paste0("p.hallmark_", p), p.hallmark)
+}
--- a/scRNAseq_transgene/seurat_analysis.R
+++ b/scRNAseq_transgene/seurat_analysis.R
+# ========================================================-
+# Title:         Seurat analysis of scRNAseq analysis from cells with transgenes
+# Description:   Code to generate Fig5, SuppFig15
+# Author:        Monah Abou Alezz
+# Date:          2025-03-06
+# ========================================================-
+suppressPackageStartupMessages(library(Seurat))
+suppressPackageStartupMessages(library(tidyverse))
+suppressPackageStartupMessages(library(reshape2))
+suppressPackageStartupMessages(library(ggrepel))
+suppressPackageStartupMessages(library(clusterProfiler))
+suppressPackageStartupMessages(library(RColorBrewer))
+## load seurat data
+h48_final <- readRDS("data/organoids_h48_final.rds")
+d5_final <- readRDS("data/organoids_D5_final.rds")
+Idents(h48_final) <- "RNA_snn_h.orig.ident_res.0.6"
+Idents(d5_final) <- "RNA_snn_h.orig.ident_res.0.6"
+h48_final@meta.data[['orig.ident']] <- recode_factor(
+  h48_final@meta.data[["orig.ident"]],
+  "Sample1" = "AAV2",
+  "Sample3" = "AAV9",
+  "Sample5" = "Spk",
+  "Sample7" = "UT")
+h48_final@meta.data[["RNA_snn_h.orig.ident_res.0.6"]] <- recode_factor(
+  h48_final@meta.data[["RNA_snn_h.orig.ident_res.0.6"]], 
+  "0" = "Astrocytes", 
+  "1" = "Immature.Astrocytes", 
+  "2" = "Undifferentiated", 
+  "3" = "Neurons", 
+  "4" = "Immature.Neurons", 
+  "5" = "OPC", 
+  "6" = "Astrocytes", 
+  "7" = "Oligodendrocytes", 
+  "8" = "Undifferentiated",
+  "9" = "Astrocytes",
+  "10" = "Undifferentiated",
+  "11" = "Undifferentiated")
+h48_final$RNA_snn_h.orig.ident_res.0.6 <- factor(
+  h48_final$RNA_snn_h.orig.ident_res.0.6,
+  levels = c("Astrocytes",
+             "Neurons",
+             "Oligodendrocytes",
+             "Immature.Astrocytes",
+             "Immature.Neurons",
+             "OPC",
+             "Undifferentiated"))
+d5_final@meta.data[['orig.ident']] <- recode_factor(
+  d5_final@meta.data[["orig.ident"]],
+  "Sample2" = "AAV2",
+  "Sample4" = "AAV9",
+  "Sample6" = "Spk",
+  "Sample8" = "UT")
+d5_final@meta.data[["RNA_snn_h.orig.ident_res.0.6"]] <- recode_factor(
+  d5_final@meta.data[["RNA_snn_h.orig.ident_res.0.6"]], 
+  "0" = "Astrocytes", 
+  "1" = "Immature.Neurons", 
+  "2" = "Undifferentiated", 
+  "3" = "Neurons", 
+  "4" = "Oligodendrocytes", 
+  "5" = "Undifferentiated", 
+  "6" = "Undifferentiated", 
+  "7" = "Immature.Astrocytes", 
+  "8" = "OPC",
+  "9" = "Immature.Neurons",
+  "10" = "Astrocytes",
+  "11" = "Undifferentiated",
+  "12" = "Astrocytes",
+  "13" = "Neurons")
+d5_final$RNA_snn_h.orig.ident_res.0.6 <- factor(
+  d5_final$RNA_snn_h.orig.ident_res.0.6,
+  levels = c("Astrocytes",
+             "Neurons",
+             "Oligodendrocytes",
+             "Immature.Astrocytes",
+             "Immature.Neurons",
+             "OPC",
+             "Undifferentiated"))
+ggplotColours <- function(n = 6, h = c(0, 360) + 15){
+  if ((diff(h) %% 360) < 1) h[2] <- h[2] - 360/n
+  hcl(h = (seq(h[1], h[2], length = n)), c = 100, l = 65)
+}
+## umap
+DimPlot(object = h48_final,
+        reduction = "umap.harmony.orig.ident",
+        group.by = "RNA_snn_h.orig.ident_res.0.6",
+        label = T,
+        repel = T,
+        pt.size = 0.2,
+        label.size = 6) +
+  scale_color_manual(labels=c("Astrocytes",
+                              "Neurons",
+                              "Oligodendrocytes",
+                              "Immature Astrocytes",
+                              "Immature Neurons",
+                              "OPC",
+                              "Undifferentiated"),
+                     values = ggplotColours(n = 7))
+## donut plot
+ggplot_object_clusters_labels <- melt(table(h48_final$RNA_snn_h.orig.ident_res.0.6))
+colnames(ggplot_object_clusters_labels) <- c("Cell_type", "Count")
+totalB <- ggplot_object_clusters_labels %>% mutate(percentage = Count/sum(Count))
+totalB$percentage <- totalB$percentage*100
+totalB$fraction = totalB$Count / sum(totalB$Count)
+totalB$ymax <- cumsum(totalB$fraction)
+totalB$ymin <- c(0, head(totalB$ymax, n=-1))
+totalB$labelPosition <- (totalB$ymax + totalB$ymin) / 2
+totalB$Samples <- c("Astrocytes", "Neurons", "Oligodendrocytes",
+                    "Immature\nAstrocytes", "Immature\nNeurons",
+                    "OPC", "Undifferentiated")
+totalB$label <- paste0(totalB$Cell_type, ":","\n",round(totalB$percentage, 2), "%")
+totalB$label2 <- c(paste0(totalB$Samples[1], ":","\n",round(totalB$percentage[1], 2), "%"),
+                   paste0(totalB$Samples[2], ":","\n",round(totalB$percentage[2], 2), "%"),
+                   paste0(totalB$Samples[3], ":","\n",round(totalB$percentage[3], 2), "%"),
+                   paste0(totalB$Samples[4], ": ",round(totalB$percentage[4], 2), "%"),
+                   paste0(totalB$Samples[5], ": ",round(totalB$percentage[5], 2), "%"),
+                   paste0(totalB$Samples[6], ":","\n",round(totalB$percentage[6], 2), "%"),
+                   paste0(totalB$Samples[7], ":","\n",round(totalB$percentage[7], 2), "%"))
+ggplot(totalB, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=Cell_type)) +
+  geom_rect() +
+  geom_label_repel(x=3.5, aes(y=labelPosition, label=label), size=10) +
+  coord_polar(theta="y") +
+  xlim(c(2, 4)) +
+  theme_void() +
+  theme(legend.position = "none")
+## barplot
+PrctCellExpringGene <- function(object, genes, group.by = "all"){
+  if(group.by == "all"){
+    prct = unlist(lapply(genes,calc_helper, object=object))
+    result = data.frame(Markers = genes, Cell_proportion = prct)
+    return(result)
+  }
+  else{        
+    list = SplitObject(object, group.by)
+    factors = names(list)
+    results = lapply(list, PrctCellExpringGene, genes=genes)
+    for(i in 1:length(factors)){
+      results[[i]]$Feature = factors[i]
+    }
+    combined = do.call("rbind", results)
+    return(combined)
+  }
+}
+calc_helper <- function(object,genes){
+  counts = object[['RNA']]@counts
+  ncells = ncol(counts)
+  if(genes %in% row.names(counts)){
+    sum(counts[genes,]>0)/ncells
+  }else{return(NA)}
+}
+aav9 <- subset(h48_final, orig.ident=="AAV9")
+gfp_percentage_cell_type <- PrctCellExpringGene(aav9,
+                                                genes ="GFP",
+                                                group.by = "RNA_snn_h.orig.ident_res.0.6")
+gfp_percentage_cell_type$Percentage <- gfp_percentage_cell_type$Cell_proportion*100
+gfp_percentage_cell_type <- gfp_percentage_cell_type[,c(4,3)]
+colnames(gfp_percentage_cell_type) <- c("Percentage", "Cell_Types")
+gfp_percentage_cell_type <- gfp_percentage_cell_type[c(7,1,6,4,5,3,2),]
+gfp_percentage_cell_type$Cell_Types <- factor(gfp_percentage_cell_type$Cell_Types,
+                                              levels = c("Immature.Neurons",
+                                                         "Neurons",
+                                                         "Immature.Astrocytes",
+                                                         "Astrocytes",
+                                                         "OPC",
+                                                         "Oligodendrocytes",
+                                                         "Undifferentiated"))
+custom_colors <- c("Immature.Neurons" = "#00B6EB",
+                   "Neurons" = "#C49A00",
+                   "Immature.Astrocytes" = "#00C094",
+                   "Astrocytes" = "#F8766D",
+                   "OPC" = "#A58AFF",
+                   "Oligodendrocytes" = "#53B400",
+                   "Undifferentiated" = "#FB61D7")
+ggplot(data=gfp_percentage_cell_type, 
+       aes(x=Cell_Types, 
+           y=Percentage, 
+           fill = Cell_Types)) +
+  geom_bar(stat="identity", 
+           width = 0.7,
+           position=position_dodge()) + # width of bars
+  scale_fill_manual(values = custom_colors) +
+  ylab("% GFP mRNA positive") +
+  theme_bw() +
+  theme(
+    text = element_text(family = "Arial"),
+    axis.title.x = element_blank(),
+    axis.title.y = element_text(size=20*96/72),
+    axis.text.x = element_text(size=16*96/72, angle = 45, vjust = 0.9, hjust = 0.9),
+    axis.text.y = element_text(size=18*96/72),
+    legend.position = "none",
+    aspect.ratio = 1.1)
+h48_aav9_ut <- subset(h48_final, orig.ident == "AAV9" | orig.ident == "UT") 
+d4_aav9_ut <- subset(d4_final, orig.ident == "AAV9" | orig.ident == "UT")
+DimPlot(object = h48_aav9_ut,
+        reduction = "umap.harmony.orig.ident",
+        group.by = "RNA_snn_h.orig.ident_res.1.2",
+        split.by = "orig.ident",
+        label = F,
+        repel = T,
+        pt.size = 0.2,
+        label.size = 6,
+        ncol = 2) + 
+  scale_color_manual(labels=c("Astrocytes",
+                              "Neurons",
+                              "Oligodendrocytes",
+                              "Immature Astrocytes",
+                              "Immature Neurons",
+                              "OPC",
+                              "Undifferentiated"),
+                     values = ggplotColours(n = 7))
+DimPlot(object = d4_aav9_ut,
+        reduction = "umap.harmony.orig.ident",
+        group.by = "RNA_snn_h.orig.ident_res.1.2",
+        split.by = "orig.ident",
+        label = F,
+        repel = T,
+        pt.size = 0.2,
+        label.size = 6,
+        ncol = 2) + 
+  scale_color_manual(labels=c("Astrocytes",
+                              "Neurons",
+                              "Oligodendrocytes",
+                              "Immature Astrocytes",
+                              "Immature Neurons",
+                              "OPC",
+                              "Undifferentiated"),
+                     values = ggplotColours(n = 7))
+samples <- c("AAV9", "UT")
+tp <- c("h48","d4")
+for (t in tp) {
+  obj_t <- get(paste0(t,"_final"))
+  for (i in samples) {
+    obj_i <- subset(obj_t, orig.ident == i)
+    ggplot_object_clusters_labels <- melt(table(obj_i$RNA_snn_h.orig.ident_res.1.2))
+    colnames(ggplot_object_clusters_labels) <- c("Cell_type", "Count")
+    totalB <- ggplot_object_clusters_labels %>% mutate(percentage = Count/sum(Count))
+    totalB$percentage <- totalB$percentage*100
+    totalB$fraction = totalB$Count / sum(totalB$Count)
+    totalB$ymax <- cumsum(totalB$fraction)
+    totalB$ymin <- c(0, head(totalB$ymax, n=-1))
+    totalB$labelPosition <- (totalB$ymax + totalB$ymin) / 2
+    totalB$Samples <- c("Astrocytes", "Neurons", "Oligodendrocytes",
+                        "Immature\nAstrocytes", "Immature\nNeurons",
+                        "OPC", "Undifferentiated")
+    totalB$label <- paste0(totalB$Cell_type, ":","\n",round(totalB$percentage, 2), "%")
+    totalB$label2 <- c(paste0(totalB$Samples[1], ":","\n",round(totalB$percentage[1], 2), "%"),
+                       paste0(totalB$Samples[2], ":","\n",round(totalB$percentage[2], 2), "%"),
+                       paste0(totalB$Samples[3], ":","\n",round(totalB$percentage[3], 2), "%"),
+                       paste0(totalB$Samples[4], ": ",round(totalB$percentage[4], 2), "%"),
+                       paste0(totalB$Samples[5], ": ",round(totalB$percentage[5], 2), "%"),
+                       paste0(totalB$Samples[6], ":","\n",round(totalB$percentage[6], 2), "%"),
+                       paste0(totalB$Samples[7], ":","\n",round(totalB$percentage[7], 2), "%"))
+    donut_i <- ggplot(totalB, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=Cell_type)) +
+      geom_rect() +
+      geom_label_repel(x=3.5, aes(y=labelPosition, label=label), size=10) +
+      coord_polar(theta="y") +
+      xlim(c(2, 4)) +
+      ggtitle(paste0(i)) +
+      theme_void() +
+      theme(legend.position = "none",
+            plot.title = element_text(hjust = 0.5, size = 50, family = "Arial"),
+            plot.title.position = "plot")
+    assign(paste0("donut_",t,"_", i), donut_i)
+  }
+}
+donut_h48_final <- (donut_h48_AAV9 | donut_h48_UT)
+donut_d4_final <- (donut_d4_AAV9 | donut_d4_UT)
+## markers
+for (t in tp) {
+  obj_t <- get(paste0(t,"_final"))
+  neurons <- subset(
+    obj_t, 
+    RNA_snn_h.orig.ident_res.1.2=="Neurons" | RNA_snn_h.orig.ident_res.1.2=="Immature.Neurons")
+  astro <- subset(
+    obj_t, 
+    RNA_snn_h.orig.ident_res.1.2=="Astrocytes" | RNA_snn_h.orig.ident_res.1.2=="Immature.Astrocytes")
+  oligo <- subset(
+    obj_t, 
+    RNA_snn_h.orig.ident_res.1.2=="Oligodendrocytes" | RNA_snn_h.orig.ident_res.1.2=="OPC")
+  assign(paste0("neurons_",t,"_obj_x_markers"), neurons)
+  assign(paste0("astro_",t,"_obj_x_markers"), astro)
+  assign(paste0("oligo_",t,"_obj_x_markers"), oligo)
+}
+for (k in ls()[grep("_obj_x_markers", ls())]) {
+  cells <- get(k)
+  degs <- FindMarkers(object = cells, 
+                      ident.1 = "AAV9", 
+                      ident.2 = "UT", 
+                      group.by = "orig.ident",
+                      only.pos = FALSE, 
+                      min.pct = 0, 
+                      test.use = "wilcox", 
+                      logfc.threshold = 0,
+                      min.cells.group = 0)
+  assign(paste0(k,"_aav9_vs_ut_degs"), degs)
+}
+## gsea
+h_gmt <- read.gmt("data/h.all.v7.2.symbols.gmt")
+for (i in ls()[grep("aav9_vs_ut_degs", ls())]) {
+  gene_res <- get(i)
+  logfc_symbol <- as.vector(gene_res$avg_log2FC)
+  names(logfc_symbol) <- row.names(gene_res)
+  logfc_symbol <- sort(logfc_symbol, decreasing = TRUE)
+  contr.gsea <- GSEA(logfc_symbol, 
+                     TERM2GENE = h_gmt, 
+                     nPerm = 10000, 
+                     pvalueCutoff = 1)
+  contr.gsea.symb <- as.data.frame(contr.gsea)
+  assign(paste0("GSEA_",i), contr.gsea.symb)
+}
+## GSEA heatmaps
+type <- c("astro", "neurons", "oligo")
+for (p in type) {
+  hallmark.full <- data.frame()
+  for (ds in ls()[grep(paste0("GSEA_", p), ls())]) {
+    ds.t <- get(ds)
+    ds.t.filt <- ds.t[,c("ID", "setSize", "enrichmentScore", "NES", "pvalue", "p.adjust", "qvalues"), drop = FALSE]
+    ds.t.filt$Dataset <- ds
+    if (nrow(hallmark.full) == 0) {
+      hallmark.full <- ds.t.filt
+    } else {
+      hallmark.full <- rbind(hallmark.full, ds.t.filt)
+    }
+  }
+  hallmark.full.filt <- hallmark.full[, c("ID", "NES", "Dataset", "qvalues")]
+  hallmark.full.filt$ID <- gsub(x = hallmark.full.filt$ID, "HALLMARK_", "")
+  hallmark.full.filt$sig <- ifelse(
+    hallmark.full.filt$qvalues <= 0.05 & hallmark.full.filt$qvalues > 0.01, '*', 
+    ifelse(hallmark.full.filt$qvalues <= 0.01 & hallmark.full.filt$qvalues > 0.001, '**', 
+           ifelse(hallmark.full.filt$qvalues <= 0.001 & hallmark.full.filt$qvalues > 0.0001, '***', 
+                  ifelse(hallmark.full.filt$qvalues <= 0.0001, '****', ''))))
+  hallmark.order <- hallmark.full.filt %>% 
+    group_by(ID) %>% 
+    summarise(Pos = sum(NES))
+  hallmark.order.terms <- hallmark.order[order(hallmark.order$Pos, decreasing = FALSE), "ID", drop = FALSE]
+  hallmark.full.filt.tt <- reshape2::melt(reshape2::dcast(hallmark.full.filt, 
+                                                          ID ~ Dataset, 
+                                                          value.var="NES"), id.vars = c("ID"))
+  colnames(hallmark.full.filt.tt) <- c("ID", "Dataset", "NES")
+  hallmark.full.filt.tt$ID <- factor(hallmark.full.filt.tt$ID, levels = hallmark.order.terms$ID)
+  hallmark.full.filt.tt <- merge(hallmark.full.filt.tt, 
+                                 hallmark.full.filt[, c("ID", "Dataset", "sig")], 
+                                 by = c("ID", "Dataset"), 
+                                 all.x = TRUE)
+  labels <- c("Day4", "Day2")
+  p.hallmark <- ggplot(hallmark.full.filt.tt, 
+                       aes(x = Dataset, y = ID)) +
+    geom_tile(aes(fill = NES), colour = "white") +
+    geom_text(aes(label = paste(sig)), size=4*96/72, fontface = "bold") +
+    scale_fill_gradientn(colours = colorRampPalette(rev(brewer.pal(11,"RdBu")))(100),
+                         limits = c(-3.5, 3.5),
+                         na.value = "white") +
+    ylab("") + 
+    xlab("") +
+    coord_fixed(ratio = 0.6) +
+    scale_x_discrete(labels = labels) +
+    theme_bw(base_size = 12) +
+    theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold", size = 18*96/72),
+          axis.text.y = element_text(face = "bold", size = 12*96/72),
+          legend.text = element_text(face = "bold", size = 14*96/72),
+          legend.title = element_text(face = "bold", size = 12*96/72))
+  assign(paste0("p.hallmark_", p), p.hallmark)
+}
--- a/snRNAseq/GSVA_combined.R
+++ b/snRNAseq/GSVA_combined.R
+# ========================================================-
+# Title:         GSVA analysis on scRNAseq and snRNAseq samples
+# Description:   Code to generate SuppFig13
+# Author:        Monah Abou Alezz
+# Date:          2025-03-06
+# ========================================================-
+suppressPackageStartupMessages(library(Seurat))
+suppressPackageStartupMessages(library(tidyverse))
+suppressPackageStartupMessages(library(openxlsx))
+suppressPackageStartupMessages(library(patchwork))
+suppressPackageStartupMessages(library(pheatmap))
+suppressPackageStartupMessages(library(GSVA))
+suppressPackageStartupMessages(library(purrr))
+suppressPackageStartupMessages(library(presto))
+suppressPackageStartupMessages(library(DESeq2))
+suppressPackageStartupMessages(library(cogena))
+#### loading the hallmark gmt file as a list using the gmt2list function from cogena
+gs_human <- gmt2list("~/databases/GSEA/GSEA_human_v7.2/h.all.v7.2.symbols.gmt")
+gs_mouse <- gmt2list("~/databases/GSEA/GSEA_mouse_v7.2/h.all.v7.2.symbols_mouse.gmt")
+## loading datasets
+mouse.combined <- readRDS("data/mouse_striatum_snRNA_AAV9.rds")
+unique(mouse.combined$Anno)
+mouse.combined <- subset(mouse.combined, Anno == "Astrocytes" |
+                                        Anno == "Mature_Oligos" |
+                                        Anno == "Immature_Oligos" |
+                                        Anno == "Excitatory" |
+                                        Anno == "Inhibitory")
+mouse.combined@meta.data[['Anno']] <- recode_factor(
+  mouse.combined@meta.data[["Anno"]],
+  "Astrocytes" = "Astrocytes",
+  "Mature_Oligos" = "Oligodendrocytes",
+  "Immature_Oligos" = "Oligodendrocytes",
+  "Excitatory" = "Neurons",
+  "Inhibitory" = "Neurons")
+mouse.combined$Anno <- droplevels(mouse.combined$Anno)
+unique(mouse.combined$Anno)
+mouse.combined$CellType <- mouse.combined$Anno
+d4_1707 <- readRDS("data/mixed_D4_final.rds")
+d4_1707@meta.data[['orig.ident']] <- recode_factor(
+  d4_1707@meta.data[["orig.ident"]],
+  "Sample2" = "AAV2",
+  "Sample4" = "d4_AAV9",
+  "Sample6" = "Spk",
+  "Sample8" = "d4_UT")
+d4_1707@meta.data[["RNA_snn_h.orig.ident_res.1.2"]] <- recode_factor(
+  d4_1707@meta.data[["RNA_snn_h.orig.ident_res.1.2"]], 
+  "0" = "Undifferentiated", 
+  "1" = "d4_Oligodendrocytes", 
+  "2" = "d4_Neurons", 
+  "3" = "d4_Astrocytes", 
+  "4" = "d4_Neurons", 
+  "5" = "d4_Oligodendrocytes", 
+  "6" = "d4_Astrocytes", 
+  "7" = "d4_Oligodendrocytes", 
+  "8" = "d4_Oligodendrocytes",
+  "9" = "d4_Neurons",
+  "10" = "d4_Oligodendrocytes",
+  "11" = "d4_Oligodendrocytes",
+  "12" = "d4_Astrocytes",
+  "13" = "d4_Astrocytes",
+  "14" = "d4_Oligodendrocytes",
+  "15" = "d4_Astrocytes",
+  "16" = "d4_Astrocytes")
+Idents(d4_1707) <- "RNA_snn_h.orig.ident_res.1.2"
+d4_1707$RNA_snn_h.orig.ident_res.1.2 <- factor(d4_1707$RNA_snn_h.orig.ident_res.1.2,
+                                               levels = c("d4_Astrocytes",
+                                                          "d4_Neurons",
+                                                          "d4_Oligodendrocytes",
+                                                          "Undifferentiated"))
+d4_1707 <- subset(d4_1707, RNA_snn_h.orig.ident_res.1.2 != "Undifferentiated")
+d4_1707$RNA_snn_h.orig.ident_res.1.2 <- droplevels(d4_1707$RNA_snn_h.orig.ident_res.1.2)
+unique(d4_1707$RNA_snn_h.orig.ident_res.1.2)
+d4_1707 <- subset(d4_1707, orig.ident == "d4_AAV9" | orig.ident == "d4_UT")
+d4_1707$orig.ident <- droplevels(d4_1707$orig.ident)
+d4_1707$CellType <- d4_1707$RNA_snn_h.orig.ident_res.1.2
+d5_1743 <- readRDS("data/organoids_D5_final.rds")
+d5_1743@meta.data[['orig.ident']] <- recode_factor(
+  d5_1743@meta.data[["orig.ident"]],
+  "Sample2" = "AAV2",
+  "Sample4" = "d5_AAV9",
+  "Sample6" = "Spk",
+  "Sample8" = "d5_UT")
+d5_1743@meta.data[["RNA_snn_h.orig.ident_res.0.6"]] <- recode_factor(
+  d5_1743@meta.data[["RNA_snn_h.orig.ident_res.0.6"]], 
+  "0" = "d5_Astrocytes", 
+  "1" = "d5_Neurons", 
+  "2" = "Undifferentiated", 
+  "3" = "d5_Neurons", 
+  "4" = "d5_Oligodendrocytes", 
+  "5" = "Pericytes", 
+  "6" = "Pericytes", 
+  "7" = "d5_Astrocytes", 
+  "8" = "d5_Oligodendrocytes",
+  "9" = "d5_Neurons",
+  "10" = "d5_Astrocytes",
+  "11" = "Pericytes",
+  "12" = "d5_Astrocytes",
+  "13" = "d5_Neurons")
+d5_1743 <- subset(d5_1743, RNA_snn_h.orig.ident_res.0.6 != "Pericytes")
+Idents(d5_1743) <- "RNA_snn_h.orig.ident_res.0.6"
+d5_1743$RNA_snn_h.orig.ident_res.0.6 <- factor(d5_1743$RNA_snn_h.orig.ident_res.0.6,
+                                               levels = c("d5_Astrocytes",
+                                                          "d5_Neurons",
+                                                          "d5_Oligodendrocytes",
+                                                          "Undifferentiated"))
+d5_1743 <- subset(d5_1743, RNA_snn_h.orig.ident_res.0.6 != "Undifferentiated")
+d5_1743$RNA_snn_h.orig.ident_res.0.6 <- droplevels(d5_1743$RNA_snn_h.orig.ident_res.0.6)
+unique(d5_1743$RNA_snn_h.orig.ident_res.0.6)
+d5_1743 <- subset(d5_1743, orig.ident == "d5_AAV9" | orig.ident == "d5_UT")
+d5_1743$orig.ident <- droplevels(d5_1743$orig.ident)
+d5_1743$CellType <- d5_1743$RNA_snn_h.orig.ident_res.0.6
+## create a pseudobulk and normalize
+datasets_pseudo <- c("mouse.combined", "d4_1707", "d5_1743")
+for (i in datasets_pseudo) {
+  print(paste0("Analyzing... ",i))
+  obj_i <- get(i)
+  data_collapsed <- presto::collapse_counts(obj_i@assays$RNA@counts, 
+                                            obj_i@meta.data, 
+                                            c('CellType', 'orig.ident'))
+  meta_data<- data_collapsed$meta_data
+  mat <- data_collapsed$counts_mat
+  colnames(mat)<- paste(meta_data$CellType, meta_data$orig.ident, sep="_")
+  rownames(meta_data) <- paste(meta_data$CellType, meta_data$orig.ident, sep="_")
+  pseudo_bulk_obj <- list(mat = mat, meta_data = meta_data)
+  assign(paste0(i, "_mat"), mat)
+  assign(paste0(i,"_pseduo_bulk_obj"), pseudo_bulk_obj)
+  DESeq.ds <- DESeq2::DESeqDataSetFromMatrix(countData = pseudo_bulk_obj$mat,
+                                             colData = pseudo_bulk_obj$meta_data,
+                                             design = ~orig.ident)
+  DESeq.ds <- DESeq.ds[rowSums(counts(DESeq.ds)) > 0, ]
+  DESeq.ds <- DESeq(DESeq.ds)
+  DESeq.rlog <- rlogTransformation(DESeq.ds, blind = TRUE)
+  counts <- assay(DESeq.rlog)
+  assign(paste0(i,"_counts"), counts)
+}
+## converting matrices into expressionsets
+d4_1707_counts_es <- ExpressionSet(assayData = as.matrix(d4_1707_counts), 
+                                   annotation = "org.Hs.eg.db")
+d5_1743_counts_es <- ExpressionSet(assayData = as.matrix(d5_1743_counts), 
+                                   annotation = "org.Hs.eg.db")
+mouse_counts_es <- ExpressionSet(assayData = as.matrix(mouse.combined_counts),
+                                 annotation = "org.Mm.eg.db")
+gsva_d4_1707_counts <- gsva(as.matrix(d4_1707_counts), gs_human, min.sz=5, max.sz=500)
+gsva_d5_1743_counts <- gsva(as.matrix(d5_1743_counts), gs_human, min.sz=5, max.sz=500)
+gsva_mouse.combined_counts <- gsva(as.matrix(mouse.combined_counts), 
+                                   gs_mouse, min.sz=5, max.sz=500)
+gsva_counts_combined <- cbind(gsva_d4_1707_counts, gsva_d5_1743_counts, gsva_mouse.combined_counts)
+p <- pheatmap(gsva_counts_combined)