# ========================================================- # Title: DE and functional enrichment analysis # Description: Code to generate Fig1, SuppFig3, SuppFig4 # Author: Monah Abou Alezz # Date: 2025-03-06 # ========================================================- suppressPackageStartupMessages(library(DESeq2)) suppressPackageStartupMessages(library(org.Hs.eg.db)) suppressPackageStartupMessages(library(tidyverse)) suppressPackageStartupMessages(library(ggrepel)) suppressPackageStartupMessages(library(pheatmap)) suppressPackageStartupMessages(library(RColorBrewer)) suppressPackageStartupMessages(library(clusterProfiler)) suppressPackageStartupMessages(library(reshape2)) # DE analysis ------------------------------------------------------------- ## Input featureCounts file fcount.file <- "data/GSE253820_featureCounts_results.txt" ## Contrasts to perform contr_astro <- "Astro_AAV2:Astro_UT,Astro_AAV5:Astro_UT,Astro_AAV9:Astro_UT,Astro_Empty:Astro_UT" contr_neuro <- "Neuro_AAV1:Neuro_UT,Neuro_AAV2:Neuro_UT,Neuro_AAV5:Neuro_UT,Neuro_AAV6:Neuro_UT,Neuro_AAV9:Neuro_UT,Neuro_Spk:Neuro_UT,Neuro_Empty:Neuro_UT" organism.db <- org.Hs.eg.db comps <- c("contr_astro", "contr_neuro") for (t in comps) { contr <- get(t) contr.list <- unlist(strsplit(contr, ",")) ## Design Formula formula.str <- "condition" design.str <- gsub(",", "+", formula.str) num_cond <- str_count(formula.str, ",") + 1 design.formula <- as.formula(paste("~", design.str, sep = "")) ## Read featureCounts results if (!file.exists(fcount.file)) { stop("Error: featureCounts file not found.") } else { read.counts <- read.table(fcount.file, header = TRUE, check.names = FALSE) %>% column_to_rownames(var = "Geneid") %>% dplyr::select(-c(1:5)) } if (t == "contr_astro") { read.counts <- dplyr::select(read.counts, starts_with("BAM:Astro")) } else { read.counts <- dplyr::select(read.counts, starts_with("BAM:Neuro")) } orig.names <- names(read.counts) ds <- list() cond <- list() for (i in seq(orig.names[1:length(orig.names)])) { ds[i] <- unlist(strsplit(orig.names[i], ":"))[2] cond[i] <- unlist(strsplit(orig.names[i], ":"))[3] } ds <- unlist(ds) cond <- unlist(cond) ## Samples - Conditions sample_info <- as.data.frame(str_split_fixed(cond, ",", num_cond)) colnames(sample_info) <- str_split_fixed(formula.str, ",", num_cond) rownames(sample_info) <- ds colnames(read.counts) <- ds sample_info$condition <- factor(sample_info$condition) last_condition <- str_split_fixed(formula.str, ",", num_cond)[num_cond] batch_cond <- str_split_fixed(formula.str, ",", num_cond)[num_cond - 1] ## thresholds adj.pval.thr <- 0.05 logfc.thr <- 0 ## DESeq2 DESeq.ds <- DESeqDataSetFromMatrix(countData = read.counts, colData = sample_info, design = design.formula) ## Remove genes without any counts DESeq.ds <- DESeq.ds[rowSums(counts(DESeq.ds)) > 0, ] ## RunDGE DESeq.ds <- DESeq(DESeq.ds) ## obtain regularized log-transformed values DESeq.rlog <- rlogTransformation(DESeq.ds, blind = TRUE) assign(paste0("DESeq.rlog_",t), DESeq.rlog) ## Contrasts for (c in 1:length(contr.list)) { ctrs <- unlist(strsplit(contr.list[c], ":")) c1 <- ctrs[1] c2 <- ctrs[2] DGE.results <- results(DESeq.ds, pAdjustMethod = "BH", independentFiltering = TRUE, contrast = c(last_condition, c1, c2), alpha = adj.pval.thr) # Change format DGE.results.annot <- as.data.frame(DGE.results) %>% dplyr::select(log2FoldChange, lfcSE, baseMean, pvalue, padj) colnames(DGE.results.annot) <- c("logFC", "lfcSE", "baseMean", "PValue", "FDR") ## Add ENTREZ ID DGE.results.annot$entrez <- mapIds(x = organism.db, keys = row.names(DGE.results), column = "ENTREZID", keytype = "SYMBOL", multiVals = "first") ## Add description DGE.results.annot$description <- mapIds(x = organism.db, keys = row.names(DGE.results), column = "GENENAME", keytype = "SYMBOL", multiVals = "first") ## sort the results according to the adjusted p-value DGE.results.sorted <- DGE.results.annot[order(DGE.results.annot$FDR), ] DGE.results.sorted$gene <- row.names(DGE.results.sorted) ## Subset for only significant genes DGE.results.sig <- subset(DGE.results.sorted, FDR < adj.pval.thr & abs(logFC) > logfc.thr) assign(paste0("DGEgenes_",c1,"_vs_",c2), DGE.results.sorted) assign(paste0("DGEgenes.sig_",c1,"_vs_",c2), DGE.results.sig) ## volcano plots data <- data.frame(gene = row.names(DGE.results.sorted), pval = -log10(DGE.results.sorted$FDR), lfc = DGE.results.sorted$logFC) data <- na.omit(data) data <- dplyr::mutate( data, color = dplyr::case_when( data$lfc > logfc.thr & data$pval > -log10(adj.pval.thr) ~ "Overexpressed", data$lfc < -logfc.thr & data$pval > -log10(adj.pval.thr) ~ "Underexpressed", abs(data$lfc) < logfc.thr | data$pval < -log10(adj.pval.thr) ~ "NonSignificant")) data <- data[order(data$pval, decreasing = TRUE), ] highl <- rbind(head(subset(data, color == "Overexpressed"), 10), head(subset(data, color == "Underexpressed"), 10)) title <- paste0(c1," vs. ", c2) vol <- ggplot(data, aes(x = lfc, y = pval, colour = color)) + geom_point(size = 2, alpha = 0.8, na.rm = T) + geom_hline(yintercept = -log10(adj.pval.thr), colour = "darkgray") + geom_label_repel(data = highl, aes(label = gene), show.legend = FALSE, size = 4, family = "Arial") + xlab("log2 Fold Change") + ylab(expression(-log[10]("adjusted p-value"))) + ggtitle(title) + scale_color_manual(name = "Expression", values = c(Overexpressed = "red1", Underexpressed = "royalblue2", NonSignificant = "gray14")) + theme_bw() + theme(legend.position = "right", text=element_text(family="Arial", size=4*96/72), axis.text.x = element_text(size = 14*96/72), axis.text.y = element_text(size = 14*96/72), axis.title.y = element_text(size = 16*96/72), axis.title.x = element_text(size = 16*96/72), legend.text = element_text(size = 12*96/72), legend.title = element_text(size = 15*96/72), plot.title = element_text(size = 16*96/72, hjust = 0.5)) assign(paste0("vol_",c1,"_vs_",c2), vol) } } # Heatmaps ---------------------------------------------------------------- degs_astro <- rbind(DGEgenes.sig_Astro_AAV2_vs_Astro_UT, DGEgenes.sig_Astro_AAV5_vs_Astro_UT, DGEgenes.sig_Astro_AAV9_vs_Astro_UT, DGEgenes.sig_Astro_Empty_vs_Astro_UT) counts_astro <- as.data.frame(assay(DESeq.rlog_contr_astro))[degs_astro$gene, ] counts_astro <- counts_astro %>% rename_with(~ str_remove(., "Astro_"), everything()) %>% select(c(13,14,15,16,10,11,12,7,8,9,4,5,6,1,2,3)) astro_annot <- data.frame(Samples = c("UT", "UT", "UT", "UT", "Empty", "Empty", "Empty", "AAV9", "AAV9", "AAV9", "AAV5", "AAV5", "AAV5", "AAV2", "AAV2", "AAV2"), row.names = colnames(counts_astro)) astro_annot$Samples <- factor(astro_annot$Samples, levels = c("UT", "Empty", "AAV9", "AAV5", "AAV2")) astro_hm <- pheatmap::pheatmap(counts_astro, scale = "row", color = colorRampPalette(rev(brewer.pal(n = 9, name ="RdYlBu")))(100), annotation_col = astro_annot, annotation_names_col = F, show_rownames = F, show_colnames = F, cluster_rows = T, cluster_cols = F, treeheight_row = 20, fontsize_col = 12*96/72, fontsize = 12*96/72, angle_col = 90) degs_neuro <- rbind(DGEgenes.sig_Neuro_AAV1_vs_Neuro_UT, DGEgenes.sig_Neuro_AAV2_vs_Neuro_UT, DGEgenes.sig_Neuro_AAV5_vs_Neuro_UT, DGEgenes.sig_Neuro_AAV6_vs_Neuro_UT, DGEgenes.sig_Neuro_AAV9_vs_Neuro_UT, DGEgenes.sig_Neuro_Empty_vs_Neuro_UT, DGEgenes.sig_Neuro_Spk_vs_Neuro_UT) counts_neuro <- as.data.frame(assay(DESeq.rlog_contr_neuro))[degs_neuro$gene, ] counts_neuro <- counts_neuro %>% rename_with(~ str_remove(., "Neuro_"), everything()) %>% select(c(22:24,16:18,13:15,7:9,19:21,1:6,10:12)) neuro_annot <- data.frame(Samples = c("UT", "UT", "UT", "Empty", "Empty", "Empty", "AAV9", "AAV9", "AAV9", "AAV5", "AAV5", "AAV5", "Spk", "Spk", "Spk", "AAV1", "AAV1", "AAV1", "AAV2", "AAV2", "AAV2", "AAV6", "AAV6", "AAV6"), row.names = colnames(counts_neuro)) neuro_annot$Samples <- factor(neuro_annot$Samples, levels = c("UT", "Empty", "AAV9", "AAV5","Spk","AAV1","AAV2", "AAV6")) neuro_hm <- pheatmap::pheatmap(counts_neuro, scale = "row", color = colorRampPalette(rev(brewer.pal(n = 9, name ="RdYlBu")))(100), annotation_col = neuro_annot, annotation_names_col = F, show_rownames = F, show_colnames = F, cluster_rows = T, cluster_cols = F, treeheight_row = 20, fontsize_col = 12*96/72, fontsize = 12*96/72, angle_col = 90) # GSEA analysis ----------------------------------------------------------- h_gmt <- read.gmt("data/h.all.v7.2.symbols.gmt") for (i in ls()[grep("DGEgenes_", ls())]) { gene_res <- get(i) logfc_symbol <- as.vector(gene_res$logFC) names(logfc_symbol) <- row.names(gene_res) logfc_symbol <- sort(logfc_symbol, decreasing = TRUE) contr.gsea <- GSEA(logfc_symbol, TERM2GENE = h_gmt, nPerm = 10000, pvalueCutoff = 1) contr.gsea.symb <- as.data.frame(contr.gsea) assign(paste0(gsub("DGEgenes_", "GSEA_",i)), contr.gsea.symb) } type <- c("Astro", "Neuro") for (p in type) { hallmark.full <- data.frame() for (ds in ls()[grep(paste0("GSEA_", p), ls())]) { ds.t <- get(ds) ds.t.filt <- ds.t[,c("ID", "setSize", "enrichmentScore", "NES", "pvalue", "p.adjust", "qvalue"), drop = FALSE] ds.t.filt$Dataset <- ds if (nrow(hallmark.full) == 0) { hallmark.full <- ds.t.filt } else { hallmark.full <- rbind(hallmark.full, ds.t.filt) } } hallmark.full.filt <- hallmark.full[, c("ID", "NES", "Dataset", "qvalue")] hallmark.full.filt$ID <- gsub(x = hallmark.full.filt$ID, "HALLMARK_", "") if (p == "Astro") { hallmark.full.filt$Dataset <- factor(hallmark.full.filt$Dataset, levels = c("GSEA_Astro_Empty_vs_Astro_UT", "GSEA_Astro_AAV9_vs_Astro_UT", "GSEA_Astro_AAV5_vs_Astro_UT", "GSEA_Astro_AAV2_vs_Astro_UT")) } else { hallmark.full.filt$Dataset <- factor(hallmark.full.filt$Dataset, levels = c("GSEA_Neuro_Empty_vs_Neuro_UT", "GSEA_Neuro_AAV9_vs_Neuro_UT", "GSEA_Neuro_Spk_vs_Neuro_UT", "GSEA_Neuro_AAV5_vs_Neuro_UT", "GSEA_Neuro_AAV1_vs_Neuro_UT", "GSEA_Neuro_AAV2_vs_Neuro_UT", "GSEA_Neuro_AAV6_vs_Neuro_UT")) } hallmark.full.filt$sig <- ifelse( hallmark.full.filt$qvalue <= 0.05 & hallmark.full.filt$qvalue > 0.01, '*', ifelse(hallmark.full.filt$qvalue <= 0.01 & hallmark.full.filt$qvalue > 0.001, '**', ifelse(hallmark.full.filt$qvalue <= 0.001 & hallmark.full.filt$qvalue > 0.0001, '***', ifelse(hallmark.full.filt$qvalue <= 0.0001, '****', '')))) hallmark.order <- hallmark.full.filt %>% group_by(ID) %>% summarise(Pos = sum(NES)) hallmark.order.terms <- hallmark.order[order(hallmark.order$Pos, decreasing = FALSE), "ID", drop = FALSE] hallmark.full.filt.tt <- reshape2::melt(reshape2::dcast(hallmark.full.filt, ID ~ Dataset, value.var="NES"), id.vars = c("ID")) colnames(hallmark.full.filt.tt) <- c("ID", "Dataset", "NES") hallmark.full.filt.tt$ID <- factor(hallmark.full.filt.tt$ID, levels = hallmark.order.terms$ID) hallmark.full.filt.tt <- merge(hallmark.full.filt.tt, hallmark.full.filt[, c("ID", "Dataset", "sig")], by = c("ID", "Dataset"), all.x = TRUE) if (p == "Astro") { labels <- c("Empty", "AAV9", "AAV5", "AAV2") } else { labels <- c("Empty", "AAV9", "Spk100","AAV5","AAV1","AAV2","AAV6") } p.hallmark <- ggplot(hallmark.full.filt.tt, aes(x = Dataset, y = ID)) + geom_tile(aes(fill = NES), colour = "white") + geom_text(aes(label = paste(sig)), size=4*96/72, fontface = "bold") + scale_fill_gradientn(colours = colorRampPalette(rev(brewer.pal(11,"RdBu")))(100), limits = c(-3.5, 3.5), na.value = "white") + ylab("") + xlab("") + coord_fixed(ratio = 0.6) + scale_x_discrete(labels = labels) + theme_bw(base_size = 12) + theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold", size = 18*96/72), axis.text.y = element_text(face = "bold", size = 12*96/72), legend.text = element_text(face = "bold", size = 14*96/72), legend.title = element_text(face = "bold", size = 12*96/72)) assign(paste0("p.hallmark_", p), p.hallmark) } # Pathways ---------------------------------------------------------------- gi_astro <- data.frame(gene = c("CDKN1A","PHLDA3","BAX","FDXR","TGFA","INPP5D","TRIM22", "GDF15", "APOBEC3C","IL1A","CXCL8","IL1B","TNFSF9","IL11", "EDA2R", "TNFRSF10A","TNFRSF10B","TNFRSF10C","TNFRSF10D", "CCNA2","CDC20","NDC80","KIF2C","AURKB","MCM6", "CCNF", "LMNB1","MCM2","E2F1","CDC45", "MCM7", "DNA2", "CDCA5","CLSPN" ), terms = c(rep("P53 Pathway", 9), rep("Inflammatory Response", 5), rep("TNFA Signaling Via NFKB", 1), rep("Apoptosis", 4), rep("G2M Checkpoint", 14), rep("DNA Repair", 1))) gi_neuro <- data.frame(gene = c("CDKN1A","FAS","BAX","FDXR","TGFA","INPP5D","TRIM22","GDF15", "CXCL8","IL1R1","IL11","FOSL1","EDA2R","TNFRSF12A","IL6R", "TNFRSF10C","TNFRSF10D","TNFRSF10A","CDC20","AURKB","MCM6", "MCM3","CDC25A","CDC6","CHEK1","E2F1","E2F2","MCM4","RACGAP1", "DLGAP5","NCAPH", "NUF2","CLSPN", "FANCE", "TF","TK1"), terms = c(rep("P53 Pathway", 8), rep("Inflammatory Response", 3), rep("TNFA Signaling Via NFKB", 2), rep("Il6 Jak Stat3 Signaling", 2), rep("Apoptosis", 3), rep("G2M Checkpoint", 10), rep("Mitotic Spindle", 4), rep("DNA Repair", 2), rep("Coagulation", 1), rep("E2F Targets", 1))) astro <- data.frame() for (l in ls()[grep("DGEgenes_Astro", ls())]) { deg <- get(l) %>% select(logFC) colnames(deg) <- gsub("DGEgenes_Astro_","",l) if (nrow(astro) == 0) { astro <- deg } else { astro <- merge(astro, deg, by = "row.names", all = TRUE) rownames(astro) <- astro$Row.names astro$Row.names <- NULL } } astro <- astro[c(row.names(astro) %in% gi_astro$gene), ] astro <- astro[match(gi_astro$gene, rownames(astro)), , drop = FALSE] astro_rowAnnot <- gi_astro %>% column_to_rownames(var = "gene") %>% rename(Pathway = 1) colours_row <- list('Pathway' = c('Inflammatory Response' = 'blue', 'TNFA Signaling Via NFKB'= 'darkgreen', 'Apoptosis' = 'green', 'G2M Checkpoint' = 'red', 'P53 Pathway' = 'orange', 'UV Response Up' = 'purple', 'DNA Repair' = 'darkslategray3')) rowAnn <- HeatmapAnnotation(df = astro_rowAnnot, which = 'row', show_annotation_name = FALSE, col = colours_row, annotation_legend_param = list( Pathway = list( title = "Pathways", title_gp = gpar(fontsize = 12*96/72, fontface = "bold"), labels_gp = gpar(fontsize = 14*96/72), grid_height = unit(0.7, "cm"))), annotation_width = unit(c(16, 16), 'cm'), width = unit(c(16, 16), 'cm'), gap = unit(1, 'mm')) astro_mat <- as.matrix(astro[,c(4,3,2,1)]) hmap <- Heatmap(astro_mat, name = "Z-score", cluster_rows = F, cluster_columns = FALSE, show_row_dend = F, show_column_dend = TRUE, column_dend_height = unit(1.2*96/72, "cm"), row_dend_width = unit(1*96/72, "cm"), row_names_side = "right", row_dend_side = "left", row_names_gp = gpar(fontsize = 12*96/72), row_names_max_width = unit(6*96/72, "cm"), show_row_names = TRUE, row_title = NULL, row_title_gp = gpar(fontsize = 16*96/72, fontface = "bold"), show_column_names = T, column_names_rot = 90, column_names_gp = gpar(fontsize = 14*96/72), column_title = NULL, column_title_side = "bottom", column_title_gp = gpar(fontsize = 16*96/72, fontface = "bold"), column_dend_reorder = TRUE, left_annotation = rowAnn, heatmap_legend_param = list( title = "logFC", title_gp = gpar(fontsize = 12*96/72, fontface = "bold"), labels_gp = gpar(fontsize = 12*96/72), legend_height = unit(3, "cm") ), width = unit(9*96/72, "cm"), height = unit(16*96/72, "cm"), use_raster = TRUE, raster_quality = 5) draw(hmap, heatmap_legend_side="right", annotation_legend_side="right") neuro <- data.frame() for (n in ls()[grep("DGEgenes_Neuro", ls())]) { deg <- get(n) deg <- select(deg, logFC) colnames(deg) <- gsub("DGEgenes_Neuro_","",n) if (nrow(neuro) == 0) { neuro <- deg } else { neuro <- merge(neuro, deg, by = "row.names", all = TRUE) rownames(neuro) <- neuro$Row.names neuro$Row.names <- NULL } } neuro <- neuro[c(row.names(neuro) %in% gi_neuro$gene), ] neuro <- neuro[match(gi_neuro$gene, rownames(neuro)), , drop = FALSE] neuro_rowAnnot <- gi_neuro %>% column_to_rownames(var = "gene") %>% rename(Pathway = 1) colours_row <- list('Pathway' = c('Inflammatory Response' = 'blue', 'Mitotic Spindle' = 'purple', 'TNFA Signaling Via NFKB'= 'darkgreen', 'Apoptosis' = 'green', 'Il6 Jak Stat3 Signaling' = 'pink', 'G2M Checkpoint' = 'red', 'E2F Targets' = 'gray60', 'P53 Pathway' = 'orange', 'Coagulation' = 'brown', 'DNA Repair' = 'darkslategray3')) rowAnn <- HeatmapAnnotation(df = neuro_rowAnnot, which = 'row', show_annotation_name = FALSE, col = colours_row, annotation_legend_param = list( Pathway = list( title = "Pathways", title_gp = gpar(fontsize = 12*96/72, fontface = "bold"), labels_gp = gpar(fontsize = 14*96/72), grid_height = unit(0.7, "cm"))), annotation_width = unit(c(16, 16), 'cm'), width = unit(c(16, 16), 'cm'), gap = unit(1, 'mm')) neuro_mat <- as.matrix(neuro[,c(6,5,7,3,1,2,4)]) hmap <- Heatmap(neuro_mat, name = "Z-score", cluster_rows = F, cluster_columns = FALSE, show_row_dend = F, show_column_dend = TRUE, column_dend_height = unit(1.2*96/72, "cm"), row_dend_width = unit(1*96/72, "cm"), row_names_side = "right", row_dend_side = "left", row_names_gp = gpar(fontsize = 12*96/72), row_names_max_width = unit(6*96/72, "cm"), show_row_names = TRUE, row_title = NULL, row_title_gp = gpar(fontsize = 16*96/72, fontface = "bold"), show_column_names = T, column_names_rot = 90, column_names_gp = gpar(fontsize = 14*96/72), column_title = NULL, column_title_side = "bottom", column_title_gp = gpar(fontsize = 16*96/72, fontface = "bold"), column_dend_reorder = TRUE, left_annotation = rowAnn, heatmap_legend_param = list( title = "logFC", title_gp = gpar(fontsize = 12*96/72, fontface = "bold"), labels_gp = gpar(fontsize = 12*96/72), legend_height = unit(3, "cm") ), width = unit(9*96/72, "cm"), height = unit(16*96/72, "cm"), use_raster = TRUE, raster_quality = 5) draw(hmap, heatmap_legend_side="right", annotation_legend_side="right")