suppressPackageStartupMessages(library(DESeq2)) suppressPackageStartupMessages(library(stringr)) suppressPackageStartupMessages(library(tidyverse)) suppressPackageStartupMessages(library(pheatmap)) suppressPackageStartupMessages(library(RColorBrewer)) suppressPackageStartupMessages(library(openxlsx)) suppressPackageStartupMessages(library(org.Hs.eg.db)) suppressPackageStartupMessages(library(clusterProfiler)) ## Input featureCounts file fcount.file <- "Scarfo_HEC2023/BulkRNAseq_1/featureCounts_results_corrected.txt" ## Contrasts to perform contr <- "ACE_pos:ACE_neg" contr.list <- unlist(strsplit(contr, ",")) ## Design Formula formula.str <- "donor,condition" design.str <- gsub(",", "+", formula.str) num_cond <- str_count(formula.str, ",") + 1 design.formula <- as.formula(paste("~", design.str, sep = "")) ## Read featureCounts results if (!file.exists(fcount.file)) { stop("Error: featureCounts file not found.") } else { read.counts <- read.table(fcount.file, header = TRUE, check.names = FALSE) %>% column_to_rownames(var = "Geneid") %>% select(-c(1:5)) } orig.names <- names(read.counts) ds <- list() cond <- list() for (i in seq(orig.names[1:length(orig.names)])) { ds[i] <- unlist(strsplit(orig.names[i], ":"))[2] cond[i] <- unlist(strsplit(orig.names[i], ":"))[3] } ds <- unlist(ds) cond <- unlist(cond) ## Samples - Conditions sample_info <- as.data.frame(str_split_fixed(cond, ",", num_cond)) colnames(sample_info) <- str_split_fixed(formula.str, ",", num_cond) rownames(sample_info) <- ds colnames(read.counts) <- ds sample_info$condition <- factor(sample_info$condition) last_condition <- str_split_fixed(formula.str, ",", num_cond)[num_cond] batch_cond <- str_split_fixed(formula.str, ",", num_cond)[num_cond - 1] ## thresholds adj.pval.thr <- 0.05 logfc.thr <- 0 ## DESeq2 DESeq.ds <- DESeqDataSetFromMatrix(countData = read.counts, colData = sample_info, design = design.formula) ## Remove genes without any counts DESeq.ds <- DESeq.ds[rowSums(counts(DESeq.ds)) > 0, ] ## RunDGE DESeq.ds <- DESeq(DESeq.ds) ## obtain regularized log-transformed values DESeq.rlog <- rlogTransformation(DESeq.ds, blind = TRUE) assay(DESeq.rlog) <- limma::removeBatchEffect(assay(DESeq.rlog), DESeq.rlog[[batch_cond]]) ctrs <- unlist(strsplit(contr.list[1], ":")) c1 <- ctrs[1] c2 <- ctrs[2] DGE.results <- results(DESeq.ds, pAdjustMethod = "BH", independentFiltering = TRUE, contrast = c(last_condition, c1, c2), alpha = adj.pval.thr) # Change format DGE.results.annot <- as.data.frame(DGE.results) %>% dplyr::select(log2FoldChange, lfcSE, baseMean, pvalue, padj) colnames(DGE.results.annot) <- c("logFC", "lfcSE", "baseMean", "PValue", "FDR") ## sort the results according to the adjusted p-value DGE.results.sorted <- DGE.results.annot[order(DGE.results.annot$FDR), ] ## Subset for only significant genes DGE.results.sig <- subset(DGE.results.sorted, FDR < adj.pval.thr & abs(logFC) > logfc.thr) DGEgenes <- rownames(DGE.results.sig) # Fig1B --------------------------------------------------------------------- plotPCA(DESeq.rlog, intgroup = last_condition, ntop = 500) + theme_bw() + ggtitle(label = "Principal Component Analysis (PCA)", subtitle = "Top 500 DEG (with batch correction)") + geom_point(size = 2) + #coord_fixed(ratio=3) + geom_text(aes(label = rownames(DESeq.rlog@colData)), vjust = 0, nudge_y = 0.5, show.legend = FALSE) # Fig1C ---------------------------------------------------------------- ## DEGs s_info <- sample_info[sample_info[[last_condition]] == c1 | sample_info[[last_condition]] == c2,, drop = FALSE] hm.mat_DGEgenes <- assay(DESeq.rlog)[DGEgenes, ] hm.mat_DGEgenes.filt <- hm.mat_DGEgenes[, row.names(s_info)] pheatmap::pheatmap(hm.mat_DGEgenes.filt, color = colorRampPalette(rev(brewer.pal(n = 9, name ="RdYlBu")))(100), fontsize_row = 5, show_rownames = FALSE, scale = "row", angle_col = 90, cluster_rows = TRUE, annotation_col = s_info, cluster_cols = TRUE) # Fig1D ---------------------------------------------------------------- ## surface genes if(!file.exists("Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx")){ download.file("http://wlab.ethz.ch/surfaceome/table_S3_surfaceome.xlsx", destfile = "Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx") } sg <- read.xlsx("Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx", sheet = "in silico surfaceome only", colNames = T, startRow = 2) sig_sg <- DGE.results.sig[c(row.names(DGE.results.sig) %in% sg$UniProt.gene), ] markers <- c("CD34","CDH5","PECAM1","TEK","NR2F2","FLRT2","BMX","GJA5","DLL4","CXCR4","HEY2","MYB","GFI1","CD44") DESeq.rlog_mat <- assay(DESeq.rlog) DESeq.rlog_mat <- DESeq.rlog_mat[c(row.names(DESeq.rlog_mat) %in% markers), ] DESeq.rlog_mat <- DESeq.rlog_mat[markers,] # Specify colors ann_colors = list( condition = c(ACE_neg = "gold3", ACE_pos = "turquoise3"), donor = c(E1 = "#E76BF3", E2 = "#F8766D", E3 = "#529EFF", E4 = "springgreen3") ) pheatmap::pheatmap(DESeq.rlog_mat, color = colorRampPalette(rev(brewer.pal(n = 9, name ="RdYlBu")))(100), annotation_col = s_info, annotation_names_row = FALSE, annotation_colors = ann_colors, show_rownames = TRUE, show_colnames = TRUE, cluster_rows = FALSE, angle_col = 45, fontsize = 14, fontsize_row = 14, fontsize_col = 18, cellheight = 20, cellwidth = 30) # Fig1E -------------------------------------------- gmt.obj <- read.gmt("Scarfo_HEC2023/BulkRNAseq_1/c5.all.v7.2.symbols.gmt") organism.db <- org.Hs.eg.db # Remove genes with no FDR DGE.results.annot <- DGE.results.annot[!is.na(DGE.results.annot$FDR), ] gene_univ <- row.names(DGE.results.annot) # get positive and negative gene names gene.res.top <- subset(DGE.results.annot, FDR < adj.pval.thr & abs(logFC) > logfc.thr) gene.res.top.pos <- subset(gene.res.top, logFC > 0) gene.res.top.neg <- subset(gene.res.top, logFC < 0) ## Over Representation Analysis comp <- c("gene.res.top.pos","gene.res.top.neg") for (i in comp) { print(paste0("Analyzing...",i," genes")) obj_i <- get(i) contr.enricher <- enricher(row.names(obj_i), universe = gene_univ, pvalueCutoff = adj.pval.thr, TERM2GENE = gmt.obj) contr.enricher.res <- as.data.frame(contr.enricher) assign(paste("contr.enricher",i,sep = "."), contr.enricher.res) } ## barplots terms.pos <- data.frame( ID = c("GO_LEUKOCYTE_MIGRATION","GO_AMEBOIDAL_TYPE_CELL_MIGRATION","GO_TISSUE_MIGRATION","GO_POSITIVE_REGULATION_OF_EPITHELIAL_CELL_MIGRATION","GO_REGULATION_OF_EPITHELIAL_CELL_MIGRATION","GO_ENDOTHELIAL_CELL_MIGRATION","GO_REGULATION_OF_ENDOTHELIAL_CELL_MIGRATION","GO_COLLAGEN_CONTAINING_EXTRACELLULAR_MATRIX","GO_EXTRACELLULAR_MATRIX","GO_CELL_CELL_JUNCTION","GO_NEGATIVE_REGULATION_OF_CELL_ADHESION","GO_REGULATION_OF_CELL_CELL_ADHESION","GO_LEUKOCYTE_CELL_CELL_ADHESION","GO_EXTRACELLULAR_MATRIX_BINDING"), Type = c(rep("Migration", 7), rep("Adhesion", 7))) terms.pos <- left_join(terms.pos,contr.enricher.gene.res.top.pos,by="ID") terms.neg <- data.frame( ID = c("GO_POSITIVE_REGULATION_OF_CELL_CYCLE_PROCESS","GO_POSITIVE_REGULATION_OF_CELL_CYCLE","GO_POSITIVE_REGULATION_OF_CELL_CYCLE_G2_M_PHASE_TRANSITION","GO_POSITIVE_REGULATION_OF_CELL_CYCLE_PHASE_TRANSITION","GO_MITOTIC_CELL_CYCLE_CHECKPOINT","GO_MITOTIC_NUCLEAR_DIVISION","GO_POSITIVE_REGULATION_OF_MITOTIC_CELL_CYCLE"), Type = "Cell Cycle") terms.neg <- left_join(terms.neg,contr.enricher.gene.res.top.neg,by="ID") terms.to.plot <- rbind(terms.pos, terms.neg) terms.to.plot <- na.omit(terms.to.plot) parsed.terms <- as.data.frame(str_split_fixed(terms.to.plot$ID, "GO_", 2)) renamed.terms <- as.data.frame(str_replace_all(parsed.terms$V2, "_", " ")) colnames(renamed.terms) <- "id" renamed.terms$Terms <- str_to_title(renamed.terms$id) terms.to.plot <- cbind(terms.to.plot,renamed.terms) terms.to.plot <- terms.to.plot[,c(12,10,7,2)] terms.to.plot$Type <- factor(terms.to.plot$Type, levels = c("Cell Cycle", "Migration", "Adhesion")) terms.to.plot$Terms <- factor(terms.to.plot$Terms, levels = terms.to.plot$Terms) terms.to.plot <- terms.to.plot %>% mutate(color = ifelse(Type == "Cell Cycle", "gold3", "turquoise3"), regulated = ifelse(Type == "Cell Cycle", "Upregulated in ACEneg", "Upregulated in ACE+")) terms.to.plot$regulated <- factor(terms.to.plot$regulated, levels = c("Upregulated in ACEneg","Upregulated in ACE+")) terms.to.plot$log_pvalue <- -log10(terms.to.plot$p.adjust) ggplot(data=terms.to.plot, aes(x=Terms, y=log_pvalue, fill=regulated, xmin = min(log_pvalue), xmax = max(log_pvalue))) + geom_bar(stat="identity", width = 0.7) + scale_fill_manual(values = c("gold3", "turquoise3"))+ theme_bw() + coord_flip() + ylab(expression(-log[10]("adjusted p-value"))) + xlab("GO Terms") + #scale_x_discrete(limits = GO_terms_all$Term) + facet_grid(Type ~., scales = "free_y", space = "free_y") + theme( axis.title.x = element_text(colour="black",family = "Arial", size=16), axis.text.x = element_text(colour="black",family = "Arial", size=12), axis.title.y = element_text(colour="black",family = "Arial", size=16), axis.text.y = element_text(color="black",family = "Arial", size=12), legend.text = element_text(colour="black",family = "Arial", size=12), legend.title = element_text(colour="white",family = "Arial", size=12), strip.background = element_rect(colour="black", fill="white"), strip.text = element_text(family = "Arial", size=12)) ## GSEA DGE.results.annot.sorted <- DGE.results.annot[order(DGE.results.annot$logFC, decreasing = TRUE), ] # LogFC with Symbols gene.res.logfc.symbol <- as.vector(DGE.results.annot.sorted$logFC) names(gene.res.logfc.symbol) <- row.names(DGE.results.annot.sorted) contr.gsea <- GSEA(gene.res.logfc.symbol, TERM2GENE = gmt.obj, nPerm = 10000, pvalueCutoff = adj.pval.thr) contr.gsea.res <- as.data.frame(contr.gsea)