Initial commit

839b34cc · Monah Abou Alezz · 839b34cc · 839b34cc · 839b34cc · 839b34cc
Commit 839b34cc authored Jul 26, 2023 by Monah Abou Alezz
--- a/BulkRNAseq_1/ED_Figure1_plots.R
+++ b/BulkRNAseq_1/ED_Figure1_plots.R
+suppressPackageStartupMessages(library(DESeq2))
+suppressPackageStartupMessages(library(stringr))
+suppressPackageStartupMessages(library(tidyverse))
+suppressPackageStartupMessages(library(ggplot2))
+suppressPackageStartupMessages(library(ggrepel))
+suppressPackageStartupMessages(library(pheatmap))
+suppressPackageStartupMessages(library(RColorBrewer))
+## Input featureCounts file
+fcount.file <- "Scarfo_HEC2023/BulkRNAseq_1/featureCounts_results_corrected.txt"
+## Contrasts to perform
+contr <- "ACE_pos:ACE_neg"
+contr.list <- unlist(strsplit(contr, ","))
+## Design Formula
+formula.str <- "donor,condition"
+design.str <- gsub(",", "+", formula.str)
+num_cond <- str_count(formula.str, ",") + 1
+design.formula <- as.formula(paste("~", design.str, sep = ""))
+## Read featureCounts results
+if (!file.exists(fcount.file)) {
+  stop("Error: featureCounts file not found.")
+} else {
+  read.counts <- read.table(fcount.file, 
+                            header = TRUE, 
+                            check.names = FALSE) %>%
+    column_to_rownames(var = "Geneid") %>%
+    select(-c(1:5)) 
+}
+orig.names <- names(read.counts)
+ds <- list()
+cond <- list()
+for (i in seq(orig.names[1:length(orig.names)])) {
+  ds[i] <- unlist(strsplit(orig.names[i], ":"))[2]
+  cond[i] <- unlist(strsplit(orig.names[i], ":"))[3]
+}
+ds <- unlist(ds)
+cond <- unlist(cond)
+## Samples - Conditions
+sample_info <- as.data.frame(str_split_fixed(cond, ",", num_cond))
+colnames(sample_info) <- str_split_fixed(formula.str, ",", num_cond)
+rownames(sample_info) <- ds
+colnames(read.counts) <- ds
+sample_info$condition <- factor(sample_info$condition)
+last_condition <- str_split_fixed(formula.str, ",", num_cond)[num_cond]
+batch_cond <- str_split_fixed(formula.str, ",", num_cond)[num_cond - 1]
+## thresholds
+adj.pval.thr <- 0.05
+logfc.thr <- 0
+## DESeq2
+DESeq.ds <- DESeqDataSetFromMatrix(countData = read.counts,
+                                   colData = sample_info,
+                                   design = design.formula)
+## Remove genes without any counts
+DESeq.ds <- DESeq.ds[rowSums(counts(DESeq.ds)) > 0, ]
+## RunDGE
+DESeq.ds <- DESeq(DESeq.ds)
+## obtain regularized log-transformed values
+DESeq.rlog <- rlogTransformation(DESeq.ds, blind = TRUE)
+assay(DESeq.rlog) <- limma::removeBatchEffect(assay(DESeq.rlog), DESeq.rlog[[batch_cond]])
+ctrs <- unlist(strsplit(contr.list[1], ":"))
+c1 <- ctrs[1]
+c2 <- ctrs[2]
+DGE.results <- results(DESeq.ds,
+                       pAdjustMethod = "BH",
+                       independentFiltering = TRUE,
+                       contrast = c(last_condition, c1, c2),
+                       alpha = adj.pval.thr)
+# Change format
+DGE.results.annot <- as.data.frame(DGE.results) %>% 
+  dplyr::select(log2FoldChange, lfcSE, baseMean, pvalue, padj)
+colnames(DGE.results.annot) <- c("logFC", "lfcSE", "baseMean", "PValue", "FDR")
+## sort the results according to the adjusted p-value
+DGE.results.sorted <- DGE.results.annot[order(DGE.results.annot$FDR), ]
+## Subset for only significant genes
+DGE.results.sig <- subset(DGE.results.sorted, 
+                          FDR < adj.pval.thr & abs(logFC) > logfc.thr)
+DGEgenes <- rownames(DGE.results.sig)
+# ED_Fig1D ----------------------------------------------------------------
+data <- data.frame(gene = row.names(DGE.results.sorted), 
+                   pval = -log10(DGE.results.sorted$FDR), 
+                   lfc = DGE.results.sorted$logFC)
+data <- na.omit(data)
+data <- mutate(data, color = case_when(data$lfc > logfc.thr & data$pval > -log10(adj.pval.thr) ~ "Overexpressed",data$lfc < -logfc.thr & data$pval > -log10(adj.pval.thr) ~ "Underexpressed",abs(data$lfc) < logfc.thr | data$pval < -log10(adj.pval.thr) ~ "NonSignificant"))
+data$color <- factor(data$color, levels = c("Overexpressed", "Underexpressed", "NonSignificant"))
+data <- data[order(data$pval, decreasing = TRUE),]
+highl <- head(subset(data, color != "NonSignificant"), 20)
+ggplot(data, aes(x = lfc, y = pval, colour = color)) +
+  theme_bw() +
+  theme(legend.position = "right",
+        text=element_text(size=17*96/72),
+        axis.text.x = element_text(size = 17*96/72, 
+                                   color = "black"),
+        axis.text.y = element_text(size = 17*96/72,
+                                   color = "black"),
+        axis.title.y = element_text(colour = "black", size = 19*96/72),
+        axis.title.x = element_text(colour = "black", size = 19*96/72),
+        legend.text = element_text(colour = "black", size = 19*96/72),
+        legend.title = element_text(colour = "black", size = 19*96/72)) +
+  xlab("log2 Fold Change") + 
+  ylab(expression(-log[10]("adjusted p-value"))) +
+  geom_hline(yintercept = -log10(adj.pval.thr), colour = "darkgray") +
+  geom_point(size = 3, alpha = 0.8, na.rm = T) +
+  scale_color_manual(name = "Expression",
+                     values = c(Overexpressed = "red",
+                                Underexpressed = "royalblue3",
+                                NonSignificant = "darkgray")) +
+  geom_label_repel(data = highl, 
+                   aes(label = gene), 
+                   show.legend = FALSE, 
+                   size = 5)
+# ED_Fig1E --------------------------------------------------------
+DESeq.ds_assay_corr <- limma::removeBatchEffect(assay(DESeq.ds), DESeq.ds[[batch_cond]])
+sampleDists_corr <- dist(t(DESeq.ds_assay_corr))
+sampleDistMatrix_corr <- as.matrix(sampleDists_corr)
+rownames(sampleDistMatrix_corr) <- paste(rownames(sampleDistMatrix_corr), DESeq.ds[[last_condition]], sep = " ")
+colnames(sampleDistMatrix_corr) <- NULL
+colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
+pheatmap::pheatmap(sampleDistMatrix_corr,
+         clustering_distance_rows=sampleDists_corr,
+         clustering_distance_cols=sampleDists_corr,
+         col = colors)
--- a/BulkRNAseq_1/ED_Figure2_plots.R
+++ b/BulkRNAseq_1/ED_Figure2_plots.R
+suppressPackageStartupMessages(library(DESeq2))
+suppressPackageStartupMessages(library(stringr))
+suppressPackageStartupMessages(library(tidyverse))
+suppressPackageStartupMessages(library(ggplot2))
+suppressPackageStartupMessages(library(ggrepel))
+suppressPackageStartupMessages(library(openxlsx))
+suppressPackageStartupMessages(library(RColorBrewer))
+## Input featureCounts file
+fcount.file <- "Scarfo_HEC2023/BulkRNAseq_1/featureCounts_results_corrected.txt"
+## Contrasts to perform
+contr <- "ACE_pos:ACE_neg"
+contr.list <- unlist(strsplit(contr, ","))
+## Design Formula
+formula.str <- "donor,condition"
+design.str <- gsub(",", "+", formula.str)
+num_cond <- str_count(formula.str, ",") + 1
+design.formula <- as.formula(paste("~", design.str, sep = ""))
+## Read featureCounts results
+if (!file.exists(fcount.file)) {
+  stop("Error: featureCounts file not found.")
+} else {
+  read.counts <- read.table(fcount.file, 
+                            header = TRUE, 
+                            check.names = FALSE) %>%
+    column_to_rownames(var = "Geneid") %>%
+    select(-c(1:5)) 
+}
+orig.names <- names(read.counts)
+ds <- list()
+cond <- list()
+for (i in seq(orig.names[1:length(orig.names)])) {
+  ds[i] <- unlist(strsplit(orig.names[i], ":"))[2]
+  cond[i] <- unlist(strsplit(orig.names[i], ":"))[3]
+}
+ds <- unlist(ds)
+cond <- unlist(cond)
+## Samples - Conditions
+sample_info <- as.data.frame(str_split_fixed(cond, ",", num_cond))
+colnames(sample_info) <- str_split_fixed(formula.str, ",", num_cond)
+rownames(sample_info) <- ds
+colnames(read.counts) <- ds
+sample_info$condition <- factor(sample_info$condition)
+last_condition <- str_split_fixed(formula.str, ",", num_cond)[num_cond]
+batch_cond <- str_split_fixed(formula.str, ",", num_cond)[num_cond - 1]
+## thresholds
+adj.pval.thr <- 0.05
+logfc.thr <- 0
+## DESeq2 
+DESeq.ds <- DESeqDataSetFromMatrix(countData = read.counts,
+                                   colData = sample_info,
+                                   design = design.formula)
+## Remove genes without any counts
+DESeq.ds <- DESeq.ds[rowSums(counts(DESeq.ds)) > 0, ]
+## RunDGE
+DESeq.ds <- DESeq(DESeq.ds)
+## obtain regularized log-transformed values
+DESeq.rlog <- rlogTransformation(DESeq.ds, blind = TRUE)
+assay(DESeq.rlog) <- limma::removeBatchEffect(assay(DESeq.rlog), DESeq.rlog[[batch_cond]])
+ctrs <- unlist(strsplit(contr.list[1], ":"))
+c1 <- ctrs[1]
+c2 <- ctrs[2]
+DGE.results <- results(DESeq.ds,
+                       pAdjustMethod = "BH",
+                       independentFiltering = TRUE,
+                       contrast = c(last_condition, c1, c2),
+                       alpha = adj.pval.thr)
+# Change format
+DGE.results.annot <- as.data.frame(DGE.results) %>% 
+  dplyr::select(log2FoldChange, lfcSE, baseMean, pvalue, padj)
+colnames(DGE.results.annot) <- c("logFC", "lfcSE", "baseMean", "PValue", "FDR")
+## sort the results according to the adjusted p-value
+DGE.results.sorted <- DGE.results.annot[order(DGE.results.annot$FDR), ]
+## Subset for only significant genes
+DGE.results.sig <- subset(DGE.results.sorted, 
+                          FDR < adj.pval.thr & abs(logFC) > logfc.thr)
+DGEgenes <- rownames(DGE.results.sig)
+# ED_Fig2A ----------------------------------------------------------------
+## surface genes
+if(!file.exists("Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx")){
+  download.file("http://wlab.ethz.ch/surfaceome/table_S3_surfaceome.xlsx", 
+                destfile = "Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx")  
+}
+sg <- read.xlsx("Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx", 
+                sheet = "in silico surfaceome only",
+                colNames = T,
+                startRow = 2)
+sig_sg <- DGE.results.sig[c(row.names(DGE.results.sig) %in% sg$UniProt.gene), ]
+top10_sig_sg <- sig_sg[c(1:10),]
+data <- data.frame(gene = row.names(DGE.results.sorted), 
+                   pval = -log10(DGE.results.sorted$FDR), 
+                   lfc = DGE.results.sorted$logFC)
+data <- na.omit(data)
+data <- mutate(data, color = case_when(data$lfc > logfc.thr & data$pval > -log10(adj.pval.thr) ~ "Overexpressed",data$lfc < -logfc.thr & data$pval > -log10(adj.pval.thr) ~ "Underexpressed",abs(data$lfc) < logfc.thr | data$pval < -log10(adj.pval.thr) ~ "NonSignificant"))
+data$color <- factor(data$color, levels = c("Overexpressed", "Underexpressed", "NonSignificant"))
+data <- data[order(data$pval, decreasing = TRUE),]
+highl <- data[c(data$gene %in% row.names(top10_sig_sg)), ]
+ggplot(data, aes(x = lfc, y = pval, colour = color)) +
+  theme_bw() +
+  theme(legend.position = "right",
+        text=element_text(size=17*96/72),
+        axis.text.x = element_text(size = 17*96/72, 
+                                   color = "black"),
+        axis.text.y = element_text(size = 17*96/72,
+                                   color = "black"),
+        axis.title.y = element_text(colour = "black", size = 19*96/72),
+        axis.title.x = element_text(colour = "black", size = 19*96/72),
+        legend.text = element_text(colour = "black", size = 19*96/72),
+        legend.title = element_text(colour = "black", size = 19*96/72)) +
+  xlab("log2 Fold Change") + 
+  ylab(expression(-log[10]("adjusted p-value"))) +
+  geom_hline(yintercept = -log10(adj.pval.thr), colour = "darkgray") +
+  geom_point(size = 3, alpha = 0.8, na.rm = T) +
+  scale_color_manual(name = "Expression",
+                     values = c(Overexpressed = "red",
+                                Underexpressed = "royalblue3",
+                                NonSignificant = "darkgray")) +
+  geom_label_repel(data = highl, 
+                   aes(label = gene), 
+                   show.legend = FALSE, 
+                   size = 5)
--- a/BulkRNAseq_1/Figure1_plots.R
+++ b/BulkRNAseq_1/Figure1_plots.R
+suppressPackageStartupMessages(library(DESeq2))
+suppressPackageStartupMessages(library(stringr))
+suppressPackageStartupMessages(library(tidyverse))
+suppressPackageStartupMessages(library(pheatmap))
+suppressPackageStartupMessages(library(RColorBrewer))
+suppressPackageStartupMessages(library(openxlsx))
+suppressPackageStartupMessages(library(org.Hs.eg.db))
+suppressPackageStartupMessages(library(clusterProfiler))
+## Input featureCounts file
+fcount.file <- "Scarfo_HEC2023/BulkRNAseq_1/featureCounts_results_corrected.txt"
+## Contrasts to perform
+contr <- "ACE_pos:ACE_neg"
+contr.list <- unlist(strsplit(contr, ","))
+## Design Formula
+formula.str <- "donor,condition"
+design.str <- gsub(",", "+", formula.str)
+num_cond <- str_count(formula.str, ",") + 1
+design.formula <- as.formula(paste("~", design.str, sep = ""))
+## Read featureCounts results
+if (!file.exists(fcount.file)) {
+  stop("Error: featureCounts file not found.")
+} else {
+  read.counts <- read.table(fcount.file, 
+                            header = TRUE, 
+                            check.names = FALSE) %>%
+    column_to_rownames(var = "Geneid") %>%
+    select(-c(1:5)) 
+}
+orig.names <- names(read.counts)
+ds <- list()
+cond <- list()
+for (i in seq(orig.names[1:length(orig.names)])) {
+  ds[i] <- unlist(strsplit(orig.names[i], ":"))[2]
+  cond[i] <- unlist(strsplit(orig.names[i], ":"))[3]
+}
+ds <- unlist(ds)
+cond <- unlist(cond)
+## Samples - Conditions
+sample_info <- as.data.frame(str_split_fixed(cond, ",", num_cond))
+colnames(sample_info) <- str_split_fixed(formula.str, ",", num_cond)
+rownames(sample_info) <- ds
+colnames(read.counts) <- ds
+sample_info$condition <- factor(sample_info$condition)
+last_condition <- str_split_fixed(formula.str, ",", num_cond)[num_cond]
+batch_cond <- str_split_fixed(formula.str, ",", num_cond)[num_cond - 1]
+## thresholds
+adj.pval.thr <- 0.05
+logfc.thr <- 0
+## DESeq2 
+DESeq.ds <- DESeqDataSetFromMatrix(countData = read.counts,
+                                   colData = sample_info,
+                                   design = design.formula)
+## Remove genes without any counts
+DESeq.ds <- DESeq.ds[rowSums(counts(DESeq.ds)) > 0, ]
+## RunDGE
+DESeq.ds <- DESeq(DESeq.ds)
+## obtain regularized log-transformed values
+DESeq.rlog <- rlogTransformation(DESeq.ds, blind = TRUE)
+assay(DESeq.rlog) <- limma::removeBatchEffect(assay(DESeq.rlog), DESeq.rlog[[batch_cond]])
+ctrs <- unlist(strsplit(contr.list[1], ":"))
+c1 <- ctrs[1]
+c2 <- ctrs[2]
+DGE.results <- results(DESeq.ds,
+                       pAdjustMethod = "BH",
+                       independentFiltering = TRUE,
+                       contrast = c(last_condition, c1, c2),
+                       alpha = adj.pval.thr)
+# Change format
+DGE.results.annot <- as.data.frame(DGE.results) %>% 
+  dplyr::select(log2FoldChange, lfcSE, baseMean, pvalue, padj)
+colnames(DGE.results.annot) <- c("logFC", "lfcSE", "baseMean", "PValue", "FDR")
+## sort the results according to the adjusted p-value
+DGE.results.sorted <- DGE.results.annot[order(DGE.results.annot$FDR), ]
+## Subset for only significant genes
+DGE.results.sig <- subset(DGE.results.sorted, 
+                          FDR < adj.pval.thr & abs(logFC) > logfc.thr)
+DGEgenes <- rownames(DGE.results.sig)
+# Fig1B ---------------------------------------------------------------------
+plotPCA(DESeq.rlog, intgroup = last_condition, ntop = 500) +
+  theme_bw() +
+  ggtitle(label = "Principal Component Analysis (PCA)",
+          subtitle = "Top 500 DEG (with batch correction)") +
+  geom_point(size = 2) +
+  #coord_fixed(ratio=3) +
+  geom_text(aes(label = rownames(DESeq.rlog@colData)), vjust = 0, nudge_y = 0.5,
+            show.legend = FALSE)
+# Fig1C ----------------------------------------------------------------
+## DEGs
+s_info <- sample_info[sample_info[[last_condition]] == c1 | sample_info[[last_condition]] == c2,, drop = FALSE]
+hm.mat_DGEgenes <- assay(DESeq.rlog)[DGEgenes, ]
+hm.mat_DGEgenes.filt <- hm.mat_DGEgenes[, row.names(s_info)]
+pheatmap::pheatmap(hm.mat_DGEgenes.filt,
+                   color = colorRampPalette(rev(brewer.pal(n = 9, name ="RdYlBu")))(100),
+                   fontsize_row = 5,
+                   show_rownames = FALSE,
+                   scale = "row",
+                   angle_col = 90,
+                   cluster_rows = TRUE,
+                   annotation_col = s_info,
+                   cluster_cols = TRUE)
+# Fig1D ----------------------------------------------------------------
+## surface genes
+if(!file.exists("Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx")){
+  download.file("http://wlab.ethz.ch/surfaceome/table_S3_surfaceome.xlsx", 
+                destfile = "Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx")  
+}
+sg <- read.xlsx("Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx", 
+                sheet = "in silico surfaceome only",
+                colNames = T,
+                startRow = 2)
+sig_sg <- DGE.results.sig[c(row.names(DGE.results.sig) %in% sg$UniProt.gene), ]
+markers <- c("CD34","CDH5","PECAM1","TEK","NR2F2","FLRT2","BMX","GJA5","DLL4","CXCR4","HEY2","MYB","GFI1","CD44")
+DESeq.rlog_mat <- assay(DESeq.rlog)
+DESeq.rlog_mat <- DESeq.rlog_mat[c(row.names(DESeq.rlog_mat) %in% markers), ]
+DESeq.rlog_mat <- DESeq.rlog_mat[markers,]
+# Specify colors
+ann_colors = list(
+  condition = c(ACE_neg = "gold3",
+                ACE_pos = "turquoise3"),
+  donor = c(E1 = "#E76BF3",
+            E2 = "#F8766D",
+            E3 = "#529EFF",
+            E4 = "springgreen3")
+)
+pheatmap::pheatmap(DESeq.rlog_mat,
+                   color = colorRampPalette(rev(brewer.pal(n = 9, name ="RdYlBu")))(100),
+                   annotation_col = s_info,
+                   annotation_names_row = FALSE,
+                   annotation_colors = ann_colors,
+                   show_rownames = TRUE,
+                   show_colnames = TRUE,
+                   cluster_rows = FALSE,
+                   angle_col = 45,
+                   fontsize = 14,
+                   fontsize_row = 14,
+                   fontsize_col = 18,
+                   cellheight = 20,
+                   cellwidth = 30)
+# Fig1E --------------------------------------------
+gmt.obj <- read.gmt("Scarfo_HEC2023/BulkRNAseq_1/c5.all.v7.2.symbols.gmt")
+organism.db <- org.Hs.eg.db
+# Remove genes with no FDR
+DGE.results.annot <- DGE.results.annot[!is.na(DGE.results.annot$FDR), ]
+gene_univ <- row.names(DGE.results.annot)
+# get positive and negative gene names
+gene.res.top <- subset(DGE.results.annot, FDR < adj.pval.thr  & abs(logFC) > logfc.thr)
+gene.res.top.pos <- subset(gene.res.top, logFC > 0)
+gene.res.top.neg <- subset(gene.res.top, logFC < 0)
+## Over Representation Analysis
+comp <- c("gene.res.top.pos","gene.res.top.neg")
+for (i in comp) {
+  print(paste0("Analyzing...",i," genes"))
+  obj_i <- get(i)
+  contr.enricher <- enricher(row.names(obj_i),
+                             universe = gene_univ,
+                             pvalueCutoff = adj.pval.thr,
+                             TERM2GENE = gmt.obj)
+  contr.enricher.res <- as.data.frame(contr.enricher)
+  assign(paste("contr.enricher",i,sep = "."), contr.enricher.res)
+}
+## barplots
+terms.pos <- data.frame(
+  ID = c("GO_LEUKOCYTE_MIGRATION","GO_AMEBOIDAL_TYPE_CELL_MIGRATION","GO_TISSUE_MIGRATION","GO_POSITIVE_REGULATION_OF_EPITHELIAL_CELL_MIGRATION","GO_REGULATION_OF_EPITHELIAL_CELL_MIGRATION","GO_ENDOTHELIAL_CELL_MIGRATION","GO_REGULATION_OF_ENDOTHELIAL_CELL_MIGRATION","GO_COLLAGEN_CONTAINING_EXTRACELLULAR_MATRIX","GO_EXTRACELLULAR_MATRIX","GO_CELL_CELL_JUNCTION","GO_NEGATIVE_REGULATION_OF_CELL_ADHESION","GO_REGULATION_OF_CELL_CELL_ADHESION","GO_LEUKOCYTE_CELL_CELL_ADHESION","GO_EXTRACELLULAR_MATRIX_BINDING"),
+  Type = c(rep("Migration", 7), rep("Adhesion", 7)))
+terms.pos <- left_join(terms.pos,contr.enricher.gene.res.top.pos,by="ID")
+terms.neg <- data.frame(
+  ID = c("GO_POSITIVE_REGULATION_OF_CELL_CYCLE_PROCESS","GO_POSITIVE_REGULATION_OF_CELL_CYCLE","GO_POSITIVE_REGULATION_OF_CELL_CYCLE_G2_M_PHASE_TRANSITION","GO_POSITIVE_REGULATION_OF_CELL_CYCLE_PHASE_TRANSITION","GO_MITOTIC_CELL_CYCLE_CHECKPOINT","GO_MITOTIC_NUCLEAR_DIVISION","GO_POSITIVE_REGULATION_OF_MITOTIC_CELL_CYCLE"),
+  Type = "Cell Cycle")
+terms.neg <- left_join(terms.neg,contr.enricher.gene.res.top.neg,by="ID")
+terms.to.plot <- rbind(terms.pos, terms.neg)
+terms.to.plot <- na.omit(terms.to.plot)
+parsed.terms <- as.data.frame(str_split_fixed(terms.to.plot$ID, "GO_", 2))
+renamed.terms <- as.data.frame(str_replace_all(parsed.terms$V2, "_", " "))
+colnames(renamed.terms) <- "id"
+renamed.terms$Terms <- str_to_title(renamed.terms$id)
+terms.to.plot <- cbind(terms.to.plot,renamed.terms)
+terms.to.plot <- terms.to.plot[,c(12,10,7,2)]
+terms.to.plot$Type <- factor(terms.to.plot$Type, levels = c("Cell Cycle", "Migration", "Adhesion"))
+terms.to.plot$Terms <- factor(terms.to.plot$Terms, levels = terms.to.plot$Terms)
+terms.to.plot <- terms.to.plot %>%
+  mutate(color = ifelse(Type == "Cell Cycle", "gold3", "turquoise3"),
+         regulated = ifelse(Type == "Cell Cycle", "Upregulated in ACEneg", "Upregulated in ACE+"))
+terms.to.plot$regulated <- factor(terms.to.plot$regulated, 
+                                  levels = c("Upregulated in ACEneg","Upregulated in ACE+"))
+terms.to.plot$log_pvalue <- -log10(terms.to.plot$p.adjust)
+ggplot(data=terms.to.plot, aes(x=Terms, 
+                               y=log_pvalue,
+                               fill=regulated,
+                               xmin = min(log_pvalue),
+                               xmax = max(log_pvalue))) +
+  geom_bar(stat="identity", width = 0.7) +
+  scale_fill_manual(values = c("gold3", "turquoise3"))+
+  theme_bw() +
+  coord_flip() +
+  ylab(expression(-log[10]("adjusted p-value"))) +
+  xlab("GO Terms") +
+  #scale_x_discrete(limits = GO_terms_all$Term) +
+  facet_grid(Type ~., scales = "free_y", space = "free_y") +
+  theme(
+    axis.title.x = element_text(colour="black",family = "Arial", size=16),
+    axis.text.x = element_text(colour="black",family = "Arial", size=12),
+    axis.title.y = element_text(colour="black",family = "Arial", size=16),
+    axis.text.y = element_text(color="black",family = "Arial", size=12),
+    legend.text = element_text(colour="black",family = "Arial", size=12),
+    legend.title = element_text(colour="white",family = "Arial", size=12),
+    strip.background = element_rect(colour="black",
+                                    fill="white"),
+    strip.text = element_text(family = "Arial", size=12))
+## GSEA
+DGE.results.annot.sorted <- DGE.results.annot[order(DGE.results.annot$logFC, decreasing = TRUE), ]
+# LogFC with Symbols
+gene.res.logfc.symbol <- as.vector(DGE.results.annot.sorted$logFC)
+names(gene.res.logfc.symbol) <- row.names(DGE.results.annot.sorted)
+contr.gsea <- GSEA(gene.res.logfc.symbol, 
+                   TERM2GENE = gmt.obj,
+                   nPerm = 10000, 
+                   pvalueCutoff = adj.pval.thr)
+contr.gsea.res <- as.data.frame(contr.gsea)
--- a/BulkRNAseq_1/Figure2_plots.R
+++ b/BulkRNAseq_1/Figure2_plots.R
+suppressPackageStartupMessages(library(DESeq2))
+suppressPackageStartupMessages(library(tidyverse))
+suppressPackageStartupMessages(library(stringr))
+suppressPackageStartupMessages(library(pheatmap))
+suppressPackageStartupMessages(library(openxlsx))
+suppressPackageStartupMessages(library(RColorBrewer))
+## Input featureCounts file
+fcount.file <- "Scarfo_HEC2023/BulkRNAseq_1/featureCounts_results_corrected.txt"
+## Contrasts to perform
+contr <- "ACE_pos:ACE_neg"
+contr.list <- unlist(strsplit(contr, ","))
+## Design Formula
+formula.str <- "donor,condition"
+design.str <- gsub(",", "+", formula.str)
+num_cond <- str_count(formula.str, ",") + 1
+design.formula <- as.formula(paste("~", design.str, sep = ""))
+## Read featureCounts results
+if (!file.exists(fcount.file)) {
+  stop("Error: featureCounts file not found.")
+} else {
+  read.counts <- read.table(fcount.file, 
+                            header = TRUE, 
+                            check.names = FALSE) %>%
+    column_to_rownames(var = "Geneid") %>%
+    select(-c(1:5)) 
+}
+orig.names <- names(read.counts)
+ds <- list()
+cond <- list()
+for (i in seq(orig.names[1:length(orig.names)])) {
+  ds[i] <- unlist(strsplit(orig.names[i], ":"))[2]
+  cond[i] <- unlist(strsplit(orig.names[i], ":"))[3]
+}
+ds <- unlist(ds)
+cond <- unlist(cond)
+## Samples - Conditions
+sample_info <- as.data.frame(str_split_fixed(cond, ",", num_cond))
+colnames(sample_info) <- str_split_fixed(formula.str, ",", num_cond)
+rownames(sample_info) <- ds
+colnames(read.counts) <- ds
+sample_info$condition <- factor(sample_info$condition)
+last_condition <- str_split_fixed(formula.str, ",", num_cond)[num_cond]
+batch_cond <- str_split_fixed(formula.str, ",", num_cond)[num_cond - 1]
+## thresholds
+adj.pval.thr <- 0.05
+logfc.thr <- 0
+## DESeq2
+DESeq.ds <- DESeqDataSetFromMatrix(countData = read.counts,
+                                   colData = sample_info,
+                                   design = design.formula)
+## Remove genes without any counts
+DESeq.ds <- DESeq.ds[rowSums(counts(DESeq.ds)) > 0, ]
+## RunDGE
+DESeq.ds <- DESeq(DESeq.ds)
+## obtain regularized log-transformed values
+DESeq.rlog <- rlogTransformation(DESeq.ds, blind = TRUE)
+assay(DESeq.rlog) <- limma::removeBatchEffect(assay(DESeq.rlog), DESeq.rlog[[batch_cond]])
+ctrs <- unlist(strsplit(contr.list[1], ":"))
+c1 <- ctrs[1]
+c2 <- ctrs[2]
+DGE.results <- results(DESeq.ds,
+                       pAdjustMethod = "BH",
+                       independentFiltering = TRUE,
+                       contrast = c(last_condition, c1, c2),
+                       alpha = adj.pval.thr)
+# Change format
+DGE.results.annot <- as.data.frame(DGE.results) %>% 
+  dplyr::select(log2FoldChange, lfcSE, baseMean, pvalue, padj)
+colnames(DGE.results.annot) <- c("logFC", "lfcSE", "baseMean", "PValue", "FDR")
+## sort the results according to the adjusted p-value
+DGE.results.sorted <- DGE.results.annot[order(DGE.results.annot$FDR), ]
+## Subset for only significant genes
+DGE.results.sig <- subset(DGE.results.sorted, 
+                          FDR < adj.pval.thr & abs(logFC) > logfc.thr)
+DGEgenes <- rownames(DGE.results.sig)
+# Fig2A ----------------------------------------------------------------
+## surface genes
+if(!file.exists("Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx")){
+  download.file("http://wlab.ethz.ch/surfaceome/table_S3_surfaceome.xlsx", 
+                destfile = "Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx")  
+}
+sg <- read.xlsx("Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx", 
+                sheet = "in silico surfaceome only",
+                colNames = T,
+                startRow = 2)
+sig_sg <- DGE.results.sig[c(row.names(DGE.results.sig) %in% sg$UniProt.gene), ]
+top10_sig_sg <- sig_sg[c(1:10),]
+s_info <- sample_info[sample_info[[last_condition]] == c1 | sample_info[[last_condition]] == c2,, drop = FALSE]
+hm.mat_DGEgenes <- assay(DESeq.rlog)[row.names(top10_sig_sg), ]
+hm.mat_DGEgenes.filt <- hm.mat_DGEgenes[, row.names(s_info)]
+pheatmap::pheatmap(hm.mat_DGEgenes.filt,
+                   scale = "row",
+                   color = colorRampPalette(rev(brewer.pal(n = 9, name ="RdYlBu")))(100),
+                   annotation_col = s_info,
+                   cluster_rows = TRUE,
+                   cluster_cols = TRUE,
+                   show_rownames = TRUE,
+                   show_colnames = TRUE,
+                   angle_col = 90,
+                   fontsize = 15)
--- a/BulkRNAseq_1/c5.all.v7.2.symbols.gmt
+++ b/BulkRNAseq_1/c5.all.v7.2.symbols.gmt
--- a/BulkRNAseq_1/featureCounts_results_corrected.txt
+++ b/BulkRNAseq_1/featureCounts_results_corrected.txt
--- a/BulkRNAseq_1/table_S3_surfaceome.xlsx
+++ b/BulkRNAseq_1/table_S3_surfaceome.xlsx
--- a/BulkRNAseq_2/ED_Figure5_plots.R
+++ b/BulkRNAseq_2/ED_Figure5_plots.R
+suppressPackageStartupMessages(library(DESeq2))
+suppressPackageStartupMessages(library(stringr))
+suppressPackageStartupMessages(library(tidyverse))
+suppressPackageStartupMessages(library(dplyr))
+suppressPackageStartupMessages(library(pheatmap))
+suppressPackageStartupMessages(library(RColorBrewer))
+## Input featureCounts file
+fcount.file <- "Scarfo_HEC2023/BulkRNAseq_2/featureCounts_results_corrected.txt"
+## Contrasts to perform
+contr <- "DLL4_plus:CD32_plus"
+contr.list <- unlist(strsplit(contr, ","))
+## Design Formula
+formula.str <- "condition"
+design.str <- gsub(",", "+", formula.str)
+num_cond <- str_count(formula.str, ",") + 1
+design.formula <- as.formula(paste("~", design.str, sep = ""))
+## Read featureCounts results
+if (!file.exists(fcount.file)) {
+  stop("Error: featureCounts file not found.")
+} else {
+  read.counts <- read.table(fcount.file, 
+                            header = TRUE, 
+                            check.names = FALSE) %>% 
+    column_to_rownames(var = "Geneid") %>% 
+    select(-c(1:5))
+}
+orig.names <- names(read.counts)
+ds <- list()
+cond <- list()
+for (i in seq(orig.names[1:length(orig.names)])) {
+  ds[i] <- unlist(strsplit(orig.names[i], ":"))[2]
+  cond[i] <- unlist(strsplit(orig.names[i], ":"))[3]
+}
+ds <- unlist(ds)
+cond <- unlist(cond)
+## Samples - Conditions
+sample_info <- as.data.frame(str_split_fixed(cond, ",", num_cond))
+colnames(sample_info) <- str_split_fixed(formula.str, ",", num_cond)
+rownames(sample_info) <- ds
+colnames(read.counts) <- ds
+sample_info$condition <- factor(sample_info$condition)
+last_condition <- str_split_fixed(formula.str, ",", num_cond)[num_cond]
+## thresholds
+adj.pval.thr <- 0.05
+logfc.thr <- 0
+## DESeq2 
+DESeq.ds <- DESeqDataSetFromMatrix(countData = read.counts,
+                                   colData = sample_info,
+                                   design = design.formula)
+## Remove genes without any counts
+DESeq.ds <- DESeq.ds[rowSums(counts(DESeq.ds)) > 0, ]
+## RunDGE
+DESeq.ds <- DESeq(DESeq.ds)
+## obtain regularized log-transformed values
+DESeq.rlog <- rlogTransformation(DESeq.ds, blind = TRUE)
+ctrs <- unlist(strsplit(contr.list[1], ":"))
+c1 <- ctrs[1]
+c2 <- ctrs[2]
+DGE.results <- results(DESeq.ds,
+                       pAdjustMethod = "BH",
+                       independentFiltering = TRUE,
+                       contrast = c(last_condition, c1, c2),
+                       alpha = adj.pval.thr)
+# Change format
+DGE.results.annot <- as.data.frame(DGE.results) %>% 
+  dplyr::select(log2FoldChange, lfcSE, baseMean, pvalue, padj)
+colnames(DGE.results.annot) <- c("logFC", "lfcSE", "baseMean", "PValue", "FDR")
+## sort the results according to the adjusted p-value
+DGE.results.sorted <- DGE.results.annot[order(DGE.results.annot$FDR), ]
+## Subset for only significant genes
+DGE.results.sig <- subset(DGE.results.sorted, 
+                          FDR < adj.pval.thr & abs(logFC) > logfc.thr)
+DGEgenes <- rownames(DGE.results.sig)
+# ED_Fig5A -----------------------------------------------------------------
+## scorecard
+EHT_scorecard = c("CDH5","RUNX1","PTPRC", "SPN","CD44","ITGA2B","HLF","GFI1","MLLT3","HOXA9","MKI67","NRP2","NR2F2","GJA4","HEY1","DLL4","CXCR4","SOX17","SMAD6","GJA5","HES4","MECOM","NID2","PRND","ESM1","PDGFB","PALMD","TMEM100","EDN1","LTBP4","HEY2","SULF1","IL33","CYP26B1","ADGRG6","COL23A1","GATA6","BMX","TMCC3","DKK1","AGTR2","FBN2","ELN","ALDH1A1","NKX2-3","PROCR","GATA3","GBP4","MYCN","KCNK17","MYB","STAT5A","SMIM24","RAB27B","SPINK2")
+hm.mat_DGEgenes <- assay(DESeq.rlog)[DGEgenes, ]
+counts_eht <- hm.mat_DGEgenes[c(rownames(hm.mat_DGEgenes) %in% EHT_scorecard), ]
+eht_sig <- counts_eht[c(rownames(counts_eht) %in% row.names(DGE.results.sig)), ]
+eht_sig <- eht_sig[,c(1,3,6,2,5,8)]
+annot <- data.frame(row.names = colnames(eht_sig),
+                    Sample = c(rep("DLL4+", 3), rep("CD32+", 3)),
+                    Donor = rep(c("RS227","RS242","RS243"), 2))
+annot_colors <- list(Sample=c("DLL4+" ="blueviolet",
+                              "CD32+" = "gold1"),
+                     Donor=c("RS227"="darkslategray3",
+                             "RS242"="brown3",
+                             "RS243"="limegreen"))
+pheatmap::pheatmap(eht_sig,
+                   scale = "row",
+                   annotation_col = annot,
+                   annotation_colors = annot_colors,
+                   cluster_rows = TRUE,
+                   cluster_cols = TRUE,
+                   show_rownames = TRUE,
+                   show_colnames = TRUE,
+                   angle_col = 90,
+                   legend = TRUE,
+                   legend_breaks = c(-1.5,0,1.5), # legend breaks to be shown
+                   legend_labels = c(-1.5,0,1.5), # legend labels to be shown
+                   treeheight_row = 25*96/72, # size of row dendogram
+                   treeheight_col = 15*96/72, # size of column dendogram
+                   fontsize = 10*96/72,
+                   fontsize_row = 8*96/72,
+                   fontsize_col = 10*96/72,
+                   cellwidth = 20*96/72,
+                   cellheight = 8*96/72,
+                   border_color = NA
+)
--- a/BulkRNAseq_2/featureCounts_results_corrected.txt
+++ b/BulkRNAseq_2/featureCounts_results_corrected.txt
--- a/README.md
+++ b/README.md
+# Scarfo_HEC2023
+This repository includes essential scripts for producing final tables and figures embedded in the following manuscript:  
+**Scarfò et al**,_The Fc receptor CD32 identifies human hemogenic endothelial cells irreversibly bound to the endothelial-to-hematopoietic transition_ 
+PubMed:  
+DOI:  
+GEO: GSE223223
+---
+The repository is divided in three folders, 2 bulk RNAseq analyses and 1 scRNAseq analysis.  
+Below a brief description of the bulk and scRNAseq workflows adopted in this work.
+**Bulk RNAseq** analysis was performed using a standard pipeline that includes the follwing steps:
+1. Quality control by [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)
+2. Trimming of bad quality reads with [TrimGalore](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/)<details><summary>Running command</summary>trim_galore --quality 20 --fastqc --length 25 --output_dir {outdir} --paired {input.r1} {inout.r2}</details>
+3. Alignment with [STAR](https://github.com/alexdobin/STAR)
+    <details><summary>Running command</summary>
+            "STAR " +
+            "--runThreadN {threads} " +
+            "--genomeDir {input.genome} " +
+            "--readFilesIn {params.trim_seq} " +
+            "--outSAMstrandField intronMotif " +
+            "--outFileNamePrefix {params.aln_seq_prefix} " +
+            "--outSAMtype BAM SortedByCoordinate " +
+            "--outSAMmultNmax 1 " +
+            "--outFilterMismatchNmax 10 " +
+            "--outReadsUnmapped Fastx " +
+            "--readFilesCommand zcat "
+    </details>
+4. Gene expression quantification with [featureCounts](https://academic.oup.com/bioinformatics/article/30/7/923/232889)
+    <details><summary>Running command</summary>
+            "featureCounts " +
+            "-a {input.annot} " +
+            "-o {output.fcount} " +
+            "-g gene_name " +
+            "-p -B -C " +
+            "-s {params.strand} " +
+            "--minOverlap 10 " +
+            "-T {threads} " +
+            "{input.bams} "
+    </details>
+5. Differential Expression analysis with [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html).  
+    For Differential Gene Expression analysis we followed the standard workflow provided by package.  
+6. Dowstream functional Analysis with [ClusterProfiler](https://bioconductor.org/packages/release/bioc/html/clusterProfiler.html).  
+    In order to retrieve functional annotation from DE analysis, we performed **O**ver **R**epresentation **A**nalysis by using the _enrichr_ function provided by the package.  
+    **ORA** analysis was performed in particular using the C5 gene set from the MSigDB database (version 7.2)
+---
+**scRNAseq** analysis was performed using a standard pipeline that includes the following steps:
+scRNAseq analysis was performed with [Seurat](https://satijalab.org/seurat/). Below are the main steps of the basic data analysis workflow that start from a minimal object after loading of 10X data to markers identification:  
+1. Quality control and filtering
+2. Cell cycle scoring
+3. Normalization (default seurat settings)
+4. Scaling (with following variables to regress out: percent.mt + nCount_RNA and CC.Difference calculated as show in [vignette](https://satijalab.org/seurat/articles/cell_cycle_vignette.html#alternate-workflow-1))
+4. Dimensionality reduction: PCA
+5. Clustering
+6. Markers identification
+- [6.1] Clusters related markers
+- [6.2] Intracluster differential expression analysis according to comparison of interest
+Pseudotime analysis was performed using [Monocle3](http://cole-trapnell-lab.github.io/monocle3/)
+Input files for scRNAseq analysis are available in the following [link](https://www.dropbox.com/sh/83dxrxqer8cl081/AAC8zALRuYRGh1mEe4lZuaHZa?dl=0)
+---
+An interactive querying and exploration of the scRNAseq dataset is available at:
+http://bioinfotiget.it/he/
+---
--- a/scRNAseq/ED_Figure5_plots.R
+++ b/scRNAseq/ED_Figure5_plots.R
+suppressPackageStartupMessages(library(Seurat))
+suppressPackageStartupMessages(library(monocle3))
+suppressPackageStartupMessages(library(ggplot2))
+suppressPackageStartupMessages(library(dplyr))
+# ED_Fig5B ----------------------------------------------------------------
+## load scRNAseq data from public dataset GSE162950
+load("Scarfo_HEC2023/scRNAseq/seurat_object.Rdata")
+EHT_scorecard <- c("CXCR4","SOX17","GJA5","MECOM","HOXA9","SPN","CD44","ITGA2B","HLF","GFI1","MLLT3","KCNK17","MYB","STAT5A","SMIM24","RAB27B","SPINK2")
+## Select HE cells
+sample <- SetIdent(sample, value = sample@meta.data$seurat_clusters)
+index1 = GetAssayData(sample, slot="counts")["RUNX1",]>0
+index2 = GetAssayData(sample, slot="counts")["CDH5",]>0
+index3 = GetAssayData(sample, slot="counts")["PTPRC",]==0
+index4 = GetAssayData(sample, slot="counts")["FCGR2B",]==0
+bars = colnames(sample)[Idents(sample) %in% c(0,3)]
+bars1 = colnames(sample)[index1 & index2 & index3 & index4]
+bars2 = intersect(bars, bars1)
+## Establish identity for FCGR2B- HE cells
+Idents(sample, cells=bars2) = "FCGR2B=0"
+index5 = GetAssayData(sample, slot="counts")["FCGR2B",]>0
+bars3 = colnames(sample)[index1 & index2 & index3 & index5]
+bars4 = intersect(bars, bars3)
+## Establish identity for FCGR2B+ HE cells
+Idents(sample, cells=bars4) = "FCGR2B_exp"
+sample$CellType <- Idents(sample)
+sample_sub <- subset(sample, CellType=="FCGR2B_exp" | CellType=="FCGR2B=0")
+DotPlot(sample_sub, 
+             features=EHT_scorecard, 
+             cols=c("grey90","red3"),
+             dot.scale = 10) +
+  coord_flip()
+# ED_Fig5C ------------------------------------------------------------------
+## load Seurat data
+sample <- readRDS("Scarfo_HEC2023/scRNAseq/WNTd_HE.rds")
+sample@meta.data[["Cell.Annotations"]] <- recode_factor(sample@meta.data[["RNA_snn_res.0.6"]],
+                                                           "0" = "HECs", 
+                                                           "1" = "HECs", 
+                                                           "2" = "HECs", 
+                                                           "3" = "Venous cells", 
+                                                           "4" = "Venous cells", 
+                                                           "5" = "Arterial ECs", 
+                                                           "6" = "Arterial ECs", 
+                                                           "7" = "Venous cells", 
+                                                           "8" = "ECs (M phase)",
+                                                           "9" = "Venous cells",
+                                                           "10" = "EndoMT cells",
+                                                           "11" = "HECs",
+                                                           "12" = "Arterial ECs",
+                                                           "13" = "Lymphatic ECs",
+                                                           "14" = "Arterial ECs",
+                                                           "15" = "Allantois/Placenta",
+                                                           "16" = "HECs",
+                                                           "17" = "HECs",
+                                                           "18" = "EndoMT cells",
+                                                           "19" = "SM cells",
+                                                           "20" = "EndoMT cells",
+                                                           "21" = "EndoMT cells")
+DimPlot(object = sample,
+        reduction = "umap",
+        group.by = "Cell.Annotations",
+        label = F,
+        repel = T,
+        pt.size = 0.8,
+        label.size = 6,
+        cols = c('HECs' = "#377EB8", 
+                 'Lymphatic ECs' = "#4DAF4A",
+                 'Arterial ECs' = "#E41A1C",
+                 'Venous cells' = "#FF7F00",
+                 'EndoMT cells' = "#E6AB02",
+                 'Allantois/Placenta' = "#984EA3",
+                 'SM cells' = "#A6761D",
+                 "ECs (M phase)" = "#A50026"))
+# ED_Fig5D - ED_Fig5E ------------------------------------------------------------------
+FeaturePlot(object = sample,
+            features = c("RUNX1", "FCGR2B"),
+            cols = c("grey90","blue"),
+            order = T,
+            pt.size = 0.1)
+# ED_Fig5F ------------------------------------------------------------------
+runx1 <- readRDS("Scarfo_HEC2023/scRNAseq/RUNX1_clusters.rds")
+## Monocle3
+Idents(runx1) <- "RNA_snn_res.0.6"
+start = WhichCells(runx1,idents = "0")
+expression_matrix <- runx1@assays[["RNA"]]@counts
+cell_metadata <- runx1@meta.data
+cell_metadata$orig.ident <- factor(cell_metadata$orig.ident, levels = unique(cell_metadata$orig.ident))
+gene_annotation <- data.frame("gene_short_name" = rownames(runx1))
+rownames(gene_annotation) <- gene_annotation$gene_short_name
+cds <- new_cell_data_set(expression_data = expression_matrix, 
+                         cell_metadata = cell_metadata, 
+                         gene_metadata = gene_annotation)
+cds <- preprocess_cds(cds, num_dim = 50)
+cds <- reduce_dimension(cds)
+## Construct and assign the made up partition
+recreate.partition <- c(rep(1, length(cds@colData@rownames)))
+names(recreate.partition) <- cds@colData@rownames
+recreate.partition <- as.factor(recreate.partition)
+cds@clusters@listData[["UMAP"]][["partitions"]] <- recreate.partition
+## Assign the cluster info
+list_cluster <- runx1@meta.data[["RNA_snn_res.0.6"]]
+names(list_cluster) <- runx1@assays[["RNA"]]@data@Dimnames[[2]]
+cds@clusters@listData[["UMAP"]][["clusters"]] <- list_cluster
+cds@clusters@listData[["UMAP"]][["louvain_res"]] <- "NA"
+## Assign UMAP coordinate
+reducedDims(cds)[["UMAP"]] <- runx1@reductions[["umap"]]@cell.embeddings
+### Reset colnames and rownames (consequence of UMAP replacement)
+rownames(cds@principal_graph_aux[['UMAP']]$dp_mst) <- NULL
+## Learn Graph
+cds <- learn_graph(cds = cds,use_partition = T,learn_graph_control=list(ncenter=220,minimal_branch_len=15),verbose = T)
+cds <- order_cells(cds, root_cells = start)
+plot_cells(cds,
+           color_cells_by = "pseudotime",
+           label_groups_by_cluster=FALSE,
+           label_leaves=T,
+           label_branch_points=T,
+           group_label_size = 8,
+           graph_label_size = 3,
+           cell_size = 1,
+           trajectory_graph_segment_size = 1)
+# ED_Fig5G ------------------------------------------------------------------
+cds <- estimate_size_factors(cds)
+gi <- c("H19","KCNK17","RUNX1", "MYB", "SPN")
+cds_gi <- cds[rowData(cds)$gene_short_name %in% gi,]
+plot_genes_in_pseudotime(cds_subset = cds_gi, 
+                         ncol = 3,
+                         color_cells_by = "RNA_snn_res.0.6", 
+                         min_expr = NULL,
+                         cell_size = 1)
--- a/scRNAseq/Fig5_plots.R
+++ b/scRNAseq/Fig5_plots.R
+suppressPackageStartupMessages(library(Seurat))
+suppressPackageStartupMessages(library(monocle3))
+suppressPackageStartupMessages(library(dplyr))
+suppressPackageStartupMessages(library(reshape2))
+suppressPackageStartupMessages(library(ggplot2))
+suppressPackageStartupMessages(library(patchwork))
+suppressPackageStartupMessages(library(clusterProfiler))
+## load Seurat data
+sample <- readRDS("Scarfo_HEC2023/scRNAseq/WNTd_HE.rds")
+# Fig5A ------------------------------------------------------------------
+DimPlot(object = sample,
+        reduction = "umap",
+        group.by = "RNA_snn_res.0.6",
+        label = T,
+        repel = T,
+        pt.size = 0.8,
+        label.size = 8,
+        cols = c("0" = "#F8766D",
+                 "1" = "#B79F00",
+                 "2" = "#00BA38",
+                 "3" = "#EC8239",
+                 "4"= "#00B81B",
+                 "5"= "#7C96FF",
+                 "6"= "#00C085",
+                 "7" = "#00C1A7",
+                 "8" = "#00BFC4",
+                 "9" = "#EF67EB",
+                 "10" = "#00B2F3",
+                 "11"= "#00BFC4",
+                 "12"= "#00A6FF",
+                 "13"= "#DB8E00",
+                 "14"= "#FD61D3",
+                 "15"= "#FF63B6",
+                 "16"= "#619CFF",
+                 "17"= "#F564E3",
+                 "18"= "#00BADE",
+                 "19"= "#B385FF",
+                 "20"= "#D874FD",
+                 "21"= "#FF6B94")) +
+  ggtitle("") +
+  xlab("UMAP1") +
+  ylab("UMAP2") +
+  theme(legend.text = element_text(size = 16*96/72),
+        axis.text.x = element_text(color = "black", family = "Arial",size = 14*96/72),
+        axis.text.y = element_text(color = "black", family = "Arial",size = 14*96/72),
+        axis.title.x = element_text(color = "black", family = "Arial",size = 16*96/72),
+        axis.title.y = element_text(color = "black", family = "Arial",size = 16*96/72),
+        aspect.ratio = 1,
+        panel.background = element_blank()) +
+  guides(color = guide_legend(override.aes = list(size=4)))
+## Seurat markers
+markers <- FindAllMarkers(sample, 
+                          only.pos = T, 
+                          min.pct = 0.1, 
+                          logfc.threshold = 0.25)
+## define RUNX1 clusters
+runx1.clusters <- subset(markers, markers$gene=="RUNX1")
+runx1.clusters <- droplevels(runx1.clusters$cluster) 
+## subset RUNX1 clusters into new object
+runx1 <- subset(sample, idents = runx1.clusters)
+DimPlot(object = runx1,
+        reduction = "umap",
+        group.by = "RNA_snn_res.0.6",
+        label = T,
+        repel = T,
+        pt.size = 0.8,
+        label.size = 8)
+# Fig5B ----------------------------------------------------------------
+## Monocle3
+Idents(runx1) <- "RNA_snn_res.0.6"
+start = WhichCells(runx1,idents = "0")
+expression_matrix <- runx1@assays[["RNA"]]@counts
+cell_metadata <- runx1@meta.data
+cell_metadata$orig.ident <- factor(cell_metadata$orig.ident, levels = unique(cell_metadata$orig.ident))
+gene_annotation <- data.frame("gene_short_name" = rownames(runx1))
+rownames(gene_annotation) <- gene_annotation$gene_short_name
+cds <- new_cell_data_set(expression_data = expression_matrix, 
+                         cell_metadata = cell_metadata, 
+                         gene_metadata = gene_annotation)
+cds <- preprocess_cds(cds, num_dim = 50)
+cds <- reduce_dimension(cds)
+## Construct and assign the made up partition
+recreate.partition <- c(rep(1, length(cds@colData@rownames)))
+names(recreate.partition) <- cds@colData@rownames
+recreate.partition <- as.factor(recreate.partition)
+cds@clusters@listData[["UMAP"]][["partitions"]] <- recreate.partition
+## Assign the cluster info
+list_cluster <- runx1@meta.data[["RNA_snn_res.0.6"]]
+names(list_cluster) <- runx1@assays[["RNA"]]@data@Dimnames[[2]]
+cds@clusters@listData[["UMAP"]][["clusters"]] <- list_cluster
+cds@clusters@listData[["UMAP"]][["louvain_res"]] <- "NA"
+## Assign UMAP coordinate
+reducedDims(cds)[["UMAP"]] <- runx1@reductions[["umap"]]@cell.embeddings
+### Reset colnames and rownames (consequence of UMAP replacement)
+rownames(cds@principal_graph_aux[['UMAP']]$dp_mst) <- NULL
+## Learn Graph
+cds <- learn_graph(cds = cds,use_partition = T,learn_graph_control=list(ncenter=220,minimal_branch_len=15),verbose = T)
+plot_cells(cds,
+           color_cells_by = "RNA_snn_res.0.6",
+           label_groups_by_cluster=FALSE,
+           label_leaves=F,
+           label_branch_points=F,
+           graph_label_size=1.5,
+           group_label_size = 6,
+           cell_size = 1)
+# Fig5C -------------------------------------------------------------------
+## differential markers
+degs_clu11_vs_clu0_1_2 <- FindMarkers(object = runx1, ident.1 = 11, ident.2 = c(0,1,2), only.pos = FALSE, min.pct = 0, test.use = "wilcox", logfc.threshold = 0, min.cells.group = 0)
+degs_clu11_vs_clu16_17 <- FindMarkers(object = runx1, ident.1 = 11, ident.2 = c(16,17), only.pos = FALSE, min.pct = 0, test.use = "wilcox", logfc.threshold = 0, min.cells.group = 0)
+## GSEA
+adj.pval.thr <- 0.05
+gmt.obj <- read.gmt("Scarfo_HEC2023/BulkRNAseq_1/c5.all.v7.2.symbols.gmt")
+comp <- c("clu11_vs_clu0_1_2", "clu11_vs_clu16_17")
+for (i in comp) {
+  degs <- get(paste("degs",i,sep="_"))
+  gene.res.sorted <- degs[order(degs$avg_logFC, decreasing = TRUE), ]
+  # LogFC with Symbols
+  gene.res.logfc.symbol <- as.vector(gene.res.sorted$avg_logFC)
+  names(gene.res.logfc.symbol) <- row.names(gene.res.sorted)
+  contr.gsea <- GSEA(gene.res.logfc.symbol, 
+                     TERM2GENE = gmt.obj,
+                     nPerm = 10000, 
+                     pvalueCutoff = adj.pval.thr)
+  contr.gsea.res <- as.data.frame(contr.gsea)
+  assign(paste0("contr.gsea.res_",i), contr.gsea.res)
+  if (i == "clu11_vs_clu0_1_2") {
+    terms <- c("GO_CELL_CELL_JUNCTION_ORGANIZATION",
+               "GO_ACTIN_CYTOSKELETON",
+               "GO_EXTRACELLULAR_MATRIX_STRUCTURAL_CONSTITUENT",
+               "GO_STRUCTURAL_CONSTITUENT_OF_RIBOSOME",
+               "GO_RIBOSOME",
+               "GO_TRANSLATIONAL_INITIATION")
+  } else {
+    terms <- c("GO_MITOTIC_NUCLEAR_DIVISION",
+               "GO_CELL_DIVISION",
+               "GO_MITOTIC_CELL_CYCLE",
+               "GO_REGULATION_OF_NOTCH_SIGNALING_PATHWAY",
+               "GO_CYTOSOLIC_RIBOSOME",
+               "GO_ENDOTHELIAL_CELL_MIGRATION")
+  }
+  ti <- contr.gsea.res[c(contr.gsea.res$Description %in% terms), ]
+  ti <- mutate(ti, color = case_when(ti$NES > 0 ~ "Upregulated",
+                                     ti$NES < 0 ~ "Downregulated"))
+  ti <- arrange(ti, NES)
+  ti$Description <- factor(ti$Description, levels = ti$Description)
+  p <- ggplot(data=ti, aes(x=NES,  y=Description, fill = color)) +
+    geom_bar(stat="identity", width = 0.7) +
+    scale_fill_manual(values = c("Upregulated" = "red","Downregulated" = "blue"),name = "") +
+    ylab("Terms") +
+    ggtitle(paste(i)) +
+    theme_bw() +
+    theme(
+      axis.title.x = element_text(colour="black",family = "Arial", size=12*96/72),
+      axis.text.x = element_text(colour="black",family = "Arial", size=10*96/72),
+      axis.title.y = element_text(colour="black",family = "Arial", size=12*96/72),
+      axis.text.y = element_text(colour="black",family = "Arial", size=8*96/72),
+      aspect.ratio = 0.8)
+  assign(paste0("p_",i), p)
+}
+# Fig5D -------------------------------------------------------------------
+runx1@meta.data[["Phase"]] <- recode_factor(runx1@meta.data[["Phase"]],
+                                            "G1" = "G1", 
+                                            "S" = "S", 
+                                            "G2M" = "G2/M")
+## grouping clusters 0,1,2 and cluster 16,17
+runx1@meta.data[["RNA_snn_res.0.6"]] <- recode_factor(runx1@meta.data[["RNA_snn_res.0.6"]],
+                                                          "0" = "0,1,2", 
+                                                          "1" = "0,1,2", 
+                                                          "2" = "0,1,2",
+                                                          "11" = "11",
+                                                          "16" = "16,17",
+                                                          "17" = "16,17")
+cc_umap <- DimPlot(object = runx1,
+                   reduction = "umap",
+                   group.by = "Phase",
+                   split.by = "RNA_snn_res.0.6",
+                   label = F,
+                   repel = T,
+                   pt.size = 0.8,
+                   label.size = 8,
+                   cols = c("G1" = "#E6AB02", 
+                            "S" = "#7570B3", 
+                            "G2/M" = "#66A61E")) +
+  ggtitle("") +
+  xlab("UMAP1") +
+  ylab("UMAP2") +
+  theme(legend.text = element_text(size = 12*96/72),
+        axis.text.x = element_text(color = "black", family = "Arial",size = 10*96/72),
+        axis.text.y = element_text(color = "black", family = "Arial",size = 10*96/72),
+        axis.title.x = element_text(color = "black", family = "Arial",size = 12*96/72),
+        axis.title.y = element_text(color = "black", family = "Arial",size = 12*96/72),
+        aspect.ratio = 1,
+        plot.margin = margin(0, 0, 0, 0, "pt"),
+        panel.background = element_blank()) +
+  guides(color = guide_legend(override.aes = list(size=4)))
+## average cell counts per cluster
+ggp_obj_clu <- melt(table(runx1$Phase, runx1$RNA_snn_res.0.6))
+colnames(ggp_obj_clu) <- c("Phase","Cluster", "Count")
+cc.clusters <- c("0,1,2","11","16,17")
+ggp_obj_clu <- ggp_obj_clu[c(ggp_obj_clu$Cluster %in% cc.clusters), ]
+for (i in cc.clusters) {
+  totalB <- subset(ggp_obj_clu, ggp_obj_clu$Cluster==i)
+  totalB <- totalB %>% 
+    mutate(percentage = Count/sum(Count))
+  totalB$percentage <- totalB$percentage*100
+  totalB$fraction = totalB$Count / sum(totalB$Count)
+  # Compute the cumulative percentages
+  totalB$ymax <- cumsum(totalB$fraction)
+  totalB$ymin <- c(0, head(totalB$ymax, n=-1))
+  totalB$labelPosition <- (totalB$ymax + totalB$ymin) / 2
+  # Compute a good label
+  totalB$label <- paste0(totalB$Phase, ":","\n",round(totalB$percentage, 2), "%")
+  assign(paste0("totalB_clu_", i), totalB)
+  p <- ggplot(totalB, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=Phase)) +
+    geom_rect() +
+    geom_text(x=5, aes(y=labelPosition, label=label), size=5) +
+    #scale_fill_brewer(palette = "Set1") +
+    scale_fill_manual(values = c("G1" = "#E6AB02", 
+                                 "S" = "#7570B3", 
+                                 "G2/M" = "#66A61E")) +
+    coord_polar(theta="y") +
+    xlim(c(2, 5)) +
+    theme_void() +
+    theme(plot.margin = margin(0, 0, 0, 0, "pt")) +
+    theme(legend.position = "none")
+  assign(paste0("p_clu_",i), p)
+}
+p_final <- cc_umap / (`p_clu_0,1,2` | p_clu_11 | `p_clu_16,17`)
+# Fig5E -------------------------------------------------------------------
+cds <- order_cells(cds, root_cells = start)
+cds <- estimate_size_factors(cds)
+gi <- c("FCGR2B","HES1","HEY1", "HEY2")
+cds_gi <- cds[rowData(cds)$gene_short_name %in% gi,]
+plot_genes_in_pseudotime(cds_subset = cds_gi, 
+                         ncol = 2,
+                         color_cells_by = "RNA_snn_res.0.6", 
+                         min_expr = NULL,
+                         cell_size = 1)
--- a/scRNAseq/c5.all.v7.2.symbols.gmt
+++ b/scRNAseq/c5.all.v7.2.symbols.gmt