Commit 839b34cc authored by Monah Abou Alezz's avatar Monah Abou Alezz
Browse files

Initial commit

parents
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(ggrepel))
suppressPackageStartupMessages(library(pheatmap))
suppressPackageStartupMessages(library(RColorBrewer))
## Input featureCounts file
fcount.file <- "Scarfo_HEC2023/BulkRNAseq_1/featureCounts_results_corrected.txt"
## Contrasts to perform
contr <- "ACE_pos:ACE_neg"
contr.list <- unlist(strsplit(contr, ","))
## Design Formula
formula.str <- "donor,condition"
design.str <- gsub(",", "+", formula.str)
num_cond <- str_count(formula.str, ",") + 1
design.formula <- as.formula(paste("~", design.str, sep = ""))
## Read featureCounts results
if (!file.exists(fcount.file)) {
stop("Error: featureCounts file not found.")
} else {
read.counts <- read.table(fcount.file,
header = TRUE,
check.names = FALSE) %>%
column_to_rownames(var = "Geneid") %>%
select(-c(1:5))
}
orig.names <- names(read.counts)
ds <- list()
cond <- list()
for (i in seq(orig.names[1:length(orig.names)])) {
ds[i] <- unlist(strsplit(orig.names[i], ":"))[2]
cond[i] <- unlist(strsplit(orig.names[i], ":"))[3]
}
ds <- unlist(ds)
cond <- unlist(cond)
## Samples - Conditions
sample_info <- as.data.frame(str_split_fixed(cond, ",", num_cond))
colnames(sample_info) <- str_split_fixed(formula.str, ",", num_cond)
rownames(sample_info) <- ds
colnames(read.counts) <- ds
sample_info$condition <- factor(sample_info$condition)
last_condition <- str_split_fixed(formula.str, ",", num_cond)[num_cond]
batch_cond <- str_split_fixed(formula.str, ",", num_cond)[num_cond - 1]
## thresholds
adj.pval.thr <- 0.05
logfc.thr <- 0
## DESeq2
DESeq.ds <- DESeqDataSetFromMatrix(countData = read.counts,
colData = sample_info,
design = design.formula)
## Remove genes without any counts
DESeq.ds <- DESeq.ds[rowSums(counts(DESeq.ds)) > 0, ]
## RunDGE
DESeq.ds <- DESeq(DESeq.ds)
## obtain regularized log-transformed values
DESeq.rlog <- rlogTransformation(DESeq.ds, blind = TRUE)
assay(DESeq.rlog) <- limma::removeBatchEffect(assay(DESeq.rlog), DESeq.rlog[[batch_cond]])
ctrs <- unlist(strsplit(contr.list[1], ":"))
c1 <- ctrs[1]
c2 <- ctrs[2]
DGE.results <- results(DESeq.ds,
pAdjustMethod = "BH",
independentFiltering = TRUE,
contrast = c(last_condition, c1, c2),
alpha = adj.pval.thr)
# Change format
DGE.results.annot <- as.data.frame(DGE.results) %>%
dplyr::select(log2FoldChange, lfcSE, baseMean, pvalue, padj)
colnames(DGE.results.annot) <- c("logFC", "lfcSE", "baseMean", "PValue", "FDR")
## sort the results according to the adjusted p-value
DGE.results.sorted <- DGE.results.annot[order(DGE.results.annot$FDR), ]
## Subset for only significant genes
DGE.results.sig <- subset(DGE.results.sorted,
FDR < adj.pval.thr & abs(logFC) > logfc.thr)
DGEgenes <- rownames(DGE.results.sig)
# ED_Fig1D ----------------------------------------------------------------
data <- data.frame(gene = row.names(DGE.results.sorted),
pval = -log10(DGE.results.sorted$FDR),
lfc = DGE.results.sorted$logFC)
data <- na.omit(data)
data <- mutate(data, color = case_when(data$lfc > logfc.thr & data$pval > -log10(adj.pval.thr) ~ "Overexpressed",data$lfc < -logfc.thr & data$pval > -log10(adj.pval.thr) ~ "Underexpressed",abs(data$lfc) < logfc.thr | data$pval < -log10(adj.pval.thr) ~ "NonSignificant"))
data$color <- factor(data$color, levels = c("Overexpressed", "Underexpressed", "NonSignificant"))
data <- data[order(data$pval, decreasing = TRUE),]
highl <- head(subset(data, color != "NonSignificant"), 20)
ggplot(data, aes(x = lfc, y = pval, colour = color)) +
theme_bw() +
theme(legend.position = "right",
text=element_text(size=17*96/72),
axis.text.x = element_text(size = 17*96/72,
color = "black"),
axis.text.y = element_text(size = 17*96/72,
color = "black"),
axis.title.y = element_text(colour = "black", size = 19*96/72),
axis.title.x = element_text(colour = "black", size = 19*96/72),
legend.text = element_text(colour = "black", size = 19*96/72),
legend.title = element_text(colour = "black", size = 19*96/72)) +
xlab("log2 Fold Change") +
ylab(expression(-log[10]("adjusted p-value"))) +
geom_hline(yintercept = -log10(adj.pval.thr), colour = "darkgray") +
geom_point(size = 3, alpha = 0.8, na.rm = T) +
scale_color_manual(name = "Expression",
values = c(Overexpressed = "red",
Underexpressed = "royalblue3",
NonSignificant = "darkgray")) +
geom_label_repel(data = highl,
aes(label = gene),
show.legend = FALSE,
size = 5)
# ED_Fig1E --------------------------------------------------------
DESeq.ds_assay_corr <- limma::removeBatchEffect(assay(DESeq.ds), DESeq.ds[[batch_cond]])
sampleDists_corr <- dist(t(DESeq.ds_assay_corr))
sampleDistMatrix_corr <- as.matrix(sampleDists_corr)
rownames(sampleDistMatrix_corr) <- paste(rownames(sampleDistMatrix_corr), DESeq.ds[[last_condition]], sep = " ")
colnames(sampleDistMatrix_corr) <- NULL
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
pheatmap::pheatmap(sampleDistMatrix_corr,
clustering_distance_rows=sampleDists_corr,
clustering_distance_cols=sampleDists_corr,
col = colors)
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(ggrepel))
suppressPackageStartupMessages(library(openxlsx))
suppressPackageStartupMessages(library(RColorBrewer))
## Input featureCounts file
fcount.file <- "Scarfo_HEC2023/BulkRNAseq_1/featureCounts_results_corrected.txt"
## Contrasts to perform
contr <- "ACE_pos:ACE_neg"
contr.list <- unlist(strsplit(contr, ","))
## Design Formula
formula.str <- "donor,condition"
design.str <- gsub(",", "+", formula.str)
num_cond <- str_count(formula.str, ",") + 1
design.formula <- as.formula(paste("~", design.str, sep = ""))
## Read featureCounts results
if (!file.exists(fcount.file)) {
stop("Error: featureCounts file not found.")
} else {
read.counts <- read.table(fcount.file,
header = TRUE,
check.names = FALSE) %>%
column_to_rownames(var = "Geneid") %>%
select(-c(1:5))
}
orig.names <- names(read.counts)
ds <- list()
cond <- list()
for (i in seq(orig.names[1:length(orig.names)])) {
ds[i] <- unlist(strsplit(orig.names[i], ":"))[2]
cond[i] <- unlist(strsplit(orig.names[i], ":"))[3]
}
ds <- unlist(ds)
cond <- unlist(cond)
## Samples - Conditions
sample_info <- as.data.frame(str_split_fixed(cond, ",", num_cond))
colnames(sample_info) <- str_split_fixed(formula.str, ",", num_cond)
rownames(sample_info) <- ds
colnames(read.counts) <- ds
sample_info$condition <- factor(sample_info$condition)
last_condition <- str_split_fixed(formula.str, ",", num_cond)[num_cond]
batch_cond <- str_split_fixed(formula.str, ",", num_cond)[num_cond - 1]
## thresholds
adj.pval.thr <- 0.05
logfc.thr <- 0
## DESeq2
DESeq.ds <- DESeqDataSetFromMatrix(countData = read.counts,
colData = sample_info,
design = design.formula)
## Remove genes without any counts
DESeq.ds <- DESeq.ds[rowSums(counts(DESeq.ds)) > 0, ]
## RunDGE
DESeq.ds <- DESeq(DESeq.ds)
## obtain regularized log-transformed values
DESeq.rlog <- rlogTransformation(DESeq.ds, blind = TRUE)
assay(DESeq.rlog) <- limma::removeBatchEffect(assay(DESeq.rlog), DESeq.rlog[[batch_cond]])
ctrs <- unlist(strsplit(contr.list[1], ":"))
c1 <- ctrs[1]
c2 <- ctrs[2]
DGE.results <- results(DESeq.ds,
pAdjustMethod = "BH",
independentFiltering = TRUE,
contrast = c(last_condition, c1, c2),
alpha = adj.pval.thr)
# Change format
DGE.results.annot <- as.data.frame(DGE.results) %>%
dplyr::select(log2FoldChange, lfcSE, baseMean, pvalue, padj)
colnames(DGE.results.annot) <- c("logFC", "lfcSE", "baseMean", "PValue", "FDR")
## sort the results according to the adjusted p-value
DGE.results.sorted <- DGE.results.annot[order(DGE.results.annot$FDR), ]
## Subset for only significant genes
DGE.results.sig <- subset(DGE.results.sorted,
FDR < adj.pval.thr & abs(logFC) > logfc.thr)
DGEgenes <- rownames(DGE.results.sig)
# ED_Fig2A ----------------------------------------------------------------
## surface genes
if(!file.exists("Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx")){
download.file("http://wlab.ethz.ch/surfaceome/table_S3_surfaceome.xlsx",
destfile = "Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx")
}
sg <- read.xlsx("Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx",
sheet = "in silico surfaceome only",
colNames = T,
startRow = 2)
sig_sg <- DGE.results.sig[c(row.names(DGE.results.sig) %in% sg$UniProt.gene), ]
top10_sig_sg <- sig_sg[c(1:10),]
data <- data.frame(gene = row.names(DGE.results.sorted),
pval = -log10(DGE.results.sorted$FDR),
lfc = DGE.results.sorted$logFC)
data <- na.omit(data)
data <- mutate(data, color = case_when(data$lfc > logfc.thr & data$pval > -log10(adj.pval.thr) ~ "Overexpressed",data$lfc < -logfc.thr & data$pval > -log10(adj.pval.thr) ~ "Underexpressed",abs(data$lfc) < logfc.thr | data$pval < -log10(adj.pval.thr) ~ "NonSignificant"))
data$color <- factor(data$color, levels = c("Overexpressed", "Underexpressed", "NonSignificant"))
data <- data[order(data$pval, decreasing = TRUE),]
highl <- data[c(data$gene %in% row.names(top10_sig_sg)), ]
ggplot(data, aes(x = lfc, y = pval, colour = color)) +
theme_bw() +
theme(legend.position = "right",
text=element_text(size=17*96/72),
axis.text.x = element_text(size = 17*96/72,
color = "black"),
axis.text.y = element_text(size = 17*96/72,
color = "black"),
axis.title.y = element_text(colour = "black", size = 19*96/72),
axis.title.x = element_text(colour = "black", size = 19*96/72),
legend.text = element_text(colour = "black", size = 19*96/72),
legend.title = element_text(colour = "black", size = 19*96/72)) +
xlab("log2 Fold Change") +
ylab(expression(-log[10]("adjusted p-value"))) +
geom_hline(yintercept = -log10(adj.pval.thr), colour = "darkgray") +
geom_point(size = 3, alpha = 0.8, na.rm = T) +
scale_color_manual(name = "Expression",
values = c(Overexpressed = "red",
Underexpressed = "royalblue3",
NonSignificant = "darkgray")) +
geom_label_repel(data = highl,
aes(label = gene),
show.legend = FALSE,
size = 5)
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(pheatmap))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(openxlsx))
suppressPackageStartupMessages(library(org.Hs.eg.db))
suppressPackageStartupMessages(library(clusterProfiler))
## Input featureCounts file
fcount.file <- "Scarfo_HEC2023/BulkRNAseq_1/featureCounts_results_corrected.txt"
## Contrasts to perform
contr <- "ACE_pos:ACE_neg"
contr.list <- unlist(strsplit(contr, ","))
## Design Formula
formula.str <- "donor,condition"
design.str <- gsub(",", "+", formula.str)
num_cond <- str_count(formula.str, ",") + 1
design.formula <- as.formula(paste("~", design.str, sep = ""))
## Read featureCounts results
if (!file.exists(fcount.file)) {
stop("Error: featureCounts file not found.")
} else {
read.counts <- read.table(fcount.file,
header = TRUE,
check.names = FALSE) %>%
column_to_rownames(var = "Geneid") %>%
select(-c(1:5))
}
orig.names <- names(read.counts)
ds <- list()
cond <- list()
for (i in seq(orig.names[1:length(orig.names)])) {
ds[i] <- unlist(strsplit(orig.names[i], ":"))[2]
cond[i] <- unlist(strsplit(orig.names[i], ":"))[3]
}
ds <- unlist(ds)
cond <- unlist(cond)
## Samples - Conditions
sample_info <- as.data.frame(str_split_fixed(cond, ",", num_cond))
colnames(sample_info) <- str_split_fixed(formula.str, ",", num_cond)
rownames(sample_info) <- ds
colnames(read.counts) <- ds
sample_info$condition <- factor(sample_info$condition)
last_condition <- str_split_fixed(formula.str, ",", num_cond)[num_cond]
batch_cond <- str_split_fixed(formula.str, ",", num_cond)[num_cond - 1]
## thresholds
adj.pval.thr <- 0.05
logfc.thr <- 0
## DESeq2
DESeq.ds <- DESeqDataSetFromMatrix(countData = read.counts,
colData = sample_info,
design = design.formula)
## Remove genes without any counts
DESeq.ds <- DESeq.ds[rowSums(counts(DESeq.ds)) > 0, ]
## RunDGE
DESeq.ds <- DESeq(DESeq.ds)
## obtain regularized log-transformed values
DESeq.rlog <- rlogTransformation(DESeq.ds, blind = TRUE)
assay(DESeq.rlog) <- limma::removeBatchEffect(assay(DESeq.rlog), DESeq.rlog[[batch_cond]])
ctrs <- unlist(strsplit(contr.list[1], ":"))
c1 <- ctrs[1]
c2 <- ctrs[2]
DGE.results <- results(DESeq.ds,
pAdjustMethod = "BH",
independentFiltering = TRUE,
contrast = c(last_condition, c1, c2),
alpha = adj.pval.thr)
# Change format
DGE.results.annot <- as.data.frame(DGE.results) %>%
dplyr::select(log2FoldChange, lfcSE, baseMean, pvalue, padj)
colnames(DGE.results.annot) <- c("logFC", "lfcSE", "baseMean", "PValue", "FDR")
## sort the results according to the adjusted p-value
DGE.results.sorted <- DGE.results.annot[order(DGE.results.annot$FDR), ]
## Subset for only significant genes
DGE.results.sig <- subset(DGE.results.sorted,
FDR < adj.pval.thr & abs(logFC) > logfc.thr)
DGEgenes <- rownames(DGE.results.sig)
# Fig1B ---------------------------------------------------------------------
plotPCA(DESeq.rlog, intgroup = last_condition, ntop = 500) +
theme_bw() +
ggtitle(label = "Principal Component Analysis (PCA)",
subtitle = "Top 500 DEG (with batch correction)") +
geom_point(size = 2) +
#coord_fixed(ratio=3) +
geom_text(aes(label = rownames(DESeq.rlog@colData)), vjust = 0, nudge_y = 0.5,
show.legend = FALSE)
# Fig1C ----------------------------------------------------------------
## DEGs
s_info <- sample_info[sample_info[[last_condition]] == c1 | sample_info[[last_condition]] == c2,, drop = FALSE]
hm.mat_DGEgenes <- assay(DESeq.rlog)[DGEgenes, ]
hm.mat_DGEgenes.filt <- hm.mat_DGEgenes[, row.names(s_info)]
pheatmap::pheatmap(hm.mat_DGEgenes.filt,
color = colorRampPalette(rev(brewer.pal(n = 9, name ="RdYlBu")))(100),
fontsize_row = 5,
show_rownames = FALSE,
scale = "row",
angle_col = 90,
cluster_rows = TRUE,
annotation_col = s_info,
cluster_cols = TRUE)
# Fig1D ----------------------------------------------------------------
## surface genes
if(!file.exists("Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx")){
download.file("http://wlab.ethz.ch/surfaceome/table_S3_surfaceome.xlsx",
destfile = "Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx")
}
sg <- read.xlsx("Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx",
sheet = "in silico surfaceome only",
colNames = T,
startRow = 2)
sig_sg <- DGE.results.sig[c(row.names(DGE.results.sig) %in% sg$UniProt.gene), ]
markers <- c("CD34","CDH5","PECAM1","TEK","NR2F2","FLRT2","BMX","GJA5","DLL4","CXCR4","HEY2","MYB","GFI1","CD44")
DESeq.rlog_mat <- assay(DESeq.rlog)
DESeq.rlog_mat <- DESeq.rlog_mat[c(row.names(DESeq.rlog_mat) %in% markers), ]
DESeq.rlog_mat <- DESeq.rlog_mat[markers,]
# Specify colors
ann_colors = list(
condition = c(ACE_neg = "gold3",
ACE_pos = "turquoise3"),
donor = c(E1 = "#E76BF3",
E2 = "#F8766D",
E3 = "#529EFF",
E4 = "springgreen3")
)
pheatmap::pheatmap(DESeq.rlog_mat,
color = colorRampPalette(rev(brewer.pal(n = 9, name ="RdYlBu")))(100),
annotation_col = s_info,
annotation_names_row = FALSE,
annotation_colors = ann_colors,
show_rownames = TRUE,
show_colnames = TRUE,
cluster_rows = FALSE,
angle_col = 45,
fontsize = 14,
fontsize_row = 14,
fontsize_col = 18,
cellheight = 20,
cellwidth = 30)
# Fig1E --------------------------------------------
gmt.obj <- read.gmt("Scarfo_HEC2023/BulkRNAseq_1/c5.all.v7.2.symbols.gmt")
organism.db <- org.Hs.eg.db
# Remove genes with no FDR
DGE.results.annot <- DGE.results.annot[!is.na(DGE.results.annot$FDR), ]
gene_univ <- row.names(DGE.results.annot)
# get positive and negative gene names
gene.res.top <- subset(DGE.results.annot, FDR < adj.pval.thr & abs(logFC) > logfc.thr)
gene.res.top.pos <- subset(gene.res.top, logFC > 0)
gene.res.top.neg <- subset(gene.res.top, logFC < 0)
## Over Representation Analysis
comp <- c("gene.res.top.pos","gene.res.top.neg")
for (i in comp) {
print(paste0("Analyzing...",i," genes"))
obj_i <- get(i)
contr.enricher <- enricher(row.names(obj_i),
universe = gene_univ,
pvalueCutoff = adj.pval.thr,
TERM2GENE = gmt.obj)
contr.enricher.res <- as.data.frame(contr.enricher)
assign(paste("contr.enricher",i,sep = "."), contr.enricher.res)
}
## barplots
terms.pos <- data.frame(
ID = c("GO_LEUKOCYTE_MIGRATION","GO_AMEBOIDAL_TYPE_CELL_MIGRATION","GO_TISSUE_MIGRATION","GO_POSITIVE_REGULATION_OF_EPITHELIAL_CELL_MIGRATION","GO_REGULATION_OF_EPITHELIAL_CELL_MIGRATION","GO_ENDOTHELIAL_CELL_MIGRATION","GO_REGULATION_OF_ENDOTHELIAL_CELL_MIGRATION","GO_COLLAGEN_CONTAINING_EXTRACELLULAR_MATRIX","GO_EXTRACELLULAR_MATRIX","GO_CELL_CELL_JUNCTION","GO_NEGATIVE_REGULATION_OF_CELL_ADHESION","GO_REGULATION_OF_CELL_CELL_ADHESION","GO_LEUKOCYTE_CELL_CELL_ADHESION","GO_EXTRACELLULAR_MATRIX_BINDING"),
Type = c(rep("Migration", 7), rep("Adhesion", 7)))
terms.pos <- left_join(terms.pos,contr.enricher.gene.res.top.pos,by="ID")
terms.neg <- data.frame(
ID = c("GO_POSITIVE_REGULATION_OF_CELL_CYCLE_PROCESS","GO_POSITIVE_REGULATION_OF_CELL_CYCLE","GO_POSITIVE_REGULATION_OF_CELL_CYCLE_G2_M_PHASE_TRANSITION","GO_POSITIVE_REGULATION_OF_CELL_CYCLE_PHASE_TRANSITION","GO_MITOTIC_CELL_CYCLE_CHECKPOINT","GO_MITOTIC_NUCLEAR_DIVISION","GO_POSITIVE_REGULATION_OF_MITOTIC_CELL_CYCLE"),
Type = "Cell Cycle")
terms.neg <- left_join(terms.neg,contr.enricher.gene.res.top.neg,by="ID")
terms.to.plot <- rbind(terms.pos, terms.neg)
terms.to.plot <- na.omit(terms.to.plot)
parsed.terms <- as.data.frame(str_split_fixed(terms.to.plot$ID, "GO_", 2))
renamed.terms <- as.data.frame(str_replace_all(parsed.terms$V2, "_", " "))
colnames(renamed.terms) <- "id"
renamed.terms$Terms <- str_to_title(renamed.terms$id)
terms.to.plot <- cbind(terms.to.plot,renamed.terms)
terms.to.plot <- terms.to.plot[,c(12,10,7,2)]
terms.to.plot$Type <- factor(terms.to.plot$Type, levels = c("Cell Cycle", "Migration", "Adhesion"))
terms.to.plot$Terms <- factor(terms.to.plot$Terms, levels = terms.to.plot$Terms)
terms.to.plot <- terms.to.plot %>%
mutate(color = ifelse(Type == "Cell Cycle", "gold3", "turquoise3"),
regulated = ifelse(Type == "Cell Cycle", "Upregulated in ACEneg", "Upregulated in ACE+"))
terms.to.plot$regulated <- factor(terms.to.plot$regulated,
levels = c("Upregulated in ACEneg","Upregulated in ACE+"))
terms.to.plot$log_pvalue <- -log10(terms.to.plot$p.adjust)
ggplot(data=terms.to.plot, aes(x=Terms,
y=log_pvalue,
fill=regulated,
xmin = min(log_pvalue),
xmax = max(log_pvalue))) +
geom_bar(stat="identity", width = 0.7) +
scale_fill_manual(values = c("gold3", "turquoise3"))+
theme_bw() +
coord_flip() +
ylab(expression(-log[10]("adjusted p-value"))) +
xlab("GO Terms") +
#scale_x_discrete(limits = GO_terms_all$Term) +
facet_grid(Type ~., scales = "free_y", space = "free_y") +
theme(
axis.title.x = element_text(colour="black",family = "Arial", size=16),
axis.text.x = element_text(colour="black",family = "Arial", size=12),
axis.title.y = element_text(colour="black",family = "Arial", size=16),
axis.text.y = element_text(color="black",family = "Arial", size=12),
legend.text = element_text(colour="black",family = "Arial", size=12),
legend.title = element_text(colour="white",family = "Arial", size=12),
strip.background = element_rect(colour="black",
fill="white"),
strip.text = element_text(family = "Arial", size=12))
## GSEA
DGE.results.annot.sorted <- DGE.results.annot[order(DGE.results.annot$logFC, decreasing = TRUE), ]
# LogFC with Symbols
gene.res.logfc.symbol <- as.vector(DGE.results.annot.sorted$logFC)
names(gene.res.logfc.symbol) <- row.names(DGE.results.annot.sorted)
contr.gsea <- GSEA(gene.res.logfc.symbol,
TERM2GENE = gmt.obj,
nPerm = 10000,
pvalueCutoff = adj.pval.thr)
contr.gsea.res <- as.data.frame(contr.gsea)
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(pheatmap))
suppressPackageStartupMessages(library(openxlsx))
suppressPackageStartupMessages(library(RColorBrewer))
## Input featureCounts file
fcount.file <- "Scarfo_HEC2023/BulkRNAseq_1/featureCounts_results_corrected.txt"
## Contrasts to perform
contr <- "ACE_pos:ACE_neg"
contr.list <- unlist(strsplit(contr, ","))
## Design Formula
formula.str <- "donor,condition"
design.str <- gsub(",", "+", formula.str)
num_cond <- str_count(formula.str, ",") + 1
design.formula <- as.formula(paste("~", design.str, sep = ""))
## Read featureCounts results
if (!file.exists(fcount.file)) {
stop("Error: featureCounts file not found.")
} else {
read.counts <- read.table(fcount.file,
header = TRUE,
check.names = FALSE) %>%
column_to_rownames(var = "Geneid") %>%
select(-c(1:5))
}
orig.names <- names(read.counts)
ds <- list()
cond <- list()
for (i in seq(orig.names[1:length(orig.names)])) {
ds[i] <- unlist(strsplit(orig.names[i], ":"))[2]
cond[i] <- unlist(strsplit(orig.names[i], ":"))[3]
}
ds <- unlist(ds)
cond <- unlist(cond)
## Samples - Conditions
sample_info <- as.data.frame(str_split_fixed(cond, ",", num_cond))
colnames(sample_info) <- str_split_fixed(formula.str, ",", num_cond)
rownames(sample_info) <- ds
colnames(read.counts) <- ds
sample_info$condition <- factor(sample_info$condition)
last_condition <- str_split_fixed(formula.str, ",", num_cond)[num_cond]
batch_cond <- str_split_fixed(formula.str, ",", num_cond)[num_cond - 1]
## thresholds
adj.pval.thr <- 0.05
logfc.thr <- 0
## DESeq2
DESeq.ds <- DESeqDataSetFromMatrix(countData = read.counts,
colData = sample_info,
design = design.formula)
## Remove genes without any counts
DESeq.ds <- DESeq.ds[rowSums(counts(DESeq.ds)) > 0, ]
## RunDGE
DESeq.ds <- DESeq(DESeq.ds)
## obtain regularized log-transformed values
DESeq.rlog <- rlogTransformation(DESeq.ds, blind = TRUE)
assay(DESeq.rlog) <- limma::removeBatchEffect(assay(DESeq.rlog), DESeq.rlog[[batch_cond]])
ctrs <- unlist(strsplit(contr.list[1], ":"))
c1 <- ctrs[1]
c2 <- ctrs[2]
DGE.results <- results(DESeq.ds,
pAdjustMethod = "BH",
independentFiltering = TRUE,
contrast = c(last_condition, c1, c2),
alpha = adj.pval.thr)
# Change format
DGE.results.annot <- as.data.frame(DGE.results) %>%
dplyr::select(log2FoldChange, lfcSE, baseMean, pvalue, padj)
colnames(DGE.results.annot) <- c("logFC", "lfcSE", "baseMean", "PValue", "FDR")
## sort the results according to the adjusted p-value
DGE.results.sorted <- DGE.results.annot[order(DGE.results.annot$FDR), ]
## Subset for only significant genes
DGE.results.sig <- subset(DGE.results.sorted,
FDR < adj.pval.thr & abs(logFC) > logfc.thr)
DGEgenes <- rownames(DGE.results.sig)
# Fig2A ----------------------------------------------------------------
## surface genes
if(!file.exists("Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx")){
download.file("http://wlab.ethz.ch/surfaceome/table_S3_surfaceome.xlsx",
destfile = "Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx")
}
sg <- read.xlsx("Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx",
sheet = "in silico surfaceome only",
colNames = T,
startRow = 2)
sig_sg <- DGE.results.sig[c(row.names(DGE.results.sig) %in% sg$UniProt.gene), ]
top10_sig_sg <- sig_sg[c(1:10),]
s_info <- sample_info[sample_info[[last_condition]] == c1 | sample_info[[last_condition]] == c2,, drop = FALSE]
hm.mat_DGEgenes <- assay(DESeq.rlog)[row.names(top10_sig_sg), ]
hm.mat_DGEgenes.filt <- hm.mat_DGEgenes[, row.names(s_info)]
pheatmap::pheatmap(hm.mat_DGEgenes.filt,
scale = "row",
color = colorRampPalette(rev(brewer.pal(n = 9, name ="RdYlBu")))(100),
annotation_col = s_info,
cluster_rows = TRUE,
cluster_cols = TRUE,
show_rownames = TRUE,
show_colnames = TRUE,
angle_col = 90,
fontsize = 15)
This diff is collapsed.
This diff is collapsed.
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(pheatmap))
suppressPackageStartupMessages(library(RColorBrewer))
## Input featureCounts file
fcount.file <- "Scarfo_HEC2023/BulkRNAseq_2/featureCounts_results_corrected.txt"
## Contrasts to perform
contr <- "DLL4_plus:CD32_plus"
contr.list <- unlist(strsplit(contr, ","))
## Design Formula
formula.str <- "condition"
design.str <- gsub(",", "+", formula.str)
num_cond <- str_count(formula.str, ",") + 1
design.formula <- as.formula(paste("~", design.str, sep = ""))
## Read featureCounts results
if (!file.exists(fcount.file)) {
stop("Error: featureCounts file not found.")
} else {
read.counts <- read.table(fcount.file,
header = TRUE,
check.names = FALSE) %>%
column_to_rownames(var = "Geneid") %>%
select(-c(1:5))
}
orig.names <- names(read.counts)
ds <- list()
cond <- list()
for (i in seq(orig.names[1:length(orig.names)])) {
ds[i] <- unlist(strsplit(orig.names[i], ":"))[2]
cond[i] <- unlist(strsplit(orig.names[i], ":"))[3]
}
ds <- unlist(ds)
cond <- unlist(cond)
## Samples - Conditions
sample_info <- as.data.frame(str_split_fixed(cond, ",", num_cond))
colnames(sample_info) <- str_split_fixed(formula.str, ",", num_cond)
rownames(sample_info) <- ds
colnames(read.counts) <- ds
sample_info$condition <- factor(sample_info$condition)
last_condition <- str_split_fixed(formula.str, ",", num_cond)[num_cond]
## thresholds
adj.pval.thr <- 0.05
logfc.thr <- 0
## DESeq2
DESeq.ds <- DESeqDataSetFromMatrix(countData = read.counts,
colData = sample_info,
design = design.formula)
## Remove genes without any counts
DESeq.ds <- DESeq.ds[rowSums(counts(DESeq.ds)) > 0, ]
## RunDGE
DESeq.ds <- DESeq(DESeq.ds)
## obtain regularized log-transformed values
DESeq.rlog <- rlogTransformation(DESeq.ds, blind = TRUE)
ctrs <- unlist(strsplit(contr.list[1], ":"))
c1 <- ctrs[1]
c2 <- ctrs[2]
DGE.results <- results(DESeq.ds,
pAdjustMethod = "BH",
independentFiltering = TRUE,
contrast = c(last_condition, c1, c2),
alpha = adj.pval.thr)
# Change format
DGE.results.annot <- as.data.frame(DGE.results) %>%
dplyr::select(log2FoldChange, lfcSE, baseMean, pvalue, padj)
colnames(DGE.results.annot) <- c("logFC", "lfcSE", "baseMean", "PValue", "FDR")
## sort the results according to the adjusted p-value
DGE.results.sorted <- DGE.results.annot[order(DGE.results.annot$FDR), ]
## Subset for only significant genes
DGE.results.sig <- subset(DGE.results.sorted,
FDR < adj.pval.thr & abs(logFC) > logfc.thr)
DGEgenes <- rownames(DGE.results.sig)
# ED_Fig5A -----------------------------------------------------------------
## scorecard
EHT_scorecard = c("CDH5","RUNX1","PTPRC", "SPN","CD44","ITGA2B","HLF","GFI1","MLLT3","HOXA9","MKI67","NRP2","NR2F2","GJA4","HEY1","DLL4","CXCR4","SOX17","SMAD6","GJA5","HES4","MECOM","NID2","PRND","ESM1","PDGFB","PALMD","TMEM100","EDN1","LTBP4","HEY2","SULF1","IL33","CYP26B1","ADGRG6","COL23A1","GATA6","BMX","TMCC3","DKK1","AGTR2","FBN2","ELN","ALDH1A1","NKX2-3","PROCR","GATA3","GBP4","MYCN","KCNK17","MYB","STAT5A","SMIM24","RAB27B","SPINK2")
hm.mat_DGEgenes <- assay(DESeq.rlog)[DGEgenes, ]
counts_eht <- hm.mat_DGEgenes[c(rownames(hm.mat_DGEgenes) %in% EHT_scorecard), ]
eht_sig <- counts_eht[c(rownames(counts_eht) %in% row.names(DGE.results.sig)), ]
eht_sig <- eht_sig[,c(1,3,6,2,5,8)]
annot <- data.frame(row.names = colnames(eht_sig),
Sample = c(rep("DLL4+", 3), rep("CD32+", 3)),
Donor = rep(c("RS227","RS242","RS243"), 2))
annot_colors <- list(Sample=c("DLL4+" ="blueviolet",
"CD32+" = "gold1"),
Donor=c("RS227"="darkslategray3",
"RS242"="brown3",
"RS243"="limegreen"))
pheatmap::pheatmap(eht_sig,
scale = "row",
annotation_col = annot,
annotation_colors = annot_colors,
cluster_rows = TRUE,
cluster_cols = TRUE,
show_rownames = TRUE,
show_colnames = TRUE,
angle_col = 90,
legend = TRUE,
legend_breaks = c(-1.5,0,1.5), # legend breaks to be shown
legend_labels = c(-1.5,0,1.5), # legend labels to be shown
treeheight_row = 25*96/72, # size of row dendogram
treeheight_col = 15*96/72, # size of column dendogram
fontsize = 10*96/72,
fontsize_row = 8*96/72,
fontsize_col = 10*96/72,
cellwidth = 20*96/72,
cellheight = 8*96/72,
border_color = NA
)
This diff is collapsed.
# Scarfo_HEC2023
This repository includes essential scripts for producing final tables and figures embedded in the following manuscript:
**Scarfò et al**,_The Fc receptor CD32 identifies human hemogenic endothelial cells irreversibly bound to the endothelial-to-hematopoietic transition_
PubMed:
DOI:
GEO: GSE223223
---
The repository is divided in three folders, 2 bulk RNAseq analyses and 1 scRNAseq analysis.
Below a brief description of the bulk and scRNAseq workflows adopted in this work.
**Bulk RNAseq** analysis was performed using a standard pipeline that includes the follwing steps:
1. Quality control by [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)
2. Trimming of bad quality reads with [TrimGalore](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/)<details><summary>Running command</summary>trim_galore --quality 20 --fastqc --length 25 --output_dir {outdir} --paired {input.r1} {inout.r2}</details>
3. Alignment with [STAR](https://github.com/alexdobin/STAR)
<details><summary>Running command</summary>
"STAR " +
"--runThreadN {threads} " +
"--genomeDir {input.genome} " +
"--readFilesIn {params.trim_seq} " +
"--outSAMstrandField intronMotif " +
"--outFileNamePrefix {params.aln_seq_prefix} " +
"--outSAMtype BAM SortedByCoordinate " +
"--outSAMmultNmax 1 " +
"--outFilterMismatchNmax 10 " +
"--outReadsUnmapped Fastx " +
"--readFilesCommand zcat "
</details>
4. Gene expression quantification with [featureCounts](https://academic.oup.com/bioinformatics/article/30/7/923/232889)
<details><summary>Running command</summary>
"featureCounts " +
"-a {input.annot} " +
"-o {output.fcount} " +
"-g gene_name " +
"-p -B -C " +
"-s {params.strand} " +
"--minOverlap 10 " +
"-T {threads} " +
"{input.bams} "
</details>
5. Differential Expression analysis with [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html).
For Differential Gene Expression analysis we followed the standard workflow provided by package.
6. Dowstream functional Analysis with [ClusterProfiler](https://bioconductor.org/packages/release/bioc/html/clusterProfiler.html).
In order to retrieve functional annotation from DE analysis, we performed **O**ver **R**epresentation **A**nalysis by using the _enrichr_ function provided by the package.
**ORA** analysis was performed in particular using the C5 gene set from the MSigDB database (version 7.2)
---
**scRNAseq** analysis was performed using a standard pipeline that includes the following steps:
scRNAseq analysis was performed with [Seurat](https://satijalab.org/seurat/). Below are the main steps of the basic data analysis workflow that start from a minimal object after loading of 10X data to markers identification:
1. Quality control and filtering
2. Cell cycle scoring
3. Normalization (default seurat settings)
4. Scaling (with following variables to regress out: percent.mt + nCount_RNA and CC.Difference calculated as show in [vignette](https://satijalab.org/seurat/articles/cell_cycle_vignette.html#alternate-workflow-1))
4. Dimensionality reduction: PCA
5. Clustering
6. Markers identification
- [6.1] Clusters related markers
- [6.2] Intracluster differential expression analysis according to comparison of interest
Pseudotime analysis was performed using [Monocle3](http://cole-trapnell-lab.github.io/monocle3/)
Input files for scRNAseq analysis are available in the following [link](https://www.dropbox.com/sh/83dxrxqer8cl081/AAC8zALRuYRGh1mEe4lZuaHZa?dl=0)
---
An interactive querying and exploration of the scRNAseq dataset is available at:
http://bioinfotiget.it/he/
---
suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(monocle3))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(dplyr))
# ED_Fig5B ----------------------------------------------------------------
## load scRNAseq data from public dataset GSE162950
load("Scarfo_HEC2023/scRNAseq/seurat_object.Rdata")
EHT_scorecard <- c("CXCR4","SOX17","GJA5","MECOM","HOXA9","SPN","CD44","ITGA2B","HLF","GFI1","MLLT3","KCNK17","MYB","STAT5A","SMIM24","RAB27B","SPINK2")
## Select HE cells
sample <- SetIdent(sample, value = sample@meta.data$seurat_clusters)
index1 = GetAssayData(sample, slot="counts")["RUNX1",]>0
index2 = GetAssayData(sample, slot="counts")["CDH5",]>0
index3 = GetAssayData(sample, slot="counts")["PTPRC",]==0
index4 = GetAssayData(sample, slot="counts")["FCGR2B",]==0
bars = colnames(sample)[Idents(sample) %in% c(0,3)]
bars1 = colnames(sample)[index1 & index2 & index3 & index4]
bars2 = intersect(bars, bars1)
## Establish identity for FCGR2B- HE cells
Idents(sample, cells=bars2) = "FCGR2B=0"
index5 = GetAssayData(sample, slot="counts")["FCGR2B",]>0
bars3 = colnames(sample)[index1 & index2 & index3 & index5]
bars4 = intersect(bars, bars3)
## Establish identity for FCGR2B+ HE cells
Idents(sample, cells=bars4) = "FCGR2B_exp"
sample$CellType <- Idents(sample)
sample_sub <- subset(sample, CellType=="FCGR2B_exp" | CellType=="FCGR2B=0")
DotPlot(sample_sub,
features=EHT_scorecard,
cols=c("grey90","red3"),
dot.scale = 10) +
coord_flip()
# ED_Fig5C ------------------------------------------------------------------
## load Seurat data
sample <- readRDS("Scarfo_HEC2023/scRNAseq/WNTd_HE.rds")
sample@meta.data[["Cell.Annotations"]] <- recode_factor(sample@meta.data[["RNA_snn_res.0.6"]],
"0" = "HECs",
"1" = "HECs",
"2" = "HECs",
"3" = "Venous cells",
"4" = "Venous cells",
"5" = "Arterial ECs",
"6" = "Arterial ECs",
"7" = "Venous cells",
"8" = "ECs (M phase)",
"9" = "Venous cells",
"10" = "EndoMT cells",
"11" = "HECs",
"12" = "Arterial ECs",
"13" = "Lymphatic ECs",
"14" = "Arterial ECs",
"15" = "Allantois/Placenta",
"16" = "HECs",
"17" = "HECs",
"18" = "EndoMT cells",
"19" = "SM cells",
"20" = "EndoMT cells",
"21" = "EndoMT cells")
DimPlot(object = sample,
reduction = "umap",
group.by = "Cell.Annotations",
label = F,
repel = T,
pt.size = 0.8,
label.size = 6,
cols = c('HECs' = "#377EB8",
'Lymphatic ECs' = "#4DAF4A",
'Arterial ECs' = "#E41A1C",
'Venous cells' = "#FF7F00",
'EndoMT cells' = "#E6AB02",
'Allantois/Placenta' = "#984EA3",
'SM cells' = "#A6761D",
"ECs (M phase)" = "#A50026"))
# ED_Fig5D - ED_Fig5E ------------------------------------------------------------------
FeaturePlot(object = sample,
features = c("RUNX1", "FCGR2B"),
cols = c("grey90","blue"),
order = T,
pt.size = 0.1)
# ED_Fig5F ------------------------------------------------------------------
runx1 <- readRDS("Scarfo_HEC2023/scRNAseq/RUNX1_clusters.rds")
## Monocle3
Idents(runx1) <- "RNA_snn_res.0.6"
start = WhichCells(runx1,idents = "0")
expression_matrix <- runx1@assays[["RNA"]]@counts
cell_metadata <- runx1@meta.data
cell_metadata$orig.ident <- factor(cell_metadata$orig.ident, levels = unique(cell_metadata$orig.ident))
gene_annotation <- data.frame("gene_short_name" = rownames(runx1))
rownames(gene_annotation) <- gene_annotation$gene_short_name
cds <- new_cell_data_set(expression_data = expression_matrix,
cell_metadata = cell_metadata,
gene_metadata = gene_annotation)
cds <- preprocess_cds(cds, num_dim = 50)
cds <- reduce_dimension(cds)
## Construct and assign the made up partition
recreate.partition <- c(rep(1, length(cds@colData@rownames)))
names(recreate.partition) <- cds@colData@rownames
recreate.partition <- as.factor(recreate.partition)
cds@clusters@listData[["UMAP"]][["partitions"]] <- recreate.partition
## Assign the cluster info
list_cluster <- runx1@meta.data[["RNA_snn_res.0.6"]]
names(list_cluster) <- runx1@assays[["RNA"]]@data@Dimnames[[2]]
cds@clusters@listData[["UMAP"]][["clusters"]] <- list_cluster
cds@clusters@listData[["UMAP"]][["louvain_res"]] <- "NA"
## Assign UMAP coordinate
reducedDims(cds)[["UMAP"]] <- runx1@reductions[["umap"]]@cell.embeddings
### Reset colnames and rownames (consequence of UMAP replacement)
rownames(cds@principal_graph_aux[['UMAP']]$dp_mst) <- NULL
## Learn Graph
cds <- learn_graph(cds = cds,use_partition = T,learn_graph_control=list(ncenter=220,minimal_branch_len=15),verbose = T)
cds <- order_cells(cds, root_cells = start)
plot_cells(cds,
color_cells_by = "pseudotime",
label_groups_by_cluster=FALSE,
label_leaves=T,
label_branch_points=T,
group_label_size = 8,
graph_label_size = 3,
cell_size = 1,
trajectory_graph_segment_size = 1)
# ED_Fig5G ------------------------------------------------------------------
cds <- estimate_size_factors(cds)
gi <- c("H19","KCNK17","RUNX1", "MYB", "SPN")
cds_gi <- cds[rowData(cds)$gene_short_name %in% gi,]
plot_genes_in_pseudotime(cds_subset = cds_gi,
ncol = 3,
color_cells_by = "RNA_snn_res.0.6",
min_expr = NULL,
cell_size = 1)
suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(monocle3))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(patchwork))
suppressPackageStartupMessages(library(clusterProfiler))
## load Seurat data
sample <- readRDS("Scarfo_HEC2023/scRNAseq/WNTd_HE.rds")
# Fig5A ------------------------------------------------------------------
DimPlot(object = sample,
reduction = "umap",
group.by = "RNA_snn_res.0.6",
label = T,
repel = T,
pt.size = 0.8,
label.size = 8,
cols = c("0" = "#F8766D",
"1" = "#B79F00",
"2" = "#00BA38",
"3" = "#EC8239",
"4"= "#00B81B",
"5"= "#7C96FF",
"6"= "#00C085",
"7" = "#00C1A7",
"8" = "#00BFC4",
"9" = "#EF67EB",
"10" = "#00B2F3",
"11"= "#00BFC4",
"12"= "#00A6FF",
"13"= "#DB8E00",
"14"= "#FD61D3",
"15"= "#FF63B6",
"16"= "#619CFF",
"17"= "#F564E3",
"18"= "#00BADE",
"19"= "#B385FF",
"20"= "#D874FD",
"21"= "#FF6B94")) +
ggtitle("") +
xlab("UMAP1") +
ylab("UMAP2") +
theme(legend.text = element_text(size = 16*96/72),
axis.text.x = element_text(color = "black", family = "Arial",size = 14*96/72),
axis.text.y = element_text(color = "black", family = "Arial",size = 14*96/72),
axis.title.x = element_text(color = "black", family = "Arial",size = 16*96/72),
axis.title.y = element_text(color = "black", family = "Arial",size = 16*96/72),
aspect.ratio = 1,
panel.background = element_blank()) +
guides(color = guide_legend(override.aes = list(size=4)))
## Seurat markers
markers <- FindAllMarkers(sample,
only.pos = T,
min.pct = 0.1,
logfc.threshold = 0.25)
## define RUNX1 clusters
runx1.clusters <- subset(markers, markers$gene=="RUNX1")
runx1.clusters <- droplevels(runx1.clusters$cluster)
## subset RUNX1 clusters into new object
runx1 <- subset(sample, idents = runx1.clusters)
DimPlot(object = runx1,
reduction = "umap",
group.by = "RNA_snn_res.0.6",
label = T,
repel = T,
pt.size = 0.8,
label.size = 8)
# Fig5B ----------------------------------------------------------------
## Monocle3
Idents(runx1) <- "RNA_snn_res.0.6"
start = WhichCells(runx1,idents = "0")
expression_matrix <- runx1@assays[["RNA"]]@counts
cell_metadata <- runx1@meta.data
cell_metadata$orig.ident <- factor(cell_metadata$orig.ident, levels = unique(cell_metadata$orig.ident))
gene_annotation <- data.frame("gene_short_name" = rownames(runx1))
rownames(gene_annotation) <- gene_annotation$gene_short_name
cds <- new_cell_data_set(expression_data = expression_matrix,
cell_metadata = cell_metadata,
gene_metadata = gene_annotation)
cds <- preprocess_cds(cds, num_dim = 50)
cds <- reduce_dimension(cds)
## Construct and assign the made up partition
recreate.partition <- c(rep(1, length(cds@colData@rownames)))
names(recreate.partition) <- cds@colData@rownames
recreate.partition <- as.factor(recreate.partition)
cds@clusters@listData[["UMAP"]][["partitions"]] <- recreate.partition
## Assign the cluster info
list_cluster <- runx1@meta.data[["RNA_snn_res.0.6"]]
names(list_cluster) <- runx1@assays[["RNA"]]@data@Dimnames[[2]]
cds@clusters@listData[["UMAP"]][["clusters"]] <- list_cluster
cds@clusters@listData[["UMAP"]][["louvain_res"]] <- "NA"
## Assign UMAP coordinate
reducedDims(cds)[["UMAP"]] <- runx1@reductions[["umap"]]@cell.embeddings
### Reset colnames and rownames (consequence of UMAP replacement)
rownames(cds@principal_graph_aux[['UMAP']]$dp_mst) <- NULL
## Learn Graph
cds <- learn_graph(cds = cds,use_partition = T,learn_graph_control=list(ncenter=220,minimal_branch_len=15),verbose = T)
plot_cells(cds,
color_cells_by = "RNA_snn_res.0.6",
label_groups_by_cluster=FALSE,
label_leaves=F,
label_branch_points=F,
graph_label_size=1.5,
group_label_size = 6,
cell_size = 1)
# Fig5C -------------------------------------------------------------------
## differential markers
degs_clu11_vs_clu0_1_2 <- FindMarkers(object = runx1, ident.1 = 11, ident.2 = c(0,1,2), only.pos = FALSE, min.pct = 0, test.use = "wilcox", logfc.threshold = 0, min.cells.group = 0)
degs_clu11_vs_clu16_17 <- FindMarkers(object = runx1, ident.1 = 11, ident.2 = c(16,17), only.pos = FALSE, min.pct = 0, test.use = "wilcox", logfc.threshold = 0, min.cells.group = 0)
## GSEA
adj.pval.thr <- 0.05
gmt.obj <- read.gmt("Scarfo_HEC2023/BulkRNAseq_1/c5.all.v7.2.symbols.gmt")
comp <- c("clu11_vs_clu0_1_2", "clu11_vs_clu16_17")
for (i in comp) {
degs <- get(paste("degs",i,sep="_"))
gene.res.sorted <- degs[order(degs$avg_logFC, decreasing = TRUE), ]
# LogFC with Symbols
gene.res.logfc.symbol <- as.vector(gene.res.sorted$avg_logFC)
names(gene.res.logfc.symbol) <- row.names(gene.res.sorted)
contr.gsea <- GSEA(gene.res.logfc.symbol,
TERM2GENE = gmt.obj,
nPerm = 10000,
pvalueCutoff = adj.pval.thr)
contr.gsea.res <- as.data.frame(contr.gsea)
assign(paste0("contr.gsea.res_",i), contr.gsea.res)
if (i == "clu11_vs_clu0_1_2") {
terms <- c("GO_CELL_CELL_JUNCTION_ORGANIZATION",
"GO_ACTIN_CYTOSKELETON",
"GO_EXTRACELLULAR_MATRIX_STRUCTURAL_CONSTITUENT",
"GO_STRUCTURAL_CONSTITUENT_OF_RIBOSOME",
"GO_RIBOSOME",
"GO_TRANSLATIONAL_INITIATION")
} else {
terms <- c("GO_MITOTIC_NUCLEAR_DIVISION",
"GO_CELL_DIVISION",
"GO_MITOTIC_CELL_CYCLE",
"GO_REGULATION_OF_NOTCH_SIGNALING_PATHWAY",
"GO_CYTOSOLIC_RIBOSOME",
"GO_ENDOTHELIAL_CELL_MIGRATION")
}
ti <- contr.gsea.res[c(contr.gsea.res$Description %in% terms), ]
ti <- mutate(ti, color = case_when(ti$NES > 0 ~ "Upregulated",
ti$NES < 0 ~ "Downregulated"))
ti <- arrange(ti, NES)
ti$Description <- factor(ti$Description, levels = ti$Description)
p <- ggplot(data=ti, aes(x=NES, y=Description, fill = color)) +
geom_bar(stat="identity", width = 0.7) +
scale_fill_manual(values = c("Upregulated" = "red","Downregulated" = "blue"),name = "") +
ylab("Terms") +
ggtitle(paste(i)) +
theme_bw() +
theme(
axis.title.x = element_text(colour="black",family = "Arial", size=12*96/72),
axis.text.x = element_text(colour="black",family = "Arial", size=10*96/72),
axis.title.y = element_text(colour="black",family = "Arial", size=12*96/72),
axis.text.y = element_text(colour="black",family = "Arial", size=8*96/72),
aspect.ratio = 0.8)
assign(paste0("p_",i), p)
}
# Fig5D -------------------------------------------------------------------
runx1@meta.data[["Phase"]] <- recode_factor(runx1@meta.data[["Phase"]],
"G1" = "G1",
"S" = "S",
"G2M" = "G2/M")
## grouping clusters 0,1,2 and cluster 16,17
runx1@meta.data[["RNA_snn_res.0.6"]] <- recode_factor(runx1@meta.data[["RNA_snn_res.0.6"]],
"0" = "0,1,2",
"1" = "0,1,2",
"2" = "0,1,2",
"11" = "11",
"16" = "16,17",
"17" = "16,17")
cc_umap <- DimPlot(object = runx1,
reduction = "umap",
group.by = "Phase",
split.by = "RNA_snn_res.0.6",
label = F,
repel = T,
pt.size = 0.8,
label.size = 8,
cols = c("G1" = "#E6AB02",
"S" = "#7570B3",
"G2/M" = "#66A61E")) +
ggtitle("") +
xlab("UMAP1") +
ylab("UMAP2") +
theme(legend.text = element_text(size = 12*96/72),
axis.text.x = element_text(color = "black", family = "Arial",size = 10*96/72),
axis.text.y = element_text(color = "black", family = "Arial",size = 10*96/72),
axis.title.x = element_text(color = "black", family = "Arial",size = 12*96/72),
axis.title.y = element_text(color = "black", family = "Arial",size = 12*96/72),
aspect.ratio = 1,
plot.margin = margin(0, 0, 0, 0, "pt"),
panel.background = element_blank()) +
guides(color = guide_legend(override.aes = list(size=4)))
## average cell counts per cluster
ggp_obj_clu <- melt(table(runx1$Phase, runx1$RNA_snn_res.0.6))
colnames(ggp_obj_clu) <- c("Phase","Cluster", "Count")
cc.clusters <- c("0,1,2","11","16,17")
ggp_obj_clu <- ggp_obj_clu[c(ggp_obj_clu$Cluster %in% cc.clusters), ]
for (i in cc.clusters) {
totalB <- subset(ggp_obj_clu, ggp_obj_clu$Cluster==i)
totalB <- totalB %>%
mutate(percentage = Count/sum(Count))
totalB$percentage <- totalB$percentage*100
totalB$fraction = totalB$Count / sum(totalB$Count)
# Compute the cumulative percentages
totalB$ymax <- cumsum(totalB$fraction)
totalB$ymin <- c(0, head(totalB$ymax, n=-1))
totalB$labelPosition <- (totalB$ymax + totalB$ymin) / 2
# Compute a good label
totalB$label <- paste0(totalB$Phase, ":","\n",round(totalB$percentage, 2), "%")
assign(paste0("totalB_clu_", i), totalB)
p <- ggplot(totalB, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=Phase)) +
geom_rect() +
geom_text(x=5, aes(y=labelPosition, label=label), size=5) +
#scale_fill_brewer(palette = "Set1") +
scale_fill_manual(values = c("G1" = "#E6AB02",
"S" = "#7570B3",
"G2/M" = "#66A61E")) +
coord_polar(theta="y") +
xlim(c(2, 5)) +
theme_void() +
theme(plot.margin = margin(0, 0, 0, 0, "pt")) +
theme(legend.position = "none")
assign(paste0("p_clu_",i), p)
}
p_final <- cc_umap / (`p_clu_0,1,2` | p_clu_11 | `p_clu_16,17`)
# Fig5E -------------------------------------------------------------------
cds <- order_cells(cds, root_cells = start)
cds <- estimate_size_factors(cds)
gi <- c("FCGR2B","HES1","HEY1", "HEY2")
cds_gi <- cds[rowData(cds)$gene_short_name %in% gi,]
plot_genes_in_pseudotime(cds_subset = cds_gi,
ncol = 2,
color_cells_by = "RNA_snn_res.0.6",
min_expr = NULL,
cell_size = 1)
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment