Commit 839b34cc authored by Monah Abou Alezz's avatar Monah Abou Alezz
Browse files

Initial commit

parents
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(ggrepel))
suppressPackageStartupMessages(library(pheatmap))
suppressPackageStartupMessages(library(RColorBrewer))
## Input featureCounts file
fcount.file <- "Scarfo_HEC2023/BulkRNAseq_1/featureCounts_results_corrected.txt"
## Contrasts to perform
contr <- "ACE_pos:ACE_neg"
contr.list <- unlist(strsplit(contr, ","))
## Design Formula
formula.str <- "donor,condition"
design.str <- gsub(",", "+", formula.str)
num_cond <- str_count(formula.str, ",") + 1
design.formula <- as.formula(paste("~", design.str, sep = ""))
## Read featureCounts results
if (!file.exists(fcount.file)) {
stop("Error: featureCounts file not found.")
} else {
read.counts <- read.table(fcount.file,
header = TRUE,
check.names = FALSE) %>%
column_to_rownames(var = "Geneid") %>%
select(-c(1:5))
}
orig.names <- names(read.counts)
ds <- list()
cond <- list()
for (i in seq(orig.names[1:length(orig.names)])) {
ds[i] <- unlist(strsplit(orig.names[i], ":"))[2]
cond[i] <- unlist(strsplit(orig.names[i], ":"))[3]
}
ds <- unlist(ds)
cond <- unlist(cond)
## Samples - Conditions
sample_info <- as.data.frame(str_split_fixed(cond, ",", num_cond))
colnames(sample_info) <- str_split_fixed(formula.str, ",", num_cond)
rownames(sample_info) <- ds
colnames(read.counts) <- ds
sample_info$condition <- factor(sample_info$condition)
last_condition <- str_split_fixed(formula.str, ",", num_cond)[num_cond]
batch_cond <- str_split_fixed(formula.str, ",", num_cond)[num_cond - 1]
## thresholds
adj.pval.thr <- 0.05
logfc.thr <- 0
## DESeq2
DESeq.ds <- DESeqDataSetFromMatrix(countData = read.counts,
colData = sample_info,
design = design.formula)
## Remove genes without any counts
DESeq.ds <- DESeq.ds[rowSums(counts(DESeq.ds)) > 0, ]
## RunDGE
DESeq.ds <- DESeq(DESeq.ds)
## obtain regularized log-transformed values
DESeq.rlog <- rlogTransformation(DESeq.ds, blind = TRUE)
assay(DESeq.rlog) <- limma::removeBatchEffect(assay(DESeq.rlog), DESeq.rlog[[batch_cond]])
ctrs <- unlist(strsplit(contr.list[1], ":"))
c1 <- ctrs[1]
c2 <- ctrs[2]
DGE.results <- results(DESeq.ds,
pAdjustMethod = "BH",
independentFiltering = TRUE,
contrast = c(last_condition, c1, c2),
alpha = adj.pval.thr)
# Change format
DGE.results.annot <- as.data.frame(DGE.results) %>%
dplyr::select(log2FoldChange, lfcSE, baseMean, pvalue, padj)
colnames(DGE.results.annot) <- c("logFC", "lfcSE", "baseMean", "PValue", "FDR")
## sort the results according to the adjusted p-value
DGE.results.sorted <- DGE.results.annot[order(DGE.results.annot$FDR), ]
## Subset for only significant genes
DGE.results.sig <- subset(DGE.results.sorted,
FDR < adj.pval.thr & abs(logFC) > logfc.thr)
DGEgenes <- rownames(DGE.results.sig)
# ED_Fig1D ----------------------------------------------------------------
data <- data.frame(gene = row.names(DGE.results.sorted),
pval = -log10(DGE.results.sorted$FDR),
lfc = DGE.results.sorted$logFC)
data <- na.omit(data)
data <- mutate(data, color = case_when(data$lfc > logfc.thr & data$pval > -log10(adj.pval.thr) ~ "Overexpressed",data$lfc < -logfc.thr & data$pval > -log10(adj.pval.thr) ~ "Underexpressed",abs(data$lfc) < logfc.thr | data$pval < -log10(adj.pval.thr) ~ "NonSignificant"))
data$color <- factor(data$color, levels = c("Overexpressed", "Underexpressed", "NonSignificant"))
data <- data[order(data$pval, decreasing = TRUE),]
highl <- head(subset(data, color != "NonSignificant"), 20)
ggplot(data, aes(x = lfc, y = pval, colour = color)) +
theme_bw() +
theme(legend.position = "right",
text=element_text(size=17*96/72),
axis.text.x = element_text(size = 17*96/72,
color = "black"),
axis.text.y = element_text(size = 17*96/72,
color = "black"),
axis.title.y = element_text(colour = "black", size = 19*96/72),
axis.title.x = element_text(colour = "black", size = 19*96/72),
legend.text = element_text(colour = "black", size = 19*96/72),
legend.title = element_text(colour = "black", size = 19*96/72)) +
xlab("log2 Fold Change") +
ylab(expression(-log[10]("adjusted p-value"))) +
geom_hline(yintercept = -log10(adj.pval.thr), colour = "darkgray") +
geom_point(size = 3, alpha = 0.8, na.rm = T) +
scale_color_manual(name = "Expression",
values = c(Overexpressed = "red",
Underexpressed = "royalblue3",
NonSignificant = "darkgray")) +
geom_label_repel(data = highl,
aes(label = gene),
show.legend = FALSE,
size = 5)
# ED_Fig1E --------------------------------------------------------
DESeq.ds_assay_corr <- limma::removeBatchEffect(assay(DESeq.ds), DESeq.ds[[batch_cond]])
sampleDists_corr <- dist(t(DESeq.ds_assay_corr))
sampleDistMatrix_corr <- as.matrix(sampleDists_corr)
rownames(sampleDistMatrix_corr) <- paste(rownames(sampleDistMatrix_corr), DESeq.ds[[last_condition]], sep = " ")
colnames(sampleDistMatrix_corr) <- NULL
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
pheatmap::pheatmap(sampleDistMatrix_corr,
clustering_distance_rows=sampleDists_corr,
clustering_distance_cols=sampleDists_corr,
col = colors)
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(ggrepel))
suppressPackageStartupMessages(library(openxlsx))
suppressPackageStartupMessages(library(RColorBrewer))
## Input featureCounts file
fcount.file <- "Scarfo_HEC2023/BulkRNAseq_1/featureCounts_results_corrected.txt"
## Contrasts to perform
contr <- "ACE_pos:ACE_neg"
contr.list <- unlist(strsplit(contr, ","))
## Design Formula
formula.str <- "donor,condition"
design.str <- gsub(",", "+", formula.str)
num_cond <- str_count(formula.str, ",") + 1
design.formula <- as.formula(paste("~", design.str, sep = ""))
## Read featureCounts results
if (!file.exists(fcount.file)) {
stop("Error: featureCounts file not found.")
} else {
read.counts <- read.table(fcount.file,
header = TRUE,
check.names = FALSE) %>%
column_to_rownames(var = "Geneid") %>%
select(-c(1:5))
}
orig.names <- names(read.counts)
ds <- list()
cond <- list()
for (i in seq(orig.names[1:length(orig.names)])) {
ds[i] <- unlist(strsplit(orig.names[i], ":"))[2]
cond[i] <- unlist(strsplit(orig.names[i], ":"))[3]
}
ds <- unlist(ds)
cond <- unlist(cond)
## Samples - Conditions
sample_info <- as.data.frame(str_split_fixed(cond, ",", num_cond))
colnames(sample_info) <- str_split_fixed(formula.str, ",", num_cond)
rownames(sample_info) <- ds
colnames(read.counts) <- ds
sample_info$condition <- factor(sample_info$condition)
last_condition <- str_split_fixed(formula.str, ",", num_cond)[num_cond]
batch_cond <- str_split_fixed(formula.str, ",", num_cond)[num_cond - 1]
## thresholds
adj.pval.thr <- 0.05
logfc.thr <- 0
## DESeq2
DESeq.ds <- DESeqDataSetFromMatrix(countData = read.counts,
colData = sample_info,
design = design.formula)
## Remove genes without any counts
DESeq.ds <- DESeq.ds[rowSums(counts(DESeq.ds)) > 0, ]
## RunDGE
DESeq.ds <- DESeq(DESeq.ds)
## obtain regularized log-transformed values
DESeq.rlog <- rlogTransformation(DESeq.ds, blind = TRUE)
assay(DESeq.rlog) <- limma::removeBatchEffect(assay(DESeq.rlog), DESeq.rlog[[batch_cond]])
ctrs <- unlist(strsplit(contr.list[1], ":"))
c1 <- ctrs[1]
c2 <- ctrs[2]
DGE.results <- results(DESeq.ds,
pAdjustMethod = "BH",
independentFiltering = TRUE,
contrast = c(last_condition, c1, c2),
alpha = adj.pval.thr)
# Change format
DGE.results.annot <- as.data.frame(DGE.results) %>%
dplyr::select(log2FoldChange, lfcSE, baseMean, pvalue, padj)
colnames(DGE.results.annot) <- c("logFC", "lfcSE", "baseMean", "PValue", "FDR")
## sort the results according to the adjusted p-value
DGE.results.sorted <- DGE.results.annot[order(DGE.results.annot$FDR), ]
## Subset for only significant genes
DGE.results.sig <- subset(DGE.results.sorted,
FDR < adj.pval.thr & abs(logFC) > logfc.thr)
DGEgenes <- rownames(DGE.results.sig)
# ED_Fig2A ----------------------------------------------------------------
## surface genes
if(!file.exists("Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx")){
download.file("http://wlab.ethz.ch/surfaceome/table_S3_surfaceome.xlsx",
destfile = "Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx")
}
sg <- read.xlsx("Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx",
sheet = "in silico surfaceome only",
colNames = T,
startRow = 2)
sig_sg <- DGE.results.sig[c(row.names(DGE.results.sig) %in% sg$UniProt.gene), ]
top10_sig_sg <- sig_sg[c(1:10),]
data <- data.frame(gene = row.names(DGE.results.sorted),
pval = -log10(DGE.results.sorted$FDR),
lfc = DGE.results.sorted$logFC)
data <- na.omit(data)
data <- mutate(data, color = case_when(data$lfc > logfc.thr & data$pval > -log10(adj.pval.thr) ~ "Overexpressed",data$lfc < -logfc.thr & data$pval > -log10(adj.pval.thr) ~ "Underexpressed",abs(data$lfc) < logfc.thr | data$pval < -log10(adj.pval.thr) ~ "NonSignificant"))
data$color <- factor(data$color, levels = c("Overexpressed", "Underexpressed", "NonSignificant"))
data <- data[order(data$pval, decreasing = TRUE),]
highl <- data[c(data$gene %in% row.names(top10_sig_sg)), ]
ggplot(data, aes(x = lfc, y = pval, colour = color)) +
theme_bw() +
theme(legend.position = "right",
text=element_text(size=17*96/72),
axis.text.x = element_text(size = 17*96/72,
color = "black"),
axis.text.y = element_text(size = 17*96/72,
color = "black"),
axis.title.y = element_text(colour = "black", size = 19*96/72),
axis.title.x = element_text(colour = "black", size = 19*96/72),
legend.text = element_text(colour = "black", size = 19*96/72),
legend.title = element_text(colour = "black", size = 19*96/72)) +
xlab("log2 Fold Change") +
ylab(expression(-log[10]("adjusted p-value"))) +
geom_hline(yintercept = -log10(adj.pval.thr), colour = "darkgray") +
geom_point(size = 3, alpha = 0.8, na.rm = T) +
scale_color_manual(name = "Expression",
values = c(Overexpressed = "red",
Underexpressed = "royalblue3",
NonSignificant = "darkgray")) +
geom_label_repel(data = highl,
aes(label = gene),
show.legend = FALSE,
size = 5)
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(pheatmap))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(openxlsx))
suppressPackageStartupMessages(library(org.Hs.eg.db))
suppressPackageStartupMessages(library(clusterProfiler))
## Input featureCounts file
fcount.file <- "Scarfo_HEC2023/BulkRNAseq_1/featureCounts_results_corrected.txt"
## Contrasts to perform
contr <- "ACE_pos:ACE_neg"
contr.list <- unlist(strsplit(contr, ","))
## Design Formula
formula.str <- "donor,condition"
design.str <- gsub(",", "+", formula.str)
num_cond <- str_count(formula.str, ",") + 1
design.formula <- as.formula(paste("~", design.str, sep = ""))
## Read featureCounts results
if (!file.exists(fcount.file)) {
stop("Error: featureCounts file not found.")
} else {
read.counts <- read.table(fcount.file,
header = TRUE,
check.names = FALSE) %>%
column_to_rownames(var = "Geneid") %>%
select(-c(1:5))
}
orig.names <- names(read.counts)
ds <- list()
cond <- list()
for (i in seq(orig.names[1:length(orig.names)])) {
ds[i] <- unlist(strsplit(orig.names[i], ":"))[2]
cond[i] <- unlist(strsplit(orig.names[i], ":"))[3]
}
ds <- unlist(ds)
cond <- unlist(cond)
## Samples - Conditions
sample_info <- as.data.frame(str_split_fixed(cond, ",", num_cond))
colnames(sample_info) <- str_split_fixed(formula.str, ",", num_cond)
rownames(sample_info) <- ds
colnames(read.counts) <- ds
sample_info$condition <- factor(sample_info$condition)
last_condition <- str_split_fixed(formula.str, ",", num_cond)[num_cond]
batch_cond <- str_split_fixed(formula.str, ",", num_cond)[num_cond - 1]
## thresholds
adj.pval.thr <- 0.05
logfc.thr <- 0
## DESeq2
DESeq.ds <- DESeqDataSetFromMatrix(countData = read.counts,
colData = sample_info,
design = design.formula)
## Remove genes without any counts
DESeq.ds <- DESeq.ds[rowSums(counts(DESeq.ds)) > 0, ]
## RunDGE
DESeq.ds <- DESeq(DESeq.ds)
## obtain regularized log-transformed values
DESeq.rlog <- rlogTransformation(DESeq.ds, blind = TRUE)
assay(DESeq.rlog) <- limma::removeBatchEffect(assay(DESeq.rlog), DESeq.rlog[[batch_cond]])
ctrs <- unlist(strsplit(contr.list[1], ":"))
c1 <- ctrs[1]
c2 <- ctrs[2]
DGE.results <- results(DESeq.ds,
pAdjustMethod = "BH",
independentFiltering = TRUE,
contrast = c(last_condition, c1, c2),
alpha = adj.pval.thr)
# Change format
DGE.results.annot <- as.data.frame(DGE.results) %>%
dplyr::select(log2FoldChange, lfcSE, baseMean, pvalue, padj)
colnames(DGE.results.annot) <- c("logFC", "lfcSE", "baseMean", "PValue", "FDR")
## sort the results according to the adjusted p-value
DGE.results.sorted <- DGE.results.annot[order(DGE.results.annot$FDR), ]
## Subset for only significant genes
DGE.results.sig <- subset(DGE.results.sorted,
FDR < adj.pval.thr & abs(logFC) > logfc.thr)
DGEgenes <- rownames(DGE.results.sig)
# Fig1B ---------------------------------------------------------------------
plotPCA(DESeq.rlog, intgroup = last_condition, ntop = 500) +
theme_bw() +
ggtitle(label = "Principal Component Analysis (PCA)",
subtitle = "Top 500 DEG (with batch correction)") +
geom_point(size = 2) +
#coord_fixed(ratio=3) +
geom_text(aes(label = rownames(DESeq.rlog@colData)), vjust = 0, nudge_y = 0.5,
show.legend = FALSE)
# Fig1C ----------------------------------------------------------------
## DEGs
s_info <- sample_info[sample_info[[last_condition]] == c1 | sample_info[[last_condition]] == c2,, drop = FALSE]
hm.mat_DGEgenes <- assay(DESeq.rlog)[DGEgenes, ]
hm.mat_DGEgenes.filt <- hm.mat_DGEgenes[, row.names(s_info)]
pheatmap::pheatmap(hm.mat_DGEgenes.filt,
color = colorRampPalette(rev(brewer.pal(n = 9, name ="RdYlBu")))(100),
fontsize_row = 5,
show_rownames = FALSE,
scale = "row",
angle_col = 90,
cluster_rows = TRUE,
annotation_col = s_info,
cluster_cols = TRUE)
# Fig1D ----------------------------------------------------------------
## surface genes
if(!file.exists("Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx")){
download.file("http://wlab.ethz.ch/surfaceome/table_S3_surfaceome.xlsx",
destfile = "Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx")
}
sg <- read.xlsx("Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx",
sheet = "in silico surfaceome only",
colNames = T,
startRow = 2)
sig_sg <- DGE.results.sig[c(row.names(DGE.results.sig) %in% sg$UniProt.gene), ]
markers <- c("CD34","CDH5","PECAM1","TEK","NR2F2","FLRT2","BMX","GJA5","DLL4","CXCR4","HEY2","MYB","GFI1","CD44")
DESeq.rlog_mat <- assay(DESeq.rlog)
DESeq.rlog_mat <- DESeq.rlog_mat[c(row.names(DESeq.rlog_mat) %in% markers), ]
DESeq.rlog_mat <- DESeq.rlog_mat[markers,]
# Specify colors
ann_colors = list(
condition = c(ACE_neg = "gold3",
ACE_pos = "turquoise3"),
donor = c(E1 = "#E76BF3",
E2 = "#F8766D",
E3 = "#529EFF",
E4 = "springgreen3")
)
pheatmap::pheatmap(DESeq.rlog_mat,
color = colorRampPalette(rev(brewer.pal(n = 9, name ="RdYlBu")))(100),
annotation_col = s_info,
annotation_names_row = FALSE,
annotation_colors = ann_colors,
show_rownames = TRUE,
show_colnames = TRUE,
cluster_rows = FALSE,
angle_col = 45,
fontsize = 14,
fontsize_row = 14,
fontsize_col = 18,
cellheight = 20,
cellwidth = 30)
# Fig1E --------------------------------------------
gmt.obj <- read.gmt("Scarfo_HEC2023/BulkRNAseq_1/c5.all.v7.2.symbols.gmt")
organism.db <- org.Hs.eg.db
# Remove genes with no FDR
DGE.results.annot <- DGE.results.annot[!is.na(DGE.results.annot$FDR), ]
gene_univ <- row.names(DGE.results.annot)
# get positive and negative gene names
gene.res.top <- subset(DGE.results.annot, FDR < adj.pval.thr & abs(logFC) > logfc.thr)
gene.res.top.pos <- subset(gene.res.top, logFC > 0)
gene.res.top.neg <- subset(gene.res.top, logFC < 0)
## Over Representation Analysis
comp <- c("gene.res.top.pos","gene.res.top.neg")
for (i in comp) {
print(paste0("Analyzing...",i," genes"))
obj_i <- get(i)
contr.enricher <- enricher(row.names(obj_i),
universe = gene_univ,
pvalueCutoff = adj.pval.thr,
TERM2GENE = gmt.obj)
contr.enricher.res <- as.data.frame(contr.enricher)
assign(paste("contr.enricher",i,sep = "."), contr.enricher.res)
}
## barplots
terms.pos <- data.frame(
ID = c("GO_LEUKOCYTE_MIGRATION","GO_AMEBOIDAL_TYPE_CELL_MIGRATION","GO_TISSUE_MIGRATION","GO_POSITIVE_REGULATION_OF_EPITHELIAL_CELL_MIGRATION","GO_REGULATION_OF_EPITHELIAL_CELL_MIGRATION","GO_ENDOTHELIAL_CELL_MIGRATION","GO_REGULATION_OF_ENDOTHELIAL_CELL_MIGRATION","GO_COLLAGEN_CONTAINING_EXTRACELLULAR_MATRIX","GO_EXTRACELLULAR_MATRIX","GO_CELL_CELL_JUNCTION","GO_NEGATIVE_REGULATION_OF_CELL_ADHESION","GO_REGULATION_OF_CELL_CELL_ADHESION","GO_LEUKOCYTE_CELL_CELL_ADHESION","GO_EXTRACELLULAR_MATRIX_BINDING"),
Type = c(rep("Migration", 7), rep("Adhesion", 7)))
terms.pos <- left_join(terms.pos,contr.enricher.gene.res.top.pos,by="ID")
terms.neg <- data.frame(
ID = c("GO_POSITIVE_REGULATION_OF_CELL_CYCLE_PROCESS","GO_POSITIVE_REGULATION_OF_CELL_CYCLE","GO_POSITIVE_REGULATION_OF_CELL_CYCLE_G2_M_PHASE_TRANSITION","GO_POSITIVE_REGULATION_OF_CELL_CYCLE_PHASE_TRANSITION","GO_MITOTIC_CELL_CYCLE_CHECKPOINT","GO_MITOTIC_NUCLEAR_DIVISION","GO_POSITIVE_REGULATION_OF_MITOTIC_CELL_CYCLE"),
Type = "Cell Cycle")
terms.neg <- left_join(terms.neg,contr.enricher.gene.res.top.neg,by="ID")
terms.to.plot <- rbind(terms.pos, terms.neg)
terms.to.plot <- na.omit(terms.to.plot)
parsed.terms <- as.data.frame(str_split_fixed(terms.to.plot$ID, "GO_", 2))
renamed.terms <- as.data.frame(str_replace_all(parsed.terms$V2, "_", " "))
colnames(renamed.terms) <- "id"
renamed.terms$Terms <- str_to_title(renamed.terms$id)
terms.to.plot <- cbind(terms.to.plot,renamed.terms)
terms.to.plot <- terms.to.plot[,c(12,10,7,2)]
terms.to.plot$Type <- factor(terms.to.plot$Type, levels = c("Cell Cycle", "Migration", "Adhesion"))
terms.to.plot$Terms <- factor(terms.to.plot$Terms, levels = terms.to.plot$Terms)
terms.to.plot <- terms.to.plot %>%
mutate(color = ifelse(Type == "Cell Cycle", "gold3", "turquoise3"),
regulated = ifelse(Type == "Cell Cycle", "Upregulated in ACEneg", "Upregulated in ACE+"))
terms.to.plot$regulated <- factor(terms.to.plot$regulated,
levels = c("Upregulated in ACEneg","Upregulated in ACE+"))
terms.to.plot$log_pvalue <- -log10(terms.to.plot$p.adjust)
ggplot(data=terms.to.plot, aes(x=Terms,
y=log_pvalue,
fill=regulated,
xmin = min(log_pvalue),
xmax = max(log_pvalue))) +
geom_bar(stat="identity", width = 0.7) +
scale_fill_manual(values = c("gold3", "turquoise3"))+
theme_bw() +
coord_flip() +
ylab(expression(-log[10]("adjusted p-value"))) +
xlab("GO Terms") +
#scale_x_discrete(limits = GO_terms_all$Term) +
facet_grid(Type ~., scales = "free_y", space = "free_y") +
theme(
axis.title.x = element_text(colour="black",family = "Arial", size=16),
axis.text.x = element_text(colour="black",family = "Arial", size=12),
axis.title.y = element_text(colour="black",family = "Arial", size=16),
axis.text.y = element_text(color="black",family = "Arial", size=12),
legend.text = element_text(colour="black",family = "Arial", size=12),
legend.title = element_text(colour="white",family = "Arial", size=12),
strip.background = element_rect(colour="black",
fill="white"),
strip.text = element_text(family = "Arial", size=12))
## GSEA
DGE.results.annot.sorted <- DGE.results.annot[order(DGE.results.annot$logFC, decreasing = TRUE), ]
# LogFC with Symbols
gene.res.logfc.symbol <- as.vector(DGE.results.annot.sorted$logFC)
names(gene.res.logfc.symbol) <- row.names(DGE.results.annot.sorted)
contr.gsea <- GSEA(gene.res.logfc.symbol,
TERM2GENE = gmt.obj,
nPerm = 10000,
pvalueCutoff = adj.pval.thr)
contr.gsea.res <- as.data.frame(contr.gsea)
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(pheatmap))
suppressPackageStartupMessages(library(openxlsx))
suppressPackageStartupMessages(library(RColorBrewer))
## Input featureCounts file
fcount.file <- "Scarfo_HEC2023/BulkRNAseq_1/featureCounts_results_corrected.txt"
## Contrasts to perform
contr <- "ACE_pos:ACE_neg"
contr.list <- unlist(strsplit(contr, ","))
## Design Formula
formula.str <- "donor,condition"
design.str <- gsub(",", "+", formula.str)
num_cond <- str_count(formula.str, ",") + 1
design.formula <- as.formula(paste("~", design.str, sep = ""))
## Read featureCounts results
if (!file.exists(fcount.file)) {
stop("Error: featureCounts file not found.")
} else {
read.counts <- read.table(fcount.file,
header = TRUE,
check.names = FALSE) %>%
column_to_rownames(var = "Geneid") %>%
select(-c(1:5))
}
orig.names <- names(read.counts)
ds <- list()
cond <- list()
for (i in seq(orig.names[1:length(orig.names)])) {
ds[i] <- unlist(strsplit(orig.names[i], ":"))[2]
cond[i] <- unlist(strsplit(orig.names[i], ":"))[3]
}
ds <- unlist(ds)
cond <- unlist(cond)
## Samples - Conditions
sample_info <- as.data.frame(str_split_fixed(cond, ",", num_cond))
colnames(sample_info) <- str_split_fixed(formula.str, ",", num_cond)
rownames(sample_info) <- ds
colnames(read.counts) <- ds
sample_info$condition <- factor(sample_info$condition)
last_condition <- str_split_fixed(formula.str, ",", num_cond)[num_cond]
batch_cond <- str_split_fixed(formula.str, ",", num_cond)[num_cond - 1]
## thresholds
adj.pval.thr <- 0.05
logfc.thr <- 0
## DESeq2
DESeq.ds <- DESeqDataSetFromMatrix(countData = read.counts,
colData = sample_info,
design = design.formula)
## Remove genes without any counts
DESeq.ds <- DESeq.ds[rowSums(counts(DESeq.ds)) > 0, ]
## RunDGE
DESeq.ds <- DESeq(DESeq.ds)
## obtain regularized log-transformed values
DESeq.rlog <- rlogTransformation(DESeq.ds, blind = TRUE)
assay(DESeq.rlog) <- limma::removeBatchEffect(assay(DESeq.rlog), DESeq.rlog[[batch_cond]])
ctrs <- unlist(strsplit(contr.list[1], ":"))
c1 <- ctrs[1]
c2 <- ctrs[2]
DGE.results <- results(DESeq.ds,
pAdjustMethod = "BH",
independentFiltering = TRUE,
contrast = c(last_condition, c1, c2),
alpha = adj.pval.thr)
# Change format
DGE.results.annot <- as.data.frame(DGE.results) %>%
dplyr::select(log2FoldChange, lfcSE, baseMean, pvalue, padj)
colnames(DGE.results.annot) <- c("logFC", "lfcSE", "baseMean", "PValue", "FDR")
## sort the results according to the adjusted p-value
DGE.results.sorted <- DGE.results.annot[order(DGE.results.annot$FDR), ]
## Subset for only significant genes
DGE.results.sig <- subset(DGE.results.sorted,
FDR < adj.pval.thr & abs(logFC) > logfc.thr)
DGEgenes <- rownames(DGE.results.sig)
# Fig2A ----------------------------------------------------------------
## surface genes
if(!file.exists("Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx")){
download.file("http://wlab.ethz.ch/surfaceome/table_S3_surfaceome.xlsx",
destfile = "Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx")
}
sg <- read.xlsx("Scarfo_HEC2023/BulkRNAseq_1/table_S3_surfaceome.xlsx",
sheet = "in silico surfaceome only",
colNames = T,
startRow = 2)
sig_sg <- DGE.results.sig[c(row.names(DGE.results.sig) %in% sg$UniProt.gene), ]
top10_sig_sg <- sig_sg[c(1:10),]
s_info <- sample_info[sample_info[[last_condition]] == c1 | sample_info[[last_condition]] == c2,, drop = FALSE]
hm.mat_DGEgenes <- assay(DESeq.rlog)[row.names(top10_sig_sg), ]
hm.mat_DGEgenes.filt <- hm.mat_DGEgenes[, row.names(s_info)]
pheatmap::pheatmap(hm.mat_DGEgenes.filt,
scale = "row",
color = colorRampPalette(rev(brewer.pal(n = 9, name ="RdYlBu")))(100),
annotation_col = s_info,
cluster_rows = TRUE,
cluster_cols = TRUE,
show_rownames = TRUE,
show_colnames = TRUE,
angle_col = 90,
fontsize = 15)
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(pheatmap))
suppressPackageStartupMessages(library(RColorBrewer))
## Input featureCounts file
fcount.file <- "Scarfo_HEC2023/BulkRNAseq_2/featureCounts_results_corrected.txt"
## Contrasts to perform
contr <- "DLL4_plus:CD32_plus"
contr.list <- unlist(strsplit(contr, ","))
## Design Formula
formula.str <- "condition"
design.str <- gsub(",", "+", formula.str)
num_cond <- str_count(formula.str, ",") + 1
design.formula <- as.formula(paste("~", design.str, sep = ""))
## Read featureCounts results
if (!file.exists(fcount.file)) {
stop("Error: featureCounts file not found.")
} else {
read.counts <- read.table(fcount.file,
header = TRUE,
check.names = FALSE) %>%
column_to_rownames(var = "Geneid") %>%
select(-c(1:5))
}
orig.names <- names(read.counts)
ds <- list()
cond <- list()
for (i in seq(orig.names[1:length(orig.names)])) {
ds[i] <- unlist(strsplit(orig.names[i], ":"))[2]
cond[i] <- unlist(strsplit(orig.names[i], ":"))[3]
}
ds <- unlist(ds)
cond <- unlist(cond)
## Samples - Conditions
sample_info <- as.data.frame(str_split_fixed(cond, ",", num_cond))
colnames(sample_info) <- str_split_fixed(formula.str, ",", num_cond)
rownames(sample_info) <- ds
colnames(read.counts) <- ds
sample_info$condition <- factor(sample_info$condition)
last_condition <- str_split_fixed(formula.str, ",", num_cond)[num_cond]
## thresholds
adj.pval.thr <- 0.05
logfc.thr <- 0
## DESeq2
DESeq.ds <- DESeqDataSetFromMatrix(countData = read.counts,
colData = sample_info,
design = design.formula)
## Remove genes without any counts
DESeq.ds <- DESeq.ds[rowSums(counts(DESeq.ds)) > 0, ]
## RunDGE
DESeq.ds <- DESeq(DESeq.ds)
## obtain regularized log-transformed values
DESeq.rlog <- rlogTransformation(DESeq.ds, blind = TRUE)
ctrs <- unlist(strsplit(contr.list[1], ":"))
c1 <- ctrs[1]
c2 <- ctrs[2]
DGE.results <- results(DESeq.ds,
pAdjustMethod = "BH",
independentFiltering = TRUE,
contrast = c(last_condition, c1, c2),
alpha = adj.pval.thr)
# Change format
DGE.results.annot <- as.data.frame(DGE.results) %>%
dplyr::select(log2FoldChange, lfcSE, baseMean, pvalue, padj)
colnames(DGE.results.annot) <- c("logFC", "lfcSE", "baseMean", "PValue", "FDR")
## sort the results according to the adjusted p-value
DGE.results.sorted <- DGE.results.annot[order(DGE.results.annot$FDR), ]
## Subset for only significant genes
DGE.results.sig <- subset(DGE.results.sorted,
FDR < adj.pval.thr & abs(logFC) > logfc.thr)
DGEgenes <- rownames(DGE.results.sig)
# ED_Fig5A -----------------------------------------------------------------
## scorecard
EHT_scorecard = c("CDH5","RUNX1","PTPRC", "SPN","CD44","ITGA2B","HLF","GFI1","MLLT3","HOXA9","MKI67","NRP2","NR2F2","GJA4","HEY1","DLL4","CXCR4","SOX17","SMAD6","GJA5","HES4","MECOM","NID2","PRND","ESM1","PDGFB","PALMD","TMEM100","EDN1","LTBP4","HEY2","SULF1","IL33","CYP26B1","ADGRG6","COL23A1","GATA6","BMX","TMCC3","DKK1","AGTR2","FBN2","ELN","ALDH1A1","NKX2-3","PROCR","GATA3","GBP4","MYCN","KCNK17","MYB","STAT5A","SMIM24","RAB27B","SPINK2")
hm.mat_DGEgenes <- assay(DESeq.rlog)[DGEgenes, ]
counts_eht <- hm.mat_DGEgenes[c(rownames(hm.mat_DGEgenes) %in% EHT_scorecard), ]
eht_sig <- counts_eht[c(rownames(counts_eht) %in% row.names(DGE.results.sig)), ]
eht_sig <- eht_sig[,c(1,3,6,2,5,8)]
annot <- data.frame(row.names = colnames(eht_sig),
Sample = c(rep("DLL4+", 3), rep("CD32+", 3)),
Donor = rep(c("RS227","RS242","RS243"), 2))
annot_colors <- list(Sample=c("DLL4+" ="blueviolet",
"CD32+" = "gold1"),
Donor=c("RS227"="darkslategray3",
"RS242"="brown3",
"RS243"="limegreen"))
pheatmap::pheatmap(eht_sig,
scale = "row",
annotation_col = annot,
annotation_colors = annot_colors,
cluster_rows = TRUE,
cluster_cols = TRUE,
show_rownames = TRUE,
show_colnames = TRUE,
angle_col = 90,
legend = TRUE,
legend_breaks = c(-1.5,0,1.5), # legend breaks to be shown
legend_labels = c(-1.5,0,1.5), # legend labels to be shown
treeheight_row = 25*96/72, # size of row dendogram
treeheight_col = 15*96/72, # size of column dendogram
fontsize = 10*96/72,
fontsize_row = 8*96/72,
fontsize_col = 10*96/72,
cellwidth = 20*96/72,
cellheight = 8*96/72,
border_color = NA
)
This source diff could not be displayed because it is too large. You can view the blob instead.
# Scarfo_HEC2023
This repository includes essential scripts for producing final tables and figures embedded in the following manuscript:
**Scarfò et al**,_The Fc receptor CD32 identifies human hemogenic endothelial cells irreversibly bound to the endothelial-to-hematopoietic transition_
PubMed:
DOI:
GEO: GSE223223
---
The repository is divided in three folders, 2 bulk RNAseq analyses and 1 scRNAseq analysis.
Below a brief description of the bulk and scRNAseq workflows adopted in this work.
**Bulk RNAseq** analysis was performed using a standard pipeline that includes the follwing steps:
1. Quality control by [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)
2. Trimming of bad quality reads with [TrimGalore](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/)<details><summary>Running command</summary>trim_galore --quality 20 --fastqc --length 25 --output_dir {outdir} --paired {input.r1} {inout.r2}</details>
3. Alignment with [STAR](https://github.com/alexdobin/STAR)
<details><summary>Running command</summary>
"STAR " +
"--runThreadN {threads} " +
"--genomeDir {input.genome} " +
"--readFilesIn {params.trim_seq} " +
"--outSAMstrandField intronMotif " +
"--outFileNamePrefix {params.aln_seq_prefix} " +
"--outSAMtype BAM SortedByCoordinate " +
"--outSAMmultNmax 1 " +
"--outFilterMismatchNmax 10 " +
"--outReadsUnmapped Fastx " +
"--readFilesCommand zcat "
</details>
4. Gene expression quantification with [featureCounts](https://academic.oup.com/bioinformatics/article/30/7/923/232889)
<details><summary>Running command</summary>
"featureCounts " +
"-a {input.annot} " +
"-o {output.fcount} " +
"-g gene_name " +
"-p -B -C " +
"-s {params.strand} " +
"--minOverlap 10 " +
"-T {threads} " +
"{input.bams} "
</details>
5. Differential Expression analysis with [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html).
For Differential Gene Expression analysis we followed the standard workflow provided by package.
6. Dowstream functional Analysis with [ClusterProfiler](https://bioconductor.org/packages/release/bioc/html/clusterProfiler.html).
In order to retrieve functional annotation from DE analysis, we performed **O**ver **R**epresentation **A**nalysis by using the _enrichr_ function provided by the package.
**ORA** analysis was performed in particular using the C5 gene set from the MSigDB database (version 7.2)
---
**scRNAseq** analysis was performed using a standard pipeline that includes the following steps:
scRNAseq analysis was performed with [Seurat](https://satijalab.org/seurat/). Below are the main steps of the basic data analysis workflow that start from a minimal object after loading of 10X data to markers identification:
1. Quality control and filtering
2. Cell cycle scoring
3. Normalization (default seurat settings)
4. Scaling (with following variables to regress out: percent.mt + nCount_RNA and CC.Difference calculated as show in [vignette](https://satijalab.org/seurat/articles/cell_cycle_vignette.html#alternate-workflow-1))
4. Dimensionality reduction: PCA
5. Clustering
6. Markers identification
- [6.1] Clusters related markers
- [6.2] Intracluster differential expression analysis according to comparison of interest
Pseudotime analysis was performed using [Monocle3](http://cole-trapnell-lab.github.io/monocle3/)
Input files for scRNAseq analysis are available in the following [link](https://www.dropbox.com/sh/83dxrxqer8cl081/AAC8zALRuYRGh1mEe4lZuaHZa?dl=0)
---
An interactive querying and exploration of the scRNAseq dataset is available at:
http://bioinfotiget.it/he/
---
suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(monocle3))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(dplyr))
# ED_Fig5B ----------------------------------------------------------------
## load scRNAseq data from public dataset GSE162950
load("Scarfo_HEC2023/scRNAseq/seurat_object.Rdata")
EHT_scorecard <- c("CXCR4","SOX17","GJA5","MECOM","HOXA9","SPN","CD44","ITGA2B","HLF","GFI1","MLLT3","KCNK17","MYB","STAT5A","SMIM24","RAB27B","SPINK2")
## Select HE cells
sample <- SetIdent(sample, value = sample@meta.data$seurat_clusters)
index1 = GetAssayData(sample, slot="counts")["RUNX1",]>0
index2 = GetAssayData(sample, slot="counts")["CDH5",]>0
index3 = GetAssayData(sample, slot="counts")["PTPRC",]==0
index4 = GetAssayData(sample, slot="counts")["FCGR2B",]==0
bars = colnames(sample)[Idents(sample) %in% c(0,3)]
bars1 = colnames(sample)[index1 & index2 & index3 & index4]
bars2 = intersect(bars, bars1)
## Establish identity for FCGR2B- HE cells
Idents(sample, cells=bars2) = "FCGR2B=0"
index5 = GetAssayData(sample, slot="counts")["FCGR2B",]>0
bars3 = colnames(sample)[index1 & index2 & index3 & index5]
bars4 = intersect(bars, bars3)
## Establish identity for FCGR2B+ HE cells
Idents(sample, cells=bars4) = "FCGR2B_exp"
sample$CellType <- Idents(sample)
sample_sub <- subset(sample, CellType=="FCGR2B_exp" | CellType=="FCGR2B=0")
DotPlot(sample_sub,
features=EHT_scorecard,
cols=c("grey90","red3"),
dot.scale = 10) +
coord_flip()
# ED_Fig5C ------------------------------------------------------------------
## load Seurat data
sample <- readRDS("Scarfo_HEC2023/scRNAseq/WNTd_HE.rds")
sample@meta.data[["Cell.Annotations"]] <- recode_factor(sample@meta.data[["RNA_snn_res.0.6"]],
"0" = "HECs",
"1" = "HECs",
"2" = "HECs",
"3" = "Venous cells",
"4" = "Venous cells",
"5" = "Arterial ECs",
"6" = "Arterial ECs",
"7" = "Venous cells",
"8" = "ECs (M phase)",
"9" = "Venous cells",
"10" = "EndoMT cells",
"11" = "HECs",
"12" = "Arterial ECs",
"13" = "Lymphatic ECs",
"14" = "Arterial ECs",
"15" = "Allantois/Placenta",
"16" = "HECs",
"17" = "HECs",
"18" = "EndoMT cells",
"19" = "SM cells",
"20" = "EndoMT cells",
"21" = "EndoMT cells")
DimPlot(object = sample,
reduction = "umap",
group.by = "Cell.Annotations",
label = F,
repel = T,
pt.size = 0.8,
label.size = 6,
cols = c('HECs' = "#377EB8",
'Lymphatic ECs' = "#4DAF4A",
'Arterial ECs' = "#E41A1C",
'Venous cells' = "#FF7F00",
'EndoMT cells' = "#E6AB02",
'Allantois/Placenta' = "#984EA3",
'SM cells' = "#A6761D",
"ECs (M phase)" = "#A50026"))
# ED_Fig5D - ED_Fig5E ------------------------------------------------------------------
FeaturePlot(object = sample,
features = c("RUNX1", "FCGR2B"),
cols = c("grey90","blue"),
order = T,
pt.size = 0.1)
# ED_Fig5F ------------------------------------------------------------------
runx1 <- readRDS("Scarfo_HEC2023/scRNAseq/RUNX1_clusters.rds")
## Monocle3
Idents(runx1) <- "RNA_snn_res.0.6"
start = WhichCells(runx1,idents = "0")
expression_matrix <- runx1@assays[["RNA"]]@counts
cell_metadata <- runx1@meta.data
cell_metadata$orig.ident <- factor(cell_metadata$orig.ident, levels = unique(cell_metadata$orig.ident))
gene_annotation <- data.frame("gene_short_name" = rownames(runx1))
rownames(gene_annotation) <- gene_annotation$gene_short_name
cds <- new_cell_data_set(expression_data = expression_matrix,
cell_metadata = cell_metadata,
gene_metadata = gene_annotation)
cds <- preprocess_cds(cds, num_dim = 50)
cds <- reduce_dimension(cds)
## Construct and assign the made up partition
recreate.partition <- c(rep(1, length(cds@colData@rownames)))
names(recreate.partition) <- cds@colData@rownames
recreate.partition <- as.factor(recreate.partition)
cds@clusters@listData[["UMAP"]][["partitions"]] <- recreate.partition
## Assign the cluster info
list_cluster <- runx1@meta.data[["RNA_snn_res.0.6"]]
names(list_cluster) <- runx1@assays[["RNA"]]@data@Dimnames[[2]]
cds@clusters@listData[["UMAP"]][["clusters"]] <- list_cluster
cds@clusters@listData[["UMAP"]][["louvain_res"]] <- "NA"
## Assign UMAP coordinate
reducedDims(cds)[["UMAP"]] <- runx1@reductions[["umap"]]@cell.embeddings
### Reset colnames and rownames (consequence of UMAP replacement)
rownames(cds@principal_graph_aux[['UMAP']]$dp_mst) <- NULL
## Learn Graph
cds <- learn_graph(cds = cds,use_partition = T,learn_graph_control=list(ncenter=220,minimal_branch_len=15),verbose = T)
cds <- order_cells(cds, root_cells = start)
plot_cells(cds,
color_cells_by = "pseudotime",
label_groups_by_cluster=FALSE,
label_leaves=T,
label_branch_points=T,
group_label_size = 8,
graph_label_size = 3,
cell_size = 1,
trajectory_graph_segment_size = 1)
# ED_Fig5G ------------------------------------------------------------------
cds <- estimate_size_factors(cds)
gi <- c("H19","KCNK17","RUNX1", "MYB", "SPN")
cds_gi <- cds[rowData(cds)$gene_short_name %in% gi,]
plot_genes_in_pseudotime(cds_subset = cds_gi,
ncol = 3,
color_cells_by = "RNA_snn_res.0.6",
min_expr = NULL,
cell_size = 1)
suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(monocle3))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(patchwork))
suppressPackageStartupMessages(library(clusterProfiler))
## load Seurat data
sample <- readRDS("Scarfo_HEC2023/scRNAseq/WNTd_HE.rds")
# Fig5A ------------------------------------------------------------------
DimPlot(object = sample,
reduction = "umap",
group.by = "RNA_snn_res.0.6",
label = T,
repel = T,
pt.size = 0.8,
label.size = 8,
cols = c("0" = "#F8766D",
"1" = "#B79F00",
"2" = "#00BA38",
"3" = "#EC8239",
"4"= "#00B81B",
"5"= "#7C96FF",
"6"= "#00C085",
"7" = "#00C1A7",
"8" = "#00BFC4",
"9" = "#EF67EB",
"10" = "#00B2F3",
"11"= "#00BFC4",
"12"= "#00A6FF",
"13"= "#DB8E00",
"14"= "#FD61D3",
"15"= "#FF63B6",
"16"= "#619CFF",
"17"= "#F564E3",
"18"= "#00BADE",
"19"= "#B385FF",
"20"= "#D874FD",
"21"= "#FF6B94")) +
ggtitle("") +
xlab("UMAP1") +
ylab("UMAP2") +
theme(legend.text = element_text(size = 16*96/72),
axis.text.x = element_text(color = "black", family = "Arial",size = 14*96/72),
axis.text.y = element_text(color = "black", family = "Arial",size = 14*96/72),
axis.title.x = element_text(color = "black", family = "Arial",size = 16*96/72),
axis.title.y = element_text(color = "black", family = "Arial",size = 16*96/72),
aspect.ratio = 1,
panel.background = element_blank()) +
guides(color = guide_legend(override.aes = list(size=4)))
## Seurat markers
markers <- FindAllMarkers(sample,
only.pos = T,
min.pct = 0.1,
logfc.threshold = 0.25)
## define RUNX1 clusters
runx1.clusters <- subset(markers, markers$gene=="RUNX1")
runx1.clusters <- droplevels(runx1.clusters$cluster)
## subset RUNX1 clusters into new object
runx1 <- subset(sample, idents = runx1.clusters)
DimPlot(object = runx1,
reduction = "umap",
group.by = "RNA_snn_res.0.6",
label = T,
repel = T,
pt.size = 0.8,
label.size = 8)
# Fig5B ----------------------------------------------------------------
## Monocle3
Idents(runx1) <- "RNA_snn_res.0.6"
start = WhichCells(runx1,idents = "0")
expression_matrix <- runx1@assays[["RNA"]]@counts
cell_metadata <- runx1@meta.data
cell_metadata$orig.ident <- factor(cell_metadata$orig.ident, levels = unique(cell_metadata$orig.ident))
gene_annotation <- data.frame("gene_short_name" = rownames(runx1))
rownames(gene_annotation) <- gene_annotation$gene_short_name
cds <- new_cell_data_set(expression_data = expression_matrix,
cell_metadata = cell_metadata,
gene_metadata = gene_annotation)
cds <- preprocess_cds(cds, num_dim = 50)
cds <- reduce_dimension(cds)
## Construct and assign the made up partition
recreate.partition <- c(rep(1, length(cds@colData@rownames)))
names(recreate.partition) <- cds@colData@rownames
recreate.partition <- as.factor(recreate.partition)
cds@clusters@listData[["UMAP"]][["partitions"]] <- recreate.partition
## Assign the cluster info
list_cluster <- runx1@meta.data[["RNA_snn_res.0.6"]]
names(list_cluster) <- runx1@assays[["RNA"]]@data@Dimnames[[2]]
cds@clusters@listData[["UMAP"]][["clusters"]] <- list_cluster
cds@clusters@listData[["UMAP"]][["louvain_res"]] <- "NA"
## Assign UMAP coordinate
reducedDims(cds)[["UMAP"]] <- runx1@reductions[["umap"]]@cell.embeddings
### Reset colnames and rownames (consequence of UMAP replacement)
rownames(cds@principal_graph_aux[['UMAP']]$dp_mst) <- NULL
## Learn Graph
cds <- learn_graph(cds = cds,use_partition = T,learn_graph_control=list(ncenter=220,minimal_branch_len=15),verbose = T)
plot_cells(cds,
color_cells_by = "RNA_snn_res.0.6",
label_groups_by_cluster=FALSE,
label_leaves=F,
label_branch_points=F,
graph_label_size=1.5,
group_label_size = 6,
cell_size = 1)
# Fig5C -------------------------------------------------------------------
## differential markers
degs_clu11_vs_clu0_1_2 <- FindMarkers(object = runx1, ident.1 = 11, ident.2 = c(0,1,2), only.pos = FALSE, min.pct = 0, test.use = "wilcox", logfc.threshold = 0, min.cells.group = 0)
degs_clu11_vs_clu16_17 <- FindMarkers(object = runx1, ident.1 = 11, ident.2 = c(16,17), only.pos = FALSE, min.pct = 0, test.use = "wilcox", logfc.threshold = 0, min.cells.group = 0)
## GSEA
adj.pval.thr <- 0.05
gmt.obj <- read.gmt("Scarfo_HEC2023/BulkRNAseq_1/c5.all.v7.2.symbols.gmt")
comp <- c("clu11_vs_clu0_1_2", "clu11_vs_clu16_17")
for (i in comp) {
degs <- get(paste("degs",i,sep="_"))
gene.res.sorted <- degs[order(degs$avg_logFC, decreasing = TRUE), ]
# LogFC with Symbols
gene.res.logfc.symbol <- as.vector(gene.res.sorted$avg_logFC)
names(gene.res.logfc.symbol) <- row.names(gene.res.sorted)
contr.gsea <- GSEA(gene.res.logfc.symbol,
TERM2GENE = gmt.obj,
nPerm = 10000,
pvalueCutoff = adj.pval.thr)
contr.gsea.res <- as.data.frame(contr.gsea)
assign(paste0("contr.gsea.res_",i), contr.gsea.res)
if (i == "clu11_vs_clu0_1_2") {
terms <- c("GO_CELL_CELL_JUNCTION_ORGANIZATION",
"GO_ACTIN_CYTOSKELETON",
"GO_EXTRACELLULAR_MATRIX_STRUCTURAL_CONSTITUENT",
"GO_STRUCTURAL_CONSTITUENT_OF_RIBOSOME",
"GO_RIBOSOME",
"GO_TRANSLATIONAL_INITIATION")
} else {
terms <- c("GO_MITOTIC_NUCLEAR_DIVISION",
"GO_CELL_DIVISION",
"GO_MITOTIC_CELL_CYCLE",
"GO_REGULATION_OF_NOTCH_SIGNALING_PATHWAY",
"GO_CYTOSOLIC_RIBOSOME",
"GO_ENDOTHELIAL_CELL_MIGRATION")
}
ti <- contr.gsea.res[c(contr.gsea.res$Description %in% terms), ]
ti <- mutate(ti, color = case_when(ti$NES > 0 ~ "Upregulated",
ti$NES < 0 ~ "Downregulated"))
ti <- arrange(ti, NES)
ti$Description <- factor(ti$Description, levels = ti$Description)
p <- ggplot(data=ti, aes(x=NES, y=Description, fill = color)) +
geom_bar(stat="identity", width = 0.7) +
scale_fill_manual(values = c("Upregulated" = "red","Downregulated" = "blue"),name = "") +
ylab("Terms") +
ggtitle(paste(i)) +
theme_bw() +
theme(
axis.title.x = element_text(colour="black",family = "Arial", size=12*96/72),
axis.text.x = element_text(colour="black",family = "Arial", size=10*96/72),
axis.title.y = element_text(colour="black",family = "Arial", size=12*96/72),
axis.text.y = element_text(colour="black",family = "Arial", size=8*96/72),
aspect.ratio = 0.8)
assign(paste0("p_",i), p)
}
# Fig5D -------------------------------------------------------------------
runx1@meta.data[["Phase"]] <- recode_factor(runx1@meta.data[["Phase"]],
"G1" = "G1",
"S" = "S",
"G2M" = "G2/M")
## grouping clusters 0,1,2 and cluster 16,17
runx1@meta.data[["RNA_snn_res.0.6"]] <- recode_factor(runx1@meta.data[["RNA_snn_res.0.6"]],
"0" = "0,1,2",
"1" = "0,1,2",
"2" = "0,1,2",
"11" = "11",
"16" = "16,17",
"17" = "16,17")
cc_umap <- DimPlot(object = runx1,
reduction = "umap",
group.by = "Phase",
split.by = "RNA_snn_res.0.6",
label = F,
repel = T,
pt.size = 0.8,
label.size = 8,
cols = c("G1" = "#E6AB02",
"S" = "#7570B3",
"G2/M" = "#66A61E")) +
ggtitle("") +
xlab("UMAP1") +
ylab("UMAP2") +
theme(legend.text = element_text(size = 12*96/72),
axis.text.x = element_text(color = "black", family = "Arial",size = 10*96/72),
axis.text.y = element_text(color = "black", family = "Arial",size = 10*96/72),
axis.title.x = element_text(color = "black", family = "Arial",size = 12*96/72),
axis.title.y = element_text(color = "black", family = "Arial",size = 12*96/72),
aspect.ratio = 1,
plot.margin = margin(0, 0, 0, 0, "pt"),
panel.background = element_blank()) +
guides(color = guide_legend(override.aes = list(size=4)))
## average cell counts per cluster
ggp_obj_clu <- melt(table(runx1$Phase, runx1$RNA_snn_res.0.6))
colnames(ggp_obj_clu) <- c("Phase","Cluster", "Count")
cc.clusters <- c("0,1,2","11","16,17")
ggp_obj_clu <- ggp_obj_clu[c(ggp_obj_clu$Cluster %in% cc.clusters), ]
for (i in cc.clusters) {
totalB <- subset(ggp_obj_clu, ggp_obj_clu$Cluster==i)
totalB <- totalB %>%
mutate(percentage = Count/sum(Count))
totalB$percentage <- totalB$percentage*100
totalB$fraction = totalB$Count / sum(totalB$Count)
# Compute the cumulative percentages
totalB$ymax <- cumsum(totalB$fraction)
totalB$ymin <- c(0, head(totalB$ymax, n=-1))
totalB$labelPosition <- (totalB$ymax + totalB$ymin) / 2
# Compute a good label
totalB$label <- paste0(totalB$Phase, ":","\n",round(totalB$percentage, 2), "%")
assign(paste0("totalB_clu_", i), totalB)
p <- ggplot(totalB, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=Phase)) +
geom_rect() +
geom_text(x=5, aes(y=labelPosition, label=label), size=5) +
#scale_fill_brewer(palette = "Set1") +
scale_fill_manual(values = c("G1" = "#E6AB02",
"S" = "#7570B3",
"G2/M" = "#66A61E")) +
coord_polar(theta="y") +
xlim(c(2, 5)) +
theme_void() +
theme(plot.margin = margin(0, 0, 0, 0, "pt")) +
theme(legend.position = "none")
assign(paste0("p_clu_",i), p)
}
p_final <- cc_umap / (`p_clu_0,1,2` | p_clu_11 | `p_clu_16,17`)
# Fig5E -------------------------------------------------------------------
cds <- order_cells(cds, root_cells = start)
cds <- estimate_size_factors(cds)
gi <- c("FCGR2B","HES1","HEY1", "HEY2")
cds_gi <- cds[rowData(cds)$gene_short_name %in% gi,]
plot_genes_in_pseudotime(cds_subset = cds_gi,
ncol = 2,
color_cells_by = "RNA_snn_res.0.6",
min_expr = NULL,
cell_size = 1)
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment