Commit d811fce1 authored by Matteo Barcella's avatar Matteo Barcella
Browse files

Adding music deconvolution analysis

parent 2c62bf6c
library(MuSiC)
library(Biobase)
library(magrittr)
library(tibble)
library(dplyr)
library(tidyr)
library(knitr)
library(xlsx)
library(ggplot2)
library(openxlsx)
library(RColorBrewer)
library(clusterProfiler)
library(pheatmap)
## what is needed: mydataframe, coldata
coldata <- read.xlsx("Dataframe/mydataframe_all.xlsx")
coldata <- as.data.frame(coldata)
coldata <- subset.data.frame(x = coldata, subset = !coldata$expgroup == "THAL_GMP" )
coldata <- subset.data.frame(x = coldata, subset = !coldata$expgroup == "HD_GMP" )
coldata$sex <- c(rep("Male", 16), rep("Female", 8), rep("Male", 4), rep("Female", 11))
coldata$sex <- as.factor(coldata$sex)
coldata$time <- as.factor(coldata$time)
coldata$disease <- as.factor(coldata$disease)
coldata$name <- as.factor(coldata$name)
rownames(coldata) <- coldata$rownames
## CountData
countData <- read.table(file = "Full-gene-counts.txt", header = TRUE, stringsAsFactors = FALSE, row.names = 1)
countData <- countData[ , c(1:8,11:18,21:43)]
colnames(countData) <- coldata$name
coldata <- droplevels.data.frame(coldata)
coldata$time <- factor(coldata$time, levels = c("HSC","MPP","CMP","MEP", "R5","R6","R7","R8"))
## covert data.frame in matrix for expressionset
exprs_count <- as.matrix(countData)
minimalSet <- ExpressionSet(assayData = exprs_count)
pdata <- coldata
rownames(pdata) <- pdata$name
## verify the order of pdata and exprs count ##
all(rownames(pdata) == colnames(exprs_count)) ## MUST BE EQUAL
metadata <- data.frame(labelDescription =
c("rownames used" , "Patient name", "differentiation time",
"Thal/HD condition", "expgroup", "patient gender"),
row.names = c("rownames","name", "time", "disease", "expgroup", "sex"))
phenodata <- new("AnnotatedDataFrame", data = pdata, varMetadata = metadata)
rnaseqSet <- ExpressionSet(assayData = exprs_count,
phenoData = phenodata)
saveRDS(rnaseqSet, "rnaseqSet.rds")
# Prepare expression sets from Tusi dataset:
# Steps:
# 1. Import data
# 2. Re-format (traspose and remove meta)
# 2.1 Convert alias to symbols
# 2.2 Convert mouse to human symbols
# 3. Create expression set with PhenoData added with info
library(data.table)
library(nichenetr)
library(dplyr)
# for inputs see Tusi et al manuscript.
infiles <- list(P1 = "GSM2985844_P1.raw_umifm_counts.csv",
P1CD71 = "GSM2985845_P1-CD71hi.raw_umifm_counts.csv",
P2 = "GSM2985846_P2.raw_umifm_counts.csv",
P5 = "GSM2985849_P5.raw_umifm_counts.csv")
convmts <- list()
for(i in c("P1","P1CD71","P2","P5")){
obj <- read.csv(file = infiles[[i]], header = T, stringsAsFactors = F, quote = "", as.is = T)
obj$CellID <- paste(obj$Sample, obj$Library, gsub(obj$Barcode_Seq, pattern = "-", replacement = ""), sep = "_")
row.names(obj) <- obj$CellID
obj$CellID <- NULL
obj_t <- t(obj)
mymat <- obj_t[10:nrow(obj_t),]
ids <- rownames(mymat)
human_symbols = ids %>% nichenetr::convert_mouse_to_human_symbols()
names(human_symbols) <- NULL
mymat <- mymat[!is.na(human_symbols),]
rownames(mymat) <- human_symbols[!is.na(human_symbols)]
#mymat <- noquote(mymat)
convmts[[i]] <- as.data.frame(mymat)
}
saveRDS(object = convmts, file = "convmts.rds")
full <- merge.data.frame(convmts$P1, convmts$P1CD71, by = 0, sort = F)
rownames(full) <- full$Row.names
full$Row.names <- NULL
full <- merge.data.frame(full, convmts$P2, by = 0, sort = F)
rownames(full) <- full$Row.names
full$Row.names <- NULL
full <- merge.data.frame(full, convmts$P5, by = 0, sort = F)
rownames(full) <- full$Row.names
full$Row.names <- NULL
full_num <- full
full_num <- apply(full, 2, as.numeric)
rownames(full_num) <- rownames(full)
# create Expression set
library(Biobase)
minimalSet <- ExpressionSet(assayData=full_num)
saveRDS(minimalSet,file = "minimalset.rds")
library(MuSiC)
library(dplyr)
library(nichenetr)
library(openxlsx)
library(ggplot2)
library(patchwork)
minimalset <- readRDS("minimalset.rds")
# identify genes with double entry
obj <- read.csv(file = "GSM2985844_P1.raw_umifm_counts.csv",
header = T, stringsAsFactors = F, quote = "", as.is = T)
human_symbols = colnames(obj)[10:ncol(obj)] %>% nichenetr::convert_mouse_to_human_symbols()
human_ids_doubled <- gsub(pattern = "\\.1$",
replacement = "",
x = grep(x = rownames(minimalset@assayData$exprs[rowSums(minimalset@assayData$exprs) > 10,]),
pattern = "\\.1$",
perl = T,
value = T))
a <- minimalset@assayData$exprs[rownames(minimalset@assayData$exprs) %in% human_ids_doubled,]
a <- a[order(rownames(a)), ]
b <- minimalset@assayData$exprs[rownames(minimalset@assayData$exprs) %in% paste0(human_ids_doubled,".1"),]
b <- b[order(rownames(b)), ]
u <- a + b
# Re-Creating expression matrix (filtered with rowsum 10 + fixing human genes coming from 2 different mouse genes)
ori.mt <- minimalset@assayData$exprs
ori.mt.filt <- ori.mt[rowSums(ori.mt) > 10,]
ori.mt.distinct <- ori.mt.filt[!rownames(ori.mt.filt) %in% c(human_ids_doubled, paste0(human_ids_doubled,".1")),]
# checks
table(rownames(u) %in% rownames(ori.mt.distinct))
dim(u)
dim(ori.mt.distinct)
rm(a); rm(b)
# final matrix
mt <- rbind(ori.mt.distinct, u)
mt <- mt[order(rownames(mt)), ]
# recreating minimal set
minimalset <- ExpressionSet(assayData=mt)
rm(mt)
rm(ori.mt)
rm(ori.mt.distinct)
rm(ori.mt.filt)
rm(u)
rm(obj)
rownames(minimalset@phenoData@data) <- colnames(minimalset@assayData$exprs)
minimalset@phenoData@data$SampleID <- rownames(minimalset@phenoData@data)
minimalset@phenoData@data$Cluster <- c(rep("cl1",9640),
rep("cl2",2537),
rep("cl3",3562),
rep("cl4",3695))
minimalset@phenoData@data$CellType <- c(rep("CFUe",9640),
rep("CFUe_CD71",2537),
rep("BFUe",3562),
rep("EryBiased",3695))
# Loading rnaseq
rnaseqSet <- readRDS("rnaseqSet.rds")
# Running music default
Est.prop.Tusi <- music_prop(bulk.eset = rnaseqSet, sc.eset = minimalset, clusters = 'Cluster',
samples = 'SampleID', verbose = T)
saveRDS(Est.prop.Tusi,file = "Est.prop.Tusi.rds")
write.xlsx(x = Est.prop.Tusi, "Estimate_proportions_full_rnaseq_dataset.xlsx", rowNames = T)
saveRDS(object = minimalset, file = "Expression_set_singlecell_rsum_10.rds")
# Version filtered (rowsum 25 - Tusi)
exprs_count <- rnaseqSet@assayData$exprs[rowSums(rnaseqSet@assayData$exprs) > 25,]
pheno <- readRDS(file = "phenodata.rds")
rnaseqSet <- ExpressionSet(assayData = exprs_count,
phenoData = pheno)
Est.prop.Tusi_rowsum25 <- music_prop(bulk.eset = rnaseqSet, sc.eset = minimalset, clusters = 'Cluster',
samples = 'SampleID', verbose = T)
write.xlsx(x = Est.prop.Tusi_rowsum25, "Estimate_proportions_rowsum25_rnaseq_dataset.xlsx", rowNames = T)
saveRDS(Est.prop.Tusi_rowsum25,file = "Est.prop.Tusi_rowsum25.rds")
## considering values from Est.prop.Tusi_rowsum25 ##
# p subset of values only relative to hsc and mpp
p$disease <- factor(p$disease,
levels = c("HD", "Bthal"))
hscmpp_cl1 <- ggplot(p, aes(x= celltype, y = cl1)) +
geom_boxplot(aes(fill = disease)) +
geom_point(position = position_dodge(width = 0.7), aes(group = disease)) +
theme(legend.title = element_text(size=12)) +
theme(legend.text = element_text( size=12)) +
#facet_grid(.~ variable) +
labs(title = "Estimation of P1 population", x="Cell type",
y = "Est. prop.") +
scale_fill_manual(values = c("gray64", "red2"))
hscmpp_cl2 <- ggplot(p, aes(x= celltype, y = cl2)) +
geom_boxplot(aes(fill = disease)) +
geom_point(position = position_dodge(width = 0.7), aes(group = disease)) +
theme(legend.title = element_text(size=12)) +
theme(legend.text = element_text( size=12)) +
#facet_grid(.~ variable) +
labs(title = "Estimation of P1 CD71high population", x="Cell type",
y = "Est. prop.") +
scale_fill_manual(values = c("gray64", "red2"))
png(filename = "Estimation_P1_P1cd71_hscmpp.png", width = 14, height = 7, units = "in", res = 200)
hscmpp_cl2 + hscmpp_cl1 + plot_annotation(tag_levels = "A")
dev.off()
......@@ -34,8 +34,9 @@ For Thal vs HD comparison intra population (HSC or MPP - n = 2) we used the simp
6. Downstream analysis
We performed Over Representation Analyses (ORA) and Gene Set Enrichment Analyses (GSEA) using R/Bioconductor package [clusterProfiler](https://bioconductor.org/packages/release/bioc/html/clusterProfiler.html) (v.3.8.1). Pre-ranked gene list according to log2FC was used to perform Gene Set Enrichment Analysis (GSEA). Terms with adjusted p-value (Benjiamini-Hochber correction) less than 0.05 were considered significantly enriched.
We performed Over Representation Analyses (ORA) and Gene Set Enrichment Analyses (GSEA) using R/Bioconductor package [clusterProfiler](https://bioconductor.org/packages/release/bioc/html/clusterProfiler.html) (v.3.8.1). Pre-ranked gene list according to log2FC was used to perform Gene Set Enrichment Analysis (GSEA). Terms with adjusted p-value (Benjiamini-Hochberg correction) less than 0.05 were considered significantly enriched.
7. Supplementary analysis: Music deconvolution analysis
We also applied Music to perform BulkRNAseq deconvolution according to celltype specific transcriptomic profiles obtained from scRNAseq dataset (Tusi et al 2021).
For details see content of Music folder.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment