Adding music deconvolution analysis

d811fce1 · Matteo Barcella · 2c62bf6c · d811fce1 · d811fce1 · d811fce1
Commit d811fce1 authored Apr 11, 2025 by Matteo Barcella
--- a/bulkRNAseq/Music/Create_minimal_bulkEset.R
+++ b/bulkRNAseq/Music/Create_minimal_bulkEset.R
+library(MuSiC)
+library(Biobase)
+library(magrittr)
+library(tibble)
+library(dplyr)
+library(tidyr)
+library(knitr)
+library(xlsx)
+library(ggplot2)
+library(openxlsx)
+library(RColorBrewer)
+library(clusterProfiler)
+library(pheatmap)
+
+## what is needed: mydataframe, coldata
+coldata <- read.xlsx("Dataframe/mydataframe_all.xlsx")
+coldata <- as.data.frame(coldata)
+coldata <- subset.data.frame(x = coldata, subset = !coldata$expgroup == "THAL_GMP" )
+coldata <- subset.data.frame(x = coldata, subset = !coldata$expgroup == "HD_GMP" )
+coldata$sex <- c(rep("Male", 16), rep("Female", 8), rep("Male", 4), rep("Female", 11))
+coldata$sex <- as.factor(coldata$sex)
+coldata$time <- as.factor(coldata$time)
+coldata$disease <- as.factor(coldata$disease)
+coldata$name <- as.factor(coldata$name)
+
+rownames(coldata) <- coldata$rownames
+
+## CountData
+countData <- read.table(file = "Full-gene-counts.txt", header = TRUE, stringsAsFactors = FALSE, row.names = 1)
+countData <- countData[ , c(1:8,11:18,21:43)]
+colnames(countData) <- coldata$name
+
+coldata <- droplevels.data.frame(coldata)
+coldata$time <- factor(coldata$time, levels = c("HSC","MPP","CMP","MEP", "R5","R6","R7","R8"))
+
+## covert data.frame in matrix for expressionset
+exprs_count <- as.matrix(countData)
+minimalSet <- ExpressionSet(assayData = exprs_count)
+pdata <- coldata
+rownames(pdata) <- pdata$name
+ 
+## verify the order of pdata and exprs count ##
+all(rownames(pdata) == colnames(exprs_count)) ## MUST BE EQUAL 
+
+metadata <- data.frame(labelDescription = 
+                         c("rownames used" , "Patient name", "differentiation time",
+                           "Thal/HD condition", "expgroup", "patient gender"),
+                       row.names = c("rownames","name", "time", "disease", "expgroup", "sex"))
+
+phenodata <- new("AnnotatedDataFrame", data = pdata, varMetadata = metadata)
+rnaseqSet <- ExpressionSet(assayData = exprs_count,
+                           phenoData = phenodata)
+
+saveRDS(rnaseqSet, "rnaseqSet.rds")
--- a/bulkRNAseq/Music/Create_minimal_scEset.R
+++ b/bulkRNAseq/Music/Create_minimal_scEset.R
+# Prepare expression sets from Tusi dataset:
+
+# Steps: 
+# 1. Import data 
+# 2. Re-format (traspose and remove meta)
+# 2.1 Convert alias to symbols
+# 2.2 Convert mouse to human symbols
+# 3. Create expression set with PhenoData added with info
+
+library(data.table)
+library(nichenetr)
+library(dplyr)
+
+# for inputs see Tusi et al manuscript. 
+
+infiles <- list(P1 = "GSM2985844_P1.raw_umifm_counts.csv", 
+                P1CD71 = "GSM2985845_P1-CD71hi.raw_umifm_counts.csv",
+                P2 = "GSM2985846_P2.raw_umifm_counts.csv",
+                P5 = "GSM2985849_P5.raw_umifm_counts.csv") 
+
+convmts <- list()
+
+for(i in c("P1","P1CD71","P2","P5")){
+  
+  obj <- read.csv(file = infiles[[i]], header = T, stringsAsFactors = F, quote = "", as.is = T)
+  obj$CellID <- paste(obj$Sample, obj$Library, gsub(obj$Barcode_Seq, pattern = "-", replacement = ""), sep = "_")
+  row.names(obj) <- obj$CellID
+  obj$CellID <- NULL
+  obj_t <- t(obj)
+  mymat <- obj_t[10:nrow(obj_t),]
+  ids <- rownames(mymat)
+  human_symbols = ids %>% nichenetr::convert_mouse_to_human_symbols()
+  names(human_symbols) <- NULL
+  mymat <- mymat[!is.na(human_symbols),]
+  rownames(mymat) <- human_symbols[!is.na(human_symbols)]
+  #mymat <- noquote(mymat)
+  convmts[[i]] <- as.data.frame(mymat)
+}
+
+saveRDS(object = convmts, file = "convmts.rds")
+
+full <- merge.data.frame(convmts$P1, convmts$P1CD71, by = 0, sort = F)
+rownames(full) <- full$Row.names
+full$Row.names <- NULL
+full <- merge.data.frame(full, convmts$P2, by = 0, sort = F)
+rownames(full) <- full$Row.names
+full$Row.names <- NULL
+full <- merge.data.frame(full, convmts$P5, by = 0, sort = F)
+rownames(full) <- full$Row.names
+full$Row.names <- NULL
+
+full_num <- full
+full_num <- apply(full, 2, as.numeric)
+rownames(full_num) <- rownames(full)
+
+# create Expression set
+
+library(Biobase)
+
+minimalSet <- ExpressionSet(assayData=full_num)
+saveRDS(minimalSet,file = "minimalset.rds")
+
--- a/bulkRNAseq/Music/MusicAnalysis.R
+++ b/bulkRNAseq/Music/MusicAnalysis.R
+library(MuSiC)
+library(dplyr)
+library(nichenetr)
+library(openxlsx)
+library(ggplot2)
+library(patchwork)
+
+minimalset <- readRDS("minimalset.rds")
+
+# identify genes with double entry 
+
+obj <- read.csv(file = "GSM2985844_P1.raw_umifm_counts.csv",
+                header = T, stringsAsFactors = F, quote = "", as.is = T)
+
+human_symbols = colnames(obj)[10:ncol(obj)] %>% nichenetr::convert_mouse_to_human_symbols()
+human_ids_doubled <- gsub(pattern = "\\.1$", 
+                          replacement = "", 
+                          x = grep(x = rownames(minimalset@assayData$exprs[rowSums(minimalset@assayData$exprs) > 10,]), 
+                                   pattern = "\\.1$", 
+                                   perl = T, 
+                                   value = T))
+
+a <- minimalset@assayData$exprs[rownames(minimalset@assayData$exprs) %in% human_ids_doubled,]
+a <- a[order(rownames(a)), ]
+b <- minimalset@assayData$exprs[rownames(minimalset@assayData$exprs) %in% paste0(human_ids_doubled,".1"),]
+b <- b[order(rownames(b)), ]
+
+u <- a + b 
+
+# Re-Creating expression matrix (filtered with rowsum 10 + fixing human genes coming from 2 different mouse genes)
+
+ori.mt <- minimalset@assayData$exprs 
+ori.mt.filt <- ori.mt[rowSums(ori.mt) > 10,]
+ori.mt.distinct <- ori.mt.filt[!rownames(ori.mt.filt) %in% c(human_ids_doubled, paste0(human_ids_doubled,".1")),]
+
+# checks
+table(rownames(u) %in% rownames(ori.mt.distinct))
+dim(u)
+dim(ori.mt.distinct)
+
+rm(a); rm(b)
+
+# final matrix
+mt <- rbind(ori.mt.distinct, u)
+mt <- mt[order(rownames(mt)), ]
+
+# recreating minimal set
+
+minimalset <- ExpressionSet(assayData=mt)
+rm(mt)
+rm(ori.mt)
+rm(ori.mt.distinct)
+rm(ori.mt.filt)
+rm(u)
+rm(obj)
+
+
+rownames(minimalset@phenoData@data) <- colnames(minimalset@assayData$exprs)
+minimalset@phenoData@data$SampleID <- rownames(minimalset@phenoData@data)
+minimalset@phenoData@data$Cluster <- c(rep("cl1",9640),
+                                       rep("cl2",2537),
+                                       rep("cl3",3562),
+                                       rep("cl4",3695))
+minimalset@phenoData@data$CellType <- c(rep("CFUe",9640),
+                                       rep("CFUe_CD71",2537),
+                                       rep("BFUe",3562),
+                                       rep("EryBiased",3695))
+
+# Loading rnaseq
+
+rnaseqSet <- readRDS("rnaseqSet.rds")
+
+# Running music default
+Est.prop.Tusi <- music_prop(bulk.eset = rnaseqSet, sc.eset = minimalset, clusters = 'Cluster',
+                            samples = 'SampleID', verbose = T)
+
+saveRDS(Est.prop.Tusi,file = "Est.prop.Tusi.rds")
+write.xlsx(x = Est.prop.Tusi, "Estimate_proportions_full_rnaseq_dataset.xlsx", rowNames = T)
+saveRDS(object = minimalset, file = "Expression_set_singlecell_rsum_10.rds")
+
+# Version filtered (rowsum 25 - Tusi)
+
+exprs_count <- rnaseqSet@assayData$exprs[rowSums(rnaseqSet@assayData$exprs) > 25,]
+pheno <- readRDS(file = "phenodata.rds")
+rnaseqSet <- ExpressionSet(assayData = exprs_count,
+                           phenoData = pheno)
+Est.prop.Tusi_rowsum25 <- music_prop(bulk.eset = rnaseqSet, sc.eset = minimalset, clusters = 'Cluster',
+                                    samples = 'SampleID', verbose = T)
+
+write.xlsx(x = Est.prop.Tusi_rowsum25, "Estimate_proportions_rowsum25_rnaseq_dataset.xlsx", rowNames = T)
+saveRDS(Est.prop.Tusi_rowsum25,file = "Est.prop.Tusi_rowsum25.rds")
+
+## considering values from Est.prop.Tusi_rowsum25 ##
+# p subset of values only relative to hsc and mpp 
+
+p$disease <- factor(p$disease,
+                     levels = c("HD", "Bthal")) 
+
+hscmpp_cl1 <- ggplot(p, aes(x= celltype, y = cl1)) + 
+  geom_boxplot(aes(fill = disease)) +
+  geom_point(position = position_dodge(width = 0.7), aes(group = disease)) +
+  theme(legend.title = element_text(size=12)) +
+  theme(legend.text = element_text( size=12)) + 
+  #facet_grid(.~ variable) +
+  labs(title = "Estimation of P1 population", x="Cell type",
+       y = "Est. prop.") +
+  scale_fill_manual(values = c("gray64", "red2")) 
+  
+hscmpp_cl2 <- ggplot(p, aes(x= celltype, y = cl2)) + 
+  geom_boxplot(aes(fill = disease)) +
+  geom_point(position = position_dodge(width = 0.7), aes(group = disease)) +
+  theme(legend.title = element_text(size=12)) +
+  theme(legend.text = element_text( size=12)) + 
+  #facet_grid(.~ variable) +
+  labs(title = "Estimation of P1 CD71high population", x="Cell type",
+       y = "Est. prop.") +
+  scale_fill_manual(values = c("gray64", "red2")) 
+
+png(filename = "Estimation_P1_P1cd71_hscmpp.png", width = 14, height = 7, units = "in", res = 200)
+hscmpp_cl2 + hscmpp_cl1 + plot_annotation(tag_levels = "A")
+dev.off()
+
--- a/bulkRNAseq/Music/convmts.rds
+++ b/bulkRNAseq/Music/convmts.rds
--- a/bulkRNAseq/Music/minimalset.rds
+++ b/bulkRNAseq/Music/minimalset.rds
--- a/bulkRNAseq/Music/phenodata.rds
+++ b/bulkRNAseq/Music/phenodata.rds
--- a/bulkRNAseq/Music/rnaseqSet.rds
+++ b/bulkRNAseq/Music/rnaseqSet.rds
--- a/bulkRNAseq/README.md
+++ b/bulkRNAseq/README.md
@@ -34,8 +34,9 @@ For Thal vs HD comparison intra population (HSC or MPP - n = 2) we used the simp

 6. Downstream analysis

-We performed Over Representation Analyses (ORA) and Gene Set Enrichment Analyses (GSEA) using R/Bioconductor package [clusterProfiler](https://bioconductor.org/packages/release/bioc/html/clusterProfiler.html) (v.3.8.1). Pre-ranked gene list according to log2FC was used to perform Gene Set Enrichment Analysis (GSEA). Terms with adjusted p-value (Benjiamini-Hochber correction) less than 0.05 were considered significantly enriched.  
-
-
+We performed Over Representation Analyses (ORA) and Gene Set Enrichment Analyses (GSEA) using R/Bioconductor package [clusterProfiler](https://bioconductor.org/packages/release/bioc/html/clusterProfiler.html) (v.3.8.1). Pre-ranked gene list according to log2FC was used to perform Gene Set Enrichment Analysis (GSEA). Terms with adjusted p-value (Benjiamini-Hochberg correction) less than 0.05 were considered significantly enriched.  

+7. Supplementary analysis: Music deconvolution analysis  

+We also applied Music to perform BulkRNAseq deconvolution according to celltype specific transcriptomic profiles obtained from scRNAseq dataset (Tusi et al 2021).
+For details see content of Music folder.