Upload New File

a988d531 · Ivan Merelli · f4a3560c · a988d531
Commit a988d531 authored Nov 22, 2024 by Ivan Merelli
--- a/fiumara_20241115_UploadScript.R
+++ b/fiumara_20241115_UploadScript.R
+### upload on gitlab
+# 1) load R packages
+library(Seurat)
+library(GEOquery)
+library(limma)
+library(dplyr)
+library(data.table)
+library(clusterProfiler)
+library(ggplot2)
+library(RColorBrewer)
+library(qusage)
+library(UCell)
+# 2) define input and output paths
+# set path to the working directory
+path <- ""
+# 3) load scRNA-seq BM-object
+## 3.1) restrict analysis to CD34+ clusters
+obj <- readRDS(file = paste0(path, "BM_20240222.rds"))
+CD34p_clusters <- as.character(c(2, 8, 14, 15, 16, 19, 20, 26, 27, 30, 33))
+obj$seurat <- obj$RNA_snn_h.orig.ident_res.1.8
+obj <- SetIdent(object = obj, value = "seurat")
+obj <- subset(obj, subset = (seurat %in% CD34p_clusters))
+saveRDS(object = obj, file = paste0(path, "BM_CD34p.rds"))
+## 3.2) free memory space
+l <- ls()
+l <- l[!l %in% "path"]
+rm(list = l); gc()
+# 4) load published scRNA-seq dataset
+## 4.1) Ainciburu et al.
+### 4.1.1) download tar file and metadata
+destfolder <- paste0(path, "GSE180298/"); dir.create(path = destfolder, showWarnings = F, recursive = T)
+destfile <- paste0(destfolder, "GSE180298_RAW.tar")
+download.file(url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE180298&format=file", destfile = destfile)
+metadata_files <- c("https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE180298&format=file&file=GSE180298%5Felderly%5Fmetadata%2Etxt%2Egz",
+                    "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE180298&format=file&file=GSE180298%5Fmds%5Fmetadata%2Etxt%2Egz",
+                    "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE180298&format=file&file=GSE180298%5Fyoung%5Fmetadata%2Etxt%2Egz")
+download.file(url = metadata_files, destfile = paste0(destfolder, "GSE180298_", c("elderly", "mds", "young"), "_metadata.txt.gz"))
+### 4.1.2) untar file
+untar(tarfile = destfile, exdir = paste0(destfolder, "GSE180298_RAW/"))
+files <- list.files(path = paste0(destfolder, "GSE180298_RAW/"), full.names = T)
+md <- list.files(path = destfolder, full.names = T)
+md_files <- md[grepl(md, pattern = "metadata")]
+### 4.1.3) define seurat object
+metadata <- c()
+for(i in seq_len(length(md_files))){
+  gunzip(md_files[i], remove = FALSE, overwrite = TRUE)
+  unzfile <- gsub(md_files[i], pattern = ".gz", replacement = "")
+  temp <- read.delim(file = unzfile, header = T, sep = "\t")
+  metadata <- rbind(metadata, temp)
+}
+metadata <- as.data.frame(metadata)
+metadata$orig.ident <- strsplit2(rownames(metadata), split = "\\_")[, 2]
+counts <- cell_counts <- samples <- c()
+for(i in seq_len(length(files))){
+  split_filename <- strsplit2(files[i], split = "/")
+  sampleid <- strsplit2(split_filename[, ncol(split_filename)], split = "\\_")[, 2]
+  print(paste0("Processing file ", sampleid, " (", i, " over ", length(files), ")"))
+  # load h5 object
+  temp <- Read10X_h5(filename = files[i], use.names = TRUE, unique.features = TRUE)
+  coln <- paste0(strsplit2(colnames(temp), split = "-")[, 1], "_", sampleid)
+  colnames(temp) <- coln
+  # store
+  temp_object <- CreateSeuratObject(counts = temp)
+  counts_temp <- matrix(data = temp_object@assays$RNA@counts, ncol = ncol(temp_object))
+  rownames(counts_temp) <- rownames(temp_object)
+  colnames(counts_temp) <- colnames(temp_object)
+  # store info
+  cell_counts <- c(cell_counts, ncol(counts_temp))
+  samples <- c(samples, sampleid)
+  if(is.null(nrow(counts))){
+    counts <- counts_temp
+    gs <- rownames(counts)
+  }else{
+    gs <- intersect(gs, rownames(counts_temp))
+    counts <- cbind(counts[match(gs, rownames(counts)), ], 
+                    counts_temp[match(gs, rownames(counts_temp)), ])
+  }
+}
+names(cell_counts) <- samples
+md <- metadata[rownames(metadata) %in% colnames(counts),]; dim(md)
+m <- match(rownames(md), colnames(counts)); table(is.na(m))
+counts <- counts[, m]
+GSE180298 <- CreateSeuratObject(counts = counts, meta.data = metadata)
+### 4.1.4) select only elderly donors
+GSE180298 <- SetIdent(object = GSE180298, value = "orig.ident")
+elderly_sampleid <- unique(GSE180298$orig.ident)[grepl(unique(GSE180298$orig.ident), pattern = "elderly")]
+GSE180298 <- subset(GSE180298, subset = (orig.ident %in% elderly_sampleid))
+saveRDS(object = GSE180298, file = paste0(path, "GSE180298_elderly.rds")); gc()
+## 4.1.5) free memory space
+l <- ls()
+l <- l[!l %in% "path"]
+rm(list = l); gc()
+## 4.2) Wu et al.
+### 4.2.1) download tar file
+destfolder <- paste0(path, "GSE196052/"); dir.create(path = destfolder, showWarnings = F, recursive = T)
+destfile <- paste0(destfolder, "GSE196052_RAW.tar")
+download.file(url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE196052&format=file", destfile = destfile)
+metadata_files <- c("https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE196052&format=file&file=GSE196052%5FdataCount%5FCD34%2Ecsv%2Egz")
+download.file(url = metadata_files, destfile = paste0(destfolder, "GSE196052_dataCount_CD34.csv.gz"))
+### 4.2.2) untar file
+untar(tarfile = destfile, exdir = paste0(destfolder, "GSE196052_RAW/"))
+files_annotation <- list.files(path = paste0(destfolder, "GSE196052_RAW/"), full.names = T)
+files <- list.files(path = destfolder, full.names = T)
+### 4.2.3) define seurat object
+md <- read.delim(file = files[grepl(files, pattern = "SraRun")], header = T, sep = ",")
+table(md$subject_id.status, md$tissue.cell_type)
+mdfiles <- files_annotation[grepl(files_annotation, pattern = "GSM")]
+metadata <- c()
+for(i in seq_len(length(mdfiles))){
+  x <- mdfiles[i]
+  temp <- read.delim(gzfile(x), header = T, sep = ",") 
+  metadata <- rbind(metadata, temp)
+}
+f <- files[grepl(files, pattern = "CD34.csv.gz")]
+counts_cd34 <- fread(f) %>% as.data.frame()
+rn <- as.character(counts_cd34[, 1])
+counts_cd34 <- counts_cd34[, -1]
+rownames(counts_cd34) <- rn
+cd34_cells <- colnames(counts_cd34)
+cd34_cells <- gsub(x = cd34_cells, pattern = "CD34_", replacement = "")
+wu_annot <- read.csv(file = paste0(destfolder, "CD34_metaDatatSNECellType_ALiceManual.csv"), header = T)
+m <- match(cd34_cells, wu_annot$orig.ident); table(is.na(m))
+status <- wu_annot$group[m]
+sampleid <- wu_annot$subject[m]
+mdt <- data.frame(sampleid, status)
+rownames(mdt) <- colnames(counts_cd34)
+GSE196052 <- CreateSeuratObject(counts = counts_cd34, meta.data = mdt)
+GSE196052 <- SetIdent(object = GSE196052, value = "orig.ident")
+saveRDS(object = GSE196052, file = paste0(path, "GSE196052.rds"))
+## 4.2.4) free memory space
+l <- ls()
+l <- l[!l %in% "path"]
+rm(list = l); gc()
+# 5) Define single seurat object
+## 5.1) load datasets
+fiumara <- readRDS(paste0(path, "BM_CD34p.rds"))
+GSE180298 <- readRDS(paste0(path, "GSE180298_elderly.rds"))
+GSE196052 <- readRDS(paste0(path, "GSE196052.rds"))
+## 5.2) define common features
+genes <- table(c(rownames(fiumara), rownames(GSE180298), rownames(GSE196052)))
+common_genes <- names(genes)[genes == 3]; length(common_genes)
+## 5.3) add sampleinfo
+fiumara$source <- "fiumara"
+fiumara$sample_info <- paste0("fiumara_", fiumara$orig.ident)
+GSE180298$source <- "GSE180298"
+GSE180298$sample_info <- paste0("GSE180298_", GSE180298$orig.ident)
+GSE196052$source <- "GSE196052"
+GSE196052$sample_info <- paste0("GSE196052_", GSE196052$sampleid)
+## 5.4) extract counts relative to these genes and define Seurat object
+counts <- cbind(fiumara@assays$RNA@counts[common_genes,],
+                GSE180298@assays$RNA@counts[common_genes,],
+                GSE196052@assays$RNA@counts[common_genes,])
+vars <- c("source", "sample_info", "nCount_RNA", "nFeature_RNA")
+md <- rbind(fiumara@meta.data[, vars],
+            GSE180298@meta.data[, vars],
+            GSE196052@meta.data[, vars])
+obj <- CreateSeuratObject(counts = counts, meta.data = md)
+obj$percent.mt <- (colSums(obj@assays$RNA@counts[grepl(rownames(obj), pattern = "^MT-"),])/colSums(obj@assays$RNA@counts))*100
+obj$percent.rb <- (colSums(obj@assays$RNA@counts[grepl(rownames(obj), pattern = "^RPL"),])/colSums(obj@assays$RNA@counts))*100
+saveRDS(object = obj, file = paste0(path, "combined.rds")); gc()
+## 5.5) free memory space
+l <- ls()
+l <- l[!l %in% c("path", "obj")]
+rm(list = l); gc()
+# 6) removing low quality cells, normalization, scaling, integration and clustering
+## 6.1) cell filtering
+obj$keep <- (obj$nFeature_RNA > 200) & (obj$percent.mt < 25)
+table(obj$keep, obj$source)
+obj <- subset(obj, subset = (keep %in% TRUE))
+saveRDS(obj, file = paste0(path, "combined_filtered.rds"))
+## 6.2) normalization
+varfeatures <- 1000
+obj <- NormalizeData(object = obj)
+obj <- FindVariableFeatures(obj, selection.method = "vst",
+                            nfeatures = varfeatures,
+                            verbose = T)
+## 6.3) scaling
+reg_vars = c("percent.mt", "nCount_RNA")
+obj <- ScaleData(object = obj, vars.to.regress = reg_vars,  display.progress = T, features = rownames(obj))
+saveRDS(obj, file = paste0(path, "combined_normscaled.rds"))
+## 6.4) dimensionality reduction
+max_pca <- 100
+obj <- RunPCA(object = obj, features = VariableFeatures(object = obj), npcs = max_pca, reduction.name="pca", reduction.key="PC_")
+explvar <- ((obj@reductions$pca@stdev^2)/sum((obj@reductions$pca@stdev^2)))*100
+delta <- explvar - c(explvar[-1], 0)
+opt_delta <- length(delta[delta > 1e-2])
+opt_explvar <- min(which(cumsum(explvar) > 80))
+opt <- min(opt_delta, opt_explvar)
+obj <- RunUMAP(object = obj, seed.use = 123, reduction = "pca", dims = 1:opt)
+## 6.5) harmony dataset integration
+integration.var <- c("source", "sample_info")
+obj <- RunHarmony(object = obj, 
+                  group.by.vars = integration.var, 
+                  max.iter.harmony = 30, 
+                  plot_convergence = FALSE, 
+                  reduction.save = "harmony")
+obj <- RunUMAP(object = obj, 
+               seed.use = 123, 
+               dims = 1:opt, 
+               reduction = "harmony", 
+               reduction.name = "harmony_umap", 
+               reduction.key = "UMAPh_", 
+               return.model = TRUE)
+## 6.6) find neighboring cells
+obj <- FindNeighbors(object = obj, 
+                     dims = 1:opt, 
+                     force.recalc = T, 
+                     reduction = "harmony", 
+                     graph.name = c("RNA_nn_h.iv", "RNA_snn_h.iv"))
+## 6.7) cell clustering
+clu_res <- seq(0.1, 1, by = 0.1)
+for(res in clu_res){
+  obj <- FindClusters(object = obj, 
+                      algorithm = 1, 
+                      resolution = as.numeric(res), 
+                      graph.name = "RNA_snn_h.iv")
+}
+## 6.8) assign celltype to each cluster
+obj$seurat <- obj$RNA_snn_h.iv_res.0.5
+obj$seurat_annotation <- NA
+obj$seurat_annotation[obj$seurat %in% "0"] <- "HSC/MPP"
+obj$seurat_annotation[obj$seurat %in% c("1", "6", "11")] <- "Mature Ery"
+obj$seurat_annotation[obj$seurat %in% "7"] <- "VEXAS Ery/CMP"
+obj$seurat_annotation[obj$seurat %in% c("8", "22")] <- "Immature Ery"
+obj$seurat_annotation[obj$seurat %in% "2"] <- "MPP/CMP"
+obj$seurat_annotation[obj$seurat %in% "12"] <- "PreBNK"
+obj$seurat_annotation[obj$seurat %in% "3"] <- "CMP/GMP"
+obj$seurat_annotation[obj$seurat %in% "4"] <- "GP"
+obj$seurat_annotation[obj$seurat %in% "13"] <- "MDP"
+obj$seurat_annotation[obj$seurat %in% "5"] <- "MyeloLympho/CMP"
+obj$seurat_annotation[obj$seurat %in% c("9", "21")] <- "VEXAS Immature Ery"
+obj$seurat_annotation[obj$seurat %in% "10"] <- "Undefined"
+obj$seurat_annotation[obj$seurat %in% "14"] <- "BEM"
+obj$seurat_annotation[obj$seurat %in% "16"] <- "Monocyte Progenitors"
+obj$seurat_annotation[obj$seurat %in% c("15", "19")] <- "MLP"
+obj$seurat_annotation[obj$seurat %in% "17"] <- "MEP"
+obj$seurat_annotation[obj$seurat %in% "18"] <- "VEXAS Mature Ery"
+obj$seurat_annotation[obj$seurat %in% "20"] <- "VEXAS MPP-Ery"
+obj$seurat_annotation[obj$seurat %in% "23"] <- "VEXAS Myelo/CMP"
+annotated_cols <- c("HSC/MPP" = '#FB0207',
+                    "MyeloLympho/CMP" = '#c6dbef',
+                    "MPP/CMP" = "#7fcdbb",
+                    "CMP/GMP" = '#9ecae1',
+                    "Monocyte Progenitors" = '#66CCFF',
+                    "MDP" = '#0F80FF',
+                    "GP" = "#08519c",
+                    "PreBNK" = '#118040',
+                    "MLP" = "#FECC66",
+                    "BEM" = "#bdbdbd",
+                    "MEP" = '#f768a1',
+                    "Immature Ery" = '#B17DFC',
+                    "Mature Ery" = "#800080",
+                    "VEXAS MPP-Ery" = '#fde0dd',
+                    "VEXAS Ery/CMP" = '#fcc5c0',
+                    "VEXAS Immature Ery" = '#fa9fb5',
+                    "VEXAS Mature Ery" = "#dd3497",
+                    "VEXAS Myelo/CMP" = "#ccebc5",
+                    "Undefined" = '#d9d9d9')
+levs <- names(annotated_cols)
+obj$seurat_annotation <- factor(obj$seurat_annotation, levels = levs)
+## 6.9) define status and vexas-mutation variables
+obj$cell_barcode <- strsplit2(rownames(obj@meta.data), split = "\\_")[, 2] 
+GSE196052_annot <- read.csv(file = paste0(path, "GSE196052/CD34_metaDatatSNECellType_ALiceManual.csv"), header = T)
+GSE196052_cases <- GSE196052_annot[GSE196052_annot$group %in% "PT",]
+### 6.9.1) status
+obj$status <- "HD"
+obj$status[(obj$source %in% "GSE196052") & (obj$cell_barcode %in% GSE196052_cases$orig.ident)] <- "PT"
+obj$status[(obj$source %in% "fiumara")] <- "PT"
+table(obj$source, obj$status)
+### 6.9.2) VEXAS mutation
+GSE196052_pt2upn <- paste0("GSE196052_PT", 1:9)
+names(GSE196052_pt2upn) <- paste0("GSE196052_UPN", c(6, 11, 1, 10, 13, 14, 15, 16, 17))
+obj$sample_info_upn <- obj$sample_info
+for(i in seq_len(length(GSE196052_pt2upn))){
+  obj$sample_info_upn[obj$sample_info_upn %in% GSE196052_pt2upn[i]] <- names(GSE196052_pt2upn)[i]
+}
+table(obj$source, obj$sample_info_upn)
+table(obj$sample_info, obj$sample_info_upn)
+obj$vexas_mutation <- "HD"
+obj$vexas_mutation[obj$sample_info_upn %in% c(paste0("fiumara_BM-0", 1),
+                                              paste0("GSE196052_UPN", c(1, 10, 11, 13, 16, 17)))] <- "THR"
+obj$vexas_mutation[obj$sample_info_upn %in% c(paste0("fiumara_BM-0", c(2, 3, 8)),
+                                              paste0("GSE196052_UPN", c(6)))] <- "VAL"
+obj$vexas_mutation[obj$sample_info_upn %in% c(paste0("fiumara_BM-0", c(4, 9)),
+                                              paste0("GSE196052_UPN", c(14, 15)))] <- "LEU"
+saveRDS(obj, file = paste0(path, "combined_annotated.rds"))
+table(obj$status, obj$vexas_mutation)
+table(obj$source, obj$vexas_mutation)
+table(obj$vexas_mutation, obj$sample_info_upn)
+## 6.10) free memory space
+l <- ls()
+l <- l[!l %in% c("path", "obj")]
+rm(list = l); gc()
+# 7) Celltype-wise VEXAS vs HD (DE and GSEA)
+outpath <- paste0(path, "DE_GSEA/"); dir.create(path = outpath, showWarnings = F, recursive = T)
+## 7.1) define function to run clusterProfiler GSEA
+gsea_run <- function(marks, gmt){
+  # load gmt file
+  gmt.obj <- clusterProfiler::read.gmt(gmt)
+  # order DE results by logFC
+  genes <- marks$avg_log2FC
+  names(genes) <- marks$gene_name
+  genes <- genes[order(genes, decreasing = T)]
+  genes <- genes[!duplicated(names(genes))]
+  # run GSEA
+  gsea <- GSEA(geneList = genes, TERM2GENE = gmt.obj, nPerm = 10000, pvalueCutoff = 1)
+  return(gsea)
+}
+## 7.2) download hallmarks gene sets
+hallmarks_gsea <- c("https://data.broadinstitute.org/gsea-msigdb/msigdb/release/7.4/h.all.v7.4.symbols.gmt")
+download.file(url = hallmarks_gsea, destfile = paste0(outpath, "h.all.v7.4.symbols.gmt"))
+## 7.3) VEXAS vs HD
+### 7.3.1) DE analysis
+annclusters <- names(annotated_cols)
+mincells <- 10
+for(i in seq_len(length(annclusters))){
+  cl <- annclusters[i]
+  cl_id <- gsub(x = cl, pattern = "/", replacement = "-")
+  temp <- subset(obj, subset = (seurat_annotation %in% cl))
+  temp <- SetIdent(object = temp, value = "status")
+  if(all(table(temp$status) >= mincells)){
+    de <- FindMarkers(temp, 
+                      ident.1 = "PT", 
+                      ident.2 = "HD", 
+                      test.use = "wilcox",
+                      min.pct = 0.1, 
+                      logfc.threshold = 0)
+    marks <- de[order(de$p_val_adj, decreasing = F),]
+    marks$gene_name <- rownames(marks)
+    write.table(x = marks, file = paste0(outpath, "de_", cl_id, ".txt"), sep = '\t', row.names = F)
+  }
+}
+### 7.3.2) GSEA Hallmarks
+for(i in seq_len(length(annclusters))){
+  cl <- annclusters[i]
+  cl_id <- gsub(x = cl, pattern = "/", replacement = "-")
+  defile <- paste0(outpath, 'de_', cl_id, ".txt")
+  if(file.exists(defile)){
+    marks <- read.table(file = defile, sep = "\t", header = T)
+    gsea <- gsea_run(marks, gmt = hallmarks_gsea)
+    write.table(x = gsea, file = paste0(outpath, 'gsea_', cl_id, ".txt"), sep = '\t', row.names = F)
+  }
+}
+## 7.4) VEXAS_MUT vs HD
+### 7.4.1) DE analysis
+annclusters <- names(annotated_cols)
+mincells <- 10
+mut <- c("LEU", "THR", "VAL")
+for(m in mut){
+  levs <- c(m, "HD")
+  sub <- subset(obj, subset = (vexas_mutation %in% levs))
+  for(i in seq_len(length(annclusters))){
+    cl <- annclusters[i]
+    cl_id <- gsub(x = cl, pattern = "/", replacement = "-")
+    temp <- subset(sub, subset = (seurat_annotation %in% cl))
+    temp <- SetIdent(object = temp, value = "vexas_mutation")
+    if(all(table(temp$vexas_mutation) >= mincells)){
+      de <- FindMarkers(temp, 
+                        ident.1 = m, 
+                        ident.2 = "HD", 
+                        test.use = "wilcox",
+                        min.pct = 0.1, 
+                        logfc.threshold = 0)
+      marks <- de[order(de$p_val_adj, decreasing = F),]
+      marks$gene_name <- rownames(marks)
+      write.table(x = marks, file = paste0(outpath, "de_", cl_id, "_", m, "vHD.txt"), sep = '\t', row.names = F)
+    }
+  }
+}
+### 7.4.2) GSEA Hallmarks
+for(m in mut){  
+  for(i in seq_len(length(annclusters))){
+    cl <- annclusters[i]
+    cl_id <- gsub(x = cl, pattern = "/", replacement = "-")
+    defile <- paste0(outpath, "de_", cl_id, "_", m, "vHD.txt")
+    if(file.exists(defile)){
+      marks <- read.table(file = defile, sep = "\t", header = T)
+      gsea <- gsea_run(marks, gmt = hallmarks_gsea)
+      write.table(x = gsea, file = paste0(outpath, 'gsea_', cl_id, "_", m, "vHD.txt"), sep = '\t', row.names = F)
+    }
+  }
+}
+## 7.5) free memory space
+l <- ls()
+l <- l[!l %in% c("path", "obj")]
+rm(list = l); gc()
+# 8) UCell module scores and wilcoxon test
+## 8.1) load marker gene set
+vexas_50 <- qusage::read.gmt(file = paste0(path, "xenograft_signatures/custom_vexas_50.gmt"))
+vexas_signature <- vexas_50[[1]]
+## 8.2) compute UCell module scores
+names(vexas_signature) <- "VEXAS_Xenograft_sig50"
+ncol <- ncol(obj@meta.data)
+obj <- AddModuleScore_UCell(obj, features = vexas_signature)
+colnames(obj@meta.data) <- c(colnames(obj@meta.data)[seq_len(ncol)], names(vexas_signature))
+## 8.3) Celltype-wise wilcoxon test: VEXAS vs HD
+x <- melt(data = obj@meta.data, id.vars = c("status", "seurat_annotation"), measure.vars = c("VEXAS_Xenograft_sig50"))
+w <- x %>% 
+  dplyr::group_by(variable, seurat_annotation) %>% 
+  dplyr::summarise(pvalue = wilcox.test(x = value[status == "PT"], y = value[status == "HD"])$p.value)
+w$p.adjust <- p.adjust(p = w$pvalue)
+w <- w[order(w$p.adjust, decreasing = F),]
+write.table(x = w, file = paste0(path, "xenograft_signatures/wilcoxon.txt"), sep = "\t", row.names = F, col.names = T, quote = F)
+# 9) figures
+figpath <- paste0(path, "figures/"); dir.create(path = figpath, showWarnings = F, recursive = T)
+## 9.1) figure 5d: Annotated UMAP
+obj <- SetIdent(object = obj, value = "seurat_annotation")
+g <- DimPlot(obj, reduction = "harmony_umap",
+             label.box = T, label = T, label.color = T, label.size = 2) +
+  scale_color_manual(values = annotated_cols, limits = levs) +
+  scale_fill_manual(values = annotated_cols, limits = levs) +
+  ggtitle("Cluster Annotation") +
+  theme(plot.title = element_text(hjust = 0.5),
+        legend.position = "right",
+        legend.text = element_text(size=7))
+ggsave(g, filename = paste0(figpath, "Fig5D_UMAP_AnnotatedClusters.png"),
+       width = 10, height = 7, limitsize = FALSE)
+## 9.2) figure 5e/5f: GSEA Hallmarks
+### 9.2.1) load results
+res_VEXASvHD <- c()
+for(i in seq_len(length(annclusters))){
+  cl <- annclusters[i]
+  cl_id <- gsub(x = cl, pattern = "/", replacement = "-")
+  file <- paste0(outpath, 'gsea_', cl_id, ".txt")
+  if(file.exists(file)){
+    x <- read.delim(file = file, header = T)
+    x <- x[order(x$p.adjust, -x$NES),]
+    res_VEXASvHD <- rbind(res_VEXASvHD,
+                          data.frame(x, celltype = cl, test = "VEXASvOLD"))
+  }
+}
+res_MUTvHD <- c()
+for(m in mut){  
+  for(i in seq_len(length(annclusters))){
+    cl <- annclusters[i]
+    cl_id <- gsub(x = cl, pattern = "/", replacement = "-")
+    file <- paste0(outpath, 'gsea_', cl_id, "_", m, "vHD.txt")
+    if(file.exists(file)){
+      id <- paste0(m, "vHD")
+      x <- read.delim(file = file, header = T)
+      x <- x[order(x$p.adjust, -x$NES),]
+      res_MUTvHD <- rbind(res_MUTvHD,
+                          data.frame(x, celltype = cl, test = id))
+    }
+  }
+}
+res <- rbind(res_VEXASvHD, res_MUTvHD)
+res$significance_asterisk <- ""
+res$significance_asterisk[res$p.adjust < 0.05] <- "*"
+res$significance_asterisk[res$p.adjust < 0.01] <- "**"
+res$significance_asterisk[res$p.adjust < 0.001] <- "***"
+res <- res %>% tidyr::complete(ID, celltype, test) %>% as.data.frame()
+res$ID <- gsub(x = res$ID, pattern = "HALLMARK_", replacement = "")
+### 9.2.2) plot
+g <- res %>%
+  ggplot() + 
+  theme_bw() +
+  facet_grid(. ~ test) +
+  geom_tile(aes(x = celltype,  y = ID, fill = NES)) + 
+  geom_text(aes(x = celltype,  y = ID, label = significance_asterisk), size = 2) + 
+  theme(plot.title = element_text(hjust = 0.5, size = 10),
+        axis.text.x = element_text(angle = 45, , vjust = 1, hjust = 1),
+        legend.position = "top",
+        strip.background =element_rect(fill="white")) + 
+  ylab("") + xlab("") +
+  scale_fill_gradientn(colours = colorRampPalette(rev(brewer.pal(11,"RdBu")))(100),
+                       limits = c(-4, 4),
+                       na.value = "grey")
+ggsave(g, filename = paste0(figpath, "Fig5EF_GSEA_Celltype_Hallmarks.png"), 
+       width = length(unique(res$celltype))*0.3*length(unique(res_complete$test)), 
+       height = length(unique(res$ID))*0.2, limitsize = FALSE)
+## 9.3) figure 5g: Monocyte xenograft signature CD34+
+### 9.3.1) load wilcoxon test results
+w <- read.table(file = paste0(path, "xenograft_signatures/wilcoxon.txt"), sep = "\t", header = T)
+w$significance_asterisk <- ""
+w$significance_asterisk[w$p.adjust < 0.05] <- "*"
+w$significance_asterisk[w$p.adjust < 0.01] <- "**"
+w$significance_asterisk[w$p.adjust < 0.001] <- "***"
+### 9.3.2) plot
+g <- obj@meta.data %>%
+  ggplot() + 
+  theme_classic() +
+  geom_violin(aes(x = seurat_annotation, y = VEXAS_Xenograft_sig50, fill = status), scale = "width") + 
+  geom_text(data = w, aes(x = seurat_annotation, y = 0.7, label = significance_asterisk)) +
+  theme(plot.title = element_text(hjust = 0.5, size = 10),
+        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
+        legend.position = "right") + 
+  ylab("UCell Module score") + xlab("") +
+  ggtitle("VEXAS xenograft signature (n = 50)") + 
+  theme(axis.text.x = element_text(angle = 45, , vjust = 1, hjust = 1)) + 
+  scale_fill_manual(values = adjustcolor(col = c("#F8766D", "#02818a"), alpha.f = 0.8), name = "")
+ggsave(g, 
+       filename = paste0(figpath, "Fig5G_UCell_MonocyteXenograft_WilcoxonTest.png"), 
+       width = length(unique(obj$seurat_annotation))*5*0.1, 
+       height = 5, 
+       limitsize = FALSE)
\ No newline at end of file