fiumara_20241115_UploadScript.R

### upload on gitlab

# 1) load R packages
library(Seurat)
library(GEOquery)
library(limma)
library(dplyr)
library(data.table)
library(clusterProfiler)
library(ggplot2)
library(RColorBrewer)
library(qusage)
library(UCell)

# 2) define input and output paths
# set path to the working directory
path <- ""

# 3) load scRNA-seq BM-object

## 3.1) restrict analysis to CD34+ clusters
obj <- readRDS(file = paste0(path, "BM_20240222.rds"))
CD34p_clusters <- as.character(c(2, 8, 14, 15, 16, 19, 20, 26, 27, 30, 33))

obj$seurat <- obj$RNA_snn_h.orig.ident_res.1.8
obj <- SetIdent(object = obj, value = "seurat")

obj <- subset(obj, subset = (seurat %in% CD34p_clusters))
saveRDS(object = obj, file = paste0(path, "BM_CD34p.rds"))

## 3.2) free memory space
l <- ls()
l <- l[!l %in% "path"]
rm(list = l); gc()


# 4) load published scRNA-seq dataset

## 4.1) Ainciburu et al.

### 4.1.1) download tar file and metadata
destfolder <- paste0(path, "GSE180298/"); dir.create(path = destfolder, showWarnings = F, recursive = T)
destfile <- paste0(destfolder, "GSE180298_RAW.tar")
download.file(url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE180298&format=file", destfile = destfile)

metadata_files <- c("https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE180298&format=file&file=GSE180298%5Felderly%5Fmetadata%2Etxt%2Egz",
                    "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE180298&format=file&file=GSE180298%5Fmds%5Fmetadata%2Etxt%2Egz",
                    "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE180298&format=file&file=GSE180298%5Fyoung%5Fmetadata%2Etxt%2Egz")
download.file(url = metadata_files, destfile = paste0(destfolder, "GSE180298_", c("elderly", "mds", "young"), "_metadata.txt.gz"))

### 4.1.2) untar file
untar(tarfile = destfile, exdir = paste0(destfolder, "GSE180298_RAW/"))
files <- list.files(path = paste0(destfolder, "GSE180298_RAW/"), full.names = T)
md <- list.files(path = destfolder, full.names = T)
md_files <- md[grepl(md, pattern = "metadata")]

### 4.1.3) define seurat object
metadata <- c()
for(i in seq_len(length(md_files))){
  gunzip(md_files[i], remove = FALSE, overwrite = TRUE)
  unzfile <- gsub(md_files[i], pattern = ".gz", replacement = "")
  temp <- read.delim(file = unzfile, header = T, sep = "\t")
  metadata <- rbind(metadata, temp)
}
metadata <- as.data.frame(metadata)
metadata$orig.ident <- strsplit2(rownames(metadata), split = "\\_")[, 2]

counts <- cell_counts <- samples <- c()
for(i in seq_len(length(files))){
  
  split_filename <- strsplit2(files[i], split = "/")
  sampleid <- strsplit2(split_filename[, ncol(split_filename)], split = "\\_")[, 2]
  print(paste0("Processing file ", sampleid, " (", i, " over ", length(files), ")"))
  
  # load h5 object
  temp <- Read10X_h5(filename = files[i], use.names = TRUE, unique.features = TRUE)
  coln <- paste0(strsplit2(colnames(temp), split = "-")[, 1], "_", sampleid)
  colnames(temp) <- coln
  
  # store
  temp_object <- CreateSeuratObject(counts = temp)
  counts_temp <- matrix(data = temp_object@assays$RNA@counts, ncol = ncol(temp_object))
  rownames(counts_temp) <- rownames(temp_object)
  colnames(counts_temp) <- colnames(temp_object)
  
  # store info
  cell_counts <- c(cell_counts, ncol(counts_temp))
  samples <- c(samples, sampleid)
  
  if(is.null(nrow(counts))){
    counts <- counts_temp
    gs <- rownames(counts)
  }else{
    gs <- intersect(gs, rownames(counts_temp))
    counts <- cbind(counts[match(gs, rownames(counts)), ], 
                    counts_temp[match(gs, rownames(counts_temp)), ])
  }
}
names(cell_counts) <- samples

md <- metadata[rownames(metadata) %in% colnames(counts),]; dim(md)
m <- match(rownames(md), colnames(counts)); table(is.na(m))
counts <- counts[, m]
GSE180298 <- CreateSeuratObject(counts = counts, meta.data = metadata)

### 4.1.4) select only elderly donors
GSE180298 <- SetIdent(object = GSE180298, value = "orig.ident")
elderly_sampleid <- unique(GSE180298$orig.ident)[grepl(unique(GSE180298$orig.ident), pattern = "elderly")]
GSE180298 <- subset(GSE180298, subset = (orig.ident %in% elderly_sampleid))
saveRDS(object = GSE180298, file = paste0(path, "GSE180298_elderly.rds")); gc()

## 4.1.5) free memory space
l <- ls()
l <- l[!l %in% "path"]
rm(list = l); gc()


## 4.2) Wu et al.

### 4.2.1) download tar file
destfolder <- paste0(path, "GSE196052/"); dir.create(path = destfolder, showWarnings = F, recursive = T)
destfile <- paste0(destfolder, "GSE196052_RAW.tar")
download.file(url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE196052&format=file", destfile = destfile)

metadata_files <- c("https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE196052&format=file&file=GSE196052%5FdataCount%5FCD34%2Ecsv%2Egz")
download.file(url = metadata_files, destfile = paste0(destfolder, "GSE196052_dataCount_CD34.csv.gz"))

### 4.2.2) untar file
untar(tarfile = destfile, exdir = paste0(destfolder, "GSE196052_RAW/"))
files_annotation <- list.files(path = paste0(destfolder, "GSE196052_RAW/"), full.names = T)
files <- list.files(path = destfolder, full.names = T)

### 4.2.3) define seurat object
md <- read.delim(file = files[grepl(files, pattern = "SraRun")], header = T, sep = ",")
table(md$subject_id.status, md$tissue.cell_type)
mdfiles <- files_annotation[grepl(files_annotation, pattern = "GSM")]
metadata <- c()
for(i in seq_len(length(mdfiles))){
  x <- mdfiles[i]
  temp <- read.delim(gzfile(x), header = T, sep = ",") 
  metadata <- rbind(metadata, temp)
}

f <- files[grepl(files, pattern = "CD34.csv.gz")]
counts_cd34 <- fread(f) %>% as.data.frame()
rn <- as.character(counts_cd34[, 1])
counts_cd34 <- counts_cd34[, -1]
rownames(counts_cd34) <- rn

cd34_cells <- colnames(counts_cd34)
cd34_cells <- gsub(x = cd34_cells, pattern = "CD34_", replacement = "")
wu_annot <- read.csv(file = paste0(destfolder, "CD34_metaDatatSNECellType_ALiceManual.csv"), header = T)
m <- match(cd34_cells, wu_annot$orig.ident); table(is.na(m))
status <- wu_annot$group[m]
sampleid <- wu_annot$subject[m]
mdt <- data.frame(sampleid, status)
rownames(mdt) <- colnames(counts_cd34)

GSE196052 <- CreateSeuratObject(counts = counts_cd34, meta.data = mdt)
GSE196052 <- SetIdent(object = GSE196052, value = "orig.ident")
saveRDS(object = GSE196052, file = paste0(path, "GSE196052.rds"))

## 4.2.4) free memory space
l <- ls()
l <- l[!l %in% "path"]
rm(list = l); gc()


# 5) Define single seurat object

## 5.1) load datasets
fiumara <- readRDS(paste0(path, "BM_CD34p.rds"))
GSE180298 <- readRDS(paste0(path, "GSE180298_elderly.rds"))
GSE196052 <- readRDS(paste0(path, "GSE196052.rds"))

## 5.2) define common features
genes <- table(c(rownames(fiumara), rownames(GSE180298), rownames(GSE196052)))
common_genes <- names(genes)[genes == 3]; length(common_genes)

## 5.3) add sampleinfo
fiumara$source <- "fiumara"
fiumara$sample_info <- paste0("fiumara_", fiumara$orig.ident)
GSE180298$source <- "GSE180298"
GSE180298$sample_info <- paste0("GSE180298_", GSE180298$orig.ident)
GSE196052$source <- "GSE196052"
GSE196052$sample_info <- paste0("GSE196052_", GSE196052$sampleid)

## 5.4) extract counts relative to these genes and define Seurat object
counts <- cbind(fiumara@assays$RNA@counts[common_genes,],
                GSE180298@assays$RNA@counts[common_genes,],
                GSE196052@assays$RNA@counts[common_genes,])

vars <- c("source", "sample_info", "nCount_RNA", "nFeature_RNA")
md <- rbind(fiumara@meta.data[, vars],
            GSE180298@meta.data[, vars],
            GSE196052@meta.data[, vars])

obj <- CreateSeuratObject(counts = counts, meta.data = md)
obj$percent.mt <- (colSums(obj@assays$RNA@counts[grepl(rownames(obj), pattern = "^MT-"),])/colSums(obj@assays$RNA@counts))*100
obj$percent.rb <- (colSums(obj@assays$RNA@counts[grepl(rownames(obj), pattern = "^RPL"),])/colSums(obj@assays$RNA@counts))*100
saveRDS(object = obj, file = paste0(path, "combined.rds")); gc()

## 5.5) free memory space
l <- ls()
l <- l[!l %in% c("path", "obj")]
rm(list = l); gc()


# 6) removing low quality cells, normalization, scaling, integration and clustering

## 6.1) cell filtering
obj$keep <- (obj$nFeature_RNA > 200) & (obj$percent.mt < 25)
table(obj$keep, obj$source)
obj <- subset(obj, subset = (keep %in% TRUE))
saveRDS(obj, file = paste0(path, "combined_filtered.rds"))

## 6.2) normalization
varfeatures <- 1000
obj <- NormalizeData(object = obj)
obj <- FindVariableFeatures(obj, selection.method = "vst",
                            nfeatures = varfeatures,
                            verbose = T)

## 6.3) scaling
reg_vars = c("percent.mt", "nCount_RNA")
obj <- ScaleData(object = obj, vars.to.regress = reg_vars,  display.progress = T, features = rownames(obj))
saveRDS(obj, file = paste0(path, "combined_normscaled.rds"))

## 6.4) dimensionality reduction
max_pca <- 100
obj <- RunPCA(object = obj, features = VariableFeatures(object = obj), npcs = max_pca, reduction.name="pca", reduction.key="PC_")

explvar <- ((obj@reductions$pca@stdev^2)/sum((obj@reductions$pca@stdev^2)))*100
delta <- explvar - c(explvar[-1], 0)
opt_delta <- length(delta[delta > 1e-2])
opt_explvar <- min(which(cumsum(explvar) > 80))
opt <- min(opt_delta, opt_explvar)

obj <- RunUMAP(object = obj, seed.use = 123, reduction = "pca", dims = 1:opt)

## 6.5) harmony dataset integration
integration.var <- c("source", "sample_info")
obj <- RunHarmony(object = obj, 
                  group.by.vars = integration.var, 
                  max.iter.harmony = 30, 
                  plot_convergence = FALSE, 
                  reduction.save = "harmony")
obj <- RunUMAP(object = obj, 
               seed.use = 123, 
               dims = 1:opt, 
               reduction = "harmony", 
               reduction.name = "harmony_umap", 
               reduction.key = "UMAPh_", 
               return.model = TRUE)

## 6.6) find neighboring cells
obj <- FindNeighbors(object = obj, 
                     dims = 1:opt, 
                     force.recalc = T, 
                     reduction = "harmony", 
                     graph.name = c("RNA_nn_h.iv", "RNA_snn_h.iv"))

## 6.7) cell clustering
clu_res <- seq(0.1, 1, by = 0.1)
for(res in clu_res){
  obj <- FindClusters(object = obj, 
                      algorithm = 1, 
                      resolution = as.numeric(res), 
                      graph.name = "RNA_snn_h.iv")
}

## 6.8) assign celltype to each cluster
obj$seurat <- obj$RNA_snn_h.iv_res.0.5
obj$seurat_annotation <- NA
obj$seurat_annotation[obj$seurat %in% "0"] <- "HSC/MPP"
obj$seurat_annotation[obj$seurat %in% c("1", "6", "11")] <- "Mature Ery"
obj$seurat_annotation[obj$seurat %in% "7"] <- "VEXAS Ery/CMP"
obj$seurat_annotation[obj$seurat %in% c("8", "22")] <- "Immature Ery"
obj$seurat_annotation[obj$seurat %in% "2"] <- "MPP/CMP"
obj$seurat_annotation[obj$seurat %in% "12"] <- "PreBNK"
obj$seurat_annotation[obj$seurat %in% "3"] <- "CMP/GMP"
obj$seurat_annotation[obj$seurat %in% "4"] <- "GP"
obj$seurat_annotation[obj$seurat %in% "13"] <- "MDP"
obj$seurat_annotation[obj$seurat %in% "5"] <- "MyeloLympho/CMP"
obj$seurat_annotation[obj$seurat %in% c("9", "21")] <- "VEXAS Immature Ery"
obj$seurat_annotation[obj$seurat %in% "10"] <- "Undefined"
obj$seurat_annotation[obj$seurat %in% "14"] <- "BEM"
obj$seurat_annotation[obj$seurat %in% "16"] <- "Monocyte Progenitors"
obj$seurat_annotation[obj$seurat %in% c("15", "19")] <- "MLP"
obj$seurat_annotation[obj$seurat %in% "17"] <- "MEP"
obj$seurat_annotation[obj$seurat %in% "18"] <- "VEXAS Mature Ery"
obj$seurat_annotation[obj$seurat %in% "20"] <- "VEXAS MPP-Ery"
obj$seurat_annotation[obj$seurat %in% "23"] <- "VEXAS Myelo/CMP"

annotated_cols <- c("HSC/MPP" = '#FB0207',
                    "MyeloLympho/CMP" = '#c6dbef',
                    "MPP/CMP" = "#7fcdbb",
                    "CMP/GMP" = '#9ecae1',
                    "Monocyte Progenitors" = '#66CCFF',
                    "MDP" = '#0F80FF',
                    "GP" = "#08519c",
                    "PreBNK" = '#118040',
                    "MLP" = "#FECC66",
                    "BEM" = "#bdbdbd",
                    "MEP" = '#f768a1',
                    "Immature Ery" = '#B17DFC',
                    "Mature Ery" = "#800080",
                    "VEXAS MPP-Ery" = '#fde0dd',
                    "VEXAS Ery/CMP" = '#fcc5c0',
                    "VEXAS Immature Ery" = '#fa9fb5',
                    "VEXAS Mature Ery" = "#dd3497",
                    "VEXAS Myelo/CMP" = "#ccebc5",
                    "Undefined" = '#d9d9d9')
levs <- names(annotated_cols)
obj$seurat_annotation <- factor(obj$seurat_annotation, levels = levs)

## 6.9) define status and vexas-mutation variables
obj$cell_barcode <- strsplit2(rownames(obj@meta.data), split = "\\_")[, 2] 
GSE196052_annot <- read.csv(file = paste0(path, "GSE196052/CD34_metaDatatSNECellType_ALiceManual.csv"), header = T)
GSE196052_cases <- GSE196052_annot[GSE196052_annot$group %in% "PT",]

### 6.9.1) status
obj$status <- "HD"
obj$status[(obj$source %in% "GSE196052") & (obj$cell_barcode %in% GSE196052_cases$orig.ident)] <- "PT"
obj$status[(obj$source %in% "fiumara")] <- "PT"
table(obj$source, obj$status)

### 6.9.2) VEXAS mutation
GSE196052_pt2upn <- paste0("GSE196052_PT", 1:9)
names(GSE196052_pt2upn) <- paste0("GSE196052_UPN", c(6, 11, 1, 10, 13, 14, 15, 16, 17))
obj$sample_info_upn <- obj$sample_info
for(i in seq_len(length(GSE196052_pt2upn))){
  obj$sample_info_upn[obj$sample_info_upn %in% GSE196052_pt2upn[i]] <- names(GSE196052_pt2upn)[i]
}
table(obj$source, obj$sample_info_upn)
table(obj$sample_info, obj$sample_info_upn)

obj$vexas_mutation <- "HD"
obj$vexas_mutation[obj$sample_info_upn %in% c(paste0("fiumara_BM-0", 1),
                                              paste0("GSE196052_UPN", c(1, 10, 11, 13, 16, 17)))] <- "THR"
obj$vexas_mutation[obj$sample_info_upn %in% c(paste0("fiumara_BM-0", c(2, 3, 8)),
                                              paste0("GSE196052_UPN", c(6)))] <- "VAL"
obj$vexas_mutation[obj$sample_info_upn %in% c(paste0("fiumara_BM-0", c(4, 9)),
                                              paste0("GSE196052_UPN", c(14, 15)))] <- "LEU"
saveRDS(obj, file = paste0(path, "combined_annotated.rds"))

table(obj$status, obj$vexas_mutation)
table(obj$source, obj$vexas_mutation)
table(obj$vexas_mutation, obj$sample_info_upn)

## 6.10) free memory space
l <- ls()
l <- l[!l %in% c("path", "obj")]
rm(list = l); gc()


# 7) Celltype-wise VEXAS vs HD (DE and GSEA)
outpath <- paste0(path, "DE_GSEA/"); dir.create(path = outpath, showWarnings = F, recursive = T)

## 7.1) define function to run clusterProfiler GSEA
gsea_run <- function(marks, gmt){
  
  # load gmt file
  gmt.obj <- clusterProfiler::read.gmt(gmt)
  
  # order DE results by logFC
  genes <- marks$avg_log2FC
  names(genes) <- marks$gene_name
  genes <- genes[order(genes, decreasing = T)]
  genes <- genes[!duplicated(names(genes))]
  
  # run GSEA
  gsea <- GSEA(geneList = genes, TERM2GENE = gmt.obj, nPerm = 10000, pvalueCutoff = 1)
  return(gsea)
}

## 7.2) download hallmarks gene sets
hallmarks_gsea <- c("https://data.broadinstitute.org/gsea-msigdb/msigdb/release/7.4/h.all.v7.4.symbols.gmt")
download.file(url = hallmarks_gsea, destfile = paste0(outpath, "h.all.v7.4.symbols.gmt"))

## 7.3) VEXAS vs HD

### 7.3.1) DE analysis
annclusters <- names(annotated_cols)
mincells <- 10
for(i in seq_len(length(annclusters))){
  cl <- annclusters[i]
  cl_id <- gsub(x = cl, pattern = "/", replacement = "-")
  temp <- subset(obj, subset = (seurat_annotation %in% cl))
  temp <- SetIdent(object = temp, value = "status")
  
  if(all(table(temp$status) >= mincells)){
    
    de <- FindMarkers(temp, 
                      ident.1 = "PT", 
                      ident.2 = "HD", 
                      test.use = "wilcox",
                      min.pct = 0.1, 
                      logfc.threshold = 0)
    marks <- de[order(de$p_val_adj, decreasing = F),]
    marks$gene_name <- rownames(marks)
    
    write.table(x = marks, file = paste0(outpath, "de_", cl_id, ".txt"), sep = '\t', row.names = F)
    
  }
}

### 7.3.2) GSEA Hallmarks
for(i in seq_len(length(annclusters))){
  cl <- annclusters[i]
  cl_id <- gsub(x = cl, pattern = "/", replacement = "-")
  defile <- paste0(outpath, 'de_', cl_id, ".txt")
  
  if(file.exists(defile)){
    marks <- read.table(file = defile, sep = "\t", header = T)
    gsea <- gsea_run(marks, gmt = hallmarks_gsea)
    write.table(x = gsea, file = paste0(outpath, 'gsea_', cl_id, ".txt"), sep = '\t', row.names = F)
  }
  
}


## 7.4) VEXAS_MUT vs HD

### 7.4.1) DE analysis
annclusters <- names(annotated_cols)
mincells <- 10

mut <- c("LEU", "THR", "VAL")

for(m in mut){
  
  levs <- c(m, "HD")
  sub <- subset(obj, subset = (vexas_mutation %in% levs))
  
  for(i in seq_len(length(annclusters))){
    
    cl <- annclusters[i]
    cl_id <- gsub(x = cl, pattern = "/", replacement = "-")
    temp <- subset(sub, subset = (seurat_annotation %in% cl))
    temp <- SetIdent(object = temp, value = "vexas_mutation")
    
    if(all(table(temp$vexas_mutation) >= mincells)){
      
      de <- FindMarkers(temp, 
                        ident.1 = m, 
                        ident.2 = "HD", 
                        test.use = "wilcox",
                        min.pct = 0.1, 
                        logfc.threshold = 0)
      marks <- de[order(de$p_val_adj, decreasing = F),]
      marks$gene_name <- rownames(marks)
      
      write.table(x = marks, file = paste0(outpath, "de_", cl_id, "_", m, "vHD.txt"), sep = '\t', row.names = F)
      
    }
  }
}

### 7.4.2) GSEA Hallmarks
for(m in mut){  
  for(i in seq_len(length(annclusters))){
    cl <- annclusters[i]
    cl_id <- gsub(x = cl, pattern = "/", replacement = "-")
    defile <- paste0(outpath, "de_", cl_id, "_", m, "vHD.txt")
    
    if(file.exists(defile)){
      marks <- read.table(file = defile, sep = "\t", header = T)
      gsea <- gsea_run(marks, gmt = hallmarks_gsea)
      write.table(x = gsea, file = paste0(outpath, 'gsea_', cl_id, "_", m, "vHD.txt"), sep = '\t', row.names = F)
    }
  }
}


## 7.5) free memory space
l <- ls()
l <- l[!l %in% c("path", "obj")]
rm(list = l); gc()


# 8) UCell module scores and wilcoxon test

## 8.1) load marker gene set
vexas_50 <- qusage::read.gmt(file = paste0(path, "xenograft_signatures/custom_vexas_50.gmt"))
vexas_signature <- vexas_50[[1]]

## 8.2) compute UCell module scores
names(vexas_signature) <- "VEXAS_Xenograft_sig50"
ncol <- ncol(obj@meta.data)
obj <- AddModuleScore_UCell(obj, features = vexas_signature)
colnames(obj@meta.data) <- c(colnames(obj@meta.data)[seq_len(ncol)], names(vexas_signature))

## 8.3) Celltype-wise wilcoxon test: VEXAS vs HD
x <- melt(data = obj@meta.data, id.vars = c("status", "seurat_annotation"), measure.vars = c("VEXAS_Xenograft_sig50"))
w <- x %>% 
  dplyr::group_by(variable, seurat_annotation) %>% 
  dplyr::summarise(pvalue = wilcox.test(x = value[status == "PT"], y = value[status == "HD"])$p.value)
w$p.adjust <- p.adjust(p = w$pvalue)
w <- w[order(w$p.adjust, decreasing = F),]
write.table(x = w, file = paste0(path, "xenograft_signatures/wilcoxon.txt"), sep = "\t", row.names = F, col.names = T, quote = F)


# 9) figures
figpath <- paste0(path, "figures/"); dir.create(path = figpath, showWarnings = F, recursive = T)

## 9.1) figure 5d: Annotated UMAP
obj <- SetIdent(object = obj, value = "seurat_annotation")
g <- DimPlot(obj, reduction = "harmony_umap",
             label.box = T, label = T, label.color = T, label.size = 2) +
  scale_color_manual(values = annotated_cols, limits = levs) +
  scale_fill_manual(values = annotated_cols, limits = levs) +
  ggtitle("Cluster Annotation") +
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = "right",
        legend.text = element_text(size=7))
ggsave(g, filename = paste0(figpath, "Fig5D_UMAP_AnnotatedClusters.png"),
       width = 10, height = 7, limitsize = FALSE)

## 9.2) figure 5e/5f: GSEA Hallmarks

### 9.2.1) load results
res_VEXASvHD <- c()
for(i in seq_len(length(annclusters))){
  cl <- annclusters[i]
  cl_id <- gsub(x = cl, pattern = "/", replacement = "-")
  file <- paste0(outpath, 'gsea_', cl_id, ".txt")
  
  if(file.exists(file)){
    x <- read.delim(file = file, header = T)
    x <- x[order(x$p.adjust, -x$NES),]
    res_VEXASvHD <- rbind(res_VEXASvHD,
                          data.frame(x, celltype = cl, test = "VEXASvOLD"))
  }
}

res_MUTvHD <- c()
for(m in mut){  
  for(i in seq_len(length(annclusters))){
    cl <- annclusters[i]
    cl_id <- gsub(x = cl, pattern = "/", replacement = "-")
    file <- paste0(outpath, 'gsea_', cl_id, "_", m, "vHD.txt")
    
    if(file.exists(file)){
      id <- paste0(m, "vHD")
      x <- read.delim(file = file, header = T)
      x <- x[order(x$p.adjust, -x$NES),]
      res_MUTvHD <- rbind(res_MUTvHD,
                          data.frame(x, celltype = cl, test = id))
    }
  }
}

res <- rbind(res_VEXASvHD, res_MUTvHD)
res$significance_asterisk <- ""
res$significance_asterisk[res$p.adjust < 0.05] <- "*"
res$significance_asterisk[res$p.adjust < 0.01] <- "**"
res$significance_asterisk[res$p.adjust < 0.001] <- "***"
res <- res %>% tidyr::complete(ID, celltype, test) %>% as.data.frame()
res$ID <- gsub(x = res$ID, pattern = "HALLMARK_", replacement = "")

### 9.2.2) plot
g <- res %>%
  ggplot() + 
  theme_bw() +
  facet_grid(. ~ test) +
  geom_tile(aes(x = celltype,  y = ID, fill = NES)) + 
  geom_text(aes(x = celltype,  y = ID, label = significance_asterisk), size = 2) + 
  theme(plot.title = element_text(hjust = 0.5, size = 10),
        axis.text.x = element_text(angle = 45, , vjust = 1, hjust = 1),
        legend.position = "top",
        strip.background =element_rect(fill="white")) + 
  ylab("") + xlab("") +
  scale_fill_gradientn(colours = colorRampPalette(rev(brewer.pal(11,"RdBu")))(100),
                       limits = c(-4, 4),
                       na.value = "grey")
ggsave(g, filename = paste0(figpath, "Fig5EF_GSEA_Celltype_Hallmarks.png"), 
       width = length(unique(res$celltype))*0.3*length(unique(res_complete$test)), 
       height = length(unique(res$ID))*0.2, limitsize = FALSE)


## 9.3) figure 5g: Monocyte xenograft signature CD34+

### 9.3.1) load wilcoxon test results
w <- read.table(file = paste0(path, "xenograft_signatures/wilcoxon.txt"), sep = "\t", header = T)
w$significance_asterisk <- ""
w$significance_asterisk[w$p.adjust < 0.05] <- "*"
w$significance_asterisk[w$p.adjust < 0.01] <- "**"
w$significance_asterisk[w$p.adjust < 0.001] <- "***"

### 9.3.2) plot
g <- obj@meta.data %>%
  ggplot() + 
  theme_classic() +
  geom_violin(aes(x = seurat_annotation, y = VEXAS_Xenograft_sig50, fill = status), scale = "width") + 
  geom_text(data = w, aes(x = seurat_annotation, y = 0.7, label = significance_asterisk)) +
  theme(plot.title = element_text(hjust = 0.5, size = 10),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
        legend.position = "right") + 
  ylab("UCell Module score") + xlab("") +
  ggtitle("VEXAS xenograft signature (n = 50)") + 
  theme(axis.text.x = element_text(angle = 45, , vjust = 1, hjust = 1)) + 
  scale_fill_manual(values = adjustcolor(col = c("#F8766D", "#02818a"), alpha.f = 0.8), name = "")
ggsave(g, 
       filename = paste0(figpath, "Fig5G_UCell_MonocyteXenograft_WilcoxonTest.png"), 
       width = length(unique(obj$seurat_annotation))*5*0.1, 
       height = 5, 
       limitsize = FALSE)