# ========================================================-
# Title:         GSVA analysis on scRNAseq and snRNAseq samples
# Description:   Code to generate SuppFig13
# Author:        Monah Abou Alezz
# Date:          2025-03-06
# ========================================================-
suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(openxlsx))
suppressPackageStartupMessages(library(patchwork))
suppressPackageStartupMessages(library(pheatmap))
suppressPackageStartupMessages(library(GSVA))
suppressPackageStartupMessages(library(purrr))
suppressPackageStartupMessages(library(presto))
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(cogena))

#### loading the hallmark gmt file as a list using the gmt2list function from cogena
gs_human <- gmt2list("~/databases/GSEA/GSEA_human_v7.2/h.all.v7.2.symbols.gmt")
gs_mouse <- gmt2list("~/databases/GSEA/GSEA_mouse_v7.2/h.all.v7.2.symbols_mouse.gmt")

## loading datasets
mouse.combined <- readRDS("data/mouse_striatum_snRNA_AAV9.rds")
unique(mouse.combined$Anno)
mouse.combined <- subset(mouse.combined, Anno == "Astrocytes" |
                                        Anno == "Mature_Oligos" |
                                        Anno == "Immature_Oligos" |
                                        Anno == "Excitatory" |
                                        Anno == "Inhibitory")
mouse.combined@meta.data[['Anno']] <- recode_factor(
  mouse.combined@meta.data[["Anno"]],
  "Astrocytes" = "Astrocytes",
  "Mature_Oligos" = "Oligodendrocytes",
  "Immature_Oligos" = "Oligodendrocytes",
  "Excitatory" = "Neurons",
  "Inhibitory" = "Neurons")
mouse.combined$Anno <- droplevels(mouse.combined$Anno)
unique(mouse.combined$Anno)
mouse.combined$CellType <- mouse.combined$Anno

d4_1707 <- readRDS("data/mixed_D4_final.rds")
d4_1707@meta.data[['orig.ident']] <- recode_factor(
  d4_1707@meta.data[["orig.ident"]],
  "Sample2" = "AAV2",
  "Sample4" = "d4_AAV9",
  "Sample6" = "Spk",
  "Sample8" = "d4_UT")
d4_1707@meta.data[["RNA_snn_h.orig.ident_res.1.2"]] <- recode_factor(
  d4_1707@meta.data[["RNA_snn_h.orig.ident_res.1.2"]], 
  "0" = "Undifferentiated", 
  "1" = "d4_Oligodendrocytes", 
  "2" = "d4_Neurons", 
  "3" = "d4_Astrocytes", 
  "4" = "d4_Neurons", 
  "5" = "d4_Oligodendrocytes", 
  "6" = "d4_Astrocytes", 
  "7" = "d4_Oligodendrocytes", 
  "8" = "d4_Oligodendrocytes",
  "9" = "d4_Neurons",
  "10" = "d4_Oligodendrocytes",
  "11" = "d4_Oligodendrocytes",
  "12" = "d4_Astrocytes",
  "13" = "d4_Astrocytes",
  "14" = "d4_Oligodendrocytes",
  "15" = "d4_Astrocytes",
  "16" = "d4_Astrocytes")
Idents(d4_1707) <- "RNA_snn_h.orig.ident_res.1.2"
d4_1707$RNA_snn_h.orig.ident_res.1.2 <- factor(d4_1707$RNA_snn_h.orig.ident_res.1.2,
                                               levels = c("d4_Astrocytes",
                                                          "d4_Neurons",
                                                          "d4_Oligodendrocytes",
                                                          "Undifferentiated"))
d4_1707 <- subset(d4_1707, RNA_snn_h.orig.ident_res.1.2 != "Undifferentiated")
d4_1707$RNA_snn_h.orig.ident_res.1.2 <- droplevels(d4_1707$RNA_snn_h.orig.ident_res.1.2)
unique(d4_1707$RNA_snn_h.orig.ident_res.1.2)
d4_1707 <- subset(d4_1707, orig.ident == "d4_AAV9" | orig.ident == "d4_UT")
d4_1707$orig.ident <- droplevels(d4_1707$orig.ident)
d4_1707$CellType <- d4_1707$RNA_snn_h.orig.ident_res.1.2

d5_1743 <- readRDS("data/organoids_D5_final.rds")
d5_1743@meta.data[['orig.ident']] <- recode_factor(
  d5_1743@meta.data[["orig.ident"]],
  "Sample2" = "AAV2",
  "Sample4" = "d5_AAV9",
  "Sample6" = "Spk",
  "Sample8" = "d5_UT")
d5_1743@meta.data[["RNA_snn_h.orig.ident_res.0.6"]] <- recode_factor(
  d5_1743@meta.data[["RNA_snn_h.orig.ident_res.0.6"]], 
  "0" = "d5_Astrocytes", 
  "1" = "d5_Neurons", 
  "2" = "Undifferentiated", 
  "3" = "d5_Neurons", 
  "4" = "d5_Oligodendrocytes", 
  "5" = "Pericytes", 
  "6" = "Pericytes", 
  "7" = "d5_Astrocytes", 
  "8" = "d5_Oligodendrocytes",
  "9" = "d5_Neurons",
  "10" = "d5_Astrocytes",
  "11" = "Pericytes",
  "12" = "d5_Astrocytes",
  "13" = "d5_Neurons")
d5_1743 <- subset(d5_1743, RNA_snn_h.orig.ident_res.0.6 != "Pericytes")
Idents(d5_1743) <- "RNA_snn_h.orig.ident_res.0.6"
d5_1743$RNA_snn_h.orig.ident_res.0.6 <- factor(d5_1743$RNA_snn_h.orig.ident_res.0.6,
                                               levels = c("d5_Astrocytes",
                                                          "d5_Neurons",
                                                          "d5_Oligodendrocytes",
                                                          "Undifferentiated"))
d5_1743 <- subset(d5_1743, RNA_snn_h.orig.ident_res.0.6 != "Undifferentiated")
d5_1743$RNA_snn_h.orig.ident_res.0.6 <- droplevels(d5_1743$RNA_snn_h.orig.ident_res.0.6)
unique(d5_1743$RNA_snn_h.orig.ident_res.0.6)
d5_1743 <- subset(d5_1743, orig.ident == "d5_AAV9" | orig.ident == "d5_UT")
d5_1743$orig.ident <- droplevels(d5_1743$orig.ident)
d5_1743$CellType <- d5_1743$RNA_snn_h.orig.ident_res.0.6
## create a pseudobulk and normalize
datasets_pseudo <- c("mouse.combined", "d4_1707", "d5_1743")
for (i in datasets_pseudo) {
  print(paste0("Analyzing... ",i))
  obj_i <- get(i)
  data_collapsed <- presto::collapse_counts(obj_i@assays$RNA@counts, 
                                            obj_i@meta.data, 
                                            c('CellType', 'orig.ident'))
  meta_data<- data_collapsed$meta_data
  mat <- data_collapsed$counts_mat
  colnames(mat)<- paste(meta_data$CellType, meta_data$orig.ident, sep="_")
  rownames(meta_data) <- paste(meta_data$CellType, meta_data$orig.ident, sep="_")
  pseudo_bulk_obj <- list(mat = mat, meta_data = meta_data)
  assign(paste0(i, "_mat"), mat)
  assign(paste0(i,"_pseduo_bulk_obj"), pseudo_bulk_obj)
  DESeq.ds <- DESeq2::DESeqDataSetFromMatrix(countData = pseudo_bulk_obj$mat,
                                             colData = pseudo_bulk_obj$meta_data,
                                             design = ~orig.ident)
  DESeq.ds <- DESeq.ds[rowSums(counts(DESeq.ds)) > 0, ]
  DESeq.ds <- DESeq(DESeq.ds)
  DESeq.rlog <- rlogTransformation(DESeq.ds, blind = TRUE)
  counts <- assay(DESeq.rlog)
  assign(paste0(i,"_counts"), counts)
}
## converting matrices into expressionsets
d4_1707_counts_es <- ExpressionSet(assayData = as.matrix(d4_1707_counts), 
                                   annotation = "org.Hs.eg.db")
d5_1743_counts_es <- ExpressionSet(assayData = as.matrix(d5_1743_counts), 
                                   annotation = "org.Hs.eg.db")
mouse_counts_es <- ExpressionSet(assayData = as.matrix(mouse.combined_counts),
                                 annotation = "org.Mm.eg.db")
gsva_d4_1707_counts <- gsva(as.matrix(d4_1707_counts), gs_human, min.sz=5, max.sz=500)
gsva_d5_1743_counts <- gsva(as.matrix(d5_1743_counts), gs_human, min.sz=5, max.sz=500)
gsva_mouse.combined_counts <- gsva(as.matrix(mouse.combined_counts), 
                                   gs_mouse, min.sz=5, max.sz=500)

gsva_counts_combined <- cbind(gsva_d4_1707_counts, gsva_d5_1743_counts, gsva_mouse.combined_counts)

p <- pheatmap(gsva_counts_combined)