library(readxl)
library(ggplot2)
library(ggrepel)
library(dplyr)
library(fgsea)
library(clusterProfiler)
library(enrichplot)

##### Directories #####
# us <-"C:/Users/bresesti.chiara/"
# wdir<-paste0(us,"/Dropbox (HSR Global)/90-857433247_RNAseq_Squadrito/05-DGE-NoOut-Corr")
# wdir_CB<-paste0(us, "/Dropbox (HSR Global)/90-857433247_RNAseq_Squadrito/Analysis CB_v2")
# fdir <- paste0(us,"/Dropbox (HSR Global)/CancerGeneTherapy/Cancer Gene Therapy/MS/2024 Bresesti et al/Scripts/plots and tables used in figures")

input_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/reference"
output_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/Output"

################################################################################
# 
# R script to generate data and figures for manuscript figures 3A, 3B, 3C, and 3D.
# This script performs differential gene expression analysis and gene set enrichment
# analysis to generate volcano plots, empirical cumulative distribution function (ECDF) plots, 
# dot plots, and enrichment maps.
#
# Figures generated:
# - Figure 3A & 3B: Volcano plots of differentially expressed genes in miR-342-3p
# overexpression and sponge experiments, and ECDF plots comparing
# logFC distributions of all genes vs. miR-342-3p target genes.
# - Figure 3C: Dot plot visualizing enriched pathways from gene set enrichment analysis (GSEA).
# - Figure 3D: Enrichment map visualizing relationships between enriched pathways.
#
# Input data files:
# - Figure 3A & 3B: "edgeR_results.xlsx" (Sheet "miR342-control" and "spongeBT-control")
# (Output from differential gene expression analysis, likely using edgeR)
# - Figure 3A & 3B: "TargetScan8.0__miR-342-3p.predicted_targets.xlsx" (Sheet 1)
# (List of predicted miR-342-3p target genes from TargetScan)
# - Figure 3C & 3D: "EdgeR_results.xlsx" (Sheet "miR342-control")
# (Same as input for Figure 3A & 3B, used for GSEA)
# - Figure 3C & 3D: "miDB_sig5.MLS.rds" (RDS file containing gene sets for gene set enrichment analysis)
#
################################################################################


#### Figure 3A&B ####

#Import df
miR_ctrl <- read_excel(paste0(input_dir, "/edgeR_results.xlsx"), sheet = "miR342-control")
sponge_ctrl <- read_excel(paste0(input_dir, "/edgeR_results.xlsx"), sheet = "spongeBT-control")

#Add DEG color and label for volcano plot
miR_ctrl$DEG <- "NO"
miR_ctrl$DEG[miR_ctrl$logFC > 1 & miR_ctrl$PValue < 0.05] <- "UP"
miR_ctrl$DEG[miR_ctrl$logFC < (-1) & miR_ctrl$PValue < 0.05] <- "DOWN"
miR_ctrl$DEG_label <- NA
miR_ctrl$DEG_label[miR_ctrl$DEG != "NO"] <- miR_ctrl$...1[miR_ctrl$DEG != "NO"]

sponge_ctrl$DEG <- "NO"
sponge_ctrl$DEG[sponge_ctrl$logFC > 1 & sponge_ctrl$PValue < 0.05] <- "UP"
sponge_ctrl$DEG[sponge_ctrl$logFC < (-1) & sponge_ctrl$PValue < 0.05] <- "DOWN"
sponge_ctrl$DEG_label <- NA
sponge_ctrl$DEG_label[sponge_ctrl$DEG != "NO"] <- sponge_ctrl$...1[sponge_ctrl$DEG != "NO"]

#Volcano plot (left panel)
data <- miR_ctrl
ggplot(data=data, aes(x=logFC, y=-log10(PValue), col=DEG, label=DEG_label)) +
  geom_point() + 
  theme_minimal() +
  geom_text_repel() +
  scale_color_manual(values=c("blue", "grey", "red")) +
  geom_vline(xintercept=c(-1, 1), col="black") +
  geom_hline(yintercept=-log10(0.05), col="black") +
  scale_x_continuous(limits = c(-4.1,4.1), breaks = seq(-4,4,1)) +
  scale_y_continuous(limits = c(0,20)) +
  xlab(bquote(Log[2](FC))) +
  ylab(bquote(-Log[10](PVal)))

data <- sponge_ctrl
ggplot(data=data, aes(x=logFC, y=-log10(PValue), col=DEG, label=DEG_label)) +
  geom_point() + 
  theme_minimal() +
  geom_text_repel() +
  scale_color_manual(values=c("blue", "grey", "red")) +
  geom_vline(xintercept=c(-1, 1), col="black") +
  geom_hline(yintercept=-log10(0.05), col="black") +
  scale_x_continuous(limits = c(-4.1,4.1), breaks = seq(-4,4,1)) +
  scale_y_continuous(limits = c(0,20)) +
  xlab(bquote(Log[2](FC))) +
  ylab(bquote(-Log[10](PVal)))

# Import miR-342-3p target list (from TargetScan)
miR342_targets <-  read_excel(paste0(input_dir, "/TargetScan8.0__miR-342-3p.predicted_targets.xlsx"))
miR342_targets <-  filter(miR342_targets, miR342_targets$`Cumulative weighted context++ score`< (-0.3))

#ecdf plot (right panel)
data <- miR_ctrl
test <-filter(data, data$...1 %in% miR342_targets$'Target gene')
plot(ecdf(data$logFC), lwd = 2, do.points=F, verticals=T,
     ylab = "Fraction of DEG",
     xlab = bquote(Log[2](FC)),
     xlim=c(-0.5,0.5),
     ylim=c(0,1),
     main="ECDF of gene expression: miR vs mut") +
  lines(ecdf(test$logFC), col= "blue", do.points=F, lwd = 2, verticals=T) +
  abline(v=0, col="black") + 
  abline(h=0.5, col="black") +
  legend("bottomright", c("All genes","miR-342-3p targets"),
         col = c("black","blue"), lwd=2, cex = 0.7)
ks.test(test$logFC, data$logFC, alternative = "g") #Kolmogorov-Smirnov test to calculate p-val of miR target distribution *not* greater than average data

data <- sponge_ctrl
test <-filter(data, data$...1 %in% miR342_targets$'Target gene')
plot(ecdf(data$logFC), lwd = 2, do.points=F, verticals=T,
     ylab = "Fraction of DEG",
     xlab = bquote(Log[2](FC)),
     xlim=c(-0.3,0.3),
     ylim=c(0,1),
     main="ECDF of gene expression: sponge vs scr") +
  lines(ecdf(test$logFC), col= "red", do.points=F, lwd = 2, verticals=T) +
  abline(v=0, col="black") + 
  abline(h=0.5, col="black") +
  legend("bottomright", c("All genes","miR-342-3p targets"),
         col = c("black","red"), lwd=2, cex = 0.7)
ks.test(test$logFC, data$logFC, alternative = "l") # Kolmogorov-Smirnov test to calculate p-val of miR target distribution *not* less than average data

#### Figure 3C&D ####

# Load df
df1<- read_excel(paste0(input_dir, "/EdgeR_results.xlsx"), sheet = "miR342-control")
miDB_sig5 <- readRDS(paste0(input_dir, "/miDB_sig5.MLS.rds")) #GO terms db

#Rename pathways of interest in miDB_sig5
names(miDB_sig5)[names(miDB_sig5) == "HALLMARK_OXIDATIVE_PHOSPHORYLATION"] <- "Oxydative phosphorylation"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_REGULATION_OF_CHOLESTEROL_METABOLIC_PROCESS"] <- "Regulation of cholesterol metabolic process"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_INTERLEUKIN_12"] <- "Response to IL12"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_REGULATION_OF_LIPID_BIOSYNTHETIC_PROCESS"] <- "Regulation of lipid biosynthetic process"
names(miDB_sig5)[names(miDB_sig5) == "GOMF_MHC_CLASS_I_PROTEIN_BINDING"] <- "MHC-I protein binding"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_TUMOR_NECROSIS_FACTOR_MEDIATED_SIGNALING_PATHWAY"] <- "TNFa mediated signalling pathway"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_NEGATIVE_REGULATION_OF_CELL_CYCLE_G2_M_PHASE_TRANSITION"] <- "Negative regulation of cell cycle progression"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_FATTY_ACYL_COA_BIOSYNTHETIC_PROCESS"] <- "Fatty acyl-CoA biosynthetic process"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_TUMOR_NECROSIS_FACTOR"] <- "Response to TNFa"
names(miDB_sig5)[names(miDB_sig5) == "HALLMARK_G2M_CHECKPOINT"] <- "G2M checkpoint"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_TYPE_I_INTERFERON"] <- "Response to type I interferon"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_INTERLEUKIN_1_MEDIATED_SIGNALING_PATHWAY"] <- "IL1 mediated signalling pathway"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_INTERLEUKIN_1"] <- "Response to IL1"
names(miDB_sig5)[names(miDB_sig5) == "LPS_RO"] <- "LPS response genes"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_CYTOKINE_MEDIATED_SIGNALING_PATHWAY"] <- "Cytokine mediated signalling pathway"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_MYELOID_CELL_DIFFERENTIATION"] <- "Myeloid cell differentiation"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_INTERLEUKIN_10_PRODUCTION"] <- "IL10 production"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_POSITIVE_REGULATION_OF_ENDOTHELIAL_CELL_PROLIFERATION"] <- "Regulation of endothelial cell proliferation"
names(miDB_sig5)[names(miDB_sig5) == "GOBP_TUMOR_NECROSIS_FACTOR_SUPERFAMILY_CYTOKINE_PRODUCTION"] <- "TNFa superfamily cytokine production"
names(miDB_sig5)[names(miDB_sig5) == "HALLMARK_ANGIOGENESIS"] <- "Angiogenesis"
names(miDB_sig5)[names(miDB_sig5) == "GOMF_PATTERN_RECOGNITION_RECEPTOR_ACTIVITY"] <- "Pattern recognition receptor activity"
names(miDB_sig5)[names(miDB_sig5) == "PGE2_RO"] <- "PGE2 response genes"

# Filter low expression, NA and order by FC
df1 <- filter(df1, df1$logCPM>5 & !is.na(df1$...1))
df1<-df1[order(df1$logFC),]
df2<-df1$logFC
names(df2)<-df1$...1

# Run GSEA with fgsea and filter by PVal
sig <- miDB_sig5
test1 <- fgsea(sig, df2, minSize=7, maxSize=500, nproc=1)
PvalResult <- filter(test1, test1$padj <= 0.05)

# DotPlot pathways of interest (Fig.3C)
pathways <- c("Oxydative phosphorylation",
              "Regulation of cholesterol metabolic process",
              "Response to IL12",
              "Regulation of lipid biosynthetic process",
              "MHC-I protein binding",
              "TNFa mediated signalling pathway",
              "Negative regulation of cell cycle progression",
              "Fatty acyl-CoA biosynthetic process",
              "Response to TNFa",
              "G2M checkpoint",
              "Response to type I interferon",
              "IL1 mediated signalling pathway",
              "Response to IL1",
              "LPS response genes",
              "Cytokine mediated signalling pathway",
              "Myeloid cell differentiation",
              "IL10 production",
              "Regulation of endothelial cell proliferation",
              "TNFa superfamily cytokine production",
              "Angiogenesis",
              "Pattern recognition receptor activity",
              "PGE2 response genes")

genelist <- as.data.frame(PvalResult[PvalResult$pathway %in% pathways,])
ggplot(genelist, aes(x=NES, y=reorder(pathway,NES), size=size ,color=padj)) + 
  geom_point() +
  scale_size_area(limits=c(10,450), max_size = 15) +
  scale_colour_gradient(low="red",high="blue") +
  labs(y='Pathway', x='NES')

#Run GSEA with ClusterProfiler
genelist <- data.frame(term = rep(names(miDB_sig5), sapply(miDB_sig5, length)),
                       gene = unlist(miDB_sig5))
df2 = sort(df2, decreasing = TRUE)
test2 <- GSEA(df2,TERM2GENE = genelist)

#Enrichment map (Fig.3D)
test3 <- filter(test2, ID %in% pathways)
test3 <- pairwise_termsim(test3)
emapplot(test3, min_edge = 0.01, color = "NES", layout="fr", repel =T) +
  scale_fill_gradient2(name=bquote(NES),low="blue", high="red")