library(data.table)
library(dplyr)
library(openxlsx)
library(readxl)
library(gridExtra)
library(ggplot2)
library(survival)
library(survminer)
#library(babelgene)

# Set directories
wdir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts"
setwd(wdir)

input_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/Input"
output_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/Output"


################################################################################
# R script for TCGA survival analysis of miR-342-3p expression.
#
# This script analyzes TCGA pancancer miRNA expression and survival data to assess
# the prognostic value of hsa-miR-342-3p across different tumor types.
# It calculates hazard ratios (HR) and generates Kaplan-Meier survival curves
# to visualize the association between miR-342-3p expression levels and patient survival.
#
# Figures generated:
#   - HR_alltumors_342.pdf (HR forest plot): Forest plot visualizing hazard ratios and
#     significance of miR-342-3p expression on overall survival across various tumor types.
#   - Survival_***_342 (Survival curves - multiple tumors): Set of Kaplan-Meier survival plots
#     for tumor types showing significant hazard ratios, illustrating survival differences
#     between patients with high and low miR-342-3p expression.
#   - Survival_metastatic_342.pdf (Survival curve - metastatic tumors): Kaplan-Meier survival plot
#     specifically for metastatic tumors, showing the impact of miR-342-3p expression on survival
#     in this patient subgroup.
#
# Input data files:
#   - TCGA_phenotype_denseDataOnlyDownload.tsv.gz: TCGA patient phenotype data (metadata),
#     downloaded from TCGA or Xena, contains clinical and demographic information.
#   - Survival_SupplementalTable_S1_20171025_xena_sp: TCGA patient survival data, provides overall survival (OS) time and status.
#   - pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs_08_04_16.xena.gz:
#     TCGA pancancer miRNA expression data (FPKM values), downloaded from Xena,
#     contains miRNA expression levels across different tumor samples.
#
# Note:
#   - The script filters tumor types based on p-value significance (p < 0.1 for HR plot, p < 0.05 for survival curves)
#     and minimum patient number (n > 100 for HR plot). These thresholds can be adjusted within the script.
################################################################################

# Load dataset
metaD <- data.frame(fread(paste0(input_dir, "/TCGA_phenotype_denseDataOnlyDownload.tsv.gz"), sep='\t', colClasses=c("character"), data.table=FALSE))
Surv01 <- data.frame(fread(paste0(input_dir, "/Survival_SupplementalTable_S1_20171025_xena_sp"), sep='\t', colClasses=c("character"), data.table=FALSE))
# FPKM01 <- fread(paste0(input_dir, "/tcga_RSEM_Hugo_norm_count"))
FPKM01 <- fread(paste0(input_dir, "/pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs_08_04_16.xena.gz"))
FPKM01 <- as.data.frame(FPKM01)
# Annot <- data.frame(fread(paste0(input_dir, "/probeMap_gencode.v23.annotation.gene.probemap"), sep='\t',colClasses=c("character"),data.table=FALSE))
allGenes <- FPKM01[,1]

#####################################################################
#####PLOT HR Score of signature for multiple tumor types#############
#####################################################################
humanGenes <- list(c("hsa-miR-342-3p"))

# Create xlsx file with genes, patients and tumor types
Pan.data3 <- metaD[,]
Pan.data4 <- merge(Surv01,Pan.data3,by=1)
Genes_selected <- FPKM01[FPKM01[,1]%in%humanGenes,]

rownames(Genes_selected)<-Genes_selected[,1]
Genes_selected <- Genes_selected[,-1]
Genes_selected <- data.frame(t(Genes_selected))

Genes_selected$sample<-rownames(Genes_selected)
Pan.data5 <- merge(Genes_selected, Pan.data4, by.y="sample")
colnames(Pan.data5)
write.xlsx(Pan.data5, paste0(output_dir, "/miR342_patients_survival.xlsx"))


# Regression with defined k2
k2 <- 0.5 # define quartile for selecting low/high score
df2 <- NULL
for (i in unique(Pan.data5$cancer.type.abbreviation)){
  Pan.data6 <- Pan.data5[Pan.data5$cancer.type.abbreviation%in%i,]
  if(nrow(Pan.data6)>2){
    P.OS <- Surv(as.numeric(Pan.data6$OS.time), as.numeric(Pan.data6$OS))
    cohort.score<-rep(0,nrow(P.OS))
    M1 <- quantile(Pan.data6$hsa.miR.342.3p,k2)
    cohort.score[Pan.data6$hsa.miR.342.3p>=M1]<-1
    a1 <- coxph(P.OS ~ as.numeric(cohort.score))
    df1 <- data.frame(Tumor = unique(Pan.data6$cancer.type.abbreviation),npatients=length(Pan.data6$X_PATIENT), HR = summary(a1)$coefficients[2], SE=summary(a1)$coefficients[3],
                      p = summary(a1)$coefficients[5] )
    df2 <- rbind(df2,df1)
    df2 <- df2[(order(df2$p)),]
  }else{}}
write.xlsx(df2, paste0(output_dir, "/miR342_HR_by_tumortype.xlsx"))


# Filters selected tumors
df3<-df2
df3<-df3[df3$p<0.1,]
df3<-df3[df3$npatients>100,]
df3$significance <- ifelse(df3$p < 0.05, "p<0.05", "NS")
a<-ggplot(df3, aes(x = reorder(Tumor, HR), y = HR, color = significance, label = significance)) +
  geom_point(size = 4) +
  geom_errorbar(aes(ymin = pmax(HR-SE, 0), ymax = HR + SE), width = 0.2) +
  scale_color_manual(name = "Color", values = c("red", "blue")) +
  labs(title = "HR by tumor type", x = "Tumor Type", y = "HR") +
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = 1), fill = "lightblue", alpha = 0.01) +
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = 1, ymax = Inf), fill = "pink", alpha = 0.01)+
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust= 1, color = "Black")) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "gray")
ggsave(filename = paste0(output_dir, "/HR_alltumors_342.pdf"), plot=a, width=9, height=4)


# Plot survival of chosen tumors
# Select type of tumor
df3 <- df3[order(df3$HR),]
df3 <- df3[df3$p<0.05,]

pdf(paste0(output_dir, "/Survival_in_chosen_tumors_342.pdf"), width=10 ,height=6)
par(mfrow = c(2,4))
survival<-list()
for (i in df3$Tumor){
  tumor.type<-i
  Pan.data6 <- Pan.data5[Pan.data5$cancer.type.abbreviation%in%i,]
  P.OS <- Surv(as.numeric(Pan.data6$OS.time), as.numeric(Pan.data6$OS))
  cohort.score<-rep(0,nrow(P.OS))
  M1 <- quantile(Pan.data6$hsa.miR.342.3p,k2)
  cohort.score[Pan.data6$hsa.miR.342.3p>=M1]<-1
  # Plotting survival
  fit.score <- survfit(P.OS ~ cohort.score)
  cox_model <- coxph(P.OS ~ as.numeric(cohort.score))
  p_value <- summary(cox_model)$coefficients["as.numeric(cohort.score)", "Pr(>|z|)"]
  formatted_p_value <- ifelse(p_value < 0.0001, "<0.0001",  as.numeric(round(p_value,5)))
  par(las = 0)
  plot(fit.score, lty = c(1, 1), col = c("blue", "red"), xlab = "Time (d)", ylab = "Overall Survival", lwd = 2, bty = "n",
       main = paste0("Survival in ", i)) 
  # Add legend
  legend(x = max(fit.score$time)/2, y = 1, legend = c("miRNA high", "miRNA low"), lty = c(1, 1), col = c("red", "blue"), lwd = 2,bty = "n")
  text(x = max(fit.score$time)/1.8, y = 0.70, label = paste("p=", (formatted_p_value)), adj = c(0, 1))
  text(x = 10, y = 0.05, label = paste("n =", length(cohort.score)), adj = c(0, 1))
  survival[[i]] <- recordPlot()
}
dev.off()


# Plot survival of metastastic/primary cancers
#k2<-0.5
Pan.data7 <- Pan.data5[Pan.data5$sample_type == "Metastatic",]
#Pan.data7 <- Pan.data5[Pan.data5$sample_type == "Primary Tumor",]
P.OS <- Surv(as.numeric(Pan.data7$OS.time), as.numeric(Pan.data7$OS))
cohort.score<-rep(0,nrow(P.OS))
M1 <- quantile(Pan.data7$hsa.miR.342.3p,k2)
cohort.score[Pan.data7$hsa.miR.342.3p>M1]<-1
fit.score <- survfit(P.OS ~ cohort.score)
cox_model <- coxph(P.OS ~ as.numeric(cohort.score))
p_value <- summary(cox_model)$coefficients["as.numeric(cohort.score)", "Pr(>|z|)"]
formatted_p_value <- ifelse(p_value < 0.0001, "<0.0001",  as.numeric(round(p_value,5)))
par(las = 0)


pdf(paste0(output_dir, "/Survival_metastatic_342.pdf"), width=5, height=5)
plot(fit.score, lty = c(1, 1), col = c("blue", "red"), xlab = "Time (d)", ylab = "Overall Survival", lwd = 2, bty = "n",
     main = "Metastatic tumors") 
# Add legend
legend(x = max(fit.score$time)/2, y = 1, legend = c("miRNA high", "miRNA low"), lty = c(1, 1), col = c("red", "blue"), lwd = 2,bty = "n")
text(x = max(fit.score$time)/1.8, y = 0.70, label = paste("p=", (formatted_p_value)), adj = c(0, 1))
text(x = 10, y = 0.05, label = paste("n =", length(cohort.score)), adj = c(0, 1))
dev.off()