library(data.table) library(dplyr) library(openxlsx) library(readxl) library(gridExtra) library(ggplot2) library(survival) library(survminer) #library(babelgene) # Set directories wdir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts" setwd(wdir) input_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/Input" output_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/Output" ################################################################################ # R script for TCGA survival analysis of miR-342-3p expression. # # This script analyzes TCGA pancancer miRNA expression and survival data to assess # the prognostic value of hsa-miR-342-3p across different tumor types. # It calculates hazard ratios (HR) and generates Kaplan-Meier survival curves # to visualize the association between miR-342-3p expression levels and patient survival. # # Figures generated: # - HR_alltumors_342.pdf (HR forest plot): Forest plot visualizing hazard ratios and # significance of miR-342-3p expression on overall survival across various tumor types. # - Survival_***_342 (Survival curves - multiple tumors): Set of Kaplan-Meier survival plots # for tumor types showing significant hazard ratios, illustrating survival differences # between patients with high and low miR-342-3p expression. # - Survival_metastatic_342.pdf (Survival curve - metastatic tumors): Kaplan-Meier survival plot # specifically for metastatic tumors, showing the impact of miR-342-3p expression on survival # in this patient subgroup. # # Input data files: # - TCGA_phenotype_denseDataOnlyDownload.tsv.gz: TCGA patient phenotype data (metadata), # downloaded from TCGA or Xena, contains clinical and demographic information. # - Survival_SupplementalTable_S1_20171025_xena_sp: TCGA patient survival data, provides overall survival (OS) time and status. # - pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs_08_04_16.xena.gz: # TCGA pancancer miRNA expression data (FPKM values), downloaded from Xena, # contains miRNA expression levels across different tumor samples. # # Note: # - The script filters tumor types based on p-value significance (p < 0.1 for HR plot, p < 0.05 for survival curves) # and minimum patient number (n > 100 for HR plot). These thresholds can be adjusted within the script. ################################################################################ # Load dataset metaD <- data.frame(fread(paste0(input_dir, "/TCGA_phenotype_denseDataOnlyDownload.tsv.gz"), sep='\t', colClasses=c("character"), data.table=FALSE)) Surv01 <- data.frame(fread(paste0(input_dir, "/Survival_SupplementalTable_S1_20171025_xena_sp"), sep='\t', colClasses=c("character"), data.table=FALSE)) # FPKM01 <- fread(paste0(input_dir, "/tcga_RSEM_Hugo_norm_count")) FPKM01 <- fread(paste0(input_dir, "/pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs_08_04_16.xena.gz")) FPKM01 <- as.data.frame(FPKM01) # Annot <- data.frame(fread(paste0(input_dir, "/probeMap_gencode.v23.annotation.gene.probemap"), sep='\t',colClasses=c("character"),data.table=FALSE)) allGenes <- FPKM01[,1] ##################################################################### #####PLOT HR Score of signature for multiple tumor types############# ##################################################################### humanGenes <- list(c("hsa-miR-342-3p")) # Create xlsx file with genes, patients and tumor types Pan.data3 <- metaD[,] Pan.data4 <- merge(Surv01,Pan.data3,by=1) Genes_selected <- FPKM01[FPKM01[,1]%in%humanGenes,] rownames(Genes_selected)<-Genes_selected[,1] Genes_selected <- Genes_selected[,-1] Genes_selected <- data.frame(t(Genes_selected)) Genes_selected$sample<-rownames(Genes_selected) Pan.data5 <- merge(Genes_selected, Pan.data4, by.y="sample") colnames(Pan.data5) write.xlsx(Pan.data5, paste0(output_dir, "/miR342_patients_survival.xlsx")) # Regression with defined k2 k2 <- 0.5 # define quartile for selecting low/high score df2 <- NULL for (i in unique(Pan.data5$cancer.type.abbreviation)){ Pan.data6 <- Pan.data5[Pan.data5$cancer.type.abbreviation%in%i,] if(nrow(Pan.data6)>2){ P.OS <- Surv(as.numeric(Pan.data6$OS.time), as.numeric(Pan.data6$OS)) cohort.score<-rep(0,nrow(P.OS)) M1 <- quantile(Pan.data6$hsa.miR.342.3p,k2) cohort.score[Pan.data6$hsa.miR.342.3p>=M1]<-1 a1 <- coxph(P.OS ~ as.numeric(cohort.score)) df1 <- data.frame(Tumor = unique(Pan.data6$cancer.type.abbreviation),npatients=length(Pan.data6$X_PATIENT), HR = summary(a1)$coefficients[2], SE=summary(a1)$coefficients[3], p = summary(a1)$coefficients[5] ) df2 <- rbind(df2,df1) df2 <- df2[(order(df2$p)),] }else{}} write.xlsx(df2, paste0(output_dir, "/miR342_HR_by_tumortype.xlsx")) # Filters selected tumors df3<-df2 df3<-df3[df3$p<0.1,] df3<-df3[df3$npatients>100,] df3$significance <- ifelse(df3$p < 0.05, "p<0.05", "NS") a<-ggplot(df3, aes(x = reorder(Tumor, HR), y = HR, color = significance, label = significance)) + geom_point(size = 4) + geom_errorbar(aes(ymin = pmax(HR-SE, 0), ymax = HR + SE), width = 0.2) + scale_color_manual(name = "Color", values = c("red", "blue")) + labs(title = "HR by tumor type", x = "Tumor Type", y = "HR") + geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = 1), fill = "lightblue", alpha = 0.01) + geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = 1, ymax = Inf), fill = "pink", alpha = 0.01)+ theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust= 1, color = "Black")) + geom_hline(yintercept = 1, linetype = "dashed", color = "gray") ggsave(filename = paste0(output_dir, "/HR_alltumors_342.pdf"), plot=a, width=9, height=4) # Plot survival of chosen tumors # Select type of tumor df3 <- df3[order(df3$HR),] df3 <- df3[df3$p<0.05,] pdf(paste0(output_dir, "/Survival_in_chosen_tumors_342.pdf"), width=10 ,height=6) par(mfrow = c(2,4)) survival<-list() for (i in df3$Tumor){ tumor.type<-i Pan.data6 <- Pan.data5[Pan.data5$cancer.type.abbreviation%in%i,] P.OS <- Surv(as.numeric(Pan.data6$OS.time), as.numeric(Pan.data6$OS)) cohort.score<-rep(0,nrow(P.OS)) M1 <- quantile(Pan.data6$hsa.miR.342.3p,k2) cohort.score[Pan.data6$hsa.miR.342.3p>=M1]<-1 # Plotting survival fit.score <- survfit(P.OS ~ cohort.score) cox_model <- coxph(P.OS ~ as.numeric(cohort.score)) p_value <- summary(cox_model)$coefficients["as.numeric(cohort.score)", "Pr(>|z|)"] formatted_p_value <- ifelse(p_value < 0.0001, "<0.0001", as.numeric(round(p_value,5))) par(las = 0) plot(fit.score, lty = c(1, 1), col = c("blue", "red"), xlab = "Time (d)", ylab = "Overall Survival", lwd = 2, bty = "n", main = paste0("Survival in ", i)) # Add legend legend(x = max(fit.score$time)/2, y = 1, legend = c("miRNA high", "miRNA low"), lty = c(1, 1), col = c("red", "blue"), lwd = 2,bty = "n") text(x = max(fit.score$time)/1.8, y = 0.70, label = paste("p=", (formatted_p_value)), adj = c(0, 1)) text(x = 10, y = 0.05, label = paste("n =", length(cohort.score)), adj = c(0, 1)) survival[[i]] <- recordPlot() } dev.off() # Plot survival of metastastic/primary cancers #k2<-0.5 Pan.data7 <- Pan.data5[Pan.data5$sample_type == "Metastatic",] #Pan.data7 <- Pan.data5[Pan.data5$sample_type == "Primary Tumor",] P.OS <- Surv(as.numeric(Pan.data7$OS.time), as.numeric(Pan.data7$OS)) cohort.score<-rep(0,nrow(P.OS)) M1 <- quantile(Pan.data7$hsa.miR.342.3p,k2) cohort.score[Pan.data7$hsa.miR.342.3p>M1]<-1 fit.score <- survfit(P.OS ~ cohort.score) cox_model <- coxph(P.OS ~ as.numeric(cohort.score)) p_value <- summary(cox_model)$coefficients["as.numeric(cohort.score)", "Pr(>|z|)"] formatted_p_value <- ifelse(p_value < 0.0001, "<0.0001", as.numeric(round(p_value,5))) par(las = 0) pdf(paste0(output_dir, "/Survival_metastatic_342.pdf"), width=5, height=5) plot(fit.score, lty = c(1, 1), col = c("blue", "red"), xlab = "Time (d)", ylab = "Overall Survival", lwd = 2, bty = "n", main = "Metastatic tumors") # Add legend legend(x = max(fit.score$time)/2, y = 1, legend = c("miRNA high", "miRNA low"), lty = c(1, 1), col = c("red", "blue"), lwd = 2,bty = "n") text(x = max(fit.score$time)/1.8, y = 0.70, label = paste("p=", (formatted_p_value)), adj = c(0, 1)) text(x = 10, y = 0.05, label = paste("n =", length(cohort.score)), adj = c(0, 1)) dev.off()