I updated the scripts.

65cf9977 · Marco Monti · cad0b7de · 65cf9977 · 65cf9977 · 65cf9977
Commit 65cf9977 authored Mar 27, 2025 by Marco Monti
--- a/CB2025_figure_1_RNAseq.R
+++ b/CB2025_figure_1_RNAseq.R
@@ -11,7 +11,7 @@ library("dplyr")
 # wdir1510 <-paste0(us, "/Dropbox (HSR Global)/SquadritoM_1510_RNA_miRNA_QIAseq_UPX")
 # output_d <- paste0(us, "/Dropbox (HSR Global)/CancerGeneTherapy/Cancer Gene Therapy/MS/2024 Bresesti et al/Scripts/plots and tables used in figures")

-input_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/reference"
+input_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/Input"
 output_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/Output"

 ################################################################################
@@ -34,7 +34,7 @@ output_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq


 #### Figure 1B ####
-df1 <- as.data.frame(read_excel(paste0(input_dir, "/QIAseqUltraplexRNA_181342.xlsx"), sheet=3, col_names = T, skip=1)) #sheet: umis.genes.polyA-mouse
+df1 <- as.data.frame(read_excel(paste0(input_dir, "/QIAseqUltraplexRNA_181342.xlsx"), sheet=3, col_names = T, skip=1)) # sheet: umis.genes.polyA-mouse
 df1 <-df1[,-c(1,3:6)] # keep only gene name and UMI counts
 df1[,-1] <-apply(df1[,-1],2,function(x){x/sum(x)*1000000}) # UMI normalized by CPM
 gene_vector <- c("Cd19", "Ms4a1", "Fcer2a", "Ighm", "Cd8a", "Xcr1", "Itgae", "Itgax", 
@@ -48,14 +48,13 @@ openxlsx::write.xlsx(df2.scaled, paste0(output_dir, "/Figure_1B_table.xlsx"), ro
 # the df was exported and used to create a heatmap using Graphpad

 #### Figure 1C ####
-# "edgeR_DGE_res_volcano.pdf" (in the 'wdir1509' folder) was imported in illustrator 
-# and merged with the plot generated by the code below
-df1 <- as.data.frame(read_excel(paste0(input_dir, "/173308.all_samples.summary.xlsx"), sheet=2, col_names=T)) #sheet: miRNA_piRNA
-df1 <-df1[,1:7] #UMI 
+# "edgeR_DGE_res_volcano.pdf" (in the 'wdir1509' folder) was imported in illustrator and merged with the plot generated by the code below
+df1 <- as.data.frame(read_excel(paste0(input_dir, "/173308.all_samples.summary.xlsx"), sheet=2, col_names=T)) # sheet: miRNA_piRNA
+df1 <- df1[,1:7] # UMI 
 rownames(df1)<-df1$miRNA
-df2 <- apply(df1[,-1], 2, function(x) log(x)) #Log counts
-df2 <- as.data.frame(limma::normalizeCyclicLoess(df2, weights = NULL, span=0.7, iterations = 5, method = "pairs")) #Cyclic loess normalization
-colnames(df2)<-c("B cell","RPM","cDC1","cDC2","LSEC","KC") #Rename columns
+df2 <- apply(df1[,-1], 2, function(x) log(x)) # Log counts
+df2 <- as.data.frame(limma::normalizeCyclicLoess(df2, weights = NULL, span=0.7, iterations = 5, method = "pairs")) # Cyclic loess normalization
+colnames(df2)<-c("B cell","RPM","cDC1","cDC2","LSEC","KC") # Rename columns
 upper.panel<-function(x, y, ...){
  points(((x+y)/2),(x-y), cex=0.6, pch=19, col="grey")
  above <- (x-y-1)*((x+y)/2-5) > 5 & ((x+y)/2)>5
@@ -63,19 +62,21 @@ upper.panel<-function(x, y, ...){
  below <- (x-y+1)*((x+y)/2-5) < -5 & ((x+y)/2)>5
  points(((x+y)/2)[below],(x-y)[below], col="blue", cex=0.6, pch=19)
 } # function for MA plot
+pdf(file = paste(output_dir, "Fig_1C.pdf", sep = "/"), width=9, height=9)
 pairs(df2[,1:6], lower.panel = NULL, upper.panel = upper.panel, 
      ylim=c(-8.5,8.5), xlim=c(2,14), cex.labels = 2) # pairwise plot
+dev.off()

 #### Figure 1D ####
-df1 <- as.data.frame(read_excel(paste0(input_dir, "/173308.all_samples.summary.xlsx"), sheet=2, col_names=T)) #sheet: miRNA_piRNA
+df1 <- as.data.frame(read_excel(paste0(input_dir, "/173308.all_samples.summary.xlsx"), sheet=2, col_names=T)) # sheet: miRNA_piRNA
 df1 <- df1[-which(grepl("piR",df1[,1])),1:7] # UMI and piRNA removing
 df1[,1] <- gsub("/.*","",df1[,1]) # leave only first miRNA for ambiguous entries
-df1[,-1] <- apply(df1[,-1],2,function(x){x/sum(x,na.rm=T)*1000000}) #UMI normalized by CPM
-df.families <- as.data.frame(read_excel("Analysis CB/miRNA_Family.xlsx",sheet=1,col_names = T))#miRNA families from miRBase
-df.families <- df.families[df.families[,3]==10090,c(4,1)] #select mouse entries
+df1[,-1] <- apply(df1[,-1],2,function(x){x/sum(x,na.rm=T)*1000000}) # UMI normalized by CPM
+df.families <- as.data.frame(read_excel(paste0(input_dir, "/miRNA_Family.xlsx"), sheet=1 ,col_names=T)) # miRNA families from miRBase
+df.families <- df.families[df.families[,3]==10090,c(4,1)] # select mouse entries
 df.families[,1] <- sub(pattern = "p.*",replacement ="p",x = df.families[,1])
 df2 <- merge(df.families,df1,by=1)
-df2 <- aggregate(df2[,-c(1:2)],by = df2["miR family"],FUN = sum,na.rm=T) #sum counts by family
+df2 <- aggregate(df2[,-c(1:2)],by = df2["miR family"],FUN = sum,na.rm=T) # sum counts by family
 gene_vector <- c("miR-150-5p","miR-25-3p/32-5p/92-3p/363-3p/367-3p","miR-142-3p.1",
                 "miR-17-5p/20-5p/93-5p/106-5p","miR-191-5p",
                 "miR-15-5p/16-5p/195-5p/322-5p/497-5p","miR-26-5p","miR-138-5p",
@@ -83,6 +84,6 @@ gene_vector <- c("miR-150-5p","miR-25-3p/32-5p/92-3p/363-3p/367-3p","miR-142-3p.
                 "miR-125-5p/351-5p","miR-126-3p.1","miR-199-3p")
 df3 <- subset(df2, df2$`miR family`%in% gene_vector) %>% arrange(factor(`miR family`, levels=gene_vector))
 rownames(df3) <- df3[,1]
-df3.scaled <- as.data.frame(t(scale(t(df3[-1])))) #Zscore normalization. Scaled only works on columns, so need to transform
+df3.scaled <- as.data.frame(t(scale(t(df3[-1])))) # Zscore normalization. Scaled only works on columns, so need to transform
 openxlsx::write.xlsx(df2.scaled, paste0(output_dir, "/Figure_1D_table.xlsx"), rowNames=T)
-#the df was exported and used to create a heatmap using Graphpad
+# The df was exported and used to create a heatmap using Graphpad
--- a/CB2025_figure_3_RNAseq.R
+++ b/CB2025_figure_3_RNAseq.R
@@ -12,7 +12,7 @@ library(enrichplot)
 # wdir_CB<-paste0(us, "/Dropbox (HSR Global)/90-857433247_RNAseq_Squadrito/Analysis CB_v2")
 # fdir <- paste0(us,"/Dropbox (HSR Global)/CancerGeneTherapy/Cancer Gene Therapy/MS/2024 Bresesti et al/Scripts/plots and tables used in figures")

-input_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/reference"
+input_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/Input"
 output_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/Output"

 ################################################################################
@@ -32,7 +32,7 @@ output_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq
 # Input data files:
 # - Figure 3A & 3B: "edgeR_results.xlsx" (Sheet "miR342-control" and "spongeBT-control")
 # (Output from differential gene expression analysis, likely using edgeR)
-# - Figure 3A & 3B: "TargetScan8.0__miR-342-3p.predicted_targets.xlsx" (Sheet 1)
+# - Figure 3A & 3B: "TargetScan8.0_miR-342-3p.predicted_targets.xlsx" (Sheet 1)
 # (List of predicted miR-342-3p target genes from TargetScan)
 # - Figure 3C & 3D: "EdgeR_results.xlsx" (Sheet "miR342-control")
 # (Same as input for Figure 3A & 3B, used for GSEA)
@@ -44,11 +44,11 @@ output_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq

 #### Figure 3A&B ####

-#Import df
+# Import df
 miR_ctrl <- read_excel(paste0(input_dir, "/edgeR_results.xlsx"), sheet = "miR342-control")
 sponge_ctrl <- read_excel(paste0(input_dir, "/edgeR_results.xlsx"), sheet = "spongeBT-control")

-#Add DEG color and label for volcano plot
+# Add DEG color and label for volcano plot
 miR_ctrl$DEG <- "NO"
 miR_ctrl$DEG[miR_ctrl$logFC > 1 & miR_ctrl$PValue < 0.05] <- "UP"
 miR_ctrl$DEG[miR_ctrl$logFC < (-1) & miR_ctrl$PValue < 0.05] <- "DOWN"
@@ -61,9 +61,9 @@ sponge_ctrl$DEG[sponge_ctrl$logFC < (-1) & sponge_ctrl$PValue < 0.05] <- "DOWN"
 sponge_ctrl$DEG_label <- NA
 sponge_ctrl$DEG_label[sponge_ctrl$DEG != "NO"] <- sponge_ctrl$...1[sponge_ctrl$DEG != "NO"]

-#Volcano plot (left panel)
+# Volcano plot (left panel)
 data <- miR_ctrl
-ggplot(data=data, aes(x=logFC, y=-log10(PValue), col=DEG, label=DEG_label)) +
+p1 <- ggplot(data=data, aes(x=logFC, y=-log10(PValue), col=DEG, label=DEG_label)) +
  geom_point() + 
  theme_minimal() +
  geom_text_repel() +
@@ -74,9 +74,10 @@ ggplot(data=data, aes(x=logFC, y=-log10(PValue), col=DEG, label=DEG_label)) +
  scale_y_continuous(limits = c(0,20)) +
  xlab(bquote(Log[2](FC))) +
  ylab(bquote(-Log[10](PVal)))
+ggsave(filename = paste0(output_dir, "/Volcano_plot_3A.pdf"), plot=p1, width=7, height=7)

 data <- sponge_ctrl
-ggplot(data=data, aes(x=logFC, y=-log10(PValue), col=DEG, label=DEG_label)) +
+p2 <- ggplot(data=data, aes(x=logFC, y=-log10(PValue), col=DEG, label=DEG_label)) +
  geom_point() + 
  theme_minimal() +
  geom_text_repel() +
@@ -87,49 +88,54 @@ ggplot(data=data, aes(x=logFC, y=-log10(PValue), col=DEG, label=DEG_label)) +
  scale_y_continuous(limits = c(0,20)) +
  xlab(bquote(Log[2](FC))) +
  ylab(bquote(-Log[10](PVal)))
+ggsave(filename = paste0(output_dir, "/Volcano_plot_3B.pdf"), plot=p2, width=7, height=7)

 # Import miR-342-3p target list (from TargetScan)
-miR342_targets <-  read_excel(paste0(input_dir, "/TargetScan8.0__miR-342-3p.predicted_targets.xlsx"))
+miR342_targets <-  read_excel(paste0(input_dir, "/TargetScan8.0_miR-342-3p.predicted_targets.xlsx"))
 miR342_targets <-  filter(miR342_targets, miR342_targets$`Cumulative weighted context++ score`< (-0.3))

-#ecdf plot (right panel)
+# ecdf plot (right panel)
 data <- miR_ctrl
-test <-filter(data, data$...1 %in% miR342_targets$'Target gene')
+test <- filter(data, data$...1 %in% miR342_targets$'Target gene')
+pdf(file = paste(output_dir, "Fig_3A_dx.pdf", sep = "/"), width=7, height=7)
 plot(ecdf(data$logFC), lwd = 2, do.points=F, verticals=T,
     ylab = "Fraction of DEG",
     xlab = bquote(Log[2](FC)),
     xlim=c(-0.5,0.5),
     ylim=c(0,1),
-     main="ECDF of gene expression: miR vs mut") +
-  lines(ecdf(test$logFC), col= "blue", do.points=F, lwd = 2, verticals=T) +
-  abline(v=0, col="black") + 
-  abline(h=0.5, col="black") +
-  legend("bottomright", c("All genes","miR-342-3p targets"),
-         col = c("black","blue"), lwd=2, cex = 0.7)
-ks.test(test$logFC, data$logFC, alternative = "g") #Kolmogorov-Smirnov test to calculate p-val of miR target distribution *not* greater than average data
+     main="ECDF of gene expression: miR vs mut")
+lines(ecdf(test$logFC), col= "blue", do.points=F, lwd = 2, verticals=T)
+abline(v=0, col="black")
+abline(h=0.5, col="black")
+legend("bottomright", c("All genes","miR-342-3p targets"), col = c("black","blue"), lwd=2, cex = 0.7)
+ks.test(test$logFC, data$logFC, alternative = "g") # Kolmogorov-Smirnov test to calculate p-val of miR target distribution *not* greater than average data
+dev.off()
+

 data <- sponge_ctrl
 test <-filter(data, data$...1 %in% miR342_targets$'Target gene')
+pdf(file = paste(output_dir, "Fig_3B_dx.pdf", sep = "/"), width=7, height=7)
 plot(ecdf(data$logFC), lwd = 2, do.points=F, verticals=T,
     ylab = "Fraction of DEG",
     xlab = bquote(Log[2](FC)),
     xlim=c(-0.3,0.3),
     ylim=c(0,1),
-     main="ECDF of gene expression: sponge vs scr") +
-  lines(ecdf(test$logFC), col= "red", do.points=F, lwd = 2, verticals=T) +
-  abline(v=0, col="black") + 
-  abline(h=0.5, col="black") +
-  legend("bottomright", c("All genes","miR-342-3p targets"),
-         col = c("black","red"), lwd=2, cex = 0.7)
+     main="ECDF of gene expression: sponge vs scr")
+lines(ecdf(test$logFC), col= "red", do.points=F, lwd = 2, verticals=T)
+abline(v=0, col="black")
+abline(h=0.5, col="black")
+legend("bottomright", c("All genes","miR-342-3p targets"), col = c("black","red"), lwd=2, cex = 0.7)
 ks.test(test$logFC, data$logFC, alternative = "l") # Kolmogorov-Smirnov test to calculate p-val of miR target distribution *not* less than average data
+dev.off()
+

 #### Figure 3C&D ####

 # Load df
-df1<- read_excel(paste0(input_dir, "/EdgeR_results.xlsx"), sheet = "miR342-control")
-miDB_sig5 <- readRDS(paste0(input_dir, "/miDB_sig5.MLS.rds")) #GO terms db
+df1 <- read_excel(paste0(input_dir, "/edgeR_results.xlsx"), sheet = "miR342-control")
+miDB_sig5 <- readRDS(paste0(input_dir, "/miDB_sig5.MLS.rds")) # GO terms db

-#Rename pathways of interest in miDB_sig5
+# Rename pathways of interest in miDB_sig5
 names(miDB_sig5)[names(miDB_sig5) == "HALLMARK_OXIDATIVE_PHOSPHORYLATION"] <- "Oxydative phosphorylation"
 names(miDB_sig5)[names(miDB_sig5) == "GOBP_REGULATION_OF_CHOLESTEROL_METABOLIC_PROCESS"] <- "Regulation of cholesterol metabolic process"
 names(miDB_sig5)[names(miDB_sig5) == "GOBP_RESPONSE_TO_INTERLEUKIN_12"] <- "Response to IL12"
@@ -189,20 +195,23 @@ pathways <- c("Oxydative phosphorylation",
              "PGE2 response genes")

 genelist <- as.data.frame(PvalResult[PvalResult$pathway %in% pathways,])
-ggplot(genelist, aes(x=NES, y=reorder(pathway,NES), size=size ,color=padj)) + 
+p3 <- ggplot(genelist, aes(x=NES, y=reorder(pathway,NES), size=size ,color=padj)) + 
  geom_point() +
  scale_size_area(limits=c(10,450), max_size = 15) +
-  scale_colour_gradient(low="red",high="blue") +
+  scale_colour_gradient(low="red", high="blue") +
  labs(y='Pathway', x='NES')
+ggsave(filename = paste0(output_dir, "/gsea_DotPlot_Fig3C.pdf"), plot=p3, width=7, height=7)

-#Run GSEA with ClusterProfiler
+# Run GSEA with ClusterProfiler
 genelist <- data.frame(term = rep(names(miDB_sig5), sapply(miDB_sig5, length)),
                       gene = unlist(miDB_sig5))
 df2 = sort(df2, decreasing = TRUE)
-test2 <- GSEA(df2,TERM2GENE = genelist)
+test2 <- GSEA(df2, TERM2GENE = genelist)

-#Enrichment map (Fig.3D)
+# Enrichment map (Fig.3D)
 test3 <- filter(test2, ID %in% pathways)
 test3 <- pairwise_termsim(test3)
-emapplot(test3, min_edge = 0.01, color = "NES", layout="fr", repel =T) +
-  scale_fill_gradient2(name=bquote(NES),low="blue", high="red") 
\ No newline at end of file
+pdf(file = paste(output_dir, "emapplot_Fig.3D.pdf", sep = "/"), width=7, height=7)
+emapplot(test3, min_edge = 0.01, color = "NES", layout="fr") + # repel=T
+  scale_fill_gradient2(name=bquote(NES), low="blue", high="red")
+dev.off()
--- a/CB2025_figure_5_scRNAseq.R
+++ b/CB2025_figure_5_scRNAseq.R
@@ -20,13 +20,16 @@ library(cetcolor)

 wdir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/results2/CB1_CB3_CB4"
 wdir_CB <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/CB_scripts/results2"
-plot_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/CB_scripts_final/plots and tables used in figures"
+#plot_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/CB_scripts_final/plots and tables used in figures"
 dirCB <-"/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/CB_scripts"

-dir.create(plot_dir, showWarnings=F, recursive=T)
+#dir.create(plot_dir, showWarnings=F, recursive=T)
+
+input_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/Input"
+output_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/Output"

 # 16357 GO terms db. used for GSEA
-sig <- readRDS("/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/reference/miDB_sig5.MLS.rds")
+sig <- readRDS(paste0(input_dir, "/miDB_sig5.MLS.rds"))


 ################################################################################
@@ -68,8 +71,6 @@ obs1@meta.data$Sample[obs1@meta.data$orig.ident=="CB4"&obs1@meta.data$hash.ID=="
 obs1@meta.data$Sample[obs1@meta.data$orig.ident=="CB4"&obs1@meta.data$hash.ID=="Mouse2"] <- "miR_02"
 obs1@meta.data$Sample[obs1@meta.data$orig.ident=="CB4"&obs1@meta.data$hash.ID=="Mouse3"] <- "miR_05"

-obs1<-FindClusters(obs1, graph.name="RNA_snn_h.orig.ident", resolution=4)
-
 # Manually annotate the clusters
 obs1@meta.data$Annotation.final.CB<-"Undefined"
 obs1@meta.data$Annotation.final.CB[obs1@meta.data$RNA_snn_h.orig.ident_res.0.6==0]<- "Monocytes"
@@ -98,21 +99,28 @@ obs1@meta.data$Annotation.final.CB[obs1@meta.data$RNA_snn_h.orig.ident_res.4==51

 # Visualize final annotation in Umap (Fig.5A)
 obs1 <- SetIdent(obs1,value="Annotation.final.CB")
-pdf(file = paste(plot_dir, "CB_Annotation.final.CB_Fig.5A.pdf", sep = "/"), width=9, height=7)
+pdf(file = paste(output_dir, "CB_Annotation.final.CB_Fig.5A.pdf", sep = "/"), width=9, height=7)
 DimPlot(obs1, reduction="umap.harmony.orig.ident", group.by="Annotation.final.CB", label=T, repel=T)
 dev.off()

 # Adding a simplified annotation
-obs1@meta.data$Shortlables<-obs1@meta.data$Annotation.final.CB
+obs1@meta.data$Shortlables <- obs1@meta.data$Annotation.final.CB
 DCs<-c("Mig cDCs","pDCs","pre DCs", "cDC1", "cDC2_MoDCs")
-obs1@meta.data$Shortlables[obs1@meta.data$Annotation.final.CB%in%DCs]<- "DCs"
+obs1@meta.data$Shortlables[obs1@meta.data$Annotation.final.CB%in%DCs] <- "DCs"
 granu<-c("Neutrophils","Basophils")
-obs1@meta.data$Shortlables[obs1@meta.data$Annotation.final.CB%in%granu]<- "Granulocytes"
+obs1@meta.data$Shortlables[obs1@meta.data$Annotation.final.CB%in%granu] <- "Granulocytes"
 TAMs<-c("TAMs_1", "TAMs_2", "TAMs_3", "TAMs_4", "TAMs_5")
-obs1@meta.data$Shortlables[obs1@meta.data$Annotation.final.CB%in%TAMs]<- "TAMs"
+obs1@meta.data$Shortlables[obs1@meta.data$Annotation.final.CB%in%TAMs] <- "TAMs"
+
+# Export DimPlot as excel
+var_list <- c("UMAPh_1", "UMAPh_2","PROP.Group","orig.ident", "Sample","RNA_snn_h.orig.ident_res.0.6", "mOrange_union","Annotation.final.CB", "Shortlables")
+obs1_df <- FetchData(obs1, vars = var_list)
+obs1_df$cell_ID <- rownames(obs1_df)
+write.xlsx(obs1_df, file=paste0(output_dir, "/CB_Annotation.final.CB_Fig.5A.xlsx"), overwrite=T)
+

 # Dotplot with markers for population (Suppl.Fig.5A)
-obs1 <- SetIdent(obs1,value="Shortlables")
+obs1 <- SetIdent(obs1, value="Shortlables")
 obs1$Shortlables <- factor(x = obs1$Shortlables, 
                           levels = c("Cancer cells","Cholangiocytes",
                                      "Hepatocytes", "LSECs", "KC-LSEC doublet", 
@@ -130,7 +138,7 @@ gene <- c("Vil1", "Cdx2","Cldn4","Cldn7","Epcam",
          "Cd209a","Itgax", "Flt3","H2-Aa","Xcr1", "Clec9a",
          "Cd19", "Cd22", "Cd79a", "Ms4a1", "Pax5",
          "Cd3e", "Cd4", "Cd8a", "Ncr1", "Gzmb")
-pdf(file = paste(plot_dir, "CB_DotPlot_Shortlables_Fig.S5A.pdf", sep="/"), width=12, height=6)
+pdf(file = paste(output_dir, "CB_DotPlot_Shortlables_Fig.S5A.pdf", sep="/"), width=12, height=6)
 DotPlot(obs1,features = unique(gene), cols = "RdBu") + 
  theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1))
 dev.off()
@@ -141,8 +149,8 @@ dev.off()
 cells_labels <- table(obs1@meta.data$Annotation.final.CB, obs1@meta.data$Sample)
 cells_labels_p <- round(prop.table(table(obs1@meta.data$Annotation.final.CB, obs1@meta.data$Sample), 2) * 100, 1)

-write.csv(cells_labels, paste0(plot_dir, "/number_cells_sample_Annotation.final.CB_Fig.S5B.csv"), row.names=T)
-write.csv(cells_labels_p, paste0(plot_dir, "/number_cells_sample_Annotation.final.CB_percentage_Fig.S5B.csv"), row.names=T)
+write.csv(cells_labels, paste0(output_dir, "/number_cells_sample_Annotation.final.CB_Fig.S5B.csv"), row.names=T)
+write.csv(cells_labels_p, paste0(output_dir, "/number_cells_sample_Annotation.final.CB_percentage_Fig.S5B.csv"), row.names=T)


 #### Analysis of mOrange+ fraction (Fig.5B, Suppl.Fig.5C,D) ####
@@ -156,8 +164,8 @@ obs2 <- subset(obs1, subset = mOrange_union == "mOrange+")
 cells_labels_orange <- table(obs2@meta.data$Annotation.final.CB, obs2@meta.data$Sample)
 cells_labels_orange_p <- round(prop.table(table(obs2@meta.data$Annotation.final.CB, obs2@meta.data$Sample), 2) * 100, 1)

-write.csv(cells_labels_orange, paste0(plot_dir, "/number_cells_sample_Annotation.final.CB_mOrange_fraction_Fig.S5C.csv"), row.names=T)
-write.csv(cells_labels_orange_p, paste0(plot_dir, "/number_cells_sample_Annotation.final.CB_percentage_mOrange_fraction_Fig.S5C.csv"), row.names=T)
+write.csv(cells_labels_orange, paste0(output_dir, "/number_cells_sample_Annotation.final.CB_mOrange_fraction_Fig.S5C.csv"), row.names=T)
+write.csv(cells_labels_orange_p, paste0(output_dir, "/number_cells_sample_Annotation.final.CB_percentage_mOrange_fraction_Fig.S5C.csv"), row.names=T)


 # DimPlot with overlayed mOrange density (Fig.5B)
@@ -165,7 +173,7 @@ data <- Embeddings(object = obs2[["umap.harmony.orig.ident"]])[(colnames(obs2)),
 data <- as.data.frame(data)

 obs1 <- SetIdent(obs1, value="cells")
-pdf(file = paste(plot_dir, "UMAP_mOrange_cells_Fig.5B.pdf", sep = "/"), width=9, height=7)
+pdf(file = paste(output_dir, "UMAP_mOrange_cells_Fig.5B.pdf", sep = "/"), width=9, height=7)
 DimPlot(obs1, reduction = "umap.harmony.orig.ident", pt.size = 0.1, cols = "azure3")  + 
  stat_density_2d(aes_string(x = "UMAPh_1", y = "UMAPh_2", fill = "after_stat(level)"), data=data,
                  linewidth = 0.4, geom = "density_2d_filled", 
@@ -183,9 +191,9 @@ obs3 <- subset(obs1, subset = Shortlables %in% selected_pop)
 # Dotplot of Slc7a11 expression by cluster and experimental group (Fig.5C, groups divided in Illustrator)
 obs3 <- SetIdent(obs3, value="Shortlables")
 obs3$Shortlables <- factor(x = obs3$Shortlables, levels = c("LSECs", "Cancer cells", "Monocytes", "TAMs", "KCs", "DCs"))
-pdf(file = paste(plot_dir, "CB_DotPlot_Shortlables_Slc7a11_Fig.5C.pdf", sep = "/"), width=9, height=5)
+pdf(file = paste(output_dir, "CB_DotPlot_Shortlables_Slc7a11_Fig.5C.pdf", sep = "/"), width=9, height=5)
 DotPlot(obs3, features = "Slc7a11", dot.scale=20, cols = "RdBu", split.by = "PROP.Group") + 
-  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
+  theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1)) +
  coord_flip()
 dev.off()

@@ -203,7 +211,7 @@ for(i in selected_pop){
  addWorksheet(simpl_DEG, name)
  writeData(simpl_DEG, name, markers, rowNames = T)
 }
-saveWorkbook(simpl_DEG, file = paste0(plot_dir, "/DEG_simpl_Shortlables.xlsx"), overwrite = T)
+saveWorkbook(simpl_DEG, file = paste0(output_dir, "/DEG_simpl_Shortlables.xlsx"), overwrite = T)

 # Run GSEA for each population
 GSEA_simpl <- data.frame(matrix(nrow = 0, ncol = 9))
@@ -217,7 +225,7 @@ for(i in selected_pop){
  colnames(GSEA_simpl) <- colnames(test)
  GSEA_simpl <- rbind(GSEA_simpl, test)
 }
-write.xlsx(GSEA_simpl, file = paste0(plot_dir, "/GSEA_simpl_Shortlables.xlsx"))
+write.xlsx(GSEA_simpl, file = paste0(output_dir, "/GSEA_simpl_Shortlables.xlsx"))

 # Plot barplot for selected gene sets (Fig.5D, names and colors edited on illustrator)
 gene_sets <- c("HALLMARK_TNFA_SIGNALING_VIA_NFKB",
@@ -262,7 +270,7 @@ gene_sets <- gsub(x=gene_sets, pattern="_", replacement=" ")
 # Reorder the terms
 plot$pathway <- factor(plot$pathway, levels = gene_sets)

-write.xlsx(plot, file = paste0(plot_dir, "/GSEA_plotted.xlsx"))
+write.xlsx(plot, file = paste0(output_dir, "/GSEA_plotted.xlsx"))

 # Plot the GSEA results (Fig5D)
 p <- ggplot(data=plot, aes(x=NES, y=pathway, fill=log10(pval))) + 
@@ -278,12 +286,13 @@ p <- ggplot(data=plot, aes(x=NES, y=pathway, fill=log10(pval))) +
        axis.text.y = element_text(colour = "black"))+
  geom_vline(xintercept=0)+
  facet_wrap(~cluster, ncol = length(unique(plot$cluster)))
-ggsave(filename = paste0(plot_dir, "/GSEA_Fig5D.pdf"), plot=p, width=12, height=7)
+ggsave(filename = paste0(output_dir, "/GSEA_Fig5D.pdf"), plot=p, width=12, height=7)



 # Create signature file with only signature cytokines for the more stringent GSEA in Fig.5E
-sig.cyto <- sig[15061:16357]
+#sig.cyto <- sig[15061:16357]
+sig.cyto <- readRDS(paste0(input_dir, "/miDB_sig5.MLS_cytokines.rds"))

 # Define the populations we want to use for this GSEA
 selected_pop2 <- c("B cells","cDC1","cDC2_MoDCs","KCs","Macrophages","Mig cDCs","Monocytes","Neutrophils","pDCs","T and NK cells", "TAMs_1","TAMs_2","TAMs_3","TAMs_4","TAMs_5")
@@ -301,7 +310,7 @@ for(i in selected_pop2){
  addWorksheet(simpl_DEG2, name)
  writeData(simpl_DEG2, name, markers, rowNames = T)
 }
-saveWorkbook(simpl_DEG2, file = paste0(plot_dir, "/DEG_Annotation.final.CB.xlsx"), overwrite = T)
+saveWorkbook(simpl_DEG2, file = paste0(output_dir, "/DEG_Annotation.final.CB.xlsx"), overwrite = T)


 # Repeat GSEA for cytokines signatures only
@@ -316,7 +325,7 @@ for(i in selected_pop2){
  colnames(GSEA_cyto) <- colnames(test)
  GSEA_cyto <- rbind(GSEA_cyto,test)
 }
-write.xlsx(GSEA_cyto, file = paste0(plot_dir, "/GSEA_cyto.xlsx"))
+write.xlsx(GSEA_cyto, file = paste0(output_dir, "/GSEA_cyto.xlsx"))

 # Subset the GSEA_cyto to match the cytokine signature calculated in the populations with the same populations that we have in our scRNAseq.
 # This is necessary because each cytokine was tested and the effect in each population was saved as a different signature.
@@ -363,4 +372,4 @@ pheatmap(mat.df, scale = "none", display_numbers = mat.label,
         clustering_method = "ward.D2",
         cutree_rows = 2, cutree_cols = 3,
         col=myColor, breaks = myBreaks, na_col = "grey",
-         filename = paste0(plot_dir, "/Heatmap_cytokine_sig_Fig.5E_new.pdf"), cellwidth=10, cellheight=10) # width=12, height=7
+         filename = paste0(output_dir, "/Heatmap_cytokine_sig_Fig.5E.pdf"), cellwidth=10, cellheight=10) # width=12, height=7
--- a/TCGA_analysis.R
+++ b/TCGA_analysis.R
 library(data.table)
-library(survival)
 library(dplyr)
-library(babelgene)
 library(openxlsx)
 library(readxl)
-library(survminer)
 library(gridExtra)
 library(ggplot2)
+library(survival)
+library(survminer)
+#library(babelgene)

-##Load dataset
+# Set directories
 wdir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts"
 setwd(wdir)

-input_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/reference"
+input_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/Input"
 output_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq_Bresesti/Analysis_MM/GitLab_scripts/Output"


@@ -47,37 +47,37 @@ output_dir <- "/beegfs/scratch/ric.squadrito/ric.squadrito/90-935462466_scRNAseq
 #     and minimum patient number (n > 100 for HR plot). These thresholds can be adjusted within the script.
 ################################################################################

-
-metaD<-data.frame(fread(paste0(input_dir, "/TCGA_phenotype_denseDataOnlyDownload.tsv.gz"), sep='\t', colClasses=c("character"), data.table=FALSE))
-Surv01<-data.frame(fread(paste0(input_dir, "/Survival_SupplementalTable_S1_20171025_xena_sp"), sep='\t', colClasses=c("character"), data.table=FALSE))
-# FPKM01<-fread("tcga_RSEM_Hugo_norm_count")
-FPKM01<-fread(paste0(input_dir, "/pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs_08_04_16.xena.gz"))
-FPKM01<-as.data.frame(FPKM01)
-# Annot<-data.frame(fread(paste0(input_dir, "/probeMap_gencode.v23.annotation.gene.probemap"), sep='\t',colClasses=c("character"),data.table=FALSE))
+# Load dataset
+metaD <- data.frame(fread(paste0(input_dir, "/TCGA_phenotype_denseDataOnlyDownload.tsv.gz"), sep='\t', colClasses=c("character"), data.table=FALSE))
+Surv01 <- data.frame(fread(paste0(input_dir, "/Survival_SupplementalTable_S1_20171025_xena_sp"), sep='\t', colClasses=c("character"), data.table=FALSE))
+# FPKM01 <- fread(paste0(input_dir, "/tcga_RSEM_Hugo_norm_count"))
+FPKM01 <- fread(paste0(input_dir, "/pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs_08_04_16.xena.gz"))
+FPKM01 <- as.data.frame(FPKM01)
+# Annot <- data.frame(fread(paste0(input_dir, "/probeMap_gencode.v23.annotation.gene.probemap"), sep='\t',colClasses=c("character"),data.table=FALSE))
 allGenes <- FPKM01[,1]

-#######################################################################
-#####PLOT HR Score of signature for multiple tumor types###############
 #####################################################################
-humanGenes<-list(c("hsa-miR-342-3p"))
+#####PLOT HR Score of signature for multiple tumor types#############
+#####################################################################
+humanGenes <- list(c("hsa-miR-342-3p"))

-####create xlsx file with genes, patients and tumor types
+# Create xlsx file with genes, patients and tumor types
 Pan.data3 <- metaD[,]
 Pan.data4 <- merge(Surv01,Pan.data3,by=1)
 Genes_selected <- FPKM01[FPKM01[,1]%in%humanGenes,]

 rownames(Genes_selected)<-Genes_selected[,1]
-Genes_selected<-Genes_selected[,-1]
+Genes_selected <- Genes_selected[,-1]
 Genes_selected <- data.frame(t(Genes_selected))

 Genes_selected$sample<-rownames(Genes_selected)
-Pan.data5 <- merge(Genes_selected,Pan.data4,by.y="sample")
+Pan.data5 <- merge(Genes_selected, Pan.data4, by.y="sample")
 colnames(Pan.data5)
-write.xlsx(Pan.data5, paste0(ourdir, "/miR342_patients_survival.xlsx"))
+write.xlsx(Pan.data5, paste0(output_dir, "/miR342_patients_survival.xlsx"))


-#############Regression with defined k2
-k2<-0.5 #define quartile for selecting low/high score
+# Regression with defined k2
+k2 <- 0.5 # define quartile for selecting low/high score
 df2 <- NULL
 for (i in unique(Pan.data5$cancer.type.abbreviation)){
  Pan.data6 <- Pan.data5[Pan.data5$cancer.type.abbreviation%in%i,]
@@ -86,16 +86,16 @@ for (i in unique(Pan.data5$cancer.type.abbreviation)){
    cohort.score<-rep(0,nrow(P.OS))
    M1 <- quantile(Pan.data6$hsa.miR.342.3p,k2)
    cohort.score[Pan.data6$hsa.miR.342.3p>=M1]<-1
-    a1<-coxph(P.OS ~ as.numeric(cohort.score))
+    a1 <- coxph(P.OS ~ as.numeric(cohort.score))
    df1 <- data.frame(Tumor = unique(Pan.data6$cancer.type.abbreviation),npatients=length(Pan.data6$X_PATIENT), HR = summary(a1)$coefficients[2], SE=summary(a1)$coefficients[3],
                      p = summary(a1)$coefficients[5] )
    df2 <- rbind(df2,df1)
-    df2<-df2[(order(df2$p)),]
+    df2 <- df2[(order(df2$p)),]
  }else{}}
 write.xlsx(df2, paste0(output_dir, "/miR342_HR_by_tumortype.xlsx"))


-###Filters selected tumors
+# Filters selected tumors
 df3<-df2
 df3<-df3[df3$p<0.1,]
 df3<-df3[df3$npatients>100,]
@@ -113,13 +113,12 @@ a<-ggplot(df3, aes(x = reorder(Tumor, HR), y = HR, color = significance, label =
 ggsave(filename = paste0(output_dir, "/HR_alltumors_342.pdf"), plot=a, width=9, height=4)


-####Plot survival of chosen tumors
-###Select type of tumor
-df3<-df3[order(df3$HR),]
-df3<-df3[df3$p<0.05,]
-
+# Plot survival of chosen tumors
+# Select type of tumor
+df3 <- df3[order(df3$HR),]
+df3 <- df3[df3$p<0.05,]

-#pdf(paste0(output_dir, "/Survival_***_342"), width=10,height = 6)
+pdf(paste0(output_dir, "/Survival_in_chosen_tumors_342.pdf"), width=10 ,height=6)
 par(mfrow = c(2,4))
 survival<-list()
 for (i in df3$Tumor){
@@ -129,7 +128,7 @@ for (i in df3$Tumor){
  cohort.score<-rep(0,nrow(P.OS))
  M1 <- quantile(Pan.data6$hsa.miR.342.3p,k2)
  cohort.score[Pan.data6$hsa.miR.342.3p>=M1]<-1
-  ##plotting survival
+  # Plotting survival
  fit.score <- survfit(P.OS ~ cohort.score)
  cox_model <- coxph(P.OS ~ as.numeric(cohort.score))
  p_value <- summary(cox_model)$coefficients["as.numeric(cohort.score)", "Pr(>|z|)"]
@@ -143,11 +142,10 @@ for (i in df3$Tumor){
  text(x = 10, y = 0.05, label = paste("n =", length(cohort.score)), adj = c(0, 1))
  survival[[i]] <- recordPlot()
 }
-# dev.off()
-
+dev.off()


-###Plot survival of metastastic/primary cancers
+# Plot survival of metastastic/primary cancers
 #k2<-0.5
 Pan.data7 <- Pan.data5[Pan.data5$sample_type == "Metastatic",]
 #Pan.data7 <- Pan.data5[Pan.data5$sample_type == "Primary Tumor",]
@@ -162,10 +160,10 @@ formatted_p_value <- ifelse(p_value < 0.0001, "<0.0001",  as.numeric(round(p_val
 par(las = 0)


-pdf(paste0(output_dir, "/Survival_metastatic_342.pdf"), width=5,height =5)
+pdf(paste0(output_dir, "/Survival_metastatic_342.pdf"), width=5, height=5)
 plot(fit.score, lty = c(1, 1), col = c("blue", "red"), xlab = "Time (d)", ylab = "Overall Survival", lwd = 2, bty = "n",
     main = "Metastatic tumors") 
-#Add legend
+# Add legend
 legend(x = max(fit.score$time)/2, y = 1, legend = c("miRNA high", "miRNA low"), lty = c(1, 1), col = c("red", "blue"), lwd = 2,bty = "n")
 text(x = max(fit.score$time)/1.8, y = 0.70, label = paste("p=", (formatted_p_value)), adj = c(0, 1))
 text(x = 10, y = 0.05, label = paste("n =", length(cohort.score)), adj = c(0, 1))

--- a/reference/miDB_sig5.MLS.rds
+++ b/reference/miDB_sig5.MLS.rds
--- a/reference/miDB_sig5.MLS_cytokines.rds
+++ b/reference/miDB_sig5.MLS_cytokines.rds
--- a/reference/regev_lab_cell_cycle_mouse.rds
+++ b/reference/regev_lab_cell_cycle_mouse.rds