initialize and repopulate git

fe5789a9 · Matteo Barcella · fe5789a9 · fe5789a9 · fe5789a9 · fe5789a9
Commit fe5789a9 authored Jul 25, 2023 by Matteo Barcella
--- a/BulkRNAseq/.gitkeep
+++ b/BulkRNAseq/.gitkeep
--- a/BulkRNAseq/00-Interaction.R
+++ b/BulkRNAseq/00-Interaction.R
+# Interaction analysis
+
+library(DESeq2)
+library(ggplot2)
+library(openxlsx)
+library(pheatmap)
+library(RColorBrewer)
+library(clusterProfiler)
+library(DOSE)          
+library(AnnotationHub)
+library(org.Hs.eg.db)
+
+
+out_dir <- "../00-Interaction/"
+
+read.counts <- read.table(file = "../04-counts/featureCounts_results_corrected.txt", header = T)
+rownames(read.counts) <- read.counts$Geneid
+read.counts <- read.counts[,c(7:ncol(read.counts))]
+colnames(read.counts) <-  unlist(strsplit(x = colnames(read.counts),split = "\\.", ))[seq.int(2, 42, by = 3)]
+sample_info <- read.table(file = "SampleInfo.txt", header = T)
+sample_info$OFP <- substr(x = sample_info$Group, start = 1, stop = 4)
+sample_info$Treatment <- substr(x = sample_info$Group, start = 5, stop = length(sample_info$Group))
+rownames(sample_info) <- sample_info$SampleID
+
+dds <- DESeqDataSetFromMatrix(countData = read.counts,
+                              colData = sample_info,
+                              design = as.formula("~ OFP + Treatment + OFP:Treatment"))
+
+keep <- rowSums(counts(dds)) >= 10
+dds <- dds[keep,]
+
+dds$OFP <- relevel(dds$OFP, ref = "OFPn")
+dds$Treatment <- relevel(dds$Treatment, ref = "NT")
+
+dds <- DESeq(dds)
+sizeFactors(dds)
+resultsNames(dds)
+
+rlog <- rlog(dds)
+
+for (contr in c("OFPOFPp.TreatmentT")) {
+  res <- results(dds, name = contr)
+  res <- res[!is.na(res$padj), ]
+  res$padj <- as.numeric(res$padj)
+  res <- res[res$padj < 0.05 & res$baseMean > 5 & abs(res$log2FoldChange) > 1, ]
+  write.table(x = as.data.frame(res), file = paste(out_dir, paste0("DESeq2_", contr, "_interaction.tsv"), sep = "/"),
+              sep = "\t", quote = FALSE, col.names = NA)
+  res$Gene <- rownames(res)
+  write.xlsx(x = res, file = paste(out_dir, paste0("DESeq2_", contr, "_interaction.xlsx"), sep = "/"))
+}
+
+
+sample_info$SampleID <- NULL
+
+pheatmap(mat = as.matrix(assay(rlog)[res[res$log2FoldChange > 0,]$Gene,]), 
+         color = rev(brewer.pal(9, "RdBu")), 
+         scale = "row", annotation_col = sample_info, 
+         show_rownames = F, cluster_cols = F)
+
+saveRDS(object = list(dds = dds, 
+                      read.counts = read.counts, 
+                      res = res, 
+                      rlog = rlog,
+                      sample_info = sample_info), file = "00-Interaction.rds")
+
+### Performing downstream analysis
+
+
+interaction_genes <- list()
+
+interaction_genes[["up"]] <- subset(res, log2FoldChange > 0)
+interaction_genes[["down"]] <- subset(res, log2FoldChange < 0)
+
+gene_list <- list()
+gene_list[["up"]] <- interaction_genes$up$log2FoldChange
+names(gene_list[["up"]]) <- interaction_genes$up$Gene
+gene_list[["down"]] <- interaction_genes$down$log2FoldChange
+names(gene_list[["down"]]) <- interaction_genes$down$Gene
+
+
+contr.enrich_go_up <- enrichGO(gene = names(gene_list[["up"]]),
+                            universe = rownames(dds),
+                            OrgDb = 'org.Hs.eg.db',
+                            ont = "BP",
+                            pAdjustMethod = "BH",
+                            pvalueCutoff = 0.05,
+                            keyType = 'SYMBOL')
+
+contr.enrich_go_down <- enrichGO(gene = names(gene_list[["down"]]),
+                               universe = rownames(dds),
+                               OrgDb = 'org.Hs.eg.db',
+                               ont = "BP",
+                               pAdjustMethod = "BH",
+                               pvalueCutoff = 0.05,
+                               keyType = 'SYMBOL')
+
+contr.enrich_go <- list(UP = contr.enrich_go_up@result, 
+                        DOWN = contr.enrich_go_down@result)
+
+write.xlsx(contr.enrich_go,file = "GO_BP_Enrichment_interaction_OFP_Treatment.xlsx", 
+           asTable = T, firstRow = T, colWidths = "auto") 
+
+# GSEA 
+
+
+for (contr in c("OFPOFPp.TreatmentT")) {
+  res_all <- results(dds, name = contr)
+  res_all <- res_all[!is.na(res_all$padj), ]
+  res_all$padj <- as.numeric(res_all$padj)
+ # res_all <- res[res$padj < 0.05 & res$baseMean > 5 & abs(res$log2FoldChange) > 1, ]
+  write.table(x = as.data.frame(res_all), file = paste(out_dir, paste0("DESeq2_", contr, "_interaction_full.tsv"), sep = "/"),
+              sep = "\t", quote = FALSE, col.names = NA)
+  res_all$Gene <- rownames(res_all)
+  write.xlsx(x = res_all, file = paste(out_dir, paste0("DESeq2_", contr, "_interaction_full.xlsx"), sep = "/"))
+}
+
+gene_list[["full"]] <- res_all$log2FoldChange
+names(gene_list[["full"]]) <- res_all$Gene
+gene_list[["full"]] <- sort(gene_list[["full"]], decreasing = T)
+
+t2gene <- read.gmt(gmtfile = paste0("/opt/common/tools/ric.tiget/GSEA/GSEA_human_v7.4/h.all.v7.4.symbols.gmt"))
+
+gsea_res <- GSEA(gene_list[["full"]], TERM2GENE = t2gene,
+                                      verbose = FALSE,minGSSize = 10,maxGSSize = 1000,
+                                      pvalueCutoff = 0.05)
+
+gseaplot2(gsea_res, geneSetID = 1:3, pvalue_table = TRUE,
+          color = c("#E495A5", "#86B875", "#7DB0DD"), ES_geom = "dot")
+
+a <- ridgeplot(gsea_res, fill = "NES", showCategory = 50) + scale_fill_gradient2() 
+a$theme$axis.text.y$size <- 10
+a$theme$axis.text.x$size <- 10
+
+png(filename = "GSEA_ridges_enrich_core_distribution.png",width = 9, height = 6, units = "in", res = 200)
+a
+dev.off()
+
+
+
--- a/BulkRNAseq/20210617_151420_DGE.rds
+++ b/BulkRNAseq/20210617_151420_DGE.rds
--- a/BulkRNAseq/GGRidgesPaperPanel.R
+++ b/BulkRNAseq/GGRidgesPaperPanel.R
+#  GGridges
+
+library(DESeq2)
+library(ggplot2)
+library(openxlsx)
+library(pheatmap)
+library(RColorBrewer)
+library(clusterProfiler)
+library(DOSE)          
+library(AnnotationHub)
+library(org.Hs.eg.db)
+library(gridExtra)
+
+load(file = "../.RData")
+
+gsea_res_interaction <- gsea_res
+gsea_bulk <- readRDS(file = "/beegfs/scratch/ric.gentner/ric.gentner/BALL/BulkRNA_BALL_Integration/results/GSEA_All_Experiments/GG_ridges/Hallmark_GSEA.rds")
+gsea_res <- gsea_bulk
+
+a <- ridgeplot(gsea_res$GFP_L_vs_GFP_H, fill = "NES", 
+               showCategory = nrow(gsea_res$GFP_L_vs_GFP_H@result[gsea_res$GFP_L_vs_GFP_H@result$p.adjust < 0.2,])) + 
+  scale_fill_gradient2(low = "#0066E7", high= "#D4070F") + 
+  ggtitle("miR-126 High vs miR-126 Low") + #, subtitle = "Human dataset") +
+  xlab("logFC") +
+  theme(legend.position = "none", plot.margin=unit(c(0.5,0.5,0.5,0.5), 'cm'))
+a$theme$axis.text.y$size <- 8
+a$theme$axis.text.x$size <- 8
+a$theme$axis.title.x$size <- 10
+
+a_2 <- a
+
+a_2$data$category <- gsub(a$data$category, pattern = "HALLMARK_", replacement = "")
+a_2$data$category <- factor(x = a_2$data$category, 
+                            levels = gsub(x = levels(a$data$category), replacement = "", pattern = "HALLMARK_"))
+
+b <- ridgeplot(gsea_res$data_OE_vs_SM, fill = "NES", 
+               showCategory = nrow(gsea_res$data_OE_vs_SM@result[gsea_res$data_OE_vs_SM@result$p.adjust < 0.2,])) + 
+  scale_fill_gradient2(low = "#0066E7", high= "#D4070F") + 
+  ggtitle("miR-126 OE vs miR-126 SM") + #, subtitle = "Murine dataset") +
+  xlab("logFC") + 
+  theme(legend.position = "none", plot.margin=unit(c(0.5,0.5,0.5,0.5), 'cm'))
+b$theme$axis.text.y$size <- 8
+b$theme$axis.text.x$size <- 8
+b$theme$axis.title.x$size <- 10
+
+b_2 <- b
+
+b_2$data$category <- gsub(b$data$category, pattern = "HALLMARK_", replacement = "")
+b_2$data$category <- factor(x = b_2$data$category, 
+                            levels = gsub(x = levels(b$data$category), replacement = "", pattern = "HALLMARK_"))
+
+c <- ridgeplot(gsea_res_interaction, fill = "NES", 
+               showCategory = nrow(gsea_res_interaction@result[gsea_res_interaction@result$p.adjust < 0.2,])) +
+scale_fill_gradient2(low = "#0066E7", high= "#D4070F") + 
+  ggtitle("Interaction miR-126 lvl - Treatment") + 
+  xlab("logFC") + 
+  theme(legend.position = "none", plot.margin=unit(c(0.5,0.5,0.5,0.5), 'cm'))
+c$theme$axis.text.y$size <- 8
+c$theme$axis.text.x$size <- 8
+c$theme$axis.title.x$size <- 10
+
+c_2 <- c 
+
+c_2$data$category <- gsub(c$data$category, pattern = "HALLMARK_", replacement = "")
+c_2$data$category <- factor(x = c_2$data$category, 
+                            levels = gsub(x = levels(c$data$category), replacement = "", pattern = "HALLMARK_"))
+
+# Panel 
+
+png(filename = "Panel_gsea_hallmakrs_nes_distribution.png", width = 18, height = 6, units = "in", res = 300)
+grid.arrange(a_2,b_2,c_2,widths = c(0.30,0.30,0.35), ncol = 3)
+dev.off()
+
+svg(filename = "Panel_gsea_hallmakrs_nes_distribution.svg", width = 18, height = 6)
+grid.arrange(a_2,b_2,c_2,widths = c(0.30,0.30,0.35), ncol = 3)
+dev.off()
+
+
--- a/BulkRNAseq/Panel_gsea_hallmakrs_nes_distribution__1_.png
+++ b/BulkRNAseq/Panel_gsea_hallmakrs_nes_distribution__1_.png
--- a/BulkRNAseq/README.md
+++ b/BulkRNAseq/README.md
+## Introduction. 
+
+BulkRNAseq analysis was performed both for single-diseases separately and the by combining all samples in an integrated analysis (GFP_L vs GFP_H).  
+For the latter, the starting point of the analysis is the genes expression counts matrix deposited at [GSE236138](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE236138).    
+Moreover, for the second dataset present at GEO id [GSE236141](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE236141), we performed an additional analysis focus on interaction analysis between condition and treatment.   
+
+## Workflow and steps. 
+
+Below the most important steps:  
+1. Quality control by [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)
+2. Trimming of bad quality reads with [TrimGalore](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/)<details><summary>Running command</summary>trim_galore --quality 20 --fastqc --length 25 --output_dir {outdir} --paired {input.r1} {inout.r2}</details>
+3. Alignment with [STAR](https://github.com/alexdobin/STAR)
+    <details><summary>Running command</summary>
+            "STAR " +
+            "--runThreadN {threads} " +
+            "--genomeDir {input.genome} " +
+            "--readFilesIn {params.trim_seq} " +
+            "--outSAMstrandField intronMotif " +
+            "--outFileNamePrefix {params.aln_seq_prefix} " +
+            "--outSAMtype BAM SortedByCoordinate " +
+            "--outSAMmultNmax 1 " +
+            "--outFilterMismatchNmax 10 " +
+            "--outReadsUnmapped Fastx " +
+            "--readFilesCommand zcat "
+    </details>
+4. Gene expression quantification with [FeatureCounts](https://academic.oup.com/bioinformatics/article/30/7/923/232889)
+    <details><summary>Running command</summary>
+            "featureCounts " +
+            "-a {input.annot} " +
+            "-o {output.fcount} " +
+            "-g gene_name " +
+            "-p -B -C " +
+            "-s {params.strand} " +
+            "--minOverlap 10 " +
+            "-T {threads} " +
+            "{input.bams} "
+    </details>
+5. Differential Expression analysis with [Deseq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html).  
+    For Differential Gene Expression analysis we followed the standard workflow provided by package.  
+    <details><summary>Detail</summary>
+    results(DESeq.ds, pAdjustMethod = "BH", independentFiltering = TRUE, contrast = c("groups", Group1, Group2), alpha = 0.05)
+    </details>
+    For interaction analysis we apply the design according to Deseq2 vignette:  
+    <details><summary>Interaction</summary>
+    design = as.formula("~ Condition + Treatment + Condition:Treatment")
+    </details>
+6. Dowstream functional Analysis with [ClusterProfiler](https://bioconductor.org/packages/release/bioc/html/clusterProfiler.html).  
+    In order to retrieve functional annotation from DE analysis, we performed **O**ver **R**epresentation **A**nalysis and **G**ene **S**et        **E**nrichment **A**nalysis by using the functions EnrichGO and GSEA provided by the package.  
+
+
+
+    
--- a/BulkRNAseq/config_integration.yaml
+++ b/BulkRNAseq/config_integration.yaml
+resources:
+  directories:
+    root: /XXX/XXX/XXX/XXXX/BulkRNA_BALL_Integration
+    conda_dir: . # if not required set to "."
+    reference: reference
+    data: input-fastq
+    progs: progs
+    results: results
+    qual_check: 01-qcheck
+    trimming: 02-trim
+    alignments: 03-aln
+    counts: 04-counts_bypatient
+    dge: 05-DGE_bypatient_0.1
+    post: 06-post-analysis_bypatient_0.1
+    cluster: 07-cluster-analysis_bypatient_0.1
+  organism: human # choices: human mouse ...
+  genome: GRCh38.primary_assembly.genome.fa # 
+  genome_annot: gencode.v34.primary_assembly.gtf # skipped - counts already present
+  design: patient,condition # cov1,cov2,...,condition
+  seq_type: single-end # choices: single-end paired-end # skipped - counts already present
+  strand_type: unstranded # choices: unstranded stranded reverse # skipped - counts already present
+  fdr_thr: 0.1
+  logfc_thr: 0
+  rand_walk: 1 # choices: 0 (do not print random walks) 1 (print random walks)
+  post_analysis_tool: DESeq # choices: edgeR DESeq limma
+  fastqfiles:
+    - PT10_395C0H: C0_HIGH_S13
+    - PT10_395C0L: C0_LOW_S14
+    - PT10_395C1H: C1_HIGH_S9
+    - PT10_395C1L: C1_LOW_S10
+    - PT10_395C3H: C3_HIGH_S11
+    - PT10_395C3L: C3_LOW_S12
+    - PT12_27C3R3L: C3_R3_S1
+    - PT12_27C3R4H: C3_R4_S2
+    - PT12_27D1R3L: D1_R3_S7
+    - PT12_27D1R4H: D1_R4_S8
+    - PT12_27D2R3L: D2_R3_S9
+    - PT12_27D2R4H: D2_R4_S10
+    - PT13_37C0R4H: 1_S1_L001,1_S1_L002
+    - PT13_37C0R5L: 2_S2_L001,2_S2_L002
+    - PT13_37C1R4H: 3_S3_L001,3_S3_L002
+    - PT13_37C1R5L: 4_S4_L001,4_S4_L002
+#NON USARE TRATTINI
+  samples:
+    - PT10_395C0H: PT10_395,GFP_HIGH
+    - PT10_395C0L: PT10_395,GFP_LOW
+    - PT10_395C1H: PT10_395,GFP_HIGH
+    - PT10_395C1L: PT10_395,GFP_LOW
+    - PT10_395C3H: PT10_395,GFP_HIGH
+    - PT10_395C3L: PT10_395,GFP_LOW
+    - PT12_27C3R3L: PT12_27,GFP_LOW
+    - PT12_27C3R4H: PT12_27,GFP_HIGH
+    - PT12_27D1R3L: PT12_27,GFP_LOW
+    - PT12_27D1R4H: PT12_27,GFP_HIGH
+    - PT12_27D2R3L: PT12_27,GFP_LOW
+    - PT12_27D2R4H: PT12_27,GFP_HIGH
+    - PT13_37C0R4H: PT13_37,GFP_HIGH
+    - PT13_37C0R5L: PT13_37,GFP_LOW
+    - PT13_37C1R4H: PT13_37,GFP_HIGH
+    - PT13_37C1R5L: PT13_37,GFP_LOW
+  contrasts:
+    - GFP_LOW:GFP_HIGH
+  enrichergmtfiles:
+    - c1.all.v7.0.symbols.gmt
+    - c2.all.v7.0.symbols.gmt
+    - c3.all.v7.0.symbols.gmt
+    - c4.all.v7.0.symbols.gmt
+    - c5.all.v7.0.symbols.gmt
+    - c6.all.v7.0.symbols.gmt
+    - c7.all.v7.0.symbols.gmt
+    - h.all.v7.0.symbols.gmt
+  read1_mask: "_R1_001.fastq.gz"
+  read2_mask: "_R2_001.fastq.gz"
+  read_mask: "_R1_001.fastq.gz"
--- a/Methylation/.gitkeep
+++ b/Methylation/.gitkeep
--- a/Methylation/MethylAnalysis_Adults_samples.R
+++ b/Methylation/MethylAnalysis_Adults_samples.R
+
+MethylAnalysis <- function(methcall.folder = NULL, # input folder with .cov files (bismark)
+                           outfolder = NULL, # output folder (if not exist it will be created)
+                           proj.id = "myproject", # project id to prefix in putput files
+                           samplesheet = NULL,
+                           sheetnum = 1,
+                           design.var = "Disease", # variable to subset samplesheet
+                           case.var = "Case", # 1 in treatment vector
+                           control.var = "Control", # 0 in treatment vector
+                           idcol = "SampleID",
+                           mincoverage = 10,
+                           lowperc = 5,
+                           lowcount = 10,
+                           assem="hg38",
+                           do.subset = T,
+                           chr.subset = "chr9",
+                           start.subset = 136668000,
+                           end.subset = 136671000,
+                           pipeline.meth = "bismarkCoverage",
+                           plot.covariates = c("Condition","Batch","MRD",
+                                               "Torelapse","Chemorefractory",
+                                               "Sex","CytoA","Tissue"),
+                           idstoremove = NULL,
+                           delta.meth = 10,
+                           plot.categorical.vars = c("Condition","Batch","MRD","Torelapse","Chemorefractory","Sex","CytoA"),
+                           plot.continuous.vars = c("miR-126"),
+                           plot.height = 9,
+                           plot.width = 12,
+                           cellw = 15, 
+                           cellh = 15,
+                           fontnumsize = 5,
+                           fontsize = 8
+                           ){
+  
+  # load libraries
+  
+  require(methylKit)
+  require(ggplot2)
+  require(reshape2)
+  require(plyr)
+  require(ggrepel)
+  library(GenomicRanges)
+  library(openxlsx)
+  library(pheatmap)
+  
+  # check vars
+  
+  if(is.null(methcall.folder)){
+    stop("Please set methcall.folder")
+  }
+  
+  if(is.null(outfolder)){
+    stop("Please set outfolder")
+  }
+  
+  if(is.null(samplesheet)){
+    stop("Please set samplesheet")
+  }
+  
+  # Setup variables
+  
+  infolder = paste0(methcall.folder, "/")
+  dir.create(infolder, showWarnings = F)
+  
+  outfolder = paste0(outfolder, "/")
+  dir.create(outfolder, showWarnings = F)
+  
+  qcfolder <- paste0(outfolder, "QC/")
+  dir.create(qcfolder, showWarnings = F)
+  
+  
+  # reading sample sheet with metadata
+  ssheet <- read.xlsx(samplesheet, sheet = sheetnum)
+  
+  if(!is.null(idstoremove)){
+    ssheet <- subset.data.frame(x = ssheet, subset = !ssheet[[idcol]] %in% idstoremove)
+  }
+
+  # defining design vector according to variable
+  
+  ssheet <- subset.data.frame(x = ssheet, subset = ssheet[[design.var]] %in% c(case.var, control.var))
+  ssheet$mir126 <- as.numeric(ssheet$mir126)
+  
+  d.vec <- ssheet[[design.var]]
+  d.vector <- ifelse(d.vec == case.var, 1, 0)
+  
+  # initialzing coverage .cov files list
+  covs <- list()
+  sampleids <- as.list(ssheet[[idcol]])
+  names(sampleids) <- ssheet[[idcol]]
+  
+  for (i in ssheet[[idcol]]) {
+    covs[[i]] <- paste0(infolder, "/", i, ".bismark.cov")
+  }
+  
+  saveRDS(object = covs, file = paste0(outfolder,"covs_object.rds"))
+  
+  # creating object 
+  
+  myobj <- methRead(location = covs,
+                    sample.id = sampleids,
+                    assembly = assem,
+                    pipeline = pipeline.meth,
+                    treatment = d.vector,
+                    context="CpG", 
+                    mincov = mincoverage)
+  
+  saveRDS(myobj, file = paste0(outfolder,"Myobject.rds"))
+  
+  names(myobj) <- ssheet[[idcol]]
+  
+  saveRDS(myobj, file = paste0(outfolder,"Myobject_with_names.rds"))
+
+  # subsetting if declared
+  
+  if(isTRUE(do.subset)){
+    my.win = GRanges(seqnames = chr.subset, ranges = IRanges(start = start.subset, end = end.subset))
+    myobj <- selectByOverlap(myobj,my.win)
+  }
+
+  
+  saveRDS(myobj, file = paste0(outfolder,"Myobject_with_names_aftersubset.rds"))
+  
+  # filtering on minimum coverage 
+  myobj <- filterByCoverage(methylObj = myobj, lo.count=lowcount, lo.perc = lowperc)
+  
+  # Normalization
+  myobj <- normalizeCoverage(obj = myobj)
+  
+ # saveRDS(object = myobj, file = "Initial_object.rds")
+  
+  # Calculate basic stats and PCs
+
+  metricsfolder <- paste0(qcfolder, "Metrics/")
+  dir.create(path = metricsfolder, showWarnings = F)
+    
+  for (id in ssheet[[idcol]]) {
+    
+    png(filename = paste0(metricsfolder,proj.id,"_CpG_pct_methylation_sample_", id, ".png"), 
+        width = 9, height = 6, units = "in", res = 96)
+    print(getMethylationStats(myobj[[id]],plot=TRUE,both.strands=FALSE))
+    dev.off()
+    png(filename = paste0(metricsfolder, proj.id,"_Coverage_stats_sample_", id, ".png"), 
+        width = 9, height = 6, units = "in", res = 96)
+    print(getCoverageStats(myobj[[id]],plot=TRUE,both.strands=FALSE))
+    dev.off()
+  }
+  
+  # create meth obj 
+  
+  #meth <- unite(object = myobj, destrand=FALSE)
+  meth <- unite(object = myobj, destrand=FALSE) # for debugging
+  saveRDS(meth, paste0(outfolder,"savemeth.tmp.rds"))
+  # Perform correlation
+  
+  sink(paste0(qcfolder, proj.id, "_Correlations.txt"))
+  getCorrelation(meth,plot=FALSE)
+  sink()
+  
+  if(length(ssheet[[idcol]]) < 15){
+    png(filename = paste0(qcfolder,proj.id, "_Correlations_pearson_pairwise.png"), 
+        width = 9, height = 6, units = "in", res = 96)
+    print(getCorrelation(meth,plot=TRUE))
+    dev.off()
+  }
+  
+  png(filename = paste0(qcfolder, proj.id, "_Clustering.png"), 
+      width = 9, height = 6, units = "in", res = 96)
+  clusterSamples(meth, dist="euclidean", plot=TRUE, method = "ward.D2")
+  dev.off()
+  
+  # Re-plotting PCs (custom chart)
+  
+  # compute PCs and store in object
+  
+  pca_compt <- PCASamples(meth, obj.return = T, screeplot = F)
+  
+  # extract PCs components
+  
+  pcafolder <- paste0(qcfolder, "PCA/")
+  dir.create(path = pcafolder, showWarnings = F)
+  
+  pca_pc1_2 <- as.data.frame(x = pca_compt$x[,1:2])
+  
+  for(myvars in plot.covariates){
+    pca_pc1_2$condition <- as.factor(ssheet[[myvars]])
+    png(filename = paste0(pcafolder, proj.id, "_PCA_",myvars,".png"), 
+        width = 9, height = 6, units = "in",res=96)
+    print(ggplot(data = pca_pc1_2, 
+                 mapping = aes(x = PC1, y=PC2, col=condition, label=rownames(pca_pc1_2))) + 
+            geom_point(size=3) + geom_text_repel(size=3) + ggtitle(label = "Principal component analysis", subtitle = myvars) + 
+            theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5)) +
+            theme(plot.subtitle=element_text(size=12, hjust=0.5, face="italic", color="black")) +
+            theme(axis.title = element_text(size=12, hjust=0.5, face="bold", color="black")) +
+            theme(legend.text = element_text(size=8, hjust=0.5)) +
+            theme(legend.title = element_blank()) +
+            theme(axis.text = element_text(size=12, hjust=0.5, color="black")))
+    dev.off()
+    
+  }
+  
+  # retrieve and store % of methylation 
+  
+  perc.meth <- percMethylation(meth)
+  
+  saveRDS(perc.meth,paste0(outfolder,"pctmethly.rds"))
+  
+  base::rownames(perc.meth) <- paste0(meth$chr, "_", meth$start)
+  
+  # Perform diff methylation
+  
+  myDiff=calculateDiffMeth(meth)
+  
+  write.table(myDiff,paste0(outfolder,proj.id,"_DiffMeth_single_CpG.txt"), row.names = F)
+  difftest <- read.table(paste0(outfolder,proj.id,"_DiffMeth_single_CpG.txt"), header = T)
+
+  difftest$comparison <- proj.id
+  difftest$qvalue_r <- as.character(cut(x = difftest$qvalue, 
+                                         breaks = c(-1, 1e-100, 1e-10, 1e-02, 1), 
+                                         labels = c("***","**","*","ns")))
+  
+  myindex <- abs(difftest$meth.diff) < delta.meth
+  difftest$meth.diff <- abs(difftest$meth.diff)
+  difftest$qvalue_r[myindex] <- "ns"
+  write.table(difftest,paste0(outfolder,proj.id,"_DiffMeth_single_CpG.txt"), row.names = F)
+  write.table(difftest,paste0(outfolder,proj.id,"_DiffMeth_single_CpG.txt"), row.names = F)
+  
+
+  # Adding color list and annotations
+
+  library(RColorBrewer)
+
+  color_list <- list()
+  annrows <- subset.data.frame(x = ssheet, select = c("Risk", "relapse","immunophenotype","sex","mir126"))
+  rownames(annrows) <- ssheet$SampleID
+  print(ssheet)
+  print(annrows)
+  anncols <- subset.data.frame(difftest, select = c("qvalue_r","meth.diff"))
+  base::rownames(anncols) <- difftest$start
+
+  print(anncols)
+  
+  color_list[["qvalue_r"]] = c("*" = "#6497b1", "**" = "#03396c", "***" = "#011f4b", ns= "#c4cacf")
+  color_list[["meth.diff"]] = colorRampPalette(brewer.pal(n = 9, "Reds"))(100)
+  color_list[["mir126"]] <- brewer.pal(n = 9, "PuRd")
+  set1cols <- alpha(colour = RColorBrewer::brewer.pal(n = 9, "Set1"), alpha = 0.6)
+  color_list[["Risk"]] = c(VHR = set1cols[1], SR = set1cols[2], HR = set1cols[3], 'NA' = "white")
+  color_list[["immunophenotype"]] = c('B-I' = set1cols[4], 'B-II' = set1cols[5], 'B-III' = set1cols[6], 'NA' = "white")
+  color_list[["relapse"]] = c('yes' = set1cols[7], 'no' = set1cols[8], 'NA' = "white")
+  color_list[["sex"]] = c('M' = set1cols[9], 'F' = set1cols[1])
+  
+  print(color_list)
+  
+  
+  
+  # Heatmap
+  
+  pctmeth_matrix <- t(perc.meth)
+  base::colnames(pctmeth_matrix) <- gsub(x = base::colnames(pctmeth_matrix), pattern = "chr[0-9]+_", replacement = "")
+  
+  pheatmap(mat = pctmeth_matrix, main = gsub(x = proj.id, pattern = "_", replacement = " "), 
+           filename = paste0(outfolder, proj.id,"_CpG_percent_methylation_matrix_pheatmap.pdf"), 
+           width = plot.width, height = plot.height,
+           na_col = "pink", 
+           cluster_cols = FALSE, 
+           cluster_rows = TRUE, 
+           annotation_row = annrows, 
+           cellwidth = cellw, 
+           cellheight = cellh, 
+           display_numbers = T, 
+           fontsize = fontsize,
+           fontsize_number = fontnumsize, 
+           number_format = "%.0f",
+           border_color = NA, 
+           annotation_col = anncols, 
+           annotation_colors = color_list,
+           color = c("#F5F5F5","#EEEEEE","#CCCCCC","#999999", "#666666","#333333","#000000"), breaks = c(0,10,20,30,50,70,90,100)
+  )
+  
+  saveRDS(object = pctmeth_matrix, paste0(outfolder, proj.id,"_CpG_percent_methylation_matrix_pheatmap.rds"))
+  saveRDS(object = list(anncol = anncols, annrow = annrows, anncolors = color_list), paste0(outfolder, proj.id,"_annotations_matrix_pheatmap.rds"))
+  saveRDS(object = difftest, paste0(outfolder, proj.id,"_CpG_differential_methylation.rds"))
+  saveRDS(object = perc.meth, paste0(outfolder, proj.id,"_CpG_percent_methylation.rds"))
+  
+  write.table(x = perc.meth, file = paste0(outfolder, proj.id,"_CpG_percent_methylation.txt"))
+  write.table(x = difftest, file = paste0(outfolder, proj.id,"_CpG_differential_methylation.txt"))
+
+  saveRDS(myobj, file = paste0(outfolder,proj.id,"_methylkit.rds"))
+  saveRDS(myobj, file = paste0(outfolder,proj.id,"_methylkit_meth.rds"))
+
+}
\ No newline at end of file
--- a/Methylation/README.md
+++ b/Methylation/README.md
+# Introduction 
+
+1-2ug of DNA was converted using EpiTect Bisulfite Kit (Qiagen, ID 59104) and following manufacturer’s instructions. After purification, 100ng or 11ul of each sample was amplified, by PCR, for the TSS region and the EGFL7 intron 7. A High Fidelity DNA polymerase, KAPA HiFi HotStart Uracil+ Kit (Roche, Basel, CH), was used, and primer sequences and thermocycler settings are shown below.  
+Methylation profile analysis and differential methylation analysis (according to groups of interest) was performed only in the PCR amplified region. 
+
+# Data analysis workflow
+
+The data analysis consists in 3 main steps:
+- QC
+- Mapping and CpG coverage quantification
+- Differential methylation analysis
+
+## QC
+
+Raw sequences were quality checked using fastqc (v.0.11.8) and trimmed for adapters and base quality using cutadapt v.1.16 (minimum base quality 20; minimum length 25bp).  
+Both tails of the reads were subjected to trimming if necessary by using different clipping parameters settings according to runs batches.  
+Very bad quality samples were discarded from downstream analyses.  
+
+## Mapping and CpG coverage quantification
+
+Passing quality checks reads were then mapped to the human reference genome GRCh38 using bismark v.0.22.1 (--local mode).
+Coverage files (.cov) obtained from bismark output were then used as input for differential methylation analysis by using the MethylKit R package (v1.10.0).  
+
+## Differential methylation analysis
+
+Differential methylation analysis was performed primarly at single CpG considering the small length of the region.
+To get rid of CpGs not included in the targeted regions, we tailored the analysis only to genomic coordinates (chr9: 136668000-136671000) mapping to (TSS, 5’ and 3’ intron-7 subregions) with selectByOverlap function.  
+Hence, we discarded reads with less than 10 counts and reads with coverage lower than the 5th percentile. In this way we were able to evaluate methylation percentage in most of the samples under analysis.  
+We then normalized coverage by using the normalizeCoverage function.  
+Exploratory data analysis was performed by using built-in functions provided by the package, including PCA, clustering, and correlations.  
+CpGs with q.value < 0.01 (logistic regression test) and absolute delta methylation (negative or positive) percentage of at least 10 between the compared groups, were considered differentially methylated.  
+Percent methylation matrix was used as input to produce heatmaps (pheatmap R package).  
+Color palette presets from RColorBrewer package were used for % methylation and row annotation tiles coloring.
+
+
--- a/README.md
+++ b/README.md
+# miR-126 and minimal residual disease in B-ALL
+
+Complete elimination of B-cell acute lymphoblastic leukemia (B-ALL) by a risk-adapted primary treatment approach remains a clinical key objective, which fails in up to a third of patients. Recent evidence has implicated subpopulations of B-ALL cells with stem-like features in disease persistence. We hypothesized that microRNA-126, a core regulator of hematopoietic and leukemic stem cells, may resolve intra-tumor heterogeneity in B-ALL and uncover therapy-resistant subpopulations. We exploited patient-derived xenograft (PDX) models with B-ALL cells transduced with a miR-126 reporter allowing the prospective isolation of miR-126(high) cells for their functional and transcriptional characterization. Discrete miR-126(high) populations, often characterized by MIR126 locus de-methylation, were identified in 8/9 PDX models and showed increased repopulation potential, in vivo chemotherapy resistance and hallmarks of quiescence, inflammation and stress-response pathway activation. Cells with a miR-126(high) transcriptional profile were identified as distinct disease subpopulations by single cell RNA sequencing in diagnosis samples from adult and pediatric B-ALL. Expression of miR-126 and locus methylation were tested in several pediatric and adult B-ALL cohorts, which received standardized treatment. High microRNA-126 levels and locus de-methylation at diagnosis associate with sub-optimal response to induction chemotherapy (MRD > 0.05% at day +33 or MRD+ at day +78).
+
+# Code and workflows
+
+This repository is structured in different folders according to the different omics covered in the paper.  
+The main categories are:  
+
+- scRNAseq
+- BulkRNAseq
+- Methylation
+- Whole Exome Sequencing
+
+Each folder will include a bried description of the main data analysis steps and the most important scripts used for producing data present in the 
+manuscript.  
+
+
+
--- a/WES/.gitkeep
+++ b/WES/.gitkeep
--- a/scRNAseq/.gitkeep
+++ b/scRNAseq/.gitkeep
--- a/scRNAseq/GObiopro_enrichments_miRNA_high_low_singlecell_markers.png
+++ b/scRNAseq/GObiopro_enrichments_miRNA_high_low_singlecell_markers.png
--- a/scRNAseq/Panel_cell_type_distribution_across_diseases_res_400.png
+++ b/scRNAseq/Panel_cell_type_distribution_across_diseases_res_400.png
--- a/scRNAseq/PaperPanels.R
+++ b/scRNAseq/PaperPanels.R
--- a/scRNAseq/README.md
+++ b/scRNAseq/README.md
+## Introduction
+
+**scRNAseq** analysis was performed using a standard pipeline that includes the following steps:
+
+Most of the single-cell RNAseq analyses were performed with [Seurat](https://satijalab.org/seurat/).  
+Below the main steps of the basic data analysis workflow that start from a minimal object after loading of 10X data to markers identification:  
+
+1. Normalization (default seurat settings)
+2. Scaling (with following variables to regress out: percent.mt + nCount_RNA and CC.Difference calculated as show in [vignette](https://satijalab.org/seurat/articles/cell_cycle_vignette.html#alternate-workflow-1))
+3. Dimensionality reduction: PCA
+4. Harmony batch removal (patientID - only for integrated analysis with all Diagnoses)
+5. Clustering
+6. Markers identification (different resolution according to datasets / subsets)
+7. Evaluation of signatures according to module score (AddModuleScore function - default settings)
+
+Data analysis was performed both at single and integrated level.  
+Parameters and metrics are described in supplementary tables.  
+Raw counts and complete metadata can be found on GEO repository [GSE236136](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE236136).  
+Seurat objects for each disease are also available in seurat_objects folder for convenience.
+
+
+
--- a/scRNAseq/high_GObiopro.csv
+++ b/scRNAseq/high_GObiopro.csv
--- a/scRNAseq/low_GObiopro.csv
+++ b/scRNAseq/low_GObiopro.csv