Commit fe5789a9 authored by Matteo Barcella's avatar Matteo Barcella
Browse files

initialize and repopulate git

parents
# Interaction analysis
library(DESeq2)
library(ggplot2)
library(openxlsx)
library(pheatmap)
library(RColorBrewer)
library(clusterProfiler)
library(DOSE)
library(AnnotationHub)
library(org.Hs.eg.db)
out_dir <- "../00-Interaction/"
read.counts <- read.table(file = "../04-counts/featureCounts_results_corrected.txt", header = T)
rownames(read.counts) <- read.counts$Geneid
read.counts <- read.counts[,c(7:ncol(read.counts))]
colnames(read.counts) <- unlist(strsplit(x = colnames(read.counts),split = "\\.", ))[seq.int(2, 42, by = 3)]
sample_info <- read.table(file = "SampleInfo.txt", header = T)
sample_info$OFP <- substr(x = sample_info$Group, start = 1, stop = 4)
sample_info$Treatment <- substr(x = sample_info$Group, start = 5, stop = length(sample_info$Group))
rownames(sample_info) <- sample_info$SampleID
dds <- DESeqDataSetFromMatrix(countData = read.counts,
colData = sample_info,
design = as.formula("~ OFP + Treatment + OFP:Treatment"))
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
dds$OFP <- relevel(dds$OFP, ref = "OFPn")
dds$Treatment <- relevel(dds$Treatment, ref = "NT")
dds <- DESeq(dds)
sizeFactors(dds)
resultsNames(dds)
rlog <- rlog(dds)
for (contr in c("OFPOFPp.TreatmentT")) {
res <- results(dds, name = contr)
res <- res[!is.na(res$padj), ]
res$padj <- as.numeric(res$padj)
res <- res[res$padj < 0.05 & res$baseMean > 5 & abs(res$log2FoldChange) > 1, ]
write.table(x = as.data.frame(res), file = paste(out_dir, paste0("DESeq2_", contr, "_interaction.tsv"), sep = "/"),
sep = "\t", quote = FALSE, col.names = NA)
res$Gene <- rownames(res)
write.xlsx(x = res, file = paste(out_dir, paste0("DESeq2_", contr, "_interaction.xlsx"), sep = "/"))
}
sample_info$SampleID <- NULL
pheatmap(mat = as.matrix(assay(rlog)[res[res$log2FoldChange > 0,]$Gene,]),
color = rev(brewer.pal(9, "RdBu")),
scale = "row", annotation_col = sample_info,
show_rownames = F, cluster_cols = F)
saveRDS(object = list(dds = dds,
read.counts = read.counts,
res = res,
rlog = rlog,
sample_info = sample_info), file = "00-Interaction.rds")
### Performing downstream analysis
interaction_genes <- list()
interaction_genes[["up"]] <- subset(res, log2FoldChange > 0)
interaction_genes[["down"]] <- subset(res, log2FoldChange < 0)
gene_list <- list()
gene_list[["up"]] <- interaction_genes$up$log2FoldChange
names(gene_list[["up"]]) <- interaction_genes$up$Gene
gene_list[["down"]] <- interaction_genes$down$log2FoldChange
names(gene_list[["down"]]) <- interaction_genes$down$Gene
contr.enrich_go_up <- enrichGO(gene = names(gene_list[["up"]]),
universe = rownames(dds),
OrgDb = 'org.Hs.eg.db',
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
keyType = 'SYMBOL')
contr.enrich_go_down <- enrichGO(gene = names(gene_list[["down"]]),
universe = rownames(dds),
OrgDb = 'org.Hs.eg.db',
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
keyType = 'SYMBOL')
contr.enrich_go <- list(UP = contr.enrich_go_up@result,
DOWN = contr.enrich_go_down@result)
write.xlsx(contr.enrich_go,file = "GO_BP_Enrichment_interaction_OFP_Treatment.xlsx",
asTable = T, firstRow = T, colWidths = "auto")
# GSEA
for (contr in c("OFPOFPp.TreatmentT")) {
res_all <- results(dds, name = contr)
res_all <- res_all[!is.na(res_all$padj), ]
res_all$padj <- as.numeric(res_all$padj)
# res_all <- res[res$padj < 0.05 & res$baseMean > 5 & abs(res$log2FoldChange) > 1, ]
write.table(x = as.data.frame(res_all), file = paste(out_dir, paste0("DESeq2_", contr, "_interaction_full.tsv"), sep = "/"),
sep = "\t", quote = FALSE, col.names = NA)
res_all$Gene <- rownames(res_all)
write.xlsx(x = res_all, file = paste(out_dir, paste0("DESeq2_", contr, "_interaction_full.xlsx"), sep = "/"))
}
gene_list[["full"]] <- res_all$log2FoldChange
names(gene_list[["full"]]) <- res_all$Gene
gene_list[["full"]] <- sort(gene_list[["full"]], decreasing = T)
t2gene <- read.gmt(gmtfile = paste0("/opt/common/tools/ric.tiget/GSEA/GSEA_human_v7.4/h.all.v7.4.symbols.gmt"))
gsea_res <- GSEA(gene_list[["full"]], TERM2GENE = t2gene,
verbose = FALSE,minGSSize = 10,maxGSSize = 1000,
pvalueCutoff = 0.05)
gseaplot2(gsea_res, geneSetID = 1:3, pvalue_table = TRUE,
color = c("#E495A5", "#86B875", "#7DB0DD"), ES_geom = "dot")
a <- ridgeplot(gsea_res, fill = "NES", showCategory = 50) + scale_fill_gradient2()
a$theme$axis.text.y$size <- 10
a$theme$axis.text.x$size <- 10
png(filename = "GSEA_ridges_enrich_core_distribution.png",width = 9, height = 6, units = "in", res = 200)
a
dev.off()
# GGridges
library(DESeq2)
library(ggplot2)
library(openxlsx)
library(pheatmap)
library(RColorBrewer)
library(clusterProfiler)
library(DOSE)
library(AnnotationHub)
library(org.Hs.eg.db)
library(gridExtra)
load(file = "../.RData")
gsea_res_interaction <- gsea_res
gsea_bulk <- readRDS(file = "/beegfs/scratch/ric.gentner/ric.gentner/BALL/BulkRNA_BALL_Integration/results/GSEA_All_Experiments/GG_ridges/Hallmark_GSEA.rds")
gsea_res <- gsea_bulk
a <- ridgeplot(gsea_res$GFP_L_vs_GFP_H, fill = "NES",
showCategory = nrow(gsea_res$GFP_L_vs_GFP_H@result[gsea_res$GFP_L_vs_GFP_H@result$p.adjust < 0.2,])) +
scale_fill_gradient2(low = "#0066E7", high= "#D4070F") +
ggtitle("miR-126 High vs miR-126 Low") + #, subtitle = "Human dataset") +
xlab("logFC") +
theme(legend.position = "none", plot.margin=unit(c(0.5,0.5,0.5,0.5), 'cm'))
a$theme$axis.text.y$size <- 8
a$theme$axis.text.x$size <- 8
a$theme$axis.title.x$size <- 10
a_2 <- a
a_2$data$category <- gsub(a$data$category, pattern = "HALLMARK_", replacement = "")
a_2$data$category <- factor(x = a_2$data$category,
levels = gsub(x = levels(a$data$category), replacement = "", pattern = "HALLMARK_"))
b <- ridgeplot(gsea_res$data_OE_vs_SM, fill = "NES",
showCategory = nrow(gsea_res$data_OE_vs_SM@result[gsea_res$data_OE_vs_SM@result$p.adjust < 0.2,])) +
scale_fill_gradient2(low = "#0066E7", high= "#D4070F") +
ggtitle("miR-126 OE vs miR-126 SM") + #, subtitle = "Murine dataset") +
xlab("logFC") +
theme(legend.position = "none", plot.margin=unit(c(0.5,0.5,0.5,0.5), 'cm'))
b$theme$axis.text.y$size <- 8
b$theme$axis.text.x$size <- 8
b$theme$axis.title.x$size <- 10
b_2 <- b
b_2$data$category <- gsub(b$data$category, pattern = "HALLMARK_", replacement = "")
b_2$data$category <- factor(x = b_2$data$category,
levels = gsub(x = levels(b$data$category), replacement = "", pattern = "HALLMARK_"))
c <- ridgeplot(gsea_res_interaction, fill = "NES",
showCategory = nrow(gsea_res_interaction@result[gsea_res_interaction@result$p.adjust < 0.2,])) +
scale_fill_gradient2(low = "#0066E7", high= "#D4070F") +
ggtitle("Interaction miR-126 lvl - Treatment") +
xlab("logFC") +
theme(legend.position = "none", plot.margin=unit(c(0.5,0.5,0.5,0.5), 'cm'))
c$theme$axis.text.y$size <- 8
c$theme$axis.text.x$size <- 8
c$theme$axis.title.x$size <- 10
c_2 <- c
c_2$data$category <- gsub(c$data$category, pattern = "HALLMARK_", replacement = "")
c_2$data$category <- factor(x = c_2$data$category,
levels = gsub(x = levels(c$data$category), replacement = "", pattern = "HALLMARK_"))
# Panel
png(filename = "Panel_gsea_hallmakrs_nes_distribution.png", width = 18, height = 6, units = "in", res = 300)
grid.arrange(a_2,b_2,c_2,widths = c(0.30,0.30,0.35), ncol = 3)
dev.off()
svg(filename = "Panel_gsea_hallmakrs_nes_distribution.svg", width = 18, height = 6)
grid.arrange(a_2,b_2,c_2,widths = c(0.30,0.30,0.35), ncol = 3)
dev.off()
## Introduction.
BulkRNAseq analysis was performed both for single-diseases separately and the by combining all samples in an integrated analysis (GFP_L vs GFP_H).
For the latter, the starting point of the analysis is the genes expression counts matrix deposited at [GSE236138](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE236138).
Moreover, for the second dataset present at GEO id [GSE236141](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE236141), we performed an additional analysis focus on interaction analysis between condition and treatment.
## Workflow and steps.
Below the most important steps:
1. Quality control by [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)
2. Trimming of bad quality reads with [TrimGalore](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/)<details><summary>Running command</summary>trim_galore --quality 20 --fastqc --length 25 --output_dir {outdir} --paired {input.r1} {inout.r2}</details>
3. Alignment with [STAR](https://github.com/alexdobin/STAR)
<details><summary>Running command</summary>
"STAR " +
"--runThreadN {threads} " +
"--genomeDir {input.genome} " +
"--readFilesIn {params.trim_seq} " +
"--outSAMstrandField intronMotif " +
"--outFileNamePrefix {params.aln_seq_prefix} " +
"--outSAMtype BAM SortedByCoordinate " +
"--outSAMmultNmax 1 " +
"--outFilterMismatchNmax 10 " +
"--outReadsUnmapped Fastx " +
"--readFilesCommand zcat "
</details>
4. Gene expression quantification with [FeatureCounts](https://academic.oup.com/bioinformatics/article/30/7/923/232889)
<details><summary>Running command</summary>
"featureCounts " +
"-a {input.annot} " +
"-o {output.fcount} " +
"-g gene_name " +
"-p -B -C " +
"-s {params.strand} " +
"--minOverlap 10 " +
"-T {threads} " +
"{input.bams} "
</details>
5. Differential Expression analysis with [Deseq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html).
For Differential Gene Expression analysis we followed the standard workflow provided by package.
<details><summary>Detail</summary>
results(DESeq.ds, pAdjustMethod = "BH", independentFiltering = TRUE, contrast = c("groups", Group1, Group2), alpha = 0.05)
</details>
For interaction analysis we apply the design according to Deseq2 vignette:
<details><summary>Interaction</summary>
design = as.formula("~ Condition + Treatment + Condition:Treatment")
</details>
6. Dowstream functional Analysis with [ClusterProfiler](https://bioconductor.org/packages/release/bioc/html/clusterProfiler.html).
In order to retrieve functional annotation from DE analysis, we performed **O**ver **R**epresentation **A**nalysis and **G**ene **S**et **E**nrichment **A**nalysis by using the functions EnrichGO and GSEA provided by the package.
resources:
directories:
root: /XXX/XXX/XXX/XXXX/BulkRNA_BALL_Integration
conda_dir: . # if not required set to "."
reference: reference
data: input-fastq
progs: progs
results: results
qual_check: 01-qcheck
trimming: 02-trim
alignments: 03-aln
counts: 04-counts_bypatient
dge: 05-DGE_bypatient_0.1
post: 06-post-analysis_bypatient_0.1
cluster: 07-cluster-analysis_bypatient_0.1
organism: human # choices: human mouse ...
genome: GRCh38.primary_assembly.genome.fa #
genome_annot: gencode.v34.primary_assembly.gtf # skipped - counts already present
design: patient,condition # cov1,cov2,...,condition
seq_type: single-end # choices: single-end paired-end # skipped - counts already present
strand_type: unstranded # choices: unstranded stranded reverse # skipped - counts already present
fdr_thr: 0.1
logfc_thr: 0
rand_walk: 1 # choices: 0 (do not print random walks) 1 (print random walks)
post_analysis_tool: DESeq # choices: edgeR DESeq limma
fastqfiles:
- PT10_395C0H: C0_HIGH_S13
- PT10_395C0L: C0_LOW_S14
- PT10_395C1H: C1_HIGH_S9
- PT10_395C1L: C1_LOW_S10
- PT10_395C3H: C3_HIGH_S11
- PT10_395C3L: C3_LOW_S12
- PT12_27C3R3L: C3_R3_S1
- PT12_27C3R4H: C3_R4_S2
- PT12_27D1R3L: D1_R3_S7
- PT12_27D1R4H: D1_R4_S8
- PT12_27D2R3L: D2_R3_S9
- PT12_27D2R4H: D2_R4_S10
- PT13_37C0R4H: 1_S1_L001,1_S1_L002
- PT13_37C0R5L: 2_S2_L001,2_S2_L002
- PT13_37C1R4H: 3_S3_L001,3_S3_L002
- PT13_37C1R5L: 4_S4_L001,4_S4_L002
#NON USARE TRATTINI
samples:
- PT10_395C0H: PT10_395,GFP_HIGH
- PT10_395C0L: PT10_395,GFP_LOW
- PT10_395C1H: PT10_395,GFP_HIGH
- PT10_395C1L: PT10_395,GFP_LOW
- PT10_395C3H: PT10_395,GFP_HIGH
- PT10_395C3L: PT10_395,GFP_LOW
- PT12_27C3R3L: PT12_27,GFP_LOW
- PT12_27C3R4H: PT12_27,GFP_HIGH
- PT12_27D1R3L: PT12_27,GFP_LOW
- PT12_27D1R4H: PT12_27,GFP_HIGH
- PT12_27D2R3L: PT12_27,GFP_LOW
- PT12_27D2R4H: PT12_27,GFP_HIGH
- PT13_37C0R4H: PT13_37,GFP_HIGH
- PT13_37C0R5L: PT13_37,GFP_LOW
- PT13_37C1R4H: PT13_37,GFP_HIGH
- PT13_37C1R5L: PT13_37,GFP_LOW
contrasts:
- GFP_LOW:GFP_HIGH
enrichergmtfiles:
- c1.all.v7.0.symbols.gmt
- c2.all.v7.0.symbols.gmt
- c3.all.v7.0.symbols.gmt
- c4.all.v7.0.symbols.gmt
- c5.all.v7.0.symbols.gmt
- c6.all.v7.0.symbols.gmt
- c7.all.v7.0.symbols.gmt
- h.all.v7.0.symbols.gmt
read1_mask: "_R1_001.fastq.gz"
read2_mask: "_R2_001.fastq.gz"
read_mask: "_R1_001.fastq.gz"
MethylAnalysis <- function(methcall.folder = NULL, # input folder with .cov files (bismark)
outfolder = NULL, # output folder (if not exist it will be created)
proj.id = "myproject", # project id to prefix in putput files
samplesheet = NULL,
sheetnum = 1,
design.var = "Disease", # variable to subset samplesheet
case.var = "Case", # 1 in treatment vector
control.var = "Control", # 0 in treatment vector
idcol = "SampleID",
mincoverage = 10,
lowperc = 5,
lowcount = 10,
assem="hg38",
do.subset = T,
chr.subset = "chr9",
start.subset = 136668000,
end.subset = 136671000,
pipeline.meth = "bismarkCoverage",
plot.covariates = c("Condition","Batch","MRD",
"Torelapse","Chemorefractory",
"Sex","CytoA","Tissue"),
idstoremove = NULL,
delta.meth = 10,
plot.categorical.vars = c("Condition","Batch","MRD","Torelapse","Chemorefractory","Sex","CytoA"),
plot.continuous.vars = c("miR-126"),
plot.height = 9,
plot.width = 12,
cellw = 15,
cellh = 15,
fontnumsize = 5,
fontsize = 8
){
# load libraries
require(methylKit)
require(ggplot2)
require(reshape2)
require(plyr)
require(ggrepel)
library(GenomicRanges)
library(openxlsx)
library(pheatmap)
# check vars
if(is.null(methcall.folder)){
stop("Please set methcall.folder")
}
if(is.null(outfolder)){
stop("Please set outfolder")
}
if(is.null(samplesheet)){
stop("Please set samplesheet")
}
# Setup variables
infolder = paste0(methcall.folder, "/")
dir.create(infolder, showWarnings = F)
outfolder = paste0(outfolder, "/")
dir.create(outfolder, showWarnings = F)
qcfolder <- paste0(outfolder, "QC/")
dir.create(qcfolder, showWarnings = F)
# reading sample sheet with metadata
ssheet <- read.xlsx(samplesheet, sheet = sheetnum)
if(!is.null(idstoremove)){
ssheet <- subset.data.frame(x = ssheet, subset = !ssheet[[idcol]] %in% idstoremove)
}
# defining design vector according to variable
ssheet <- subset.data.frame(x = ssheet, subset = ssheet[[design.var]] %in% c(case.var, control.var))
ssheet$mir126 <- as.numeric(ssheet$mir126)
d.vec <- ssheet[[design.var]]
d.vector <- ifelse(d.vec == case.var, 1, 0)
# initialzing coverage .cov files list
covs <- list()
sampleids <- as.list(ssheet[[idcol]])
names(sampleids) <- ssheet[[idcol]]
for (i in ssheet[[idcol]]) {
covs[[i]] <- paste0(infolder, "/", i, ".bismark.cov")
}
saveRDS(object = covs, file = paste0(outfolder,"covs_object.rds"))
# creating object
myobj <- methRead(location = covs,
sample.id = sampleids,
assembly = assem,
pipeline = pipeline.meth,
treatment = d.vector,
context="CpG",
mincov = mincoverage)
saveRDS(myobj, file = paste0(outfolder,"Myobject.rds"))
names(myobj) <- ssheet[[idcol]]
saveRDS(myobj, file = paste0(outfolder,"Myobject_with_names.rds"))
# subsetting if declared
if(isTRUE(do.subset)){
my.win = GRanges(seqnames = chr.subset, ranges = IRanges(start = start.subset, end = end.subset))
myobj <- selectByOverlap(myobj,my.win)
}
saveRDS(myobj, file = paste0(outfolder,"Myobject_with_names_aftersubset.rds"))
# filtering on minimum coverage
myobj <- filterByCoverage(methylObj = myobj, lo.count=lowcount, lo.perc = lowperc)
# Normalization
myobj <- normalizeCoverage(obj = myobj)
# saveRDS(object = myobj, file = "Initial_object.rds")
# Calculate basic stats and PCs
metricsfolder <- paste0(qcfolder, "Metrics/")
dir.create(path = metricsfolder, showWarnings = F)
for (id in ssheet[[idcol]]) {
png(filename = paste0(metricsfolder,proj.id,"_CpG_pct_methylation_sample_", id, ".png"),
width = 9, height = 6, units = "in", res = 96)
print(getMethylationStats(myobj[[id]],plot=TRUE,both.strands=FALSE))
dev.off()
png(filename = paste0(metricsfolder, proj.id,"_Coverage_stats_sample_", id, ".png"),
width = 9, height = 6, units = "in", res = 96)
print(getCoverageStats(myobj[[id]],plot=TRUE,both.strands=FALSE))
dev.off()
}
# create meth obj
#meth <- unite(object = myobj, destrand=FALSE)
meth <- unite(object = myobj, destrand=FALSE) # for debugging
saveRDS(meth, paste0(outfolder,"savemeth.tmp.rds"))
# Perform correlation
sink(paste0(qcfolder, proj.id, "_Correlations.txt"))
getCorrelation(meth,plot=FALSE)
sink()
if(length(ssheet[[idcol]]) < 15){
png(filename = paste0(qcfolder,proj.id, "_Correlations_pearson_pairwise.png"),
width = 9, height = 6, units = "in", res = 96)
print(getCorrelation(meth,plot=TRUE))
dev.off()
}
png(filename = paste0(qcfolder, proj.id, "_Clustering.png"),
width = 9, height = 6, units = "in", res = 96)
clusterSamples(meth, dist="euclidean", plot=TRUE, method = "ward.D2")
dev.off()
# Re-plotting PCs (custom chart)
# compute PCs and store in object
pca_compt <- PCASamples(meth, obj.return = T, screeplot = F)
# extract PCs components
pcafolder <- paste0(qcfolder, "PCA/")
dir.create(path = pcafolder, showWarnings = F)
pca_pc1_2 <- as.data.frame(x = pca_compt$x[,1:2])
for(myvars in plot.covariates){
pca_pc1_2$condition <- as.factor(ssheet[[myvars]])
png(filename = paste0(pcafolder, proj.id, "_PCA_",myvars,".png"),
width = 9, height = 6, units = "in",res=96)
print(ggplot(data = pca_pc1_2,
mapping = aes(x = PC1, y=PC2, col=condition, label=rownames(pca_pc1_2))) +
geom_point(size=3) + geom_text_repel(size=3) + ggtitle(label = "Principal component analysis", subtitle = myvars) +
theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5)) +
theme(plot.subtitle=element_text(size=12, hjust=0.5, face="italic", color="black")) +
theme(axis.title = element_text(size=12, hjust=0.5, face="bold", color="black")) +
theme(legend.text = element_text(size=8, hjust=0.5)) +
theme(legend.title = element_blank()) +
theme(axis.text = element_text(size=12, hjust=0.5, color="black")))
dev.off()
}
# retrieve and store % of methylation
perc.meth <- percMethylation(meth)
saveRDS(perc.meth,paste0(outfolder,"pctmethly.rds"))
base::rownames(perc.meth) <- paste0(meth$chr, "_", meth$start)
# Perform diff methylation
myDiff=calculateDiffMeth(meth)
write.table(myDiff,paste0(outfolder,proj.id,"_DiffMeth_single_CpG.txt"), row.names = F)
difftest <- read.table(paste0(outfolder,proj.id,"_DiffMeth_single_CpG.txt"), header = T)
difftest$comparison <- proj.id
difftest$qvalue_r <- as.character(cut(x = difftest$qvalue,
breaks = c(-1, 1e-100, 1e-10, 1e-02, 1),
labels = c("***","**","*","ns")))
myindex <- abs(difftest$meth.diff) < delta.meth
difftest$meth.diff <- abs(difftest$meth.diff)
difftest$qvalue_r[myindex] <- "ns"
write.table(difftest,paste0(outfolder,proj.id,"_DiffMeth_single_CpG.txt"), row.names = F)
write.table(difftest,paste0(outfolder,proj.id,"_DiffMeth_single_CpG.txt"), row.names = F)
# Adding color list and annotations
library(RColorBrewer)
color_list <- list()
annrows <- subset.data.frame(x = ssheet, select = c("Risk", "relapse","immunophenotype","sex","mir126"))
rownames(annrows) <- ssheet$SampleID
print(ssheet)
print(annrows)
anncols <- subset.data.frame(difftest, select = c("qvalue_r","meth.diff"))
base::rownames(anncols) <- difftest$start
print(anncols)
color_list[["qvalue_r"]] = c("*" = "#6497b1", "**" = "#03396c", "***" = "#011f4b", ns= "#c4cacf")
color_list[["meth.diff"]] = colorRampPalette(brewer.pal(n = 9, "Reds"))(100)
color_list[["mir126"]] <- brewer.pal(n = 9, "PuRd")
set1cols <- alpha(colour = RColorBrewer::brewer.pal(n = 9, "Set1"), alpha = 0.6)
color_list[["Risk"]] = c(VHR = set1cols[1], SR = set1cols[2], HR = set1cols[3], 'NA' = "white")
color_list[["immunophenotype"]] = c('B-I' = set1cols[4], 'B-II' = set1cols[5], 'B-III' = set1cols[6], 'NA' = "white")
color_list[["relapse"]] = c('yes' = set1cols[7], 'no' = set1cols[8], 'NA' = "white")
color_list[["sex"]] = c('M' = set1cols[9], 'F' = set1cols[1])
print(color_list)
# Heatmap
pctmeth_matrix <- t(perc.meth)
base::colnames(pctmeth_matrix) <- gsub(x = base::colnames(pctmeth_matrix), pattern = "chr[0-9]+_", replacement = "")
pheatmap(mat = pctmeth_matrix, main = gsub(x = proj.id, pattern = "_", replacement = " "),
filename = paste0(outfolder, proj.id,"_CpG_percent_methylation_matrix_pheatmap.pdf"),
width = plot.width, height = plot.height,
na_col = "pink",
cluster_cols = FALSE,
cluster_rows = TRUE,
annotation_row = annrows,
cellwidth = cellw,
cellheight = cellh,
display_numbers = T,
fontsize = fontsize,
fontsize_number = fontnumsize,
number_format = "%.0f",
border_color = NA,
annotation_col = anncols,
annotation_colors = color_list,
color = c("#F5F5F5","#EEEEEE","#CCCCCC","#999999", "#666666","#333333","#000000"), breaks = c(0,10,20,30,50,70,90,100)
)
saveRDS(object = pctmeth_matrix, paste0(outfolder, proj.id,"_CpG_percent_methylation_matrix_pheatmap.rds"))
saveRDS(object = list(anncol = anncols, annrow = annrows, anncolors = color_list), paste0(outfolder, proj.id,"_annotations_matrix_pheatmap.rds"))
saveRDS(object = difftest, paste0(outfolder, proj.id,"_CpG_differential_methylation.rds"))
saveRDS(object = perc.meth, paste0(outfolder, proj.id,"_CpG_percent_methylation.rds"))
write.table(x = perc.meth, file = paste0(outfolder, proj.id,"_CpG_percent_methylation.txt"))
write.table(x = difftest, file = paste0(outfolder, proj.id,"_CpG_differential_methylation.txt"))
saveRDS(myobj, file = paste0(outfolder,proj.id,"_methylkit.rds"))
saveRDS(myobj, file = paste0(outfolder,proj.id,"_methylkit_meth.rds"))
}
\ No newline at end of file
# Introduction
1-2ug of DNA was converted using EpiTect Bisulfite Kit (Qiagen, ID 59104) and following manufacturer’s instructions. After purification, 100ng or 11ul of each sample was amplified, by PCR, for the TSS region and the EGFL7 intron 7. A High Fidelity DNA polymerase, KAPA HiFi HotStart Uracil+ Kit (Roche, Basel, CH), was used, and primer sequences and thermocycler settings are shown below.
Methylation profile analysis and differential methylation analysis (according to groups of interest) was performed only in the PCR amplified region.
# Data analysis workflow
The data analysis consists in 3 main steps:
- QC
- Mapping and CpG coverage quantification
- Differential methylation analysis
## QC
Raw sequences were quality checked using fastqc (v.0.11.8) and trimmed for adapters and base quality using cutadapt v.1.16 (minimum base quality 20; minimum length 25bp).
Both tails of the reads were subjected to trimming if necessary by using different clipping parameters settings according to runs batches.
Very bad quality samples were discarded from downstream analyses.
## Mapping and CpG coverage quantification
Passing quality checks reads were then mapped to the human reference genome GRCh38 using bismark v.0.22.1 (--local mode).
Coverage files (.cov) obtained from bismark output were then used as input for differential methylation analysis by using the MethylKit R package (v1.10.0).
## Differential methylation analysis
Differential methylation analysis was performed primarly at single CpG considering the small length of the region.
To get rid of CpGs not included in the targeted regions, we tailored the analysis only to genomic coordinates (chr9: 136668000-136671000) mapping to (TSS, 5’ and 3’ intron-7 subregions) with selectByOverlap function.
Hence, we discarded reads with less than 10 counts and reads with coverage lower than the 5th percentile. In this way we were able to evaluate methylation percentage in most of the samples under analysis.
We then normalized coverage by using the normalizeCoverage function.
Exploratory data analysis was performed by using built-in functions provided by the package, including PCA, clustering, and correlations.
CpGs with q.value < 0.01 (logistic regression test) and absolute delta methylation (negative or positive) percentage of at least 10 between the compared groups, were considered differentially methylated.
Percent methylation matrix was used as input to produce heatmaps (pheatmap R package).
Color palette presets from RColorBrewer package were used for % methylation and row annotation tiles coloring.
# miR-126 and minimal residual disease in B-ALL
Complete elimination of B-cell acute lymphoblastic leukemia (B-ALL) by a risk-adapted primary treatment approach remains a clinical key objective, which fails in up to a third of patients. Recent evidence has implicated subpopulations of B-ALL cells with stem-like features in disease persistence. We hypothesized that microRNA-126, a core regulator of hematopoietic and leukemic stem cells, may resolve intra-tumor heterogeneity in B-ALL and uncover therapy-resistant subpopulations. We exploited patient-derived xenograft (PDX) models with B-ALL cells transduced with a miR-126 reporter allowing the prospective isolation of miR-126(high) cells for their functional and transcriptional characterization. Discrete miR-126(high) populations, often characterized by MIR126 locus de-methylation, were identified in 8/9 PDX models and showed increased repopulation potential, in vivo chemotherapy resistance and hallmarks of quiescence, inflammation and stress-response pathway activation. Cells with a miR-126(high) transcriptional profile were identified as distinct disease subpopulations by single cell RNA sequencing in diagnosis samples from adult and pediatric B-ALL. Expression of miR-126 and locus methylation were tested in several pediatric and adult B-ALL cohorts, which received standardized treatment. High microRNA-126 levels and locus de-methylation at diagnosis associate with sub-optimal response to induction chemotherapy (MRD > 0.05% at day +33 or MRD+ at day +78).
# Code and workflows
This repository is structured in different folders according to the different omics covered in the paper.
The main categories are:
- scRNAseq
- BulkRNAseq
- Methylation
- Whole Exome Sequencing
Each folder will include a bried description of the main data analysis steps and the most important scripts used for producing data present in the
manuscript.
This diff is collapsed.
## Introduction
**scRNAseq** analysis was performed using a standard pipeline that includes the following steps:
Most of the single-cell RNAseq analyses were performed with [Seurat](https://satijalab.org/seurat/).
Below the main steps of the basic data analysis workflow that start from a minimal object after loading of 10X data to markers identification:
1. Normalization (default seurat settings)
2. Scaling (with following variables to regress out: percent.mt + nCount_RNA and CC.Difference calculated as show in [vignette](https://satijalab.org/seurat/articles/cell_cycle_vignette.html#alternate-workflow-1))
3. Dimensionality reduction: PCA
4. Harmony batch removal (patientID - only for integrated analysis with all Diagnoses)
5. Clustering
6. Markers identification (different resolution according to datasets / subsets)
7. Evaluation of signatures according to module score (AddModuleScore function - default settings)
Data analysis was performed both at single and integrated level.
Parameters and metrics are described in supplementary tables.
Raw counts and complete metadata can be found on GEO repository [GSE236136](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE236136).
Seurat objects for each disease are also available in seurat_objects folder for convenience.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment