From 38a09ebbccb738e1618bbce5ca65aa182e870ffc Mon Sep 17 00:00:00 2001 From: Matteo Barcella Date: Thu, 3 Aug 2023 16:23:29 +0200 Subject: [PATCH] Adding scripts for Allele Fractions variants --- WES/AF_comparison.R | 301 ++++++++++++++++++++++++++++++++++ WES/Filter_variants_v3.R | 20 +++ WES/MethylAnalysis_bulk_v3.R | 304 +++++++++++++++++++++++++++++++++++ 3 files changed, 625 insertions(+) create mode 100755 WES/AF_comparison.R create mode 100644 WES/Filter_variants_v3.R create mode 100644 WES/MethylAnalysis_bulk_v3.R diff --git a/WES/AF_comparison.R b/WES/AF_comparison.R new file mode 100755 index 0000000..08bf6c1 --- /dev/null +++ b/WES/AF_comparison.R @@ -0,0 +1,301 @@ +# Script for analyzing AF + +library(openxlsx) + +source(file = "Filter_variants_v3.R") + +inputdata <- list() + +for (mysample in c("12_27_1","12_27_2", "10_395_3", "10_395_4")) { + + inputdata[["rawcall"]][[mysample]] <- filter_variants_v3(infile = paste0(mysample, "_dbsnp_cc_cnc_dbnsfp_refseq_light_raw_filtered.fields.txt"), sampleid = mysample, dp = 50) + inputdata[["passonly"]][[mysample]] <- filter_variants_v3(infile = paste0(mysample, "_dbsnp_cc_cnc_dbnsfp_refseq_light.fields.txt"), sampleid = mysample, dp = 50) +} + +source(file = "/home/mbarcella/scripts/bitbucket/exometools/Compare_variants_AF_v3.R") + +Comparison_12_27 <- list() +Comparison_10_395 <- list() + +for(i in names(inputdata)){ + + Comparison_12_27[[i]] <- Compare_variants_AF_v3(dblist = inputdata[[i]], sampleid_1 = "12_27_1", sampleid_2 = "12_27_2", + my.width = 8, my.height = 4, my.res = 120, + tag = i, outfolder = "12_27/") + + Comparison_10_395[[i]] <- Compare_variants_AF_v3(dblist = inputdata[[i]], sampleid_1 = "10_395_3", sampleid_2 = "10_395_4", + my.width = 8, my.height = 4, my.res = 120, + tag = i, outfolder = "10_395/") +} + +# plot AF density in H and L (full list) + +AF_density_raw <- list() + +library(cowplot) + +tmpdf_1_raw <- subset.data.frame(x = Comparison_12_27$rawcall$unionannot, select = c("AF_GFP_H"), subset = AF_GFP_H > 0) +tmpdf_2_raw <- subset.data.frame(x = Comparison_12_27$rawcall$unionannot, select = c("AF_GFP_L"), subset = AF_GFP_L > 0) + +tmpdf_3_raw <- subset.data.frame(x = Comparison_10_395$rawcall$unionannot, select = c("AF_GFP_H"), subset = AF_GFP_H > 0) +tmpdf_4_raw <- subset.data.frame(x = Comparison_10_395$rawcall$unionannot, select = c("AF_GFP_L"), subset = AF_GFP_L > 0) + +AF_density_raw[["12-27_miRNA_L"]] <- ggplot(data = tmpdf_1_raw, + mapping = aes(AF_GFP_H)) + ylim(0,600) + + geom_histogram(bins = 50, fill = alpha("#003366",0.8)) + xlab("miRNA_low") + + theme(axis.text = element_text(size = 14), legend.title = element_blank()) + + ggtitle("12-27_miRNA_L") + +AF_density_raw[["12-27_miRNA_H"]] <- ggplot(data = tmpdf_2_raw, + mapping = aes(AF_GFP_L)) + ylim(0,600) + + geom_histogram(bins = 50, fill = alpha("#fc2803",0.8))+ xlab("miRNA_high") + + theme(axis.text = element_text(size = 14), legend.title = element_blank()) + + ggtitle("12-27_miRNA_H") + +AF_density_raw[["10-395_miRNA_L"]] <- ggplot(data = tmpdf_3_raw, + mapping = aes(AF_GFP_H)) + ylim(0,600) + + geom_histogram(bins = 50, fill = alpha("#003366",0.8)) + xlab("miRNA_low") + + theme(axis.text = element_text(size = 14), legend.title = element_blank()) + + ggtitle("10-395_miRNA_L") + +AF_density_raw[["10-395_miRNA_H"]] <- ggplot(data = tmpdf_4_raw, + mapping = aes(AF_GFP_L)) + ylim(0,600) + + geom_histogram(bins = 50, fill = alpha("#fc2803",0.8)) + xlab("miRNA_high") + + theme(axis.text = element_text(size = 14), legend.title = element_blank()) + + ggtitle("10-395_miRNA_H") + +png(filename = "AF_density_raw_variants.png", width = 8, height = 6, units = "in", res = 300) +plot_grid(AF_density_raw[["12-27_miRNA_L"]], AF_density_raw[["12-27_miRNA_H"]], + AF_density_raw[["10-395_miRNA_L"]] , AF_density_raw[["10-395_miRNA_H"]], + nrow = 2) +dev.off() + +## PLOTTING PASS-ONLY + +tmpdf_1_pass <- subset.data.frame(x = Comparison_12_27$passonly$unionannot, select = c("AF_GFP_H"), subset = AF_GFP_H > 0) +tmpdf_2_pass <- subset.data.frame(x = Comparison_12_27$passonly$unionannot, select = c("AF_GFP_L"), subset = AF_GFP_L > 0) + +tmpdf_3_pass <- subset.data.frame(x = Comparison_10_395$passonly$unionannot, select = c("AF_GFP_H"), subset = AF_GFP_H > 0) +tmpdf_4_pass <- subset.data.frame(x = Comparison_10_395$passonly$unionannot, select = c("AF_GFP_L"), subset = AF_GFP_L > 0) + +AF_density_pass <- list() + +AF_density_pass[["12-27_miRNA_L"]] <- ggplot(data = tmpdf_1_pass, + mapping = aes(AF_GFP_H)) + ylim(0,300) + + geom_histogram(bins = 50, fill = alpha("#003366",0.8)) + xlab("miRNA_low") + + theme(axis.text = element_text(size = 14), legend.title = element_blank()) + + ggtitle("12-27_miRNA_L") + +AF_density_pass[["12-27_miRNA_H"]] <- ggplot(data = tmpdf_2_pass, + mapping = aes(AF_GFP_L)) + ylim(0,300) + + geom_histogram(bins = 50, fill = alpha("#fc2803",0.8))+ xlab("miRNA_high") + + theme(axis.text = element_text(size = 14), legend.title = element_blank()) + + ggtitle("12-27_miRNA_H") + +AF_density_pass[["10-395_miRNA_L"]] <- ggplot(data = tmpdf_3_pass, + mapping = aes(AF_GFP_H)) + ylim(0,300) + + geom_histogram(bins = 50, fill = alpha("#003366",0.8)) + xlab("miRNA_low") + + theme(axis.text = element_text(size = 14), legend.title = element_blank()) + + ggtitle("10-395_miRNA_L") + +AF_density_pass[["10-395_miRNA_H"]] <- ggplot(data = tmpdf_4_pass, + mapping = aes(AF_GFP_L)) + ylim(0,300) + + geom_histogram(bins = 50, fill = alpha("#fc2803",0.8)) + xlab("miRNA_high") + + theme(axis.text = element_text(size = 14), legend.title = element_blank()) + + ggtitle("10-395_miRNA_H") + +png(filename = "AF_density_pass_variants.png", width = 8, height = 6, units = "in", res = 300) +plot_grid(AF_density_pass[["12-27_miRNA_L"]], AF_density_pass[["12-27_miRNA_H"]], + AF_density_pass[["10-395_miRNA_L"]] , AF_density_pass[["10-395_miRNA_H"]], + nrow = 2) +dev.off() + +# Histograms with PASS or clustered_events + +AF_density_mildfilt <- list() +Mild_vars_12_27 <- Comparison_12_27$rawcall$unionannot + +# Recoding filter variable, PASS / no PASS +Mild_vars_12_27$FILTER_GFP_H_simple <- ifelse(test = Mild_vars_12_27$FILTER_GFP_H == "PASS", "PASS","NOPASS") +Mild_vars_12_27$FILTER_GFP_L_simple <- ifelse(test = Mild_vars_12_27$FILTER_GFP_L == "PASS", "PASS","NOPASS") + +tmpdf_1_mild <- subset.data.frame(x = Mild_vars_12_27, select = c("AF_GFP_H"), + subset = AF_GFP_H > 0 & (FILTER_GFP_H_simple == "PASS" | FILTER_GFP_L_simple == "PASS") & !HGVS_P == "") +tmpdf_2_mild <- subset.data.frame(x = Mild_vars_12_27, select = c("AF_GFP_L"), + subset = AF_GFP_L > 0 & (FILTER_GFP_H_simple == "PASS" | FILTER_GFP_L_simple == "PASS") & !HGVS_P == "") + +Mild_vars_10_395 <- Comparison_10_395$rawcall$unionannot +Mild_vars_10_395$FILTER_GFP_H_simple <- ifelse(test = Mild_vars_10_395$FILTER_GFP_H == "PASS", "PASS","NOPASS") +Mild_vars_10_395$FILTER_GFP_L_simple <- ifelse(test = Mild_vars_10_395$FILTER_GFP_L == "PASS", "PASS","NOPASS") + +tmpdf_3_mild <- subset.data.frame(x = Mild_vars_10_395, select = c("AF_GFP_H"), + subset = AF_GFP_H > 0 & (FILTER_GFP_H_simple == "PASS" | FILTER_GFP_L_simple == "PASS") & !HGVS_P == "") +tmpdf_4_mild <- subset.data.frame(x = Mild_vars_10_395, select = c("AF_GFP_L"), + subset = AF_GFP_L > 0 & (FILTER_GFP_H_simple == "PASS" | FILTER_GFP_L_simple == "PASS") & !HGVS_P == "") + + +AF_density_mildfilt[["12-27_miRNA_L"]] <- ggplot(data = tmpdf_1_mild, + mapping = aes(AF_GFP_H)) + ylim(0,300) + + geom_histogram(bins = 50, fill = alpha("#003366",0.8)) + xlab("miRNA_low") + + theme(axis.text = element_text(size = 14), legend.title = element_blank()) + + ggtitle("12-27_miRNA_L") + +AF_density_mildfilt[["12-27_miRNA_H"]] <- ggplot(data = tmpdf_2_mild, + mapping = aes(AF_GFP_L)) + ylim(0,300) + + geom_histogram(bins = 50, fill = alpha("#fc2803",0.8))+ xlab("miRNA_high") + + theme(axis.text = element_text(size = 14), legend.title = element_blank()) + + ggtitle("12-27_miRNA_H") + +AF_density_mildfilt[["10-395_miRNA_L"]] <- ggplot(data = tmpdf_3_mild, + mapping = aes(AF_GFP_H)) + ylim(0,300) + + geom_histogram(bins = 50, fill = alpha("#003366",0.8)) + xlab("miRNA_low") + + theme(axis.text = element_text(size = 14), legend.title = element_blank()) + + ggtitle("10-395_miRNA_L") + +AF_density_mildfilt[["10-395_miRNA_H"]] <- ggplot(data = tmpdf_4_mild, + mapping = aes(AF_GFP_L)) + ylim(0,300) + + geom_histogram(bins = 50, fill = alpha("#fc2803",0.8)) + xlab("miRNA_high") + + theme(axis.text = element_text(size = 14), legend.title = element_blank()) + + ggtitle("10-395_miRNA_H") + +png(filename = "AF_density_mildfilt_variants.png", width = 8, height = 6, units = "in", res = 300) +plot_grid(AF_density_mildfilt[["12-27_miRNA_L"]], AF_density_mildfilt[["12-27_miRNA_H"]], + AF_density_mildfilt[["10-395_miRNA_L"]] , AF_density_mildfilt[["10-395_miRNA_H"]], + nrow = 2) +dev.off() + +# removing not exonic by using & !HGVS_P == "" + +mild_12_27 <- subset.data.frame(x = Mild_vars_12_27, subset = (FILTER_GFP_H_simple == "PASS" | FILTER_GFP_L_simple == "PASS") & !HGVS_P == "") +mild_12_27$FILTER <- paste0(mild_12_27$FILTER_GFP_H_simple, "_", mild_12_27$FILTER_GFP_L_simple) +png(filename = "12_27_variants_PASS_H_or_L_scatterplot_mild.png", width = 12, height = 9, units = "in", res = 900) +ggplot(data = mild_12_27, mapping = aes(x = AF_GFP_H, y = AF_GFP_L, col=FILTER)) + geom_point() + + theme(legend.text = element_text(size = 7), legend.title = element_blank(), legend.position = "top", + legend.key.height = unit(1, units = "mm"), legend.key.width = unit(1, units = "mm"), + plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5), + legend.justification = 0.5) + ggtitle("12-27 variants", subtitle = "PASS in at least one the 2 populations") +dev.off() + +mild_10_395 <- subset.data.frame(x = Mild_vars_10_395, subset = (FILTER_GFP_H_simple == "PASS" | FILTER_GFP_L_simple == "PASS") & !HGVS_P == "") +mild_10_395$FILTER <- paste0(mild_10_395$FILTER_GFP_H_simple, "_", mild_10_395$FILTER_GFP_L_simple) +png(filename = "10_395_variants_PASS_H_or_L_scatterplot_mild.png", width = 12, height = 9, units = "in", res = 900) +ggplot(data = mild_10_395, mapping = aes(x = AF_GFP_H, y = AF_GFP_L, col=FILTER)) + geom_point() + + theme(legend.text = element_text(size = 7), legend.title = element_blank(), legend.position = "top", + legend.key.height = unit(1, units = "mm"), legend.key.width = unit(1, units = "mm"), + plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5), + legend.justification = 0.5) + ggtitle("10_395 variants", subtitle = "PASS in at least one the 2 populations") +dev.off() + +# Variants NA in miRNAlow 12-27 and PASS in miRNA-H and viceversa + +library(openxlsx) + +write.xlsx(x = mild_12_27[is.na(mild_12_27$FILTER_GFP_H) & mild_12_27$FILTER_GFP_L == "PASS",], + file = "mild_12_27_PASS_GFP_L_NA_GFP_H.xlsx", asTable = T) +write.xlsx(x = mild_12_27[is.na(mild_12_27$FILTER_GFP_L) & mild_12_27$FILTER_GFP_H == "PASS",], + file = "mild_12_27_PASS_GFP_H_NA_GFP_L.xlsx", asTable = T) + +write.xlsx(x = mild_10_395[is.na(mild_10_395$FILTER_GFP_H) & mild_10_395$FILTER_GFP_L == "PASS",], + file = "mild_10_395_PASS_GFP_L_NA_GFP_H.xlsx", asTable = T) +write.xlsx(x = mild_10_395[is.na(mild_10_395$FILTER_GFP_L) & mild_10_395$FILTER_GFP_H == "PASS",], + file = "mild_10_395_PASS_GFP_H_NA_GFP_L.xlsx", asTable = T) + +# saving also only annotated (COSMIC or dbSNP) + +mild_12_27_pass_GFP_L_NA_GFP_H <- subset.data.frame(x = mild_12_27, subset = !ID == "" & is.na(FILTER_GFP_H) & FILTER_GFP_L == "PASS") +writeLines(text = mild_12_27_pass_GFP_L_NA_GFP_H$variantkey, con = "mild_12_27_pass_GFP_L_NA_GFP_H.txt") + +mild_12_27_pass_GFP_H_NA_GFP_L <- subset.data.frame(x = mild_12_27, subset = !ID == "" & is.na(FILTER_GFP_L) & FILTER_GFP_H == "PASS") +writeLines(text = mild_12_27_pass_GFP_H_NA_GFP_L$variantkey, con = "mild_12_27_pass_GFP_H_NA_GFP_L.txt") + +write.xlsx(x = mild_12_27_pass_GFP_L_NA_GFP_H, + file = "mild_12_27_PASS_GFP_L_NA_GFP_H_cosmic_dbsnp.xlsx", asTable = T) +write.xlsx(x = mild_12_27_pass_GFP_H_NA_GFP_L, + file = "mild_12_27_PASS_GFP_H_NA_GFP_L_cosmic_dbsnp.xlsx", asTable = T) + +mild_10_395_pass_GFP_L_NA_GFP_H <- subset.data.frame(x = mild_10_395, subset = !ID == "" & is.na(FILTER_GFP_H) & FILTER_GFP_L == "PASS") +writeLines(text = mild_10_395_pass_GFP_L_NA_GFP_H$variantkey, con = "mild_10_395_pass_GFP_L_NA_GFP_H.txt") + +mild_10_395_pass_GFP_H_NA_GFP_L <- subset.data.frame(x = mild_10_395, subset = !ID == "" & is.na(FILTER_GFP_L) & FILTER_GFP_H == "PASS") +writeLines(text = mild_10_395_pass_GFP_H_NA_GFP_L$variantkey, con = "mild_10_395_pass_GFP_H_NA_GFP_L.txt") + +write.xlsx(x = mild_10_395_pass_GFP_L_NA_GFP_H, + file = "mild_10_395_PASS_GFP_L_NA_GFP_H_cosmic_dbsnp.xlsx", asTable = T) +write.xlsx(x = mild_10_395_pass_GFP_H_NA_GFP_L, + file = "mild_10_395_PASS_GFP_H_NA_GFP_L_cosmic_dbsnp.xlsx", asTable = T) + + +# plotting CR vs DGN variants +# importing variants + +dgn_vs_cr_vars <- read.xlsx(xlsxFile = "12_27_Somatic_DGN_only_variants.xlsx") +mykeys <- dgn_vs_cr_vars$variantkey + +raw_vars_12_27_1 <- read.table(file = "12_27_1_dbsnp_cc_cnc_dbnsfp_refseq_light_raw_filtered.fields.txt", + sep = "\t" ,header = T, stringsAsFactors = F) + +raw_vars_12_27_2 <- read.table(file = "12_27_2_dbsnp_cc_cnc_dbnsfp_refseq_light_raw_filtered.fields.txt", + sep = "\t" ,header = T, stringsAsFactors = F) + +colnames(raw_vars_12_27_1) <- c("Chromosome","Position","Reference","Alternative","AF","DP","AD.R","AD.A","GT","ID","FILTER","HGVS_P","Gene","Biotype", + "Rank","Effect","Impact","Common","G5",colnames(raw_vars_12_27_1)[20:35]) + +raw_vars_12_27_1$sampleid <- "12_27_1" +raw_vars_12_27_1$variantkey <- paste(raw_vars_12_27_1$Chromosome, + raw_vars_12_27_1$Position, + raw_vars_12_27_1$Reference, + raw_vars_12_27_1$Alternative,sep = "_") + +colnames(raw_vars_12_27_2) <- c("Chromosome","Position","Reference","Alternative","AF","DP","AD.R","AD.A","GT","ID","FILTER","HGVS_P","Gene","Biotype", + "Rank","Effect","Impact","Common","G5",colnames(raw_vars_12_27_2)[20:35]) + +raw_vars_12_27_2$sampleid <- "12_27_2" +raw_vars_12_27_2$variantkey <- paste(raw_vars_12_27_2$Chromosome, + raw_vars_12_27_2$Position, + raw_vars_12_27_2$Reference, + raw_vars_12_27_2$Alternative,sep = "_") + + +# subsetting dfs + +df_1 <- subset.data.frame(x = raw_vars_12_27_1, subset = variantkey %in% mykeys, select = c("variantkey","AF","DP","Gene","Effect","FILTER")) +colnames(df_1) <- c("variantkey","AF_GFP_H","DP_GFP_H","Gene", "Effect","FILTER") +df_2 <- subset.data.frame(x = raw_vars_12_27_2, subset = variantkey %in% mykeys, select = c("variantkey","AF","DP","Gene","Effect","FILTER")) +colnames(df_2) <- c("variantkey","AF_GFP_L","DP_GFP_L","Gene", "Effect","FILTER") + + +union_df <- merge.data.frame(x = df_1, y=df_2, by = c("variantkey","Gene","Effect"), all = T) + + +union_df$FILTER.x[is.na(union_df$FILTER.x)] <- "NOPASS" +union_df$FILTER.y[is.na(union_df$FILTER.y)] <- "NOPASS" + +union_df$AF_1[is.na(union_df$AF_GFP_H)] <- 0 +union_df$AF_2[is.na(union_df$AF_GFP_L)] <- 0 + +union_df$DP_1[is.na(union_df$DP_GFP_H)] <- 0 +union_df$DP_2[is.na(union_df$DP_GFP_L)] <- 0 + +union_df$FILTER <- ifelse(test = (union_df$FILTER.x == "" | union_df$FILTER.y == ""), + yes = paste0(union_df$FILTER.x, "_", union_df$FILTER.y), + no = union_df$FILTER.y) + +union_df$FILTER.x <- NULL +union_df$FILTER.y <- NULL + +# subsetting only variants in exons + +union_df_exonic <- union_df[union_df$Effect %in% c("frameshift_variant","missense_variant","synonymous_variant","splice_region_variant&intron_variant"),] +write.xlsx(x = union_df_exonic, file = "DGN_vs_CR_variants_exonic.xlsx") + +library(ggrepel) + +png(filename = paste0("AF_Comparison_12_27_1_vs_12_27_2_DGN_vs_CR_variants.png"), width = 8, height = 6, units = "in", res = 200) +print(ggplot(data = union_df_exonic, mapping = aes(x = AF_GFP_H, y = AF_GFP_L, label = Gene)) + xlim(0,1) + ylim(0,1) + + geom_point(mapping = aes(col = "black"), size = 2) + #geom_abline(intercept = 0, slope = 1) + + xlab(label = "GFP_HIGH") + ylab(label = "GFP_LOW") + ggtitle(label = "12_27 Somatic DGN only variants") + + #geom_point(data=union_df[union_df$AF_GFP_H == 0, ], aes(x=AF_GFP_H, y=AF_GFP_L), colour="red",size = 2) + + geom_text_repel(size=3, show.legend = FALSE, segment.alpha = 0.4) + + theme(plot.title = element_text(hjust = 0.5), legend.title = element_blank(), legend.position = "none")) +dev.off() + diff --git a/WES/Filter_variants_v3.R b/WES/Filter_variants_v3.R new file mode 100644 index 0000000..902fef1 --- /dev/null +++ b/WES/Filter_variants_v3.R @@ -0,0 +1,20 @@ +filter_variants_v3 <- function(infile, sampleid = "Sample01", dp = 50){ + + library(openxlsx) + + df <- read.table(file = infile, sep = "\t" ,header = T, stringsAsFactors = F) + write.xlsx(x = df, file = paste0(sampleid,"_full_variants_annotated.xlsx"), asTable = T) + + colnames(df) <- c("Chromosome","Position","Reference","Alternative","AF","DP","AD.R","AD.A","GT","ID","FILTER","HGVS_P","Gene","Biotype", + "Rank","Effect","Impact","Common","G5",colnames(df)[20:35]) + + df$Common[is.na(df$Common)] <- 0 + df <- subset.data.frame(x = df, subset = DP > dp & Common == 0 & + Biotype == "protein_coding" & !Impact %in% c("LOW", "MODIFIER") + ) + + df$sampleid <- sampleid + df$variantkey <- paste(df$Chromosome, df$Position, df$Reference, df$Alternative,sep = "_") + write.xlsx(x = df, file = paste0(sampleid,"_rawvariants_dp_",dp,"_coding_lowimpact_notcommon",".xlsx"), asTable = T) + return(df) +} diff --git a/WES/MethylAnalysis_bulk_v3.R b/WES/MethylAnalysis_bulk_v3.R new file mode 100644 index 0000000..b3e3c93 --- /dev/null +++ b/WES/MethylAnalysis_bulk_v3.R @@ -0,0 +1,304 @@ + +MethylAnalysis_bulk_v3 <- function(methcall.folder = NULL, # input folder with .cov files (bismark) + outfolder = NULL, # output folder (if not exist it will be created) + proj.id = "myproject", # project id to prefix in putput files + samplesheet = NULL, + sheetnum = 1, + design.var = "Disease", # variable to subset samplesheet + case.var = "Case", # 1 in treatment vector + control.var = "Control", # 0 in treatment vector + idcol = "SampleID", + mincoverage = 10, + lowperc = 5, + lowcount = 10, + assem="hg38", + do.subset = T, + chr.subset = "chr9", + start.subset = 136668000, + end.subset = 136671000, + pipeline.meth = "bismarkCoverage", + plot.covariates = c("Condition","Batch","MRD", + "Torelapse","Chemorefractory", + "Sex","CytoA","Tissue"), + idstoremove = NULL, + delta.meth = 10, + plot.categorical.vars = c("Condition","Batch","MRD","Torelapse","Chemorefractory","Sex","CytoA"), + plot.continuous.vars = c("miR-126","egfl7"), + plot.height = 9, + plot.width = 12, + cellw = 15, + cellh = 15, + fontnumsize = 5, + fontsize = 8 + ){ + + # load libraries + + require(methylKit) + require(ggplot2) + require(reshape2) + require(plyr) + require(ggrepel) + library(GenomicRanges) + library(openxlsx) + library(pheatmap) + + # check vars + + if(is.null(methcall.folder)){ + stop("Please set methcall.folder") + } + + if(is.null(outfolder)){ + stop("Please set outfolder") + } + + if(is.null(samplesheet)){ + stop("Please set samplesheet") + } + + # Setup variables + + infolder = paste0(methcall.folder, "/") + dir.create(infolder, showWarnings = F) + + outfolder = paste0(outfolder, "/") + dir.create(outfolder, showWarnings = F) + + qcfolder <- paste0(outfolder, "QC/") + dir.create(qcfolder, showWarnings = F) + + + # reading sample sheet with metadata + ssheet <- read.xlsx(samplesheet, sheet = sheetnum) + + if(!is.null(idstoremove)){ + ssheet <- subset.data.frame(x = ssheet, subset = !ssheet[[idcol]] %in% idstoremove) + } + + # defining design vector according to variable + + ssheet <- subset.data.frame(x = ssheet, subset = ssheet[[design.var]] %in% c(case.var, control.var)) + + d.vec <- ssheet[[design.var]] + d.vector <- ifelse(d.vec == case.var, 1, 0) + + # initialzing coverage .cov files list + covs <- list() + sampleids <- as.list(ssheet[[idcol]]) + names(sampleids) <- ssheet[[idcol]] + + for (i in ssheet[[idcol]]) { + covs[[i]] <- paste0(infolder, "/", i, ".bismark.cov") + } + + saveRDS(object = covs, file = "covs_object.rds") + + # creating object + + myobj <- methRead(location = covs, + sample.id = sampleids, + assembly = assem, + pipeline = pipeline.meth, + treatment = d.vector, + context="CpG", + mincov = mincoverage) + + saveRDS(myobj, file = "Myobject.rds") + + names(myobj) <- ssheet[[idcol]] + + saveRDS(myobj, file = "Myobject_with_names.rds") + + # subsetting if declared + + if(isTRUE(do.subset)){ + my.win = GRanges(seqnames = chr.subset, ranges = IRanges(start = start.subset, end = end.subset)) + myobj <- selectByOverlap(myobj,my.win) + } + + + saveRDS(myobj, file = "Myobject_with_names_aftersubset.rds") + + # filtering on minimum coverage + myobj <- filterByCoverage(methylObj = myobj, lo.count=lowcount, lo.perc = lowperc) + + # Normalization + myobj <- normalizeCoverage(obj = myobj) + + # saveRDS(object = myobj, file = "Initial_object.rds") + + # Calculate basic stats and PCs + + metricsfolder <- paste0(qcfolder, "Metrics/") + dir.create(path = metricsfolder, showWarnings = F) + + for (id in ssheet[[idcol]]) { + + png(filename = paste0(metricsfolder,proj.id,"_CpG_pct_methylation_sample_", id, ".png"), + width = 9, height = 6, units = "in", res = 96) + print(getMethylationStats(myobj[[id]],plot=TRUE,both.strands=FALSE)) + dev.off() + png(filename = paste0(metricsfolder, proj.id,"_Coverage_stats_sample_", id, ".png"), + width = 9, height = 6, units = "in", res = 96) + print(getCoverageStats(myobj[[id]],plot=TRUE,both.strands=FALSE)) + dev.off() + } + + # create meth obj + + #meth <- unite(object = myobj, destrand=FALSE) + meth <- unite(object = myobj, destrand=FALSE) # for debugging + saveRDS(meth, "savemeth.tmp.rds") + # Perform correlation + + sink(paste0(qcfolder, proj.id, "_Correlations.txt")) + getCorrelation(meth,plot=FALSE) + sink() + + if(length(ssheet[[idcol]]) < 15){ + png(filename = paste0(qcfolder,proj.id, "_Correlations_pearson_pairwise.png"), + width = 9, height = 6, units = "in", res = 96) + print(getCorrelation(meth,plot=TRUE)) + dev.off() + } + + png(filename = paste0(qcfolder, proj.id, "_Clustering.png"), + width = 9, height = 6, units = "in", res = 96) + clusterSamples(meth, dist="euclidean", plot=TRUE, method = "ward.D2") + dev.off() + + # Re-plotting PCs (custom chart) + + # compute PCs and store in object + + pca_compt <- PCASamples(meth, obj.return = T, screeplot = F) + + # extract PCs components + + pcafolder <- paste0(qcfolder, "PCA/") + dir.create(path = pcafolder, showWarnings = F) + + pca_pc1_2 <- as.data.frame(x = pca_compt$x[,1:2]) + + for(myvars in plot.covariates){ + pca_pc1_2$condition <- as.factor(ssheet[[myvars]]) + png(filename = paste0(pcafolder, proj.id, "_PCA_",myvars,".png"), + width = 9, height = 6, units = "in",res=96) + print(ggplot(data = pca_pc1_2, + mapping = aes(x = PC1, y=PC2, col=condition, label=rownames(pca_pc1_2))) + + geom_point(size=3) + geom_text_repel(size=3) + ggtitle(label = "Principal component analysis", subtitle = myvars) + + theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5)) + + theme(plot.subtitle=element_text(size=12, hjust=0.5, face="italic", color="black")) + + theme(axis.title = element_text(size=12, hjust=0.5, face="bold", color="black")) + + theme(legend.text = element_text(size=8, hjust=0.5)) + + theme(legend.title = element_blank()) + + theme(axis.text = element_text(size=12, hjust=0.5, color="black"))) + dev.off() + + } + + # retrieve and store % of methylation + + perc.meth <- percMethylation(meth) + + saveRDS(perc.meth,"pctmethly.rds") + + base::rownames(perc.meth) <- paste0(meth$chr, "_", meth$start) + + # Perform diff methylation + + myDiff=calculateDiffMeth(meth) + + write.table(myDiff,paste0(outfolder,proj.id,"_DiffMeth_single_CpG.txt"), row.names = F) + difftest <- read.table(paste0(outfolder,proj.id,"_DiffMeth_single_CpG.txt"), header = T) + + difftest$comparison <- proj.id + difftest$qvalue_r <- as.character(cut(x = difftest$qvalue, + breaks = c(-1, 1e-100, 1e-10, 1e-02, 1), + labels = c("***","**","*","ns"))) + + myindex <- abs(difftest$meth.diff) < delta.meth + difftest$meth.diff <- abs(difftest$meth.diff) + difftest$qvalue_r[myindex] <- "ns" + write.table(difftest,paste0(outfolder,proj.id,"_DiffMeth_single_CpG.txt"), row.names = F) + + + # Adding color list and annotations + + library(RColorBrewer) + + color_list <- list() + annrows <- NULL + anncols <- subset.data.frame(difftest, select = c("qvalue_r","meth.diff")) + base::rownames(anncols) <- difftest$start + + rows.annot.vars.cat = plot.categorical.vars + rows.annot.vars.con = plot.continuous.vars + + # if(!is.null(rows.annot.vars.cat) | !is.null(rows.annot.vars.con)){ + # rownames(ssheet) <- ssheet$SampleID + # annrows <- subset.data.frame(x = ssheet, select = c(rows.annot.vars.cat, rows.annot.vars.con)) + # concolors <- RColorBrewer::brewer.pal(n = 9, name = "Set1") + # catcolors <- NULL + # for (varcon in 1:length(rows.annot.vars.con)) { + # color_list[[rows.annot.vars.con[varcon]]] <- colorRampPalette(c("lightgrey", concolors[varcon]))(10) + # } + # for (varcat in 1:length(rows.annot.vars.cat)) { + # ncolor <- 1 + # for (val in unique(ssheet[[rows.annot.vars.cat[varcat]]])[order(unique(ssheet[[rows.annot.vars.cat[varcat]]]))]){ + # if(length(unique(ssheet[[rows.annot.vars.cat[varcat]]])) > 9){ + # catcolors <- colorRampPalette(RColorBrewer::brewer.pal(n = 9, name = "Set3"))(length(unique(ssheet[[rows.annot.vars.cat[varcat]]]))) + # } + # else{ + # catcolors <- RColorBrewer::brewer.pal(n = 9, name = "Set3") + # } + # color_list[[rows.annot.vars.cat[varcat]]][[val]] <- catcolors[ncolor] + # ncolor <- ncolor + 1 + # } + # } + # } + + color_list[["qvalue_r"]] = c("*" = "#6497b1", "**" = "#03396c", "***" = "#011f4b", ns= "#c4cacf") + color_list[["meth.diff"]] = colorRampPalette(brewer.pal(n = 11, "Reds"))(100) + color_list[["miR-126"]] <- brewer.pal(n = 9, "PuRd") + + # Heatmap + + pctmeth_matrix <- t(perc.meth) + base::colnames(pctmeth_matrix) <- gsub(x = base::colnames(pctmeth_matrix), pattern = "chr[0-9]+_", replacement = "") + + pheatmap(mat = pctmeth_matrix, main = gsub(x = proj.id, pattern = "_", replacement = " "), + filename = paste0(outfolder, proj.id,"_CpG_percent_methylation_matrix_pheatmap.pdf"), width = plot.width, height = plot.height, + na_col = "black", + cluster_cols = FALSE, + cluster_rows = TRUE, + annotation_row = annrows, + cellwidth = cellw, + cellheight = cellh, + display_numbers = T, + fontsize = fontsize, + fontsize_number = fontnumsize, + number_format = "%.0f", + #border_color = "#CBBEB5", + annotation_col = anncols, + #labels_row = lrow, + #labels_col = lcol, + #gaps_row = gaps.row, + gaps_col = c(8), + annotation_colors = color_list, + color = c("#F5F5F5","#EEEEEE","#CCCCCC","#999999", "#666666","#333333","#000000"), breaks = c(0,10,20,30,50,70,90,100) + ) + + saveRDS(object = pctmeth_matrix, paste0(outfolder, proj.id,"_CpG_percent_methylation_matrix_pheatmap.rds")) + saveRDS(object = list(anncol = anncols, annrow = annrows, anncolors = color_list), paste0(outfolder, proj.id,"_annotations_matrix_pheatmap.rds")) + saveRDS(object = difftest, paste0(outfolder, proj.id,"_CpG_differential_methylation.rds")) + saveRDS(object = perc.meth, paste0(outfolder, proj.id,"_CpG_percent_methylation.rds")) + + write.table(x = perc.meth, file = paste0(outfolder, proj.id,"_CpG_percent_methylation.txt")) + write.table(x = difftest, file = paste0(outfolder, proj.id,"_CpG_differential_methylation.txt")) + + saveRDS(myobj, file = paste0(outfolder,proj.id,"_methylkit.rds")) + saveRDS(myobj, file = paste0(outfolder,proj.id,"_methylkit_meth.rds")) + +} \ No newline at end of file -- GitLab