# Script for analyzing AF library(openxlsx) source(file = "Filter_variants_v3.R") inputdata <- list() for (mysample in c("12_27_1","12_27_2", "10_395_3", "10_395_4")) { inputdata[["rawcall"]][[mysample]] <- filter_variants_v3(infile = paste0(mysample, "_dbsnp_cc_cnc_dbnsfp_refseq_light_raw_filtered.fields.txt"), sampleid = mysample, dp = 50) } source(file = "Compare_variants_AF_v3.R") Comparison_12_27 <- list() Comparison_10_395 <- list() for(i in names(inputdata)){ Comparison_12_27[[i]] <- Compare_variants_AF_v3(dblist = inputdata[[i]], sampleid_1 = "12_27_1", sampleid_2 = "12_27_2", my.width = 8, my.height = 4, my.res = 120, tag = i, outfolder = "12_27/") Comparison_10_395[[i]] <- Compare_variants_AF_v3(dblist = inputdata[[i]], sampleid_1 = "10_395_3", sampleid_2 = "10_395_4", my.width = 8, my.height = 4, my.res = 120, tag = i, outfolder = "10_395/") } # plot AF density in H and L (full list) AF_density_raw <- list() library(cowplot) tmpdf_1_raw <- subset.data.frame(x = Comparison_12_27$rawcall$unionannot, select = c("AF_GFP_H"), subset = AF_GFP_H > 0) tmpdf_2_raw <- subset.data.frame(x = Comparison_12_27$rawcall$unionannot, select = c("AF_GFP_L"), subset = AF_GFP_L > 0) tmpdf_3_raw <- subset.data.frame(x = Comparison_10_395$rawcall$unionannot, select = c("AF_GFP_H"), subset = AF_GFP_H > 0) tmpdf_4_raw <- subset.data.frame(x = Comparison_10_395$rawcall$unionannot, select = c("AF_GFP_L"), subset = AF_GFP_L > 0) AF_density_raw[["12-27_miRNA_L"]] <- ggplot(data = tmpdf_1_raw, mapping = aes(AF_GFP_H)) + ylim(0,600) + geom_histogram(bins = 50, fill = alpha("#003366",0.8)) + xlab("miRNA_low") + theme(axis.text = element_text(size = 14), legend.title = element_blank()) + ggtitle("12-27_miRNA_L") AF_density_raw[["12-27_miRNA_H"]] <- ggplot(data = tmpdf_2_raw, mapping = aes(AF_GFP_L)) + ylim(0,600) + geom_histogram(bins = 50, fill = alpha("#fc2803",0.8))+ xlab("miRNA_high") + theme(axis.text = element_text(size = 14), legend.title = element_blank()) + ggtitle("12-27_miRNA_H") AF_density_raw[["10-395_miRNA_L"]] <- ggplot(data = tmpdf_3_raw, mapping = aes(AF_GFP_H)) + ylim(0,600) + geom_histogram(bins = 50, fill = alpha("#003366",0.8)) + xlab("miRNA_low") + theme(axis.text = element_text(size = 14), legend.title = element_blank()) + ggtitle("10-395_miRNA_L") AF_density_raw[["10-395_miRNA_H"]] <- ggplot(data = tmpdf_4_raw, mapping = aes(AF_GFP_L)) + ylim(0,600) + geom_histogram(bins = 50, fill = alpha("#fc2803",0.8)) + xlab("miRNA_high") + theme(axis.text = element_text(size = 14), legend.title = element_blank()) + ggtitle("10-395_miRNA_H") png(filename = "AF_density_raw_variants.png", width = 8, height = 6, units = "in", res = 300) plot_grid(AF_density_raw[["12-27_miRNA_L"]], AF_density_raw[["12-27_miRNA_H"]], AF_density_raw[["10-395_miRNA_L"]] , AF_density_raw[["10-395_miRNA_H"]], nrow = 2) dev.off() AF_density_mildfilt <- list() Mild_vars_12_27 <- Comparison_12_27$rawcall$unionannot # Recoding filter variable, PASS / no PASS Mild_vars_12_27$FILTER_GFP_H_simple <- ifelse(test = Mild_vars_12_27$FILTER_GFP_H == "PASS", "PASS","NOPASS") Mild_vars_12_27$FILTER_GFP_L_simple <- ifelse(test = Mild_vars_12_27$FILTER_GFP_L == "PASS", "PASS","NOPASS") tmpdf_1_mild <- subset.data.frame(x = Mild_vars_12_27, select = c("AF_GFP_H"), subset = AF_GFP_H > 0 & (FILTER_GFP_H_simple == "PASS" | FILTER_GFP_L_simple == "PASS") & !HGVS_P == "") tmpdf_2_mild <- subset.data.frame(x = Mild_vars_12_27, select = c("AF_GFP_L"), subset = AF_GFP_L > 0 & (FILTER_GFP_H_simple == "PASS" | FILTER_GFP_L_simple == "PASS") & !HGVS_P == "") Mild_vars_10_395 <- Comparison_10_395$rawcall$unionannot Mild_vars_10_395$FILTER_GFP_H_simple <- ifelse(test = Mild_vars_10_395$FILTER_GFP_H == "PASS", "PASS","NOPASS") Mild_vars_10_395$FILTER_GFP_L_simple <- ifelse(test = Mild_vars_10_395$FILTER_GFP_L == "PASS", "PASS","NOPASS") tmpdf_3_mild <- subset.data.frame(x = Mild_vars_10_395, select = c("AF_GFP_H"), subset = AF_GFP_H > 0 & (FILTER_GFP_H_simple == "PASS" | FILTER_GFP_L_simple == "PASS") & !HGVS_P == "") tmpdf_4_mild <- subset.data.frame(x = Mild_vars_10_395, select = c("AF_GFP_L"), subset = AF_GFP_L > 0 & (FILTER_GFP_H_simple == "PASS" | FILTER_GFP_L_simple == "PASS") & !HGVS_P == "") AF_density_mildfilt[["12-27_miRNA_L"]] <- ggplot(data = tmpdf_1_mild, mapping = aes(AF_GFP_H)) + ylim(0,300) + geom_histogram(bins = 50, fill = alpha("#003366",0.8)) + xlab("miRNA_low") + theme(axis.text = element_text(size = 14), legend.title = element_blank()) + ggtitle("12-27_miRNA_L") AF_density_mildfilt[["12-27_miRNA_H"]] <- ggplot(data = tmpdf_2_mild, mapping = aes(AF_GFP_L)) + ylim(0,300) + geom_histogram(bins = 50, fill = alpha("#fc2803",0.8))+ xlab("miRNA_high") + theme(axis.text = element_text(size = 14), legend.title = element_blank()) + ggtitle("12-27_miRNA_H") AF_density_mildfilt[["10-395_miRNA_L"]] <- ggplot(data = tmpdf_3_mild, mapping = aes(AF_GFP_H)) + ylim(0,300) + geom_histogram(bins = 50, fill = alpha("#003366",0.8)) + xlab("miRNA_low") + theme(axis.text = element_text(size = 14), legend.title = element_blank()) + ggtitle("10-395_miRNA_L") AF_density_mildfilt[["10-395_miRNA_H"]] <- ggplot(data = tmpdf_4_mild, mapping = aes(AF_GFP_L)) + ylim(0,300) + geom_histogram(bins = 50, fill = alpha("#fc2803",0.8)) + xlab("miRNA_high") + theme(axis.text = element_text(size = 14), legend.title = element_blank()) + ggtitle("10-395_miRNA_H") png(filename = "AF_density_mildfilt_variants.png", width = 8, height = 6, units = "in", res = 300) plot_grid(AF_density_mildfilt[["12-27_miRNA_L"]], AF_density_mildfilt[["12-27_miRNA_H"]], AF_density_mildfilt[["10-395_miRNA_L"]] , AF_density_mildfilt[["10-395_miRNA_H"]], nrow = 2) dev.off() # removing not exonic by using & !HGVS_P == "" mild_12_27 <- subset.data.frame(x = Mild_vars_12_27, subset = (FILTER_GFP_H_simple == "PASS" | FILTER_GFP_L_simple == "PASS") & !HGVS_P == "") mild_12_27$FILTER <- paste0(mild_12_27$FILTER_GFP_H_simple, "_", mild_12_27$FILTER_GFP_L_simple) png(filename = "12_27_variants_PASS_H_or_L_scatterplot_mild.png", width = 12, height = 9, units = "in", res = 900) ggplot(data = mild_12_27, mapping = aes(x = AF_GFP_H, y = AF_GFP_L, col=FILTER)) + geom_point() + theme(legend.text = element_text(size = 7), legend.title = element_blank(), legend.position = "top", legend.key.height = unit(1, units = "mm"), legend.key.width = unit(1, units = "mm"), plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5), legend.justification = 0.5) + ggtitle("12-27 variants", subtitle = "PASS in at least one the 2 populations") dev.off() mild_10_395 <- subset.data.frame(x = Mild_vars_10_395, subset = (FILTER_GFP_H_simple == "PASS" | FILTER_GFP_L_simple == "PASS") & !HGVS_P == "") mild_10_395$FILTER <- paste0(mild_10_395$FILTER_GFP_H_simple, "_", mild_10_395$FILTER_GFP_L_simple) png(filename = "10_395_variants_PASS_H_or_L_scatterplot_mild.png", width = 12, height = 9, units = "in", res = 900) ggplot(data = mild_10_395, mapping = aes(x = AF_GFP_H, y = AF_GFP_L, col=FILTER)) + geom_point() + theme(legend.text = element_text(size = 7), legend.title = element_blank(), legend.position = "top", legend.key.height = unit(1, units = "mm"), legend.key.width = unit(1, units = "mm"), plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5), legend.justification = 0.5) + ggtitle("10_395 variants", subtitle = "PASS in at least one the 2 populations") dev.off() # Variants NA in miRNAlow 12-27 and PASS in miRNA-H and viceversa library(openxlsx) write.xlsx(x = mild_12_27[is.na(mild_12_27$FILTER_GFP_H) & mild_12_27$FILTER_GFP_L == "PASS",], file = "mild_12_27_PASS_GFP_L_NA_GFP_H.xlsx", asTable = T) write.xlsx(x = mild_12_27[is.na(mild_12_27$FILTER_GFP_L) & mild_12_27$FILTER_GFP_H == "PASS",], file = "mild_12_27_PASS_GFP_H_NA_GFP_L.xlsx", asTable = T) write.xlsx(x = mild_10_395[is.na(mild_10_395$FILTER_GFP_H) & mild_10_395$FILTER_GFP_L == "PASS",], file = "mild_10_395_PASS_GFP_L_NA_GFP_H.xlsx", asTable = T) write.xlsx(x = mild_10_395[is.na(mild_10_395$FILTER_GFP_L) & mild_10_395$FILTER_GFP_H == "PASS",], file = "mild_10_395_PASS_GFP_H_NA_GFP_L.xlsx", asTable = T) # saving also only annotated (COSMIC or dbSNP) mild_12_27_pass_GFP_L_NA_GFP_H <- subset.data.frame(x = mild_12_27, subset = !ID == "" & is.na(FILTER_GFP_H) & FILTER_GFP_L == "PASS") writeLines(text = mild_12_27_pass_GFP_L_NA_GFP_H$variantkey, con = "mild_12_27_pass_GFP_L_NA_GFP_H.txt") mild_12_27_pass_GFP_H_NA_GFP_L <- subset.data.frame(x = mild_12_27, subset = !ID == "" & is.na(FILTER_GFP_L) & FILTER_GFP_H == "PASS") writeLines(text = mild_12_27_pass_GFP_H_NA_GFP_L$variantkey, con = "mild_12_27_pass_GFP_H_NA_GFP_L.txt") write.xlsx(x = mild_12_27_pass_GFP_L_NA_GFP_H, file = "mild_12_27_PASS_GFP_L_NA_GFP_H_cosmic_dbsnp.xlsx", asTable = T) write.xlsx(x = mild_12_27_pass_GFP_H_NA_GFP_L, file = "mild_12_27_PASS_GFP_H_NA_GFP_L_cosmic_dbsnp.xlsx", asTable = T) mild_10_395_pass_GFP_L_NA_GFP_H <- subset.data.frame(x = mild_10_395, subset = !ID == "" & is.na(FILTER_GFP_H) & FILTER_GFP_L == "PASS") writeLines(text = mild_10_395_pass_GFP_L_NA_GFP_H$variantkey, con = "mild_10_395_pass_GFP_L_NA_GFP_H.txt") mild_10_395_pass_GFP_H_NA_GFP_L <- subset.data.frame(x = mild_10_395, subset = !ID == "" & is.na(FILTER_GFP_L) & FILTER_GFP_H == "PASS") writeLines(text = mild_10_395_pass_GFP_H_NA_GFP_L$variantkey, con = "mild_10_395_pass_GFP_H_NA_GFP_L.txt") write.xlsx(x = mild_10_395_pass_GFP_L_NA_GFP_H, file = "mild_10_395_PASS_GFP_L_NA_GFP_H_cosmic_dbsnp.xlsx", asTable = T) write.xlsx(x = mild_10_395_pass_GFP_H_NA_GFP_L, file = "mild_10_395_PASS_GFP_H_NA_GFP_L_cosmic_dbsnp.xlsx", asTable = T) # plotting CR vs DGN variants # importing variants dgn_vs_cr_vars <- read.xlsx(xlsxFile = "12_27_Somatic_DGN_only_variants.xlsx") mykeys <- dgn_vs_cr_vars$variantkey raw_vars_12_27_1 <- read.table(file = "12_27_1_dbsnp_cc_cnc_dbnsfp_refseq_light_raw_filtered.fields.txt", sep = "\t" ,header = T, stringsAsFactors = F) raw_vars_12_27_2 <- read.table(file = "12_27_2_dbsnp_cc_cnc_dbnsfp_refseq_light_raw_filtered.fields.txt", sep = "\t" ,header = T, stringsAsFactors = F) colnames(raw_vars_12_27_1) <- c("Chromosome","Position","Reference","Alternative","AF","DP","AD.R","AD.A","GT","ID","FILTER","HGVS_P","Gene","Biotype", "Rank","Effect","Impact","Common","G5",colnames(raw_vars_12_27_1)[20:35]) raw_vars_12_27_1$sampleid <- "12_27_1" raw_vars_12_27_1$variantkey <- paste(raw_vars_12_27_1$Chromosome, raw_vars_12_27_1$Position, raw_vars_12_27_1$Reference, raw_vars_12_27_1$Alternative,sep = "_") colnames(raw_vars_12_27_2) <- c("Chromosome","Position","Reference","Alternative","AF","DP","AD.R","AD.A","GT","ID","FILTER","HGVS_P","Gene","Biotype", "Rank","Effect","Impact","Common","G5",colnames(raw_vars_12_27_2)[20:35]) raw_vars_12_27_2$sampleid <- "12_27_2" raw_vars_12_27_2$variantkey <- paste(raw_vars_12_27_2$Chromosome, raw_vars_12_27_2$Position, raw_vars_12_27_2$Reference, raw_vars_12_27_2$Alternative,sep = "_") # subsetting dfs df_1 <- subset.data.frame(x = raw_vars_12_27_1, subset = variantkey %in% mykeys, select = c("variantkey","AF","DP","Gene","Effect","FILTER")) colnames(df_1) <- c("variantkey","AF_GFP_H","DP_GFP_H","Gene", "Effect","FILTER") df_2 <- subset.data.frame(x = raw_vars_12_27_2, subset = variantkey %in% mykeys, select = c("variantkey","AF","DP","Gene","Effect","FILTER")) colnames(df_2) <- c("variantkey","AF_GFP_L","DP_GFP_L","Gene", "Effect","FILTER") union_df <- merge.data.frame(x = df_1, y=df_2, by = c("variantkey","Gene","Effect"), all = T) union_df$FILTER.x[is.na(union_df$FILTER.x)] <- "NOPASS" union_df$FILTER.y[is.na(union_df$FILTER.y)] <- "NOPASS" union_df$AF_1[is.na(union_df$AF_GFP_H)] <- 0 union_df$AF_2[is.na(union_df$AF_GFP_L)] <- 0 union_df$DP_1[is.na(union_df$DP_GFP_H)] <- 0 union_df$DP_2[is.na(union_df$DP_GFP_L)] <- 0 union_df$FILTER <- ifelse(test = (union_df$FILTER.x == "" | union_df$FILTER.y == ""), yes = paste0(union_df$FILTER.x, "_", union_df$FILTER.y), no = union_df$FILTER.y) union_df$FILTER.x <- NULL union_df$FILTER.y <- NULL # subsetting only variants in exons union_df_exonic <- union_df[union_df$Effect %in% c("frameshift_variant","missense_variant","synonymous_variant","splice_region_variant&intron_variant"),] write.xlsx(x = union_df_exonic, file = "DGN_vs_CR_variants_exonic.xlsx") library(ggrepel) png(filename = paste0("AF_Comparison_12_27_1_vs_12_27_2_DGN_vs_CR_variants.png"), width = 8, height = 6, units = "in", res = 200) print(ggplot(data = union_df_exonic, mapping = aes(x = AF_GFP_H, y = AF_GFP_L, label = Gene)) + xlim(0,1) + ylim(0,1) + geom_point(mapping = aes(col = "black"), size = 2) + #geom_abline(intercept = 0, slope = 1) + xlab(label = "GFP_HIGH") + ylab(label = "GFP_LOW") + ggtitle(label = "12_27 Somatic DGN only variants") + #geom_point(data=union_df[union_df$AF_GFP_H == 0, ], aes(x=AF_GFP_H, y=AF_GFP_L), colour="red",size = 2) + geom_text_repel(size=3, show.legend = FALSE, segment.alpha = 0.4) + theme(plot.title = element_text(hjust = 0.5), legend.title = element_blank(), legend.position = "none")) dev.off()