Upload New File

90f1cd57 · Stefano Beretta · 911e890d · 90f1cd57
Commit 90f1cd57 authored Jul 31, 2023 by Stefano Beretta
--- a/scripts/WESstd_plot_results.R
+++ b/scripts/WESstd_plot_results.R
+library(dplyr)
+library(ggplot2)
+library(RColorBrewer)
+library(CMplot)
+library(gtable)
+library(circlize)
+library(grid)
+library(gridExtra)
+library(stringr)
+library(e1071)
+library(openxlsx)
+library(VennDiagram)
+library(scales)
+library(ggVennDiagram)
+library(reshape2)
+
+#########################
+### Utility Functions ###
+#########################
+# A helper function to define a region on the layout
+define_region <- function(row, col){
+  viewport(layout.pos.row = row, layout.pos.col = col)
+}
+
+# Function to define colors for SNVs
+snv_colors2 <- function(del = FALSE, alpha = 1) {
+  nuc <- c("A", "C", "G", "T")
+  # Point deletions
+  dels <- alpha(brewer.pal(5, "Greys")[2:5], 1)
+  names(dels) <- paste0(nuc, "*")
+  # Transitions
+  ts <- alpha(brewer.pal(5, "Blues")[2:5], 1)
+  names(ts) <- c("CT", "GA", "AG", "TC")
+  # Tansversions
+  tv1 <- alpha(brewer.pal(5, "YlGn")[2:5], 1)
+  names(tv1) <- c("AC", "TG", "AT", "TA")
+  tv2 <- alpha(brewer.pal(5, "OrRd")[2:5], 1)
+  names(tv2) <- c("CA", "GT", "CG", "GC")
+  var_cols <- c(ts, tv1, tv2)
+  if (del) {
+    var_cols <- c(var_cols, dels)
+  }
+  return(var_cols)
+}
+
+# Circos + Legend
+legend_plot = function(t, sample) {
+  t <- mutate(t, TYPE = case_when(nchar(REF) == 1 & nchar(ALT) == 1 & ALT != "*" ~ "Substitution",
+                                  nchar(REF) == 1 & nchar(ALT) == 1 & ALT == "*" ~ "Deletion",
+                                  nchar(REF) > nchar(ALT) & nchar(REF) - nchar(ALT) >= 1 ~ "Deletion",
+                                  nchar(REF) < nchar(ALT) & nchar(ALT) - nchar(REF) >= 1 ~ "Insertion",
+                                  TRUE ~ "Other"))
+  # Substitutions
+  t.subs <- t %>% filter(TYPE == "Substitution") %>% tbl_df()
+  t.subs$SUB <- paste0(t.subs$REF, t.subs$ALT)
+  t.subs$SUB <- factor(t.subs$SUB, levels = sort(unique(t.subs$SUB)))
+  cols_names <- levels(t.subs$SUB)
+  cols <- snv_colors2(del = F)
+  t.subs <- merge(t.subs, data.frame(cols, SUB = names(cols)), all.x = TRUE)
+  tt.subs <- t.subs %>% filter(Sample == sample) %>% tbl_df()
+  # Deletions / Insertions
+  t.other <- t %>% filter(TYPE != "Substitution") %>% tbl_df()
+  tt.other <- t.other %>% filter(Sample == sample) %>% tbl_df()
+  if(nrow(tt.other) > 0) {
+    tt.other$AF <- 1
+  }
+  tt.other$col <- ifelse(tt.other$TYPE == "Deletion", "Red", "Blue")
+  # Circos
+  par(mar=rep(0,4))
+  chr.names <- paste0("chr", c(1:22, "X", "Y"))
+  circos.par("start.degree" = 90)
+  circos.initializeWithIdeogram(species = "hg38",
+                                chromosome.index = chr.names,
+                                plotType = c())
+  
+  # Legends (center)
+  legend(-0.5, 0.85, lty = 1, lwd = 2, seg.len = 1, col = c("Red", "Blue"),
+         legend = c("Deletion", "Insertion"), bty = 'n',
+         #xjust = 0,
+         x.intersp = 0.4,
+         y.intersp = 1,
+         inset = c(0.05, 0),
+         title.adj = 0.1,
+         title = "Small Indels")
+  legend(-0.5, 0.45, pch = 16, col = cols,
+         legend = names(cols), pt.cex = 1.2, bty = 'n',
+         ncol = 3,
+         #xjust = 0,
+         x.intersp = 0.4,
+         y.intersp = 1,
+         inset = c(0.05, 0),
+         border = "NA",
+         title.adj = 0.1,
+         title = "Substitution")
+  if ("snpEff_Impact" %in% colnames(tt.subs)) {
+    legend(-0.5, -0.2, pch = c(10, 13), col = "red",
+           legend = c("Relevant Impact", "Target Gene"), pt.cex = 1.8, bty = 'n',
+           ncol = 1,
+           #xjust = 0,
+           x.intersp = 0.6,
+           y.intersp = 1,
+           inset = c(0.2, 0),
+           border = "NA",
+           title.adj = 0,
+           title = "Highlight Variants")
+  }
+  circos.clear()
+}
+
+plot_circos <- function(t, sample) {
+  # Substitutions
+  t <- mutate(t, TYPE = case_when(nchar(REF) == 1 & nchar(ALT) == 1 & ALT != "*" ~ "Substitution",
+                                  nchar(REF) == 1 & nchar(ALT) == 1 & ALT == "*" ~ "Deletion",
+                                  nchar(REF) > nchar(ALT) & nchar(REF) - nchar(ALT) >= 1 ~ "Deletion",
+                                  nchar(REF) < nchar(ALT) & nchar(ALT) - nchar(REF) >= 1 ~ "Insertion",
+                                  TRUE ~ "Other"))
+  t.subs <- t %>% filter(TYPE == "Substitution") %>% tbl_df()
+  t.subs$SUB <- paste0(t.subs$REF, t.subs$ALT)
+  t.subs$SUB <- factor(t.subs$SUB, levels = sort(unique(t.subs$SUB)))
+  cols_names <- levels(t.subs$SUB)
+  cols <- snv_colors2(del = F, alpha = 0.8)
+  t.subs <- merge(t.subs, data.frame(cols, SUB = names(cols)), all.x = TRUE)
+  tt.subs <- t.subs %>% filter(Sample == sample) %>% tbl_df()
+  # Deletions / Insertions
+  t.other <- t %>% filter(TYPE != "Substitution") %>% tbl_df()
+  tt.other <- t.other %>% filter(Sample == sample) %>% tbl_df()
+  if(nrow(tt.other) > 0) {
+    tt.other$VAF <- 1
+  }
+  tt.other$col <- ifelse(tt.other$TYPE == "Deletion", "Red", "Blue")
+  if ("IMPACT" %in% colnames(tt.other)) {
+    hio <- subset(tt.other, IMPACT %in% c("HIGH", "MODERATE"))
+    if (nrow(hio) > 0) {
+      print(hio)
+    }
+  }
+  # Circos
+  par(mar=rep(0,4))
+  chr.names <- paste0("chr", c(1:22, "X", "Y"))
+  circos.par("start.degree" = 90)
+  circos.initializeWithIdeogram(species = "hg38",
+                                chromosome.index = chr.names,
+                                plotType = c("ideogram", "labels"))
+  
+  lines <- seq(0, 1, 0.2)
+  line_factors <- expand.grid(x = get.all.sector.index(), y = lines)
+  circos.trackPlotRegion(factors = line_factors$x,
+                         y = line_factors$y,
+                         track.height = 0.4,
+                         panel.fun = function(x,y) {
+                           xl <- get.cell.meta.data("xlim")
+                           for(i in lines) {
+                             circos.lines(xl, c(i,i), col = "grey")
+                           }
+                         })
+  for(i in lines) {
+    circos.text(0, i, i, col = "black", sector.index = "chrY", track.index = 3,
+                cex = 0.5, adj = c(0.01, 0.01))
+  }
+  circos.trackPoints(tt.subs$CHROM,
+                     tt.subs$POS,
+                     tt.subs$VAF,
+                     pch = 16,
+                     cex = 1.5,
+                     col = as.character(tt.subs$cols))
+  if ("snpEff_Impact" %in% colnames(tt.subs)) {
+    hi <- subset(tt.subs, snpEff_Impact %in% c("HIGH", "MODERATE"))
+    if (nrow(hi) > 0) {
+      circos.trackPoints(hi$CHROM,
+                         hi$POS,
+                         hi$VAF,
+                         pch = 10,
+                         cex = 1.8,
+                         col = "red",
+                         track.index = 3)
+    }
+    b2m_df <- data.frame("CHROM" = c("chr15"), "POS" = c(44715702), "VAF" = c(0.7))
+    circos.trackPoints(b2m_df$CHROM,
+                       b2m_df$POS,
+                       b2m_df$VAF,
+                       pch = 13,
+                       cex = 2.5,
+                       col = "red",
+                       track.index = 3)
+  }
+  circos.trackPlotRegion(factors = line_factors$x,
+                         ylim = c(0, 1), track.height = 0.1)
+  if(nrow(tt.other) > 0) {
+    circos.trackLines(tt.other$CHROM,
+                      tt.other$POS,
+                      tt.other$VAF,
+                      track.index = 4,
+                      col = as.character(tt.other$col),
+                      lwd = 2,
+                      type = 'h')
+  }
+  text(0,0, sample, cex = 1.5)
+  circos.clear()
+}
+
+# Variant Plots
+plot_variants <- function(full.t, out_dir, plot_prefix, fill_by) {
+  dir.create(path = out_dir, showWarnings = F)
+  tt <- full.t %>%
+    filter(grepl("chr", CHROM)) %>%
+    filter(!CHROM %in% c("chrY", "chrM")) %>%
+    group_by(Sample, !!!syms(fill_by)) %>%
+    summarise(Count = n())
+  write.xlsx(x = list("NumVariants" = tt), file = paste(out_dir, paste0(plot_prefix, "_VariantCounts.xlsx"), sep = "/"))
+  
+  p <- ggplot(tt, aes(x = Sample, y = Count, fill = get(fill_by))) +
+    theme_bw(base_size = 12) +
+    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
+          legend.position = "none") +
+    geom_bar(stat = "identity") +
+    scale_y_continuous(labels = scales::comma, n.breaks = 8) +
+    xlab("") +
+    facet_grid(.~get(fill_by), scales = "free_x", space = "free")
+  ggsave(filename = paste(out_dir, paste0(plot_prefix, "_VariantCounts.pdf"), sep = "/"), plot = p, width = 6, height = 6)
+  
+  tt <- full.t %>%
+    filter(grepl("chr", CHROM)) %>%
+    filter(!CHROM %in% c("chrY", "chrM")) %>%
+    group_by(Sample, !!!syms(fill_by), REF, ALT) %>%
+    summarise(Count = n())
+  tt <- mutate(tt, TYPE = case_when(nchar(REF) == 1 & nchar(ALT) == 1 & ALT != "*" ~ "SNV",
+                                    nchar(REF) == 1 & nchar(ALT) == 1 & ALT == "*" ~ "DEL",
+                                    nchar(REF) > nchar(ALT) & nchar(REF) - nchar(ALT) >= 1 ~ "DEL",
+                                    nchar(REF) < nchar(ALT) & nchar(ALT) - nchar(REF) >= 1 ~ "INS",
+                                    TRUE ~ "Other"))
+  write.xlsx(x = list("VariantClass" = tt), file = paste(out_dir, paste0(plot_prefix, "_VariantClassification.xlsx"), sep = "/"))
+  tt_type <- tt %>%
+    group_by(Sample, !!!syms(fill_by), TYPE) %>%
+    summarise(SumCount = sum(Count)) %>%
+    arrange(desc(SumCount)) %>%
+    group_by(Sample, !!!syms(fill_by)) %>%
+    mutate(CountPerc =  SumCount/sum(SumCount))
+  write.xlsx(x = list("VariantType" = tt_type), file = paste(out_dir, paste0(plot_prefix, "_VariantType.xlsx"), sep = "/"))
+  
+  p <- ggplot(tt_type, aes(x = Sample, y = SumCount, fill = TYPE)) +
+    theme_bw(base_size = 12) +
+    theme(axis.text.x = element_text(angle = 30, hjust = 1),
+          legend.position = "top") +
+    guides(fill = guide_legend(ncol = 6)) +
+    xlab("") +
+    ylab("Count") +
+    scale_fill_brewer(palette = "Set1", name = "") +
+    geom_bar(stat = "identity", position = "stack") +
+    scale_y_continuous(labels = scales::comma, n.breaks = 8) +
+    facet_grid(.~get(fill_by), scales = "free_x", space = "free")
+  ggsave(filename = paste(out_dir, paste0(plot_prefix, "_VariantTypes.pdf"), sep = "/"), plot = p, width = 6, height = 6)
+  
+  p <- ggplot(tt_type, aes(x = Sample, y = CountPerc, fill = TYPE)) +
+    theme_bw(base_size = 12) +
+    theme(axis.text.x = element_text(angle = 30, hjust = 1),
+          legend.position = "top") +
+    guides(fill = guide_legend(ncol = 6)) +
+    xlab("") +
+    ylab("Count") +
+    scale_fill_brewer(palette = "Set1", name = "") +
+    geom_bar(stat = "identity", position = "stack") +
+    scale_y_continuous(labels = scales::percent_format(accuracy = 2), n.breaks = 10) +
+    facet_grid(.~get(fill_by), scales = "free_x", space = "free")
+  ggsave(filename = paste(out_dir, paste0(plot_prefix, "_VariantTypesPerc.pdf"), sep = "/"), plot = p, width = 6, height = 6)
+  
+  tt <- full.t %>%
+    filter(nchar(REF) == 1 & nchar(ALT) == 1 & grepl("chr", CHROM)) %>%
+    filter(!CHROM %in% c("chrY", "chrM")) %>%
+    group_by(Sample, !!!syms(fill_by), REF, ALT) %>%
+    summarise(Count = n()) %>%
+    group_by(Sample, !!!syms(fill_by)) %>%
+    mutate(CountPerc =  Count/sum(Count))
+  tt$Variant <- paste0(tt$REF, tt$ALT)
+  tt$Variant <- factor(tt$Variant, levels = sort(unique(tt$Variant)))
+  write.xlsx(x = list("SNV" = tt), file = paste(out_dir, paste0(plot_prefix, "_SNVcounts.xlsx"), sep = "/"))
+  
+  tt$Variant <- factor(tt$Variant, levels = rev(names(snv_colors2())))
+  p2 <- ggplot(tt, aes(x = Sample, y = Count, fill = Variant)) +
+    theme_bw(base_size = 12) +
+    theme(axis.text.x = element_text(angle = 30, hjust = 1)) +
+    geom_bar(stat = "identity") +
+    scale_fill_manual(values = snv_colors2(del = F)) +
+    xlab("") +
+    scale_y_continuous(labels = scales::comma, n.breaks = 8) +
+    facet_grid(.~get(fill_by), scales = "free_x", space = "free")
+  ggsave(filename = paste(out_dir, paste0(plot_prefix, "_SNVcounts.pdf"), sep = "/"), plot = p2, width = 7, height = 7)
+  
+  p2p <- ggplot(tt, aes(x = Sample, y = CountPerc, fill = Variant)) +
+    theme_bw(base_size = 12) +
+    theme(axis.text.x = element_text(angle = 30, hjust = 1)) +
+    geom_bar(stat = "identity") +
+    scale_fill_manual(values = snv_colors2(del = F)) +
+    scale_y_continuous(labels = scales::percent_format(accuracy = 2), n.breaks = 10) +
+    xlab("") +
+    facet_grid(.~get(fill_by), scales = "free_x", space = "free")
+  ggsave(filename = paste(out_dir, paste0(plot_prefix, "_SNVcountsPerc.pdf"), sep = "/"), plot = p2p, width = 7, height = 7)
+}
+
+
+###############
+### General ###
+###############
+wdir <- "Fiumara_BasePrimeEd2022_WES/WESstd"
+dp_sing <- "50"
+
+out_dir <- paste(wdir, "results", sep = "/")
+dir.create(out_dir, showWarnings = F)
+
+# Samples
+samples <- c("227-BM-A0-5","227-BM-A0-6","227-BM-A2-5","227-BM-A2-6","227-BM-A3-5","227-BM-A3-6",
+             "227-BM-B0-5","227-BM-B1-5","227-BM-B3-5","227-BM-B4-5",
+             "227-BM-C0-5","227-BM-C1-5","227-BM-D0-6","227-BM-D1-6","227-BM-D3-6",
+             "227-vitro-A","227-vitro-B","227-vitro-C","227-vitro-D")
+
+################
+### Variants ###
+################
+full.t <- data.frame()
+for (ss in samples) {
+  print(ss)
+  t <- read.table(gzfile(paste(wdir, "data", paste0(ss, "-GQ80_DP", dp_sing, "_PASS.ANNOT.vcf.gz"), sep = "/")), comment.char = "#")
+  colnames(t) <- c("CHROM","POS","ID","REF","ALT","QUAL","FILTER", "ANNOT", "FORMAT", "SAMPLE")
+  t$Vars <- paste(t$CHROM, t$POS, t$REF, t$ALT, sep = "-")
+  t$Sample <- ss
+  t$Treatment <- substr(as.character(str_split_fixed(ss, "-", 3)[,3]), 1, 1)
+  t$Group <- gsub("BM", "Sample", as.character(str_split_fixed(ss, "-", 3)[,2]))
+  t$Vars <- paste(t$CHROM, t$POS, t$REF, t$ALT, sep = "-")
+  t$DP <- as.numeric(str_split_fixed(t$SAMPLE, ":", 5)[,3])
+  t$AD <- as.numeric(str_split_fixed(str_split_fixed(t$SAMPLE, ":", 5)[,2], ",", 2)[,1])
+  t$GQ <- as.numeric(str_split_fixed(t$SAMPLE, ":", 5)[,4])
+  t$VAF <- (t$DP - t$AD) / t$DP
+  full.t <- rbind(full.t, t)
+}
+
+chr <- c(paste0("chr", seq(1,22)), "chrX")
+full.t <- subset(full.t, CHROM %in% chr)
+full.t$CHROM <- factor(full.t$CHROM, levels = chr)
+full.t$Sample <- factor(full.t$Sample, levels = unique(full.t$Sample))
+
+write.table(x = full.t, file = paste(out_dir, "Full_res_PASS.tsv", sep = "/"),
+            sep = "\t", quote = F, row.names = F, col.names = T)
+
+# Vitro
+vitros <- list()
+for (vv in c("227-vitro-A","227-vitro-B","227-vitro-C","227-vitro-D")) {
+  print(vv)
+  t <- subset(full.t, Sample == vv)
+  vitros[[vv]] <- t$Vars
+}
+
+venn <- Venn(vitros)
+data <- process_data(venn)
+vreg <- venn_region(data)
+vreg$NumInters <- as.character(str_count(venn_region(data)$name, "[..]"))
+cc <- brewer.pal(name = "Accent", n = 8)
+cc_cat <- cc[1:length(unique(vreg$NumInters))]
+names(cc_cat) <- unique(vreg$NumInters)
+vp <- ggplot() +
+  # change mapping of color filling
+  geom_sf(aes(fill = NumInters), data = vreg, show.legend = FALSE) +
+  scale_fill_manual(values = cc_cat) +
+  # adjust edge size and color
+  geom_sf(color = "black", size = 1, data = venn_setedge(data), show.legend = FALSE) +
+  # show set label in bold
+  geom_sf_text(aes(label = name), fontface = "bold", data = venn_setlabel(data), size = 8, nudge_x = c(0.05, 0.06, -0.06, -0.06)) +
+  # add a alternative region name
+  geom_sf_label(aes(label = paste0(count, "\n", "(", round(count/sum(count)*100, 1), "%)")), data = vreg, alpha = 0.5, size = 5) +
+  theme_void()
+ggsave(filename = paste(out_dir, paste0("Vitro_DP", dp_sing, "_intersection_Venn.pdf"), sep = "/"),
+       plot = vp, width = 9, height = 7)
+
+
+# Remove Multivars
+vitros <- list()
+for (vv in c("227-vitro-A","227-vitro-B","227-vitro-C","227-vitro-D")) {
+  print(vv)
+  t <- subset(full.t, Sample == vv)
+  t <- t[grep(pattern = ",", x = t$ALT, invert = T),]
+  vitros[[vv]] <- t$Vars
+}
+
+venn <- Venn(vitros)
+data <- process_data(venn)
+vreg <- venn_region(data)
+vreg$NumInters <- as.character(str_count(venn_region(data)$name, "[..]"))
+cc <- brewer.pal(name = "Accent", n = 8)
+cc_cat <- cc[1:length(unique(vreg$NumInters))]
+names(cc_cat) <- unique(vreg$NumInters)
+vp <- ggplot() +
+  # change mapping of color filling
+  geom_sf(aes(fill = NumInters), data = vreg, show.legend = FALSE) +
+  scale_fill_manual(values = cc_cat) +
+  # adjust edge size and color
+  geom_sf(color = "black", size = 1, data = venn_setedge(data), show.legend = FALSE) +
+  # show set label in bold
+  geom_sf_text(aes(label = name), fontface = "bold", data = venn_setlabel(data), size = 8, nudge_x = c(0.05, 0.06, -0.06, -0.06)) +
+  # add a alternative region name
+  geom_sf_label(aes(label = paste0(count, "\n", "(", round(count/sum(count)*100, 1), "%)")), data = vreg, alpha = 0.5, size = 5) +
+  theme_void()
+ggsave(filename = paste(out_dir, paste0("Vitro_DP", dp_sing, "NoMulti_intersection_Venn.pdf"), sep = "/"),
+       plot = vp, width = 9, height = 7)
+
+plot_variants(full.t = subset(full.t, Group == "vitro"), out_dir = out_dir, plot_prefix = paste0("Vitro_DP", dp_sing), fill_by = "Treatment")
+
+# Control
+vitro_ctrl <- subset(full.t, Sample == "227-vitro-D")
+vitro_ctrl_vars <- vitro_ctrl$Vars
+
+# Remove Control variants from vitro
+full.t_noctrl <- subset(full.t, !Vars %in% vitro_ctrl_vars)
+full.t_noctrl %>% group_by(Sample) %>% summarise(n())
+plot_variants(full.t = subset(full.t_noctrl, Group == "vitro"), out_dir = out_dir, plot_prefix = paste0("Vitro_DP", dp_sing, "_NoVitroCtrl"), fill_by = "Treatment")
+
+vitros_noctrl <- list()
+for (vv in c("227-vitro-A","227-vitro-B","227-vitro-C")) {
+  print(vv)
+  t <- subset(full.t_noctrl, Sample == vv)
+  vitros_noctrl[[vv]] <- t$Vars
+}
+venn <- Venn(vitros_noctrl)
+data <- process_data(venn)
+vreg <- venn_region(data)
+vreg$NumInters <- as.character(str_count(venn_region(data)$name, "[..]"))
+cc <- brewer.pal(name = "Accent", n = 8)
+cc_cat <- cc[1:length(unique(vreg$NumInters))]
+names(cc_cat) <- unique(vreg$NumInters)
+vp <- ggplot() +
+  # change mapping of color filling
+  geom_sf(aes(fill = NumInters), data = vreg, show.legend = FALSE) +
+  scale_fill_manual(values = cc_cat) +
+  # adjust edge size and color
+  geom_sf(color = "black", size = 1, data = venn_setedge(data), show.legend = FALSE) +
+  # show set label in bold
+  geom_sf_text(aes(label = name), fontface = "bold", data = venn_setlabel(data), size = 8, nudge_y = c(-450, 0, -450), nudge_x = c(100, 0, -100)) +
+  # add a alternative region name
+  geom_sf_label(aes(label = paste0(count, "\n", "(", round(count/sum(count)*100, 1), "%)")), data = vreg, alpha = 0.5, size = 5) +
+  theme_void()
+ggsave(filename = paste(out_dir, paste0("Vitro_DP", dp_sing, "_NoVitroCtrl_intersection_Venn.pdf"), sep = "/"),
+       plot = vp, width = 9, height = 7)
+
+
+# Samples
+plot_variants(full.t = subset(full.t, Group == "Sample"), out_dir = out_dir, plot_prefix = paste0("Sample_DP", dp_sing), fill_by = "Treatment")
+
+plot_variants(full.t = subset(full.t_noctrl, Group == "Sample"), out_dir = out_dir, plot_prefix = paste0("Sample_DP", dp_sing, "_NoVitroCtrl"), fill_by = "Treatment")
+
+full.t_noctrl_nomulti <- full.t_noctrl[grep(pattern = ",", x = full.t_noctrl$ALT, invert = T),]
+
+plot_variants(full.t = subset(full.t_noctrl_nomulti, Group == "Sample"), out_dir = out_dir, plot_prefix = paste0("Sample_DP", dp_sing, "NoMulti_NoVitroCtrl"), fill_by = "Treatment")
+
+plot_variants(full.t = full.t_noctrl, out_dir = out_dir, plot_prefix = paste0("Full_DP", dp_sing, "_NoVitroCtrl"), fill_by = "Treatment")
+
+## Parse snpEff Annotation
+full.t_noctrl_nomulti$snpEff_tmp <- lapply(full.t_noctrl_nomulti$ANNOT, function(x){
+  tmp <- strsplit(x, ";")[[1]]
+  ann_pos <- length(tmp)
+  if (!startsWith(tmp[length(tmp)], "ANN")) {
+    for (i in seq(1,length(tmp))) {
+      if (startsWith(tmp[i], "ANN")) {
+        ann_pos <- i
+      }
+    }
+  }
+  tmp <- tmp[ann_pos]
+})
+full.t_noctrl_nomulti$snpEff_Effect <- sapply(full.t_noctrl_nomulti$snpEff_tmp, function(tmp){
+  strsplit(tmp, "|", fixed = T)[[1]][2]
+}, USE.NAMES = F)
+full.t_noctrl_nomulti$snpEff_Impact <- sapply(full.t_noctrl_nomulti$snpEff_tmp, function(tmp){
+  strsplit(tmp, "|", fixed = T)[[1]][3]
+}, USE.NAMES = F)
+full.t_noctrl_nomulti$snpEff_Gene <- sapply(full.t_noctrl_nomulti$snpEff_tmp, function(tmp){
+  strsplit(tmp, "|", fixed = T)[[1]][4]
+}, USE.NAMES = F)
+full.t_noctrl_nomulti$snpEff_GeneName <- sapply(full.t_noctrl_nomulti$snpEff_tmp, function(tmp){
+  strsplit(tmp, "|", fixed = T)[[1]][5]
+}, USE.NAMES = F)
+full.t_noctrl_nomulti$snpEff_BioType <- sapply(full.t_noctrl_nomulti$snpEff_tmp, function(tmp){
+  strsplit(tmp, "|", fixed = T)[[1]][8]
+}, USE.NAMES = F)
+full.t_noctrl_nomulti$snpEff_tmp <- NULL
+
+write.table(x = full.t_noctrl_nomulti, file = paste(out_dir, "Full_NoMulti_noGerm_res.tsv", sep = "/"),
+            sep = "\t", quote = F, row.names = F, col.names = T)
+
+snpeff_df <- full.t_noctrl_nomulti %>% group_by(Sample, Group, Treatment, snpEff_Effect, snpEff_Impact) %>% summarise(Count = n())
+snpeff_df <- melt(reshape2::dcast(snpeff_df, Sample + Group + Treatment ~ snpEff_Effect + snpEff_Impact, value.var = "Count"), id.vars = c("Sample", "Group", "Treatment"))
+snpeff_df$snpEff_Effect <- sapply(snpeff_df$variable, function(x){
+  tmp <- strsplit(as.character(x), "_")[[1]] # only consider the nearest match
+  tmp <- tmp[1:(length(tmp)-1)]
+  paste(tmp, collapse = "_")
+}, USE.NAMES = F)
+snpeff_df$snpEff_Impact <- sapply(snpeff_df$variable, function(x){
+  tmp <- strsplit(as.character(x), "_")[[1]] # only consider the nearest match
+  tmp <- tmp[length(tmp)]
+}, USE.NAMES = F)
+snpeff_df$variable <- NULL
+snpeff_df$snpEff_Impact <- factor(snpeff_df$snpEff_Impact, levels = c("HIGH", "MODERATE", "LOW", "MODIFIER"))
+mpoint <- (max(snpeff_df$value, na.rm = T) - min(snpeff_df$value, na.rm = T)) / 2
+p <- ggplot(snpeff_df, aes(x = snpEff_Effect, y = Sample, fill = value)) +
+  theme_bw(base_size = 14) +
+  theme(axis.text.x = element_text(angle = 70, hjust = 1)) +
+  geom_tile(aes(height=.98, width=.98)) +
+  geom_text(data = subset(snpeff_df, value < 1000), aes(label = value), color = "white", size = 4) +
+  ylab("") +
+  xlab("") +
+  ggtitle("snpEff Variant Classification") +
+  scale_fill_gradient2(low = "blue", high = "red", mid = "yellow", midpoint = mpoint, na.value = "grey") +
+  facet_grid(Treatment~snpEff_Impact, scales = "free", space = "free")
+ggsave(filename = paste(out_dir, paste0("Full_DP", dp_sing, "NoMulti_NoVitroCtrl_snpEff.pdf"), sep = "/"),
+       plot = p, width = 15, height = 12)
+
+# Samples
+samples.t_noctrl_nomulti <- subset(full.t_noctrl_nomulti, Group == "Sample")
+
+
+############################
+### Cancer-related Genes ###
+############################
+offt_gene <- c("ABCB1","ABCC2","ABL1","ABL2","AKT1","AKT2","AKT3","ALK","ANGPTL7","APC","ASXL1","ATM","ATRX","BCYRN1","BRAF","BRCA1","BRCA2","CBL","CDA",
+               "CDH1","CDKN2A","CDKN2B","CEBPA","CHD7","CHIC2","CREBBP","CRLF2","CSF1R","CTNNB1","CYP19A1","CYP2A13","CYP2A6","CYP2A7","CYP2B6","CYP2B7P",
+							 "CYP2C19","CYP2C9","CYP2D6","CYP2D7","DACH1","DDR1","DDR2","DDX3X","DDX54","DNMT3A","DPYD","DPYD-AS1","EGFR","EGFR-AS1","ERBB2","ERBB3","ERBB4",
+							 "ERG","ESR1","EVI2A","EVI2B","EZH2","FBXW7","FGFR1","FGFR2","FGFR3","FGFR4","FLT1","FLT3","FLT4","FSTL5","GNA11","GNAQ","GNAS","GNAS-AS1",
+							 "GSTP1","GTF2IP1","H3F3A","HNF1A","HRAS","IDH1","IDH2","IKZF1","IL1RAPL1","IL2RA","IL2RB","IL2RG","INPP4B","JAK1","JAK2","JAK3","KDM6A",
+							 "KDR","KIT","KMT2A","KRAS","LAMA2","LCK","LOC100287072","LOC101928052","LPAR6","LTK","MAP2K1","MAP2K2","MAP2K4","MAP3K1","MAPK1","MED13",
+							 "MEIKIN","MET","MGC32805","MGST2","MIR548AZ","MLH1","MPL","MST1R","MTOR","MTOR-AS1","MYC","MYD88","NELL2","NF1","NOTCH1","NPM1","NRAS","OMG",
+							 "PDGFRA","PDGFRB","PHF6","PIK3CA","PIK3R1","PSMB1","PSMB2","PSMB5","PSMD1","PSMD2","PTCH1","PTEN","PTENP1","PTPN11","PVT1","RAF1","RARA",
+							 "RARA-AS1","RARB","RARG","RB1","RET","ROS1","RPS6KB1","RUNDC3B","RUNX1","RXRA","RXRB","RXRG","SDCCAG8","SHH","SHOC2","SLC22A1","SLC22A2",
+							 "SLC31A1","SLC34A2","SLC45A3","SLCO1B1","SMAD4","SMARCA4","SMARCB1","SMO","SNCAIP","SOS1","SPRED1","SRC","STK11","SUFU","TAS2R38","TET2",
+							 "TMEM75","TMPRSS2","TP53","TPX2","TRRAP","TYK2","UGT1A9","UTY","VHL","WT1","YES1")
+offt_vars <- subset(full.t_noctrl_nomulti, snpEff_GeneName %in% offt_gene)
+pdf(paste(out_dir, paste0("Full_DP", dp_sing, "NoMulti_NoVitroCtrl_Circos_Pannello_Genes.pdf"), sep = "/"), width = 12, height = 9)
+par(mfrow = c(3,4))
+for (gg in unique(offt_vars$Treatment)) {
+  offt_vars_gg <- subset(offt_vars, Treatment == gg)
+  print(gg)
+  for (ss in unique(offt_vars_gg$Sample)) {
+    plot_circos(t = offt_vars, sample = ss)
+  }
+}
+legend_plot(t = offt_vars, sample = ss)
+dev.off()
+write.xlsx(x = list("Pannello" = offt_vars), file = paste(out_dir, paste0("Full_DP", dp_sing, "NoMulti_NoVitroCtrl_Circos_Pannello_Genes.xlsx"), sep = "/"))
+
+offt_vars <- subset(samples.t_noctrl_nomulti, snpEff_GeneName %in% offt_gene)
+pdf(paste(out_dir, paste0("Sample_DP", dp_sing, "NoMulti_NoVitroCtrl_Circos_Pannello_Genes.pdf"), sep = "/"), width = 12, height = 9)
+par(mfrow = c(3,4))
+for (gg in unique(offt_vars$Treatment)) {
+  offt_vars_gg <- subset(offt_vars, Treatment == gg)
+  print(gg)
+  for (ss in unique(offt_vars_gg$Sample)) {
+    plot_circos(t = offt_vars, sample = ss)
+  }
+}
+legend_plot(t = offt_vars, sample = ss)
+dev.off()
+write.xlsx(x = list("Pannello" = offt_vars), file = paste(out_dir, paste0("Sample_DP", dp_sing, "NoMulti_NoVitroCtrl_Circos_Pannello_Genes.xlsx"), sep = "/"))
+
+
+offt_vars <- subset(samples.t_noctrl_nomulti, snpEff_GeneName %in% offt_gene & snpEff_Impact %in% c("HIGH", "MODERATE","MODIFIER"))
+pdf(paste(out_dir, "Groups_Circos_Pannello_Genes_HighModMod.pdf", sep = "/"), width = 12, height = 9)
+par(mfrow = c(2,3))
+for (gg in unique(offt_vars$Treatment)) {
+  offt_vars_gg <- subset(offt_vars, Treatment == gg)
+  offt_vars_gg$Sample <- gg
+  plot_circos(t = offt_vars_gg, sample = gg)
+}
+legend_plot(t = offt_vars, sample = gg)
+dev.off()
+write.xlsx(x = list("Pannello" = offt_vars), file = paste(out_dir, "Groups_Circos_Pannello_Genes_HighModMod.xlsx", sep = "/"))
+
+# Merge samples 5 and 6 of group A
+tt_A <- data.frame()
+for (contr in c("227-BM-A0", "227-BM-A2", "227-BM-A3")) {
+  tt5_name <- paste0(contr, "-5")
+  tt5 <- subset(full.t, Sample == tt5_name)
+  tt6_name <- paste0(contr, "-6")
+  tt6 <- subset(full.t, Sample == tt6_name)
+  vl <- list(tt5$Vars, tt6$Vars)
+  names(vl) <- c(tt5_name, tt6_name)
+  venn <- Venn(vl)
+  data <- process_data(venn)
+  vreg <- venn_region(data)
+  vreg$NumInters <- as.character(str_count(venn_region(data)$name, "[..]"))
+  cc <- brewer.pal(name = "Accent", n = 8)
+  cc_cat <- cc[1:length(unique(vreg$NumInters))]
+  names(cc_cat) <- unique(vreg$NumInters)
+  vp <- ggplot() +
+    # change mapping of color filling
+    geom_sf(aes(fill = NumInters), data = vreg, show.legend = FALSE) +
+    scale_fill_manual(values = cc_cat) +
+    # adjust edge size and color
+    geom_sf(color = "black", size = 1, data = venn_setedge(data), show.legend = FALSE) +
+    # show set label in bold
+    geom_sf_text(aes(label = name), fontface = "bold", data = venn_setlabel(data), size = 6) +
+    # add a alternative region name
+    geom_sf_label(aes(label = paste0(count, "\n", "(", round(count/sum(count)*100, 1), "%)")), data = vreg, alpha = 0.5, size = 5) +
+    theme_void()
+  ggsave(filename = paste(out_dir, paste0(contr, "_Samples_DP", dp_sing, "_intersection_Venn.pdf"), sep = "/"),
+         plot = vp, width = 5, height = 3)
+  
+  tt5_only <- subset(tt5, Vars %in% setdiff(tt5$Vars, tt6$Vars))
+  tt5_only$Sample <- contr
+  tt6_only <- subset(tt6, Vars %in% setdiff(tt6$Vars, tt5$Vars))
+  tt6_only$Sample <- contr
+  tt5_common <- subset(tt5, Vars %in% intersect(tt5$Vars, tt6$Vars))
+  tt6_common <- subset(tt6, Vars %in% intersect(tt5$Vars, tt6$Vars))
+  tt_common <- merge(tt5_common, tt6_common, by = c("CHROM","POS","ID","REF","ALT","FILTER","Group","Treatment","Vars"))
+  tt_common$Sample.x <- NULL
+  tt_common$Sample.y <- NULL
+  tt_common$Sample <- contr
+  tt_common$VAF <- rowMeans(tt_common[, c("VAF.x", "VAF.y")])
+  tt_common$VAF.x <- NULL
+  tt_common$VAF.y <- NULL
+  tt_common$DP <- rowMeans(tt_common[, c("DP.x", "DP.y")])
+  tt_common$DP.x <- NULL
+  tt_common$DP.y <- NULL
+  tt_common$GQ <- rowMeans(tt_common[, c("GQ.x", "GQ.y")])
+  tt_common$GQ.x <- NULL
+  tt_common$GQ.y <- NULL
+  tt_common$QUAL <- rowMeans(tt_common[, c("QUAL.x", "QUAL.y")])
+  tt_common$QUAL.x <- NULL
+  tt_common$QUAL.y <- NULL
+  tt_common$ANNOT <- tt_common$ANNOT.x
+  tt_common$ANNOT.x <- NULL
+  tt_common$ANNOT.y <- NULL
+  tt_common$FORMAT <- tt_common$FORMAT.x
+  tt_common$FORMAT.x <- NULL
+  tt_common$FORMAT.y <- NULL
+  tt_common$SAMPLE <- tt_common$SAMPLE.x
+  tt_common$SAMPLE.x <- NULL
+  tt_common$SAMPLE.y <- NULL
+  tt_common <- tt_common[,colnames(tt5_only)]
+  tt_contr <- rbind(tt_common, tt5_only, tt6_only)
+  if (nrow(tt_A) == 0) {
+    tt_A <- tt_contr
+  } else {
+    tt_A <- rbind(tt_A, tt_contr)
+  }
+}
+
+table(full.t$Sample)
+full.t.unionA <- subset(full.t, !Sample %in% c(paste0(c("227-BM-A0", "227-BM-A2", "227-BM-A3"), "-5"), paste0(c("227-BM-A0", "227-BM-A2", "227-BM-A3"), "-6")))
+full.t.unionA <- rbind(full.t.unionA, tt_A)
+plot_variants(full.t = subset(full.t.unionA, Group == "Sample"), out_dir = out_dir, plot_prefix = paste0("SampleUnionA_DP", dp_sing), fill_by = "Treatment")
+
+full.t.unionA_noctrl <- subset(full.t.unionA, !Vars %in% vitro_ctrl_vars)
+plot_variants(full.t = subset(full.t.unionA_noctrl, Group == "Sample"), out_dir = out_dir, 
+              plot_prefix = paste0("SampleUnionA_DP", dp_sing, "_NoVitroCtrl"), fill_by = "Treatment")
+
+full.t.unionA_noctrl_nomulti <- full.t.unionA_noctrl[grep(pattern = ",", x = full.t.unionA_noctrl$ALT, invert = T),]
+
+plot_variants(full.t = subset(full.t.unionA_noctrl_nomulti, Group == "Sample"), out_dir = out_dir,
+              plot_prefix = paste0("SampleUnionA_DP", dp_sing, "NoMulti_NoVitroCtrl"), fill_by = "Treatment")