#import libraries
library(ggVennDiagram)
library(stringr)
library(RColorBrewer)
library(ggplot2)
library(dplyr)
library(openxlsx)
library(gtable)
library(reshape2)
library(grid)
library(gridExtra)
library(CMplot)
library(circlize)
library(e1071)
library(VennDiagram)
library(scales)
library(lme4)
library(ggrepel)
library(readxl)


################################################### Functions #############################################################
snv_colors2 <- function(del = FALSE, alpha = 1) {
  nuc <- c("A", "C", "G", "T")
  # Point deletions
  dels <- alpha(brewer.pal(5, "Greys")[2:5], 1)
  names(dels) <- paste0(nuc, "*")
  # Transitions
  ts <- alpha(brewer.pal(5, "Blues")[2:5], 1)
  names(ts) <- c("CT", "GA", "AG", "TC")
  # Tansversions
  tv1 <- alpha(brewer.pal(5, "YlGn")[2:5], 1)
  names(tv1) <- c("AC", "TG", "AT", "TA")
  tv2 <- alpha(brewer.pal(5, "OrRd")[2:5], 1)
  names(tv2) <- c("CA", "GT", "CG", "GC")
  var_cols <- c(ts, tv1, tv2)
  if (del) {
    var_cols <- c(var_cols, dels)
  }
  return(var_cols)
}


### Circos + Legend ###
legend_plot = function(t, sample) {
  t <- mutate(t, TYPE = case_when(nchar(REF) == 1 & nchar(ALT) == 1 & ALT != "*" ~ "Substitution",
                                  nchar(REF) == 1 & nchar(ALT) == 1 & ALT == "*" ~ "Deletion",
                                  nchar(REF) > nchar(ALT) & nchar(REF) - nchar(ALT) >= 1 ~ "Deletion",
                                  nchar(REF) < nchar(ALT) & nchar(ALT) - nchar(REF) >= 1 ~ "Insertion",
                                  TRUE ~ "Other"))
  # Substitutions
  t.subs <- t %>% filter(TYPE == "Substitution") %>% tbl_df()
  t.subs$SUB <- paste0(t.subs$REF, t.subs$ALT)
  t.subs$SUB <- factor(t.subs$SUB, levels = sort(unique(t.subs$SUB)))
  cols_names <- levels(t.subs$SUB)
  cols <- snv_colors2(del = F)
  t.subs <- merge(t.subs, data.frame(cols, SUB = names(cols)), all.x = TRUE)
  tt.subs <- t.subs %>% filter(Sample == sample) %>% tbl_df()
  # Deletions / Insertions
  t.other <- t %>% filter(TYPE != "Substitution") %>% tbl_df()
  tt.other <- t.other %>% filter(Sample == sample) %>% tbl_df()
  if(nrow(tt.other) > 0) {
    tt.other$AF <- 1
  }
  tt.other$col <- ifelse(tt.other$TYPE == "Deletion", "Red", "Blue")
  # Circos
  #par(mar = c(1,1,1,1))
  par(mar=rep(0,4), cex = 0.7)
  chr.names <- paste0("chr", c(1:22, "X", "Y"))
  circos.par("start.degree" = 90)
  circos.initializeWithIdeogram(species = "hg38",
                                chromosome.index = chr.names,
                                plotType = c())
  
  # Legends (center)
  legend(-0.5, 0.85, lty = 1, lwd = 2, seg.len = 1, col = c("Red", "Blue"),
         legend = c("Deletion", "Insertion"), bty = 'n',
         #xjust = 0,
         x.intersp = 0.4,
         y.intersp = 1,
         inset = c(0.05, 0),
         title.adj = 0.1,
         title = "Small Indels")
  legend(-0.5, 0.45, pch = 16, col = cols,
         legend = names(cols), pt.cex = 1.2, bty = 'n',
         ncol = 3,
         #xjust = 0,
         x.intersp = 0.4,
         y.intersp = 1,
         inset = c(0.05, 0),
         border = "NA",
         title.adj = 0.1,
         title = "Substitution")
  if ("snpEff_Impact" %in% colnames(tt.subs)) {
    legend(-0.5, -0.2, pch = c(10, 13), col = "red",
           legend = c("Relevant Impact", "Target Gene"), pt.cex = 1.8, bty = 'n',
           ncol = 1,
           #xjust = 0,
           x.intersp = 0.6,
           y.intersp = 1,
           inset = c(0.2, 0),
           border = "NA",
           title.adj = 0,
           title = "Highlight Variants")
  }
  circos.clear()
}

plot_circos <- function(t, sample) {
  # Substitutions
  t <- mutate(t, TYPE = case_when(nchar(REF) == 1 & nchar(ALT) == 1 & ALT != "*" ~ "Substitution",
                                  nchar(REF) == 1 & nchar(ALT) == 1 & ALT == "*" ~ "Deletion",
                                  nchar(REF) > nchar(ALT) & nchar(REF) - nchar(ALT) >= 1 ~ "Deletion",
                                  nchar(REF) < nchar(ALT) & nchar(ALT) - nchar(REF) >= 1 ~ "Insertion",
                                  TRUE ~ "Other"))
  t.subs <- t %>% filter(TYPE == "Substitution") %>% tbl_df()
  t.subs$SUB <- paste0(t.subs$REF, t.subs$ALT)
  t.subs$SUB <- factor(t.subs$SUB, levels = sort(unique(t.subs$SUB)))
  cols_names <- levels(t.subs$SUB)
  cols <- snv_colors2(del = F, alpha = 0.8)
  t.subs <- merge(t.subs, data.frame(cols, SUB = names(cols)), all.x = TRUE)
  tt.subs <- t.subs %>% filter(Sample == sample) %>% tbl_df()
  # Deletions / Insertions
  t.other <- t %>% filter(TYPE != "Substitution") %>% tbl_df()
  tt.other <- t.other %>% filter(Sample == sample) %>% tbl_df()
  if(nrow(tt.other) > 0) {
    tt.other$VAF <- 1
  }
  tt.other$col <- ifelse(tt.other$TYPE == "Deletion", "Red", "Blue")
  if ("IMPACT" %in% colnames(tt.other)) {
    hio <- subset(tt.other, IMPACT %in% c("HIGH"))
    if (nrow(hio) > 0) {
      print(hio)
    }
  }
  # Circos
  par(mar=rep(0,4), cex = 0.9)
  chr.names <- paste0("chr", c(1:22, "X", "Y"))
  circos.par("start.degree" = 90)
  circos.par("track.height" = 0.3)
  circos.initializeWithIdeogram(species = "hg38",
                                chromosome.index = chr.names,
                                plotType = c("ideogram", "labels"))
  
  lines <- seq(0, 1, 0.2)
  line_factors <- expand.grid(x = get.all.sector.index(), y = lines)
  circos.trackPlotRegion(factors = line_factors$x,
                         y = line_factors$y,
                         track.height = 0.4,
                         panel.fun = function(x,y) {
                           xl <- get.cell.meta.data("xlim")
                           for(i in lines) {
                             circos.lines(xl, c(i,i), col = "grey")
                           }
                         })
  for(i in lines) {
    circos.text(0, i, i, col = "black", sector.index = "chrY", track.index = 3,
                cex = 0.5, adj = c(0.01, 0.01))
  }
  circos.trackPoints(tt.subs$CHROM,
                     tt.subs$POS,
                     tt.subs$VAF,
                     pch = 16,
                     cex = 0.9,
                     col = as.character(tt.subs$cols))
  if ("snpEff_Impact" %in% colnames(tt.subs)) {
    hi <- subset(tt.subs, snpEff_Impact %in% c("HIGH"))
    if (nrow(hi) > 0) {
      circos.trackPoints(hi$CHROM,
                         hi$POS,
                         hi$VAF,
                         pch = 10,
                         cex = 1.2,
                         col = "red",
                         track.index = 3)
    }
    aavs_df <- data.frame("CHROM" = c("chr19"), "POS" = c(44715702), "VAF" = c(0.7))
    circos.trackPoints(aavs_df$CHROM,
                       aavs_df$POS,
                       aavs_df$VAF,
                       pch = 13,
                       cex = 1.5,
                       col = "red",
                       track.index = 3)
  }
  circos.trackPlotRegion(factors = line_factors$x,
                         ylim = c(0, 1), track.height = 0.1)
  if(nrow(tt.other) > 0) {
    circos.trackLines(tt.other$CHROM,
                      tt.other$POS,
                      tt.other$VAF,
                      track.index = 4,
                      col = as.character(tt.other$col),
                      lwd = 2,
                      type = 'h')
  }
  text(0,0, sample, cex = 1.5)
  circos.clear()
}

#Define Samples name for analysis
# Samples list to include in analysis
samples <- c("TIP02_A1","TIP02_A3","TIP02_A5","TIP02_AV",
             "TIP02_B10","TIP02_B8","TIP02_B9","TIP02_BV",
             "TIP02_C16","TIP02_C18","TIP02_CV",
             "TIP02_E29","TIP02_E32","TIP02_E33","TIP02_EV", 
             "A2", "A4", "A6", "AV-96h-1", "B12", "B7", "B-Dx", "BV-96h-2",
             "C14", "C15", "C19", "CV-96h-3", "E30", "EV-96h-5")

#Samples order for Circos plot
samples_order_circos <- c("A1","A2","A3","A4","A5","A6",
                          "B7","B8","B9","B10","B-Dx","B12",
                          "C14","C15","C16","C18","C19","",
                          "E29","E30","E32","E33", "", "")


############################################## Analysis #################################################################
#Directory containing ".vcf.gz" files, if vcf file are available, if not skip to line 348-349 and directly read full.t.nomulti.noGerm
wdir <- "VCF_DIR"

#Create dataframe with variants
full.t <- data.frame()
for (ss in samples) {
  orig.name = ss
  t <- read.table(gzfile(paste(wdir, ss, paste0(ss, "-DP500_PASS_sGQ80_sDP50.ANNOT.dbSNP.vcf.gz"), sep = "/")), comment.char = "#", sep = "\t")
  colnames(t) <- c("CHROM","POS","ID","REF","ALT","QUAL","FILTER", "ANNOT", "FORMAT", "SAMPLE")
  if (grepl("_", ss)){
    ss = unlist(strsplit(ss, "_"))[2]
  }
  t$Sample <- ss
  t$Vars <- paste(t$CHROM, t$POS, t$REF, t$ALT, sep = "-")
  t$Group <- substr(ss, 1, 1)
  t$Type <- ifelse( substr(ss, 2, 2) == "V", "Vitro", "Sample")
  t$DP <- as.numeric(str_split_fixed(t$SAMPLE, ":", 5)[,3])
  t$AD <- as.numeric(str_split_fixed(str_split_fixed(t$SAMPLE, ":", 5)[,2], ",", 2)[,1])
  t$VAF <- (t$DP - t$AD) / t$DP
  t$orig.name = orig.name
  
  t$Common = unlist(lapply(t$ANNOT, function(x) grepl("COMMON", x)))
  
  if (grepl("TIP02", orig.name)){
    t$Replica = "A"
  }
  else{
    t$Replica = "B"
  }
  full.t <- rbind(full.t, t)
}

# Filter CHR only
chr <- c(paste0("chr", seq(1,22)), "chrX")
full.t <- subset(full.t, CHROM %in% chr)
full.t$CHROM <- factor(full.t$CHROM, levels = chr)

# Remove Multi
full.t.nomulti <- full.t[grep(",", full.t$ALT, value = F, invert = T), ]

# Select germline variants from "Vitro" samples
vs_counts <- subset(full.t.nomulti, Type == "Vitro") %>% group_by(Vars) %>% summarise(Count = n())
germ_vars <- subset(vs_counts, Count >= 4)

#Remove Germline vraints shared by vitro samples
full.t.nomulti.noGerm <- subset(full.t.nomulti, !(Vars %in% germ_vars$Vars)  )
plot_variants(full.t = full.t.nomulti.noGerm, out_dir = out_dir, plot_prefix = "Full_NoMulti_noGerm", fill_by = "Group")

## Parse snpEff Annotation
full.t.nomulti.noGerm$snpEff_tmp <- lapply(full.t.nomulti.noGerm$ANNOT, function(x){
  tmp <- strsplit(x, ";")[[1]]
  ann_pos <- length(tmp)
  if (!startsWith(tmp[length(tmp)], "ANN")) {
    for (i in seq(1,length(tmp))) {
      if (startsWith(tmp[i], "ANN")) {
        ann_pos <- i
      }
    }
  }
  tmp <- tmp[ann_pos]
})
full.t.nomulti.noGerm$snpEff_Effect <- sapply(full.t.nomulti.noGerm$snpEff_tmp, function(tmp){
  strsplit(tmp, "|", fixed = T)[[1]][2]
}, USE.NAMES = F)
full.t.nomulti.noGerm$snpEff_Impact <- sapply(full.t.nomulti.noGerm$snpEff_tmp, function(tmp){
  strsplit(tmp, "|", fixed = T)[[1]][3]
}, USE.NAMES = F)
full.t.nomulti.noGerm$snpEff_Gene <- sapply(full.t.nomulti.noGerm$snpEff_tmp, function(tmp){
  strsplit(tmp, "|", fixed = T)[[1]][4]
}, USE.NAMES = F)
full.t.nomulti.noGerm$snpEff_GeneName <- sapply(full.t.nomulti.noGerm$snpEff_tmp, function(tmp){
  strsplit(tmp, "|", fixed = T)[[1]][5]
}, USE.NAMES = F)
full.t.nomulti.noGerm$snpEff_BioType <- sapply(full.t.nomulti.noGerm$snpEff_tmp, function(tmp){
  strsplit(tmp, "|", fixed = T)[[1]][8]
}, USE.NAMES = F)
full.t.nomulti.noGerm$snpEff_tmp <- NULL

#Parse SnpEff annotation
snpeff_df <- full.t.nomulti.noGerm %>% group_by(Sample, Group, Type, snpEff_Effect, snpEff_Impact) %>% summarise(Count = n())
snpeff_df <- melt(reshape2::dcast(snpeff_df, Sample + Group + Type ~ snpEff_Effect + snpEff_Impact, value.var = "Count"), id.vars = c("Sample", "Group", "Type"))
snpeff_df$snpEff_Effect <- sapply(snpeff_df$variable, function(x){
  tmp <- strsplit(as.character(x), "_")[[1]] # only consider the nearest match
  tmp <- tmp[1:(length(tmp)-1)]
  paste(tmp, collapse = "_")
}, USE.NAMES = F)
snpeff_df$snpEff_Impact <- sapply(snpeff_df$variable, function(x){
  tmp <- strsplit(as.character(x), "_")[[1]] # only consider the nearest match
  tmp <- tmp[length(tmp)]
}, USE.NAMES = F)
snpeff_df$variable <- NULL
snpeff_df$snpEff_Impact <- factor(snpeff_df$snpEff_Impact, levels = c("HIGH", "MODERATE", "LOW", "MODIFIER"))


#Mantain only "Sample", remove Vitro samples
samples.t.nomulti.noGerm <- subset(full.t.nomulti.noGerm, Type == "Sample")

#Circos plots for Variants in ClonalHema and CancerRelated Genes
#Directory containing gene list .txt files
glist_dir <- "genePanels/"

#Create dataframe for circos
full.circos <- data.frame()
for (fl in c("CancerRelated.txt", "ClonalHema.txt")) {
  offt <- read.table(paste(glist_dir, fl, sep = "/"), header = F, sep = "\t")
  colnames(offt) <- c("gene")
  lname <- gsub(".txt", "", fl)
  print(lname)
  
  offt_res <- list()
  offt_res_circos <- list()
  for (ss in unique(samples.t.nomulti.noGerm$Sample)) {
    samples_filt <- subset(samples.t.nomulti.noGerm, Sample == ss)
    offt_var_cond <- subset(samples_filt, snpEff_GeneName %in% offt$gene)
    if (nrow(offt_var_cond) > 0) {
      full.t.filt <- offt_var_cond %>% filter(grepl("chr", CHROM)) %>% filter(!CHROM %in% c("chrM"))
      full.t.filt <- mutate(full.t.filt, TYPE = case_when(nchar(REF) == 1 & nchar(ALT) == 1 & ALT != "*" ~ "Substitution",
                                                          nchar(REF) == 1 & nchar(ALT) == 1 & ALT == "*" ~ "Deletion",
                                                          nchar(REF) > nchar(ALT) & nchar(REF) - nchar(ALT) >= 1 ~ "Deletion",
                                                          nchar(REF) < nchar(ALT) & nchar(ALT) - nchar(REF) >= 1 ~ "Insertion",
                                                          TRUE ~ "Other"))
      offt_res_circos[[ss]] <- full.t.filt
      offt_res[[ss]] <- offt_var_cond
      full.t.filt$GL <- lname
      full.circos <- rbind(full.circos, full.t.filt)
    }
  }
}


#Or directly Read dataframe for circos plot
full.circos = read.table("Variants/Full_circos.txt", header = T)

glist_dir <- "genePanels/"
#Plot Circos
for (gl in c("CancerRelated", "ClonalHema")) {
  full.circos.gl <- subset(full.circos, GL == gl)
  
  #Per sample
  par(mfrow=c(4,6))
  samples_order_circos
  unique(full.circos.gl$Sample)
  for (ss in samples_order_circos ) {
    if (ss == ""){
      plot(1, type = "n", axes = FALSE, xlab = "", ylab = "")
    }
    else{
      plot_circos(full.circos.gl, ss)
    }
  }
  legend_plot(full.circos.gl, ss)
  
  #per Group
  full.circos.gl$Sample <- full.circos.gl$Group
  par(mfrow=c(2,3))
  for (ss in unique(full.circos.gl$Sample)) {
    if (ss == ""){
      plot(1, type = "n", axes = FALSE, xlab = "", ylab = "")
    }
    else{
    plot_circos(full.circos.gl, ss)
  }
  legend_plot(full.circos.gl, ss)
  dev.off()
  }
}


#Count number of variant per groups, or direclty read the file in line 399-400
full.t.nomulti.noGerm <- mutate(full.t.nomulti.noGerm,
                                TYPE = case_when(nchar(REF) == 1 & nchar(ALT) == 1 & ALT != "*" ~ "SNV",
                                                 nchar(REF) == 1 & nchar(ALT) == 1 & ALT == "*" ~ "MNV",
                                                 nchar(REF) > nchar(ALT) & nchar(REF) - nchar(ALT) >= 1 ~ "DEL",
                                                 nchar(REF) < nchar(ALT) & nchar(ALT) - nchar(REF) >= 1 ~ "INS",
                                                 TRUE ~ "Other"))

full.t.nomulti.noGerm$Mut.type = paste(full.t.nomulti.noGerm$REF,full.t.nomulti.noGerm$ALT, sep = "")
VarCount = data.frame(table(full.t.nomulti.noGerm[full.t.nomulti.noGerm$Type == "Sample", "Sample"]))
VarCount$Group = substring(VarCount$Var1,1,1)

#Or directly read var Counts
VarCount = read.table("Variants/Variant_counts.txt", header = T)

#Read Callable nucleotides table
callable = read.table("callable_nt/callable_nt.txt", header = T)
#Merge with Variant count dataframe
VarCount = merge(VarCount, callable, by = "Var1")

#Create Sample to Group mapping
replicate = data.frame(t(data.frame(list("A1" = "A",
                                         "A2" = "B",
                                         "A3" = "A",
                                         "A4" = "B",
                                         "A5" = "A",
                                         "A6" = "B", 
                                         "B-Dx" = "B", 
                                         "B10" = "A", 
                                         "B12" = "B", 
                                         "B7" = "B", 
                                         "B9" = "A", 
                                         "B8" = "A",
                                         "C14" = "B",
                                         "C15" = "B", 
                                         "C16" = "A", 
                                         "C18" = "A", 
                                         "C19" = "B", 
                                         "E29" = "A", 
                                         "E30" = "B", 
                                         "E32" = "A", 
                                         "E33" = "A", 
                                         "F34" = "A", 
                                         "F35" = "A", 
                                         "F36" = "B", 
                                         "F37" = "A", 
                                         "F38" = "B"))))
colnames(replicate) = "Replica"
replicate$Sample = gsub("\\.", "-" , rownames(replicate))

#Create dataframe with PCR cycles
pcrcycle = data.frame(t(data.frame(list("A1" = 15,
                                        "A2" = 15,
                                        "A3" = 15,
                                        "A4" = 16,
                                        "A5" = 15,
                                        "A6" = 15, 
                                        "B-Dx" = 15, 
                                        "B10" = 15, 
                                        "B12" = 16, 
                                        "B7" = 16, 
                                        "B9" = 15, 
                                        "B8" = 15,
                                        "C14" = 16,
                                        "C15" = 16, 
                                        "C16" = 15, 
                                        "C18" = 15, 
                                        "C19" = 17, 
                                        "E29" = 15, 
                                        "E30" = 20, 
                                        "E32" = 15, 
                                        "E33" = 15, 
                                        "F34" = 15, 
                                        "F35" = 15, 
                                        "F36" = 15, 
                                        "F37" = 15, 
                                        "F38" = 15))))
colnames(pcrcycle) = "PCRCycle"
pcrcycle$Sample = gsub("\\.", "-" , rownames(pcrcycle))
VarCount = merge(VarCount, replicate, by.x = "Var1", by.y ="Sample", all.x = T)
VarCount = merge(VarCount, pcrcycle, by.x = "Var1", by.y ="Sample", all.x = T)
VarCount$PCRCycle.amp = 2^VarCount$PCRCycle
VarCount$PCRCycle.log = log2(VarCount$PCRCycle)
VarCount$Group = as.factor(VarCount$Group)


#Build lmer Model
model = lmer( Freq ~ PCRCycle.amp + (1 | Group ),
              data = VarCount)

#Predict
predicted = data.frame(predict(model, VarCount))
colnames(predicted) = "Predicted"
rownames(predicted) = VarCount$Var1
predicted$Group = VarCount$Group
predicted$Replica = VarCount$Replica
predicted$Sample = rownames(predicted)
predicted = merge(predicted, VarCount, by.x = "Sample", by.y = "Var1")


#Plot number of predicted variants normalized by callable nt
ggplot(predicted, aes(x = Group.x, y = Predicted / Callable)) +
  geom_boxplot() +
  geom_point()