#import libraries library(ggVennDiagram) library(stringr) library(RColorBrewer) library(ggplot2) library(dplyr) library(openxlsx) library(gtable) library(reshape2) library(grid) library(gridExtra) library(CMplot) library(circlize) library(e1071) library(VennDiagram) library(scales) library(lme4) library(ggrepel) library(readxl) ################################################### Functions ############################################################# snv_colors2 <- function(del = FALSE, alpha = 1) { nuc <- c("A", "C", "G", "T") # Point deletions dels <- alpha(brewer.pal(5, "Greys")[2:5], 1) names(dels) <- paste0(nuc, "*") # Transitions ts <- alpha(brewer.pal(5, "Blues")[2:5], 1) names(ts) <- c("CT", "GA", "AG", "TC") # Tansversions tv1 <- alpha(brewer.pal(5, "YlGn")[2:5], 1) names(tv1) <- c("AC", "TG", "AT", "TA") tv2 <- alpha(brewer.pal(5, "OrRd")[2:5], 1) names(tv2) <- c("CA", "GT", "CG", "GC") var_cols <- c(ts, tv1, tv2) if (del) { var_cols <- c(var_cols, dels) } return(var_cols) } ### Circos + Legend ### legend_plot = function(t, sample) { t <- mutate(t, TYPE = case_when(nchar(REF) == 1 & nchar(ALT) == 1 & ALT != "*" ~ "Substitution", nchar(REF) == 1 & nchar(ALT) == 1 & ALT == "*" ~ "Deletion", nchar(REF) > nchar(ALT) & nchar(REF) - nchar(ALT) >= 1 ~ "Deletion", nchar(REF) < nchar(ALT) & nchar(ALT) - nchar(REF) >= 1 ~ "Insertion", TRUE ~ "Other")) # Substitutions t.subs <- t %>% filter(TYPE == "Substitution") %>% tbl_df() t.subs$SUB <- paste0(t.subs$REF, t.subs$ALT) t.subs$SUB <- factor(t.subs$SUB, levels = sort(unique(t.subs$SUB))) cols_names <- levels(t.subs$SUB) cols <- snv_colors2(del = F) t.subs <- merge(t.subs, data.frame(cols, SUB = names(cols)), all.x = TRUE) tt.subs <- t.subs %>% filter(Sample == sample) %>% tbl_df() # Deletions / Insertions t.other <- t %>% filter(TYPE != "Substitution") %>% tbl_df() tt.other <- t.other %>% filter(Sample == sample) %>% tbl_df() if(nrow(tt.other) > 0) { tt.other$AF <- 1 } tt.other$col <- ifelse(tt.other$TYPE == "Deletion", "Red", "Blue") # Circos #par(mar = c(1,1,1,1)) par(mar=rep(0,4), cex = 0.7) chr.names <- paste0("chr", c(1:22, "X", "Y")) circos.par("start.degree" = 90) circos.initializeWithIdeogram(species = "hg38", chromosome.index = chr.names, plotType = c()) # Legends (center) legend(-0.5, 0.85, lty = 1, lwd = 2, seg.len = 1, col = c("Red", "Blue"), legend = c("Deletion", "Insertion"), bty = 'n', #xjust = 0, x.intersp = 0.4, y.intersp = 1, inset = c(0.05, 0), title.adj = 0.1, title = "Small Indels") legend(-0.5, 0.45, pch = 16, col = cols, legend = names(cols), pt.cex = 1.2, bty = 'n', ncol = 3, #xjust = 0, x.intersp = 0.4, y.intersp = 1, inset = c(0.05, 0), border = "NA", title.adj = 0.1, title = "Substitution") if ("snpEff_Impact" %in% colnames(tt.subs)) { legend(-0.5, -0.2, pch = c(10, 13), col = "red", legend = c("Relevant Impact", "Target Gene"), pt.cex = 1.8, bty = 'n', ncol = 1, #xjust = 0, x.intersp = 0.6, y.intersp = 1, inset = c(0.2, 0), border = "NA", title.adj = 0, title = "Highlight Variants") } circos.clear() } plot_circos <- function(t, sample) { # Substitutions t <- mutate(t, TYPE = case_when(nchar(REF) == 1 & nchar(ALT) == 1 & ALT != "*" ~ "Substitution", nchar(REF) == 1 & nchar(ALT) == 1 & ALT == "*" ~ "Deletion", nchar(REF) > nchar(ALT) & nchar(REF) - nchar(ALT) >= 1 ~ "Deletion", nchar(REF) < nchar(ALT) & nchar(ALT) - nchar(REF) >= 1 ~ "Insertion", TRUE ~ "Other")) t.subs <- t %>% filter(TYPE == "Substitution") %>% tbl_df() t.subs$SUB <- paste0(t.subs$REF, t.subs$ALT) t.subs$SUB <- factor(t.subs$SUB, levels = sort(unique(t.subs$SUB))) cols_names <- levels(t.subs$SUB) cols <- snv_colors2(del = F, alpha = 0.8) t.subs <- merge(t.subs, data.frame(cols, SUB = names(cols)), all.x = TRUE) tt.subs <- t.subs %>% filter(Sample == sample) %>% tbl_df() # Deletions / Insertions t.other <- t %>% filter(TYPE != "Substitution") %>% tbl_df() tt.other <- t.other %>% filter(Sample == sample) %>% tbl_df() if(nrow(tt.other) > 0) { tt.other$VAF <- 1 } tt.other$col <- ifelse(tt.other$TYPE == "Deletion", "Red", "Blue") if ("IMPACT" %in% colnames(tt.other)) { hio <- subset(tt.other, IMPACT %in% c("HIGH")) if (nrow(hio) > 0) { print(hio) } } # Circos par(mar=rep(0,4), cex = 0.9) chr.names <- paste0("chr", c(1:22, "X", "Y")) circos.par("start.degree" = 90) circos.par("track.height" = 0.3) circos.initializeWithIdeogram(species = "hg38", chromosome.index = chr.names, plotType = c("ideogram", "labels")) lines <- seq(0, 1, 0.2) line_factors <- expand.grid(x = get.all.sector.index(), y = lines) circos.trackPlotRegion(factors = line_factors$x, y = line_factors$y, track.height = 0.4, panel.fun = function(x,y) { xl <- get.cell.meta.data("xlim") for(i in lines) { circos.lines(xl, c(i,i), col = "grey") } }) for(i in lines) { circos.text(0, i, i, col = "black", sector.index = "chrY", track.index = 3, cex = 0.5, adj = c(0.01, 0.01)) } circos.trackPoints(tt.subs$CHROM, tt.subs$POS, tt.subs$VAF, pch = 16, cex = 0.9, col = as.character(tt.subs$cols)) if ("snpEff_Impact" %in% colnames(tt.subs)) { hi <- subset(tt.subs, snpEff_Impact %in% c("HIGH")) if (nrow(hi) > 0) { circos.trackPoints(hi$CHROM, hi$POS, hi$VAF, pch = 10, cex = 1.2, col = "red", track.index = 3) } aavs_df <- data.frame("CHROM" = c("chr19"), "POS" = c(44715702), "VAF" = c(0.7)) circos.trackPoints(aavs_df$CHROM, aavs_df$POS, aavs_df$VAF, pch = 13, cex = 1.5, col = "red", track.index = 3) } circos.trackPlotRegion(factors = line_factors$x, ylim = c(0, 1), track.height = 0.1) if(nrow(tt.other) > 0) { circos.trackLines(tt.other$CHROM, tt.other$POS, tt.other$VAF, track.index = 4, col = as.character(tt.other$col), lwd = 2, type = 'h') } text(0,0, sample, cex = 1.5) circos.clear() } #Define Samples name for analysis # Samples list to include in analysis samples <- c("TIP02_A1","TIP02_A3","TIP02_A5","TIP02_AV", "TIP02_B10","TIP02_B8","TIP02_B9","TIP02_BV", "TIP02_C16","TIP02_C18","TIP02_CV", "TIP02_E29","TIP02_E32","TIP02_E33","TIP02_EV", "A2", "A4", "A6", "AV-96h-1", "B12", "B7", "B-Dx", "BV-96h-2", "C14", "C15", "C19", "CV-96h-3", "E30", "EV-96h-5") #Samples order for Circos plot samples_order_circos <- c("A1","A2","A3","A4","A5","A6", "B7","B8","B9","B10","B-Dx","B12", "C14","C15","C16","C18","C19","", "E29","E30","E32","E33", "", "") ############################################## Analysis ################################################################# #Directory containing ".vcf.gz" files, if vcf file are available, if not skip to line 348-349 and directly read full.t.nomulti.noGerm wdir <- "VCF_DIR" #Create dataframe with variants full.t <- data.frame() for (ss in samples) { orig.name = ss t <- read.table(gzfile(paste(wdir, ss, paste0(ss, "-DP500_PASS_sGQ80_sDP50.ANNOT.dbSNP.vcf.gz"), sep = "/")), comment.char = "#", sep = "\t") colnames(t) <- c("CHROM","POS","ID","REF","ALT","QUAL","FILTER", "ANNOT", "FORMAT", "SAMPLE") if (grepl("_", ss)){ ss = unlist(strsplit(ss, "_"))[2] } t$Sample <- ss t$Vars <- paste(t$CHROM, t$POS, t$REF, t$ALT, sep = "-") t$Group <- substr(ss, 1, 1) t$Type <- ifelse( substr(ss, 2, 2) == "V", "Vitro", "Sample") t$DP <- as.numeric(str_split_fixed(t$SAMPLE, ":", 5)[,3]) t$AD <- as.numeric(str_split_fixed(str_split_fixed(t$SAMPLE, ":", 5)[,2], ",", 2)[,1]) t$VAF <- (t$DP - t$AD) / t$DP t$orig.name = orig.name t$Common = unlist(lapply(t$ANNOT, function(x) grepl("COMMON", x))) if (grepl("TIP02", orig.name)){ t$Replica = "A" } else{ t$Replica = "B" } full.t <- rbind(full.t, t) } # Filter CHR only chr <- c(paste0("chr", seq(1,22)), "chrX") full.t <- subset(full.t, CHROM %in% chr) full.t$CHROM <- factor(full.t$CHROM, levels = chr) # Remove Multi full.t.nomulti <- full.t[grep(",", full.t$ALT, value = F, invert = T), ] # Select germline variants from "Vitro" samples vs_counts <- subset(full.t.nomulti, Type == "Vitro") %>% group_by(Vars) %>% summarise(Count = n()) germ_vars <- subset(vs_counts, Count >= 4) #Remove Germline vraints shared by vitro samples full.t.nomulti.noGerm <- subset(full.t.nomulti, !(Vars %in% germ_vars$Vars) ) plot_variants(full.t = full.t.nomulti.noGerm, out_dir = out_dir, plot_prefix = "Full_NoMulti_noGerm", fill_by = "Group") ## Parse snpEff Annotation full.t.nomulti.noGerm$snpEff_tmp <- lapply(full.t.nomulti.noGerm$ANNOT, function(x){ tmp <- strsplit(x, ";")[[1]] ann_pos <- length(tmp) if (!startsWith(tmp[length(tmp)], "ANN")) { for (i in seq(1,length(tmp))) { if (startsWith(tmp[i], "ANN")) { ann_pos <- i } } } tmp <- tmp[ann_pos] }) full.t.nomulti.noGerm$snpEff_Effect <- sapply(full.t.nomulti.noGerm$snpEff_tmp, function(tmp){ strsplit(tmp, "|", fixed = T)[[1]][2] }, USE.NAMES = F) full.t.nomulti.noGerm$snpEff_Impact <- sapply(full.t.nomulti.noGerm$snpEff_tmp, function(tmp){ strsplit(tmp, "|", fixed = T)[[1]][3] }, USE.NAMES = F) full.t.nomulti.noGerm$snpEff_Gene <- sapply(full.t.nomulti.noGerm$snpEff_tmp, function(tmp){ strsplit(tmp, "|", fixed = T)[[1]][4] }, USE.NAMES = F) full.t.nomulti.noGerm$snpEff_GeneName <- sapply(full.t.nomulti.noGerm$snpEff_tmp, function(tmp){ strsplit(tmp, "|", fixed = T)[[1]][5] }, USE.NAMES = F) full.t.nomulti.noGerm$snpEff_BioType <- sapply(full.t.nomulti.noGerm$snpEff_tmp, function(tmp){ strsplit(tmp, "|", fixed = T)[[1]][8] }, USE.NAMES = F) full.t.nomulti.noGerm$snpEff_tmp <- NULL #Parse SnpEff annotation snpeff_df <- full.t.nomulti.noGerm %>% group_by(Sample, Group, Type, snpEff_Effect, snpEff_Impact) %>% summarise(Count = n()) snpeff_df <- melt(reshape2::dcast(snpeff_df, Sample + Group + Type ~ snpEff_Effect + snpEff_Impact, value.var = "Count"), id.vars = c("Sample", "Group", "Type")) snpeff_df$snpEff_Effect <- sapply(snpeff_df$variable, function(x){ tmp <- strsplit(as.character(x), "_")[[1]] # only consider the nearest match tmp <- tmp[1:(length(tmp)-1)] paste(tmp, collapse = "_") }, USE.NAMES = F) snpeff_df$snpEff_Impact <- sapply(snpeff_df$variable, function(x){ tmp <- strsplit(as.character(x), "_")[[1]] # only consider the nearest match tmp <- tmp[length(tmp)] }, USE.NAMES = F) snpeff_df$variable <- NULL snpeff_df$snpEff_Impact <- factor(snpeff_df$snpEff_Impact, levels = c("HIGH", "MODERATE", "LOW", "MODIFIER")) #Mantain only "Sample", remove Vitro samples samples.t.nomulti.noGerm <- subset(full.t.nomulti.noGerm, Type == "Sample") #Circos plots for Variants in ClonalHema and CancerRelated Genes #Directory containing gene list .txt files glist_dir <- "genePanels/" #Create dataframe for circos full.circos <- data.frame() for (fl in c("CancerRelated.txt", "ClonalHema.txt")) { offt <- read.table(paste(glist_dir, fl, sep = "/"), header = F, sep = "\t") colnames(offt) <- c("gene") lname <- gsub(".txt", "", fl) print(lname) offt_res <- list() offt_res_circos <- list() for (ss in unique(samples.t.nomulti.noGerm$Sample)) { samples_filt <- subset(samples.t.nomulti.noGerm, Sample == ss) offt_var_cond <- subset(samples_filt, snpEff_GeneName %in% offt$gene) if (nrow(offt_var_cond) > 0) { full.t.filt <- offt_var_cond %>% filter(grepl("chr", CHROM)) %>% filter(!CHROM %in% c("chrM")) full.t.filt <- mutate(full.t.filt, TYPE = case_when(nchar(REF) == 1 & nchar(ALT) == 1 & ALT != "*" ~ "Substitution", nchar(REF) == 1 & nchar(ALT) == 1 & ALT == "*" ~ "Deletion", nchar(REF) > nchar(ALT) & nchar(REF) - nchar(ALT) >= 1 ~ "Deletion", nchar(REF) < nchar(ALT) & nchar(ALT) - nchar(REF) >= 1 ~ "Insertion", TRUE ~ "Other")) offt_res_circos[[ss]] <- full.t.filt offt_res[[ss]] <- offt_var_cond full.t.filt$GL <- lname full.circos <- rbind(full.circos, full.t.filt) } } } #Or directly Read dataframe for circos plot full.circos = read.table("Variants/Full_circos.txt", header = T) glist_dir <- "genePanels/" #Plot Circos for (gl in c("CancerRelated", "ClonalHema")) { full.circos.gl <- subset(full.circos, GL == gl) #Per sample par(mfrow=c(4,6)) samples_order_circos unique(full.circos.gl$Sample) for (ss in samples_order_circos ) { if (ss == ""){ plot(1, type = "n", axes = FALSE, xlab = "", ylab = "") } else{ plot_circos(full.circos.gl, ss) } } legend_plot(full.circos.gl, ss) #per Group full.circos.gl$Sample <- full.circos.gl$Group par(mfrow=c(2,3)) for (ss in unique(full.circos.gl$Sample)) { if (ss == ""){ plot(1, type = "n", axes = FALSE, xlab = "", ylab = "") } else{ plot_circos(full.circos.gl, ss) } legend_plot(full.circos.gl, ss) dev.off() } } #Count number of variant per groups, or direclty read the file in line 399-400 full.t.nomulti.noGerm <- mutate(full.t.nomulti.noGerm, TYPE = case_when(nchar(REF) == 1 & nchar(ALT) == 1 & ALT != "*" ~ "SNV", nchar(REF) == 1 & nchar(ALT) == 1 & ALT == "*" ~ "MNV", nchar(REF) > nchar(ALT) & nchar(REF) - nchar(ALT) >= 1 ~ "DEL", nchar(REF) < nchar(ALT) & nchar(ALT) - nchar(REF) >= 1 ~ "INS", TRUE ~ "Other")) full.t.nomulti.noGerm$Mut.type = paste(full.t.nomulti.noGerm$REF,full.t.nomulti.noGerm$ALT, sep = "") VarCount = data.frame(table(full.t.nomulti.noGerm[full.t.nomulti.noGerm$Type == "Sample", "Sample"])) VarCount$Group = substring(VarCount$Var1,1,1) #Or directly read var Counts VarCount = read.table("Variants/Variant_counts.txt", header = T) #Read Callable nucleotides table callable = read.table("callable_nt/callable_nt.txt", header = T) #Merge with Variant count dataframe VarCount = merge(VarCount, callable, by = "Var1") #Create Sample to Group mapping replicate = data.frame(t(data.frame(list("A1" = "A", "A2" = "B", "A3" = "A", "A4" = "B", "A5" = "A", "A6" = "B", "B-Dx" = "B", "B10" = "A", "B12" = "B", "B7" = "B", "B9" = "A", "B8" = "A", "C14" = "B", "C15" = "B", "C16" = "A", "C18" = "A", "C19" = "B", "E29" = "A", "E30" = "B", "E32" = "A", "E33" = "A", "F34" = "A", "F35" = "A", "F36" = "B", "F37" = "A", "F38" = "B")))) colnames(replicate) = "Replica" replicate$Sample = gsub("\\.", "-" , rownames(replicate)) #Create dataframe with PCR cycles pcrcycle = data.frame(t(data.frame(list("A1" = 15, "A2" = 15, "A3" = 15, "A4" = 16, "A5" = 15, "A6" = 15, "B-Dx" = 15, "B10" = 15, "B12" = 16, "B7" = 16, "B9" = 15, "B8" = 15, "C14" = 16, "C15" = 16, "C16" = 15, "C18" = 15, "C19" = 17, "E29" = 15, "E30" = 20, "E32" = 15, "E33" = 15, "F34" = 15, "F35" = 15, "F36" = 15, "F37" = 15, "F38" = 15)))) colnames(pcrcycle) = "PCRCycle" pcrcycle$Sample = gsub("\\.", "-" , rownames(pcrcycle)) VarCount = merge(VarCount, replicate, by.x = "Var1", by.y ="Sample", all.x = T) VarCount = merge(VarCount, pcrcycle, by.x = "Var1", by.y ="Sample", all.x = T) VarCount$PCRCycle.amp = 2^VarCount$PCRCycle VarCount$PCRCycle.log = log2(VarCount$PCRCycle) VarCount$Group = as.factor(VarCount$Group) #Build lmer Model model = lmer( Freq ~ PCRCycle.amp + (1 | Group ), data = VarCount) #Predict predicted = data.frame(predict(model, VarCount)) colnames(predicted) = "Predicted" rownames(predicted) = VarCount$Var1 predicted$Group = VarCount$Group predicted$Replica = VarCount$Replica predicted$Sample = rownames(predicted) predicted = merge(predicted, VarCount, by.x = "Sample", by.y = "Var1") #Plot number of predicted variants normalized by callable nt ggplot(predicted, aes(x = Group.x, y = Predicted / Callable)) + geom_boxplot() + geom_point()