Commit 911e890d authored by Stefano Beretta's avatar Stefano Beretta
Browse files

Upload New File

parent a1aa39cb
library(dplyr)
library(ggplot2)
library(RColorBrewer)
library(CMplot)
library(gtable)
library(circlize)
library(grid)
library(gridExtra)
library(stringr)
library(e1071)
library(openxlsx)
library(VennDiagram)
library(scales)
library(ggVennDiagram)
library(reshape2)
# A helper function to define a region on the layout
define_region <- function(row, col){
viewport(layout.pos.row = row, layout.pos.col = col)
}
# Function to define colors for SNVs
snv_colors2 <- function(del = FALSE, alpha = 1) {
nuc <- c("A", "C", "G", "T")
# Point deletions
dels <- alpha(brewer.pal(5, "Greys")[2:5], 1)
names(dels) <- paste0(nuc, "*")
# Transitions
ts <- alpha(brewer.pal(5, "Blues")[2:5], 1)
names(ts) <- c("CT", "GA", "AG", "TC")
# Tansversions
tv1 <- alpha(brewer.pal(5, "YlGn")[2:5], 1)
names(tv1) <- c("AC", "TG", "AT", "TA")
tv2 <- alpha(brewer.pal(5, "OrRd")[2:5], 1)
names(tv2) <- c("CA", "GT", "CG", "GC")
var_cols <- c(ts, tv1, tv2)
if (del) {
var_cols <- c(var_cols, dels)
}
return(var_cols)
}
# Variant Plots
plot_variants <- function(full.t, out_dir, plot_prefix, fill_by) {
dir.create(path = out_dir, showWarnings = F)
# Per-Sample
tt <- full.t %>%
filter(grepl("chr", CHROM)) %>%
filter(!CHROM %in% c("chrY", "chrM")) %>%
group_by(Sample, !!!syms(fill_by)) %>%
summarise(Count = n())
write.xlsx(x = list("NumVariants" = tt), file = paste(out_dir, paste0(plot_prefix, "_VariantCounts.xlsx"), sep = "/"))
p <- ggplot(tt, aes(x = Sample, y = Count, fill = get(fill_by))) +
theme_bw(base_size = 12) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
legend.position = "none") +
geom_bar(stat = "identity") +
scale_y_continuous(labels = scales::comma, n.breaks = 8) +
xlab("") +
facet_grid(.~get(fill_by), scales = "free_x", space = "free")
ggsave(filename = paste(out_dir, paste0(plot_prefix, "_VariantCounts.pdf"), sep = "/"), plot = p, width = 6, height = 6)
tt <- full.t %>%
filter(grepl("chr", CHROM)) %>%
filter(!CHROM %in% c("chrY", "chrM")) %>%
group_by(Sample, !!!syms(fill_by), REF, ALT) %>%
summarise(Count = n())
tt <- mutate(tt, TYPE = case_when(nchar(REF) == 1 & nchar(ALT) == 1 & ALT != "*" ~ "SNV",
nchar(REF) == 1 & nchar(ALT) == 1 & ALT == "*" ~ "DEL",
nchar(REF) > nchar(ALT) & nchar(REF) - nchar(ALT) >= 1 ~ "DEL",
nchar(REF) < nchar(ALT) & nchar(ALT) - nchar(REF) >= 1 ~ "INS",
TRUE ~ "Other"))
write.xlsx(x = list("VariantClass" = tt), file = paste(out_dir, paste0(plot_prefix, "_VariantClassification.xlsx"), sep = "/"))
tt_type <- tt %>%
group_by(Sample, !!!syms(fill_by), TYPE) %>%
summarise(SumCount = sum(Count)) %>%
arrange(desc(SumCount)) %>%
group_by(Sample, !!!syms(fill_by)) %>%
mutate(CountPerc = SumCount/sum(SumCount))
write.xlsx(x = list("VariantType" = tt_type), file = paste(out_dir, paste0(plot_prefix, "_VariantType.xlsx"), sep = "/"))
p <- ggplot(tt_type, aes(x = Sample, y = SumCount, fill = TYPE)) +
theme_bw(base_size = 12) +
theme(axis.text.x = element_text(angle = 30, hjust = 1),
legend.position = "top") +
guides(fill = guide_legend(ncol = 6)) +
xlab("") +
ylab("Count") +
scale_fill_brewer(palette = "Set1", name = "") +
geom_bar(stat = "identity", position = "stack") +
scale_y_continuous(labels = scales::comma, n.breaks = 8) +
facet_grid(.~get(fill_by), scales = "free_x", space = "free")
ggsave(filename = paste(out_dir, paste0(plot_prefix, "_VariantTypes.pdf"), sep = "/"), plot = p, width = 6, height = 6)
p <- ggplot(tt_type, aes(x = Sample, y = CountPerc, fill = TYPE)) +
theme_bw(base_size = 12) +
theme(axis.text.x = element_text(angle = 30, hjust = 1),
legend.position = "top") +
guides(fill = guide_legend(ncol = 6)) +
xlab("") +
ylab("Count") +
scale_fill_brewer(palette = "Set1", name = "") +
geom_bar(stat = "identity", position = "stack") +
scale_y_continuous(labels = scales::percent_format(accuracy = 2), n.breaks = 10) +
facet_grid(.~get(fill_by), scales = "free_x", space = "free")
ggsave(filename = paste(out_dir, paste0(plot_prefix, "_VariantTypesPerc.pdf"), sep = "/"), plot = p, width = 6, height = 6)
tt <- full.t %>%
filter(nchar(REF) == 1 & nchar(ALT) == 1 & grepl("chr", CHROM)) %>%
filter(!CHROM %in% c("chrY", "chrM")) %>%
group_by(Sample, !!!syms(fill_by), REF, ALT) %>%
summarise(Count = n()) %>%
group_by(Sample, !!!syms(fill_by)) %>%
mutate(CountPerc = Count/sum(Count))
tt$Variant <- paste0(tt$REF, tt$ALT)
tt$Variant <- factor(tt$Variant, levels = sort(unique(tt$Variant)))
write.xlsx(x = list("SNV" = tt), file = paste(out_dir, paste0(plot_prefix, "_SNVcounts.xlsx"), sep = "/"))
tt$Variant <- factor(tt$Variant, levels = rev(names(snv_colors2())))
p2 <- ggplot(tt, aes(x = Sample, y = Count, fill = Variant)) +
theme_bw(base_size = 12) +
theme(axis.text.x = element_text(angle = 30, hjust = 1)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = snv_colors2(del = F)) +
xlab("") +
scale_y_continuous(labels = scales::comma, n.breaks = 8) +
facet_grid(.~get(fill_by), scales = "free_x", space = "free")
ggsave(filename = paste(out_dir, paste0(plot_prefix, "_SNVcounts.pdf"), sep = "/"), plot = p2, width = 7, height = 7)
p2p <- ggplot(tt, aes(x = Sample, y = CountPerc, fill = Variant)) +
theme_bw(base_size = 12) +
theme(axis.text.x = element_text(angle = 30, hjust = 1)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = snv_colors2(del = F)) +
scale_y_continuous(labels = scales::percent_format(accuracy = 2), n.breaks = 10) +
xlab("") +
facet_grid(.~get(fill_by), scales = "free_x", space = "free")
ggsave(filename = paste(out_dir, paste0(plot_prefix, "_SNVcountsPerc.pdf"), sep = "/"), plot = p2p, width = 7, height = 7)
}
###############
### General ###
###############
wdir <- "Fiumara_BasePrimeEd2022_WES/WEScolonies"
out_dir <- paste(wdir, "results", sep = "/")
dir.create(out_dir, showWarnings = F)
# Samples
samples <- c("S1-BE4CC", "S2-BE4CCw-osgRNA", "S3-ABE8CC", "S4-BE4arca", "S5-mockcolonies", "S6-mock24h")
full.t <- data.frame()
# Mutect2
for (ss in samples) {
print(ss)
t <- read.table(gzfile(paste(wdir, "data", paste0(ss, ".Mutect2.vcf.gz"), sep = "/")), comment.char = "#", sep = "\t")
colnames(t) <- c("CHROM","POS","ID","REF","ALT","QUAL","FILTER", "ANNOT", "FORMAT", "SAMPLE")
t$Sample <- ss
t$Vars <- paste(t$CHROM, t$POS, t$REF, t$ALT, sep = "-")
t$Group <- str_split_fixed(ss, "-", 2)[,2]
t$Tool <- "Mutect2"
t$DP <- as.numeric(str_split_fixed(t$SAMPLE, ":", 5)[,4])
t$GT <- gsub("[|]", "/", str_split_fixed(t$SAMPLE, ":", 5)[,1])
t$AD <- as.numeric(str_split_fixed(str_split_fixed(t$SAMPLE, ":", 5)[,2], ",", 2)[,1])
t$VAF <- (t$DP - t$AD) / t$DP
t <- subset(t, DP >= 10)
full.t <- rbind(full.t, t)
}
# Freebayes
for (ss in samples) {
print(ss)
t <- read.table(gzfile(paste(wdir, "data", paste0(ss, ".Freebayes.vcf.gz"), sep = "/")), comment.char = "#", sep = "\t")
colnames(t) <- c("CHROM","POS","ID","REF","ALT","QUAL","FILTER", "ANNOT", "FORMAT", "SAMPLE")
t$Sample <- ss
t$Vars <- paste(t$CHROM, t$POS, t$REF, t$ALT, sep = "-")
t$Group <- str_split_fixed(ss, "-", 2)[,2]
t$Tool <- "Freebayes"
t$DP <- as.numeric(str_split_fixed(t$SAMPLE, ":", 9)[,3])
t$GT <- gsub("[|]", "/", str_split_fixed(t$SAMPLE, ":", 9)[,1])
t$AD <- as.numeric(str_split_fixed(str_split_fixed(t$SAMPLE, ":", 9)[,4], ",", 2)[,1])
t$VAF <- (t$DP - t$AD) / t$DP
t <- subset(t, DP >= 10)
full.t <- rbind(full.t, t)
}
# Filter CHR only
chr <- c(paste0("chr", seq(1,22)), "chrX")
full.t <- subset(full.t, CHROM %in% chr)
full.t$CHROM <- factor(full.t$CHROM, levels = chr)
full.t$Sample <- factor(full.t$Sample, levels = unique(full.t$Sample))
full.t.nomulti <- full.t[grep(",", full.t$ALT, value = F, invert = T), ]
## Tool overlap on all (nomulti) variants
full.venn <- list()
for (ss in samples) {
print(ss)
full.venn[[ss]] <- list()
full.t.nomulti.ss <- subset(full.t.nomulti, Sample == ss)
for (tt in unique(full.t.nomulti.ss$Tool)) {
full.venn[[ss]][[tt]] <- full.t.nomulti.ss[full.t.nomulti.ss$Tool == tt, "Vars"]
}
}
pdf(paste(out_dir, "Full_NoMulti_venn_Tools.pdf", sep = "/"), width = 16, height = 8)
plot.new()
pushViewport(viewport(layout = grid.layout(2, 3)))
i <- 1
j <- 1
for (ss in names(full.venn)) {
venn <- Venn(full.venn[[ss]])
data <- process_data(venn)
vreg <- venn_region(data)
vreg$NumInters <- as.character(str_count(venn_region(data)$name, "[..]"))
cc <- brewer.pal(name = "Set2", n = 8)
cc_cat <- cc[1:length(unique(vreg$NumInters))]
names(cc_cat) <- unique(vreg$NumInters)
s.plot <- ggplot() +
# change mapping of color filling
geom_sf(aes(fill = NumInters), data = vreg, show.legend = FALSE) +
scale_fill_manual(values = cc_cat) +
# adjust edge size and color
geom_sf(color="grey", size = 1, data = venn_setedge(data), show.legend = FALSE) +
# show set label in bold
geom_sf_text(aes(label = name), fontface = "bold", data = venn_setlabel(data), size = 6) +
# add a alternative region name
geom_sf_label(aes(label = paste0(count, "\n", "(", round(count/sum(count)*100, 1), "%)")), data = vreg, alpha = 0.5, size = 5) +
ggtitle(paste0("Sample", ss)) +
theme_void()
print(s.plot, vp = define_region(row = i, col = j))
j <- j + 1
if (j > 3) {
j <- 1
i <- i + 1
}
}
dev.off()
## Tool overlap on NO 0/0 (nomulti) variants
full.t.nomulti.no00 <- subset(full.t.nomulti, GT != "0/0")
full.venn <- list()
for (ss in samples) {
print(ss)
full.venn[[ss]] <- list()
full.t.nomulti.ss <- subset(full.t.nomulti.no00, Sample == ss)
for (tt in unique(full.t.nomulti.ss$Tool)) {
full.venn[[ss]][[tt]] <- full.t.nomulti.ss[full.t.nomulti.ss$Tool == tt, "Vars"]
}
}
pdf(paste(out_dir, "Full_NoMulti_no00_venn_Tools.pdf", sep = "/"), width = 16, height = 8)
plot.new()
pushViewport(viewport(layout = grid.layout(2, 3)))
i <- 1
j <- 1
for (ss in names(full.venn)) {
venn <- Venn(full.venn[[ss]])
data <- process_data(venn)
vreg <- venn_region(data)
vreg$NumInters <- as.character(str_count(venn_region(data)$name, "[..]"))
cc <- brewer.pal(name = "Set2", n = 8)
cc_cat <- cc[1:length(unique(vreg$NumInters))]
names(cc_cat) <- unique(vreg$NumInters)
s.plot <- ggplot() +
# change mapping of color filling
geom_sf(aes(fill = NumInters), data = vreg, show.legend = FALSE) +
scale_fill_manual(values = cc_cat) +
# adjust edge size and color
geom_sf(color="grey", size = 1, data = venn_setedge(data), show.legend = FALSE) +
# show set label in bold
geom_sf_text(aes(label = name), fontface = "bold", data = venn_setlabel(data), size = 6) +
# add a alternative region name
geom_sf_label(aes(label = paste0(count, "\n", "(", round(count/sum(count)*100, 1), "%)")), data = vreg, alpha = 0.5, size = 5) +
ggtitle(paste0("Sample", ss)) +
theme_void()
print(s.plot, vp = define_region(row = i, col = j))
j <- j + 1
if (j > 3) {
j <- 1
i <- i + 1
}
}
dev.off()
# Check overlap among samples
full.venn <- list()
for (vs in unique(full.t.nomulti.no00$Sample)) {
print(vs)
full.venn[[vs]] <- full.t.nomulti.no00[full.t.nomulti.no00$Sample == vs & full.t.nomulti.no00$Tool == "Mutect2", "Vars"]
}
venn <- Venn(full.venn)
data <- process_data(venn)
vreg <- venn_region(data)
vreg$NumInters <- as.character(str_count(venn_region(data)$name, "[..]"))
cc <- brewer.pal(name = "Set2", n = 8)
cc_cat <- cc[1:length(unique(vreg$NumInters))]
names(cc_cat) <- unique(vreg$NumInters)
s.plot <- ggplot() +
# change mapping of color filling
geom_sf(aes(fill = NumInters), data = vreg, show.legend = FALSE) +
scale_fill_manual(values = cc_cat) +
# adjust edge size and color
geom_sf(color="grey", size = 1, data = venn_setedge(data), show.legend = FALSE) +
# show set label in bold
geom_sf_text(aes(label = name), fontface = "bold", data = venn_setlabel(data), size = 6) +
# add a alternative region name
geom_sf_label(aes(label = paste0(count, "\n", "(", round(count/sum(count)*100, 1), "%)")), data = vreg, alpha = 0.5, size = 3) +
theme_void()
ggsave(filename = paste(out_dir, "Full_NoMulti_no00_vars_venn_Mutect2.pdf", sep = "/"), plot = s.plot, width = 10, height = 10)
# Control
vitro_ctrl <- subset(full.t.nomulti.no00, Sample == "S6-mock24h")
vitro_ctrl_vars <- unique(vitro_ctrl_vars$Vars)
full.t.nomulti.no00.noGerm <- subset(full.t.nomulti.no00, !(Vars %in% vitro_ctrl_vars))
# Check Tool overlap
full.venn <- list()
for (ss in samples) {
if(ss == "S6-mock24h") {next}
print(ss)
full.venn[[ss]] <- list()
full.t.nomulti.no00.noGerm.ss <- subset(full.t.nomulti.no00.noGerm, Sample == ss)
for (tt in unique(full.t.nomulti.no00.noGerm.ss$Tool)) {
full.venn[[ss]][[tt]] <- full.t.nomulti.no00.noGerm.ss[full.t.nomulti.no00.noGerm.ss$Tool == tt, "Vars"]
}
}
pdf(paste(out_dir, "Full_NoMulti_no00_noGerm_venn_Tools.pdf", sep = "/"), width = 16, height = 8)
plot.new()
pushViewport(viewport(layout = grid.layout(2, 3)))
i <- 1
j <- 1
for (ss in names(full.venn)) {
venn <- Venn(full.venn[[ss]])
data <- process_data(venn)
vreg <- venn_region(data)
vreg$NumInters <- as.character(str_count(venn_region(data)$name, "[..]"))
cc <- brewer.pal(name = "Set2", n = 8)
cc_cat <- cc[1:length(unique(vreg$NumInters))]
names(cc_cat) <- unique(vreg$NumInters)
s.plot <- ggplot() +
# change mapping of color filling
geom_sf(aes(fill = NumInters), data = vreg, show.legend = FALSE) +
scale_fill_manual(values = cc_cat) +
# adjust edge size and color
geom_sf(color="grey", size = 1, data = venn_setedge(data), show.legend = FALSE) +
# show set label in bold
geom_sf_text(aes(label = name), fontface = "bold", data = venn_setlabel(data), size = 6) +
# add a alternative region name
geom_sf_label(aes(label = paste0(count, "\n", "(", round(count/sum(count)*100, 1), "%)")), data = vreg, alpha = 0.5, size = 5) +
ggtitle(paste0("Sample", ss)) +
theme_void()
print(s.plot, vp = define_region(row = i, col = j))
j <- j + 1
if (j > 3) {
j <- 1
i <- i + 1
}
}
dev.off()
plot_variants(full.t = subset(full.t.nomulti.no00.noGerm, Tool == "Mutect2"),
out_dir = out_dir, plot_prefix = "Full_NoMulti_no00_noGerm_Mutect2", fill_by = "Group")
#### Low Freq ####
full.t.nomulti.no00.noGerm.lowfreq <- subset(full.t.nomulti.no00.noGerm, VAF > 0.05 & VAF < 0.2)
table(full.t.nomulti.no00.noGerm.lowfreq$Tool, full.t.nomulti.no00.noGerm.lowfreq$Sample)
full.venn <- list()
for (ss in samples) {
if(ss == "S6-mock24h") {next}
print(ss)
full.venn[[ss]] <- list()
full.t.nomulti.no00.noGerm.lowfreq.ss <- subset(full.t.nomulti.no00.noGerm.lowfreq, Sample == ss)
for (tt in unique(full.t.nomulti.no00.noGerm.lowfreq.ss$Tool)) {
full.venn[[ss]][[tt]] <- full.t.nomulti.no00.noGerm.lowfreq.ss[full.t.nomulti.no00.noGerm.lowfreq.ss$Tool == tt, "Vars"]
}
}
pdf(paste(out_dir, "Full_NoMulti_no00_noGerm_lowfreq_venn_Tools.pdf", sep = "/"), width = 16, height = 8)
plot.new()
pushViewport(viewport(layout = grid.layout(2, 3)))
i <- 1
j <- 1
for (ss in names(full.venn)) {
venn <- Venn(full.venn[[ss]])
data <- process_data(venn)
vreg <- venn_region(data)
vreg$NumInters <- as.character(str_count(venn_region(data)$name, "[..]"))
cc <- brewer.pal(name = "Set2", n = 8)
cc_cat <- cc[1:length(unique(vreg$NumInters))]
names(cc_cat) <- unique(vreg$NumInters)
s.plot <- ggplot() +
# change mapping of color filling
geom_sf(aes(fill = NumInters), data = vreg, show.legend = FALSE) +
scale_fill_manual(values = cc_cat) +
# adjust edge size and color
geom_sf(color="grey", size = 1, data = venn_setedge(data), show.legend = FALSE) +
# show set label in bold
geom_sf_text(aes(label = name), fontface = "bold", data = venn_setlabel(data), size = 6) +
# add a alternative region name
geom_sf_label(aes(label = paste0(count, "\n", "(", round(count/sum(count)*100, 1), "%)")), data = vreg, alpha = 0.5, size = 5) +
ggtitle(paste0("Sample", ss)) +
theme_void()
print(s.plot, vp = define_region(row = i, col = j))
j <- j + 1
if (j > 3) {
j <- 1
i <- i + 1
}
}
dev.off()
plot_variants(full.t = subset(full.t.nomulti.no00.noGerm.lowfreq, Tool == "Mutect2"),
out_dir = out_dir, plot_prefix = "Full_NoMulti_no00_noGerm_lowfreq_Mutect2", fill_by = "Group")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment