WESopt_plot_results.R 13.7 KB
Newer Older
Stefano Beretta's avatar
Stefano Beretta committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
library(dplyr)
library(ggplot2)
library(RColorBrewer)
library(CMplot)
library(gtable)
library(circlize)
library(grid)
library(gridExtra)
library(stringr)
library(e1071)
library(openxlsx)
library(VennDiagram)
library(scales)
library(ggVennDiagram)
library(reshape2)

#########################
### Utility Functions ###
#########################
# A helper function to define a region on the layout
define_region <- function(row, col){
  viewport(layout.pos.row = row, layout.pos.col = col)
}

# Function to define colors for SNVs
snv_colors2 <- function(del = FALSE, alpha = 1) {
  nuc <- c("A", "C", "G", "T")
  # Point deletions
  dels <- alpha(brewer.pal(5, "Greys")[2:5], 1)
  names(dels) <- paste0(nuc, "*")
  # Transitions
  ts <- alpha(brewer.pal(5, "Blues")[2:5], 1)
  names(ts) <- c("CT", "GA", "AG", "TC")
  # Tansversions
  tv1 <- alpha(brewer.pal(5, "YlGn")[2:5], 1)
  names(tv1) <- c("AC", "TG", "AT", "TA")
  tv2 <- alpha(brewer.pal(5, "OrRd")[2:5], 1)
  names(tv2) <- c("CA", "GT", "CG", "GC")
  var_cols <- c(ts, tv1, tv2)
  if (del) {
    var_cols <- c(var_cols, dels)
  }
  return(var_cols)
}

# Variant Plot function
plot_variants <- function(full.t, out_dir, plot_prefix, fill_by) {
  dir.create(path = out_dir, showWarnings = F)
  tt <- full.t %>%
    filter(grepl("chr", CHROM)) %>%
    filter(!CHROM %in% c("chrY", "chrM")) %>%
    group_by(Sample, !!!syms(fill_by)) %>%
    summarise(Count = n())
  write.xlsx(x = list("NumVariants" = tt), file = paste(out_dir, paste0(plot_prefix, "_VariantCounts.xlsx"), sep = "/"))
  
  p <- ggplot(tt, aes(x = Sample, y = Count, fill = get(fill_by))) +
    theme_bw(base_size = 12) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
          legend.position = "none") +
    geom_bar(stat = "identity") +
    scale_y_continuous(labels = scales::comma, n.breaks = 8) +
    xlab("") +
    facet_grid(.~get(fill_by), scales = "free_x", space = "free")
  ggsave(filename = paste(out_dir, paste0(plot_prefix, "_VariantCounts.pdf"), sep = "/"), plot = p, width = 6, height = 6)
  
  tt <- full.t %>%
    filter(grepl("chr", CHROM)) %>%
    filter(!CHROM %in% c("chrY", "chrM")) %>%
    group_by(Sample, !!!syms(fill_by), REF, ALT) %>%
    summarise(Count = n())
  tt <- mutate(tt, TYPE = case_when(nchar(REF) == 1 & nchar(ALT) == 1 & ALT != "*" ~ "SNV",
                                    nchar(REF) == 1 & nchar(ALT) == 1 & ALT == "*" ~ "DEL",
                                    nchar(REF) > nchar(ALT) & nchar(REF) - nchar(ALT) >= 1 ~ "DEL",
                                    nchar(REF) < nchar(ALT) & nchar(ALT) - nchar(REF) >= 1 ~ "INS",
                                    TRUE ~ "Other"))
  write.xlsx(x = list("VariantClass" = tt), file = paste(out_dir, paste0(plot_prefix, "_VariantClassification.xlsx"), sep = "/"))
  tt_type <- tt %>%
    group_by(Sample, !!!syms(fill_by), TYPE) %>%
    summarise(SumCount = sum(Count)) %>%
    arrange(desc(SumCount)) %>%
    group_by(Sample, !!!syms(fill_by)) %>%
    mutate(CountPerc =  SumCount/sum(SumCount))
  write.xlsx(x = list("VariantType" = tt_type), file = paste(out_dir, paste0(plot_prefix, "_VariantType.xlsx"), sep = "/"))
  
  p <- ggplot(tt_type, aes(x = Sample, y = SumCount, fill = TYPE)) +
    theme_bw(base_size = 12) +
    theme(axis.text.x = element_text(angle = 30, hjust = 1),
          legend.position = "top") +
    guides(fill = guide_legend(ncol = 6)) +
    xlab("") +
    ylab("Count") +
    scale_fill_brewer(palette = "Set1", name = "") +
    geom_bar(stat = "identity", position = "stack") +
    scale_y_continuous(labels = scales::comma, n.breaks = 8) +
    facet_grid(.~get(fill_by), scales = "free_x", space = "free")
  ggsave(filename = paste(out_dir, paste0(plot_prefix, "_VariantTypes.pdf"), sep = "/"), plot = p, width = 6, height = 6)
  
  p <- ggplot(tt_type, aes(x = Sample, y = CountPerc, fill = TYPE)) +
    theme_bw(base_size = 12) +
    theme(axis.text.x = element_text(angle = 30, hjust = 1),
          legend.position = "top") +
    guides(fill = guide_legend(ncol = 6)) +
    xlab("") +
    ylab("Count") +
    scale_fill_brewer(palette = "Set1", name = "") +
    geom_bar(stat = "identity", position = "stack") +
    scale_y_continuous(labels = scales::percent_format(accuracy = 2), n.breaks = 10) +
    facet_grid(.~get(fill_by), scales = "free_x", space = "free")
  ggsave(filename = paste(out_dir, paste0(plot_prefix, "_VariantTypesPerc.pdf"), sep = "/"), plot = p, width = 6, height = 6)
  
  tt <- full.t %>%
    filter(nchar(REF) == 1 & nchar(ALT) == 1 & grepl("chr", CHROM)) %>%
    filter(!CHROM %in% c("chrY", "chrM")) %>%
    group_by(Sample, !!!syms(fill_by), REF, ALT) %>%
    summarise(Count = n()) %>%
    group_by(Sample, !!!syms(fill_by)) %>%
    mutate(CountPerc =  Count/sum(Count))
  tt$Variant <- paste0(tt$REF, tt$ALT)
  tt$Variant <- factor(tt$Variant, levels = sort(unique(tt$Variant)))
  write.xlsx(x = list("SNV" = tt), file = paste(out_dir, paste0(plot_prefix, "_SNVcounts.xlsx"), sep = "/"))
  
  tt$Variant <- factor(tt$Variant, levels = rev(names(snv_colors2())))
  p2 <- ggplot(tt, aes(x = Sample, y = Count, fill = Variant)) +
    theme_bw(base_size = 12) +
    theme(axis.text.x = element_text(angle = 30, hjust = 1)) +
    geom_bar(stat = "identity") +
    scale_fill_manual(values = snv_colors2(del = F)) +
    xlab("") +
    scale_y_continuous(labels = scales::comma, n.breaks = 8) +
    facet_grid(.~get(fill_by), scales = "free_x", space = "free")
  ggsave(filename = paste(out_dir, paste0(plot_prefix, "_SNVcounts.pdf"), sep = "/"), plot = p2, width = 7, height = 7)
  
  p2p <- ggplot(tt, aes(x = Sample, y = CountPerc, fill = Variant)) +
    theme_bw(base_size = 12) +
    theme(axis.text.x = element_text(angle = 30, hjust = 1)) +
    geom_bar(stat = "identity") +
    scale_fill_manual(values = snv_colors2(del = F)) +
    scale_y_continuous(labels = scales::percent_format(accuracy = 2), n.breaks = 10) +
    xlab("") +
    facet_grid(.~get(fill_by), scales = "free_x", space = "free")
  ggsave(filename = paste(out_dir, paste0(plot_prefix, "_SNVcountsPerc.pdf"), sep = "/"), plot = p2p, width = 7, height = 7)
}


###############
### General ###
###############
wdir <- "WESopt"

out_dir <- paste(wdir, "results", sep = "/")
dir.create(out_dir, showWarnings = F)

# Samples
samples <- c("S1_B1", "S2_B2", "S3_B3",
             "S4_D0", "S5_D1", "S6_D3",
             "S7_E0", "S8_E1", "S9_E2",
             "S10_vitroB", "S11_vitroD", "S12_vitroE")

################
### Variants ###
################
full.t <- data.frame()
for (ss in samples) {
  print(ss)
  t <- read.table(gzfile(paste(wdir, "data", ss, paste0(ss, "-DP500_PASS_sGQ80_sDP10.ANNOT.vcf.gz"), sep = "/")), comment.char = "#", sep = "\t")
  colnames(t) <- c("CHROM","POS","ID","REF","ALT","QUAL","FILTER", "ANNOT", "FORMAT", "SAMPLE")
  t$Sample <- ss
  t$Vars <- paste(t$CHROM, t$POS, t$REF, t$ALT, sep = "-")
  t$Group <- ifelse(startsWith(str_split_fixed(ss, "_", 2)[,2], "vitro"),
                    gsub("vitro", "", str_split_fixed(ss, "_", 2)[,2]),
                    str_sub(str_split_fixed(ss, "_", 2)[,2],1,1))
  t$Type <- ifelse(startsWith(str_split_fixed(ss, "_", 2)[,2], "vitro"), "Vitro", "Sample")
  t$DP <- as.numeric(str_split_fixed(t$SAMPLE, ":", 5)[,3])
  t$AD <- as.numeric(str_split_fixed(str_split_fixed(t$SAMPLE, ":", 5)[,2], ",", 2)[,1])
  t$VAF <- (t$DP - t$AD) / t$DP
  full.t <- rbind(full.t, t)
}

# Filter CHR only
chr <- c(paste0("chr", seq(1,22)), "chrX")
full.t <- subset(full.t, CHROM %in% chr)
full.t$CHROM <- factor(full.t$CHROM, levels = chr)
full.t$Sample <- factor(full.t$Sample, levels = unique(full.t$Sample))

# Remove Multi
full.t.nomulti <- full.t[grep(",", full.t$ALT, value = F, invert = T), ]
write.table(x = full.t.nomulti, file = paste(out_dir, "Full_NoMulti_res.tsv", sep = "/"),
            sep = "\t", quote = F, row.names = F, col.names = T)

plot_variants(full.t = full.t.nomulti, out_dir = out_dir, plot_prefix = "Full_NoMulti", fill_by = "Group")

# Check Vitro overlap (for germline)
full.venn <- list()
for (vs in subset(unique(full.t.nomulti[ ,c("Sample", "Type")]), Type == "Vitro")$Sample) {
  print(vs)
  full.venn[[vs]] <- full.t.nomulti[full.t.nomulti$Sample == vs, "Vars"]
}
venn <- Venn(full.venn)
data <- process_data(venn)
vreg <- venn_region(data)
vreg$NumInters <- as.character(str_count(venn_region(data)$name, "[..]"))
cc <- brewer.pal(name = "Set2", n = 8)
cc_cat <- cc[1:length(unique(vreg$NumInters))]
names(cc_cat) <- unique(vreg$NumInters)
s.plot <- ggplot() +
  # change mapping of color filling
  geom_sf(aes(fill = NumInters), data = vreg, show.legend = FALSE) +
  scale_fill_manual(values = cc_cat) +
  # adjust edge size and color
  geom_sf(color="grey", size = 1, data = venn_setedge(data), show.legend = FALSE) +
  # show set label in bold
  geom_sf_text(aes(label = name), fontface = "bold", data = venn_setlabel(data), size = 8, nudge_y = c(-400, -50, -400), nudge_x = c(150, 0, -150)) +
  # add a alternative region name
  geom_sf_label(aes(label = paste0(count, "\n", "(", round(count/sum(count)*100, 1), "%)")), data = vreg, alpha = 0.5, size = 5) +
  theme_void()
ggsave(filename = paste(out_dir, "Vitro_vars_venn.pdf", sep = "/"), plot = s.plot, width = 5, height = 5)

# Control
vitro_ctrl <- subset(full.t.nomulti, Sample == "S12_vitroE")
vitro_ctrl_vars <- vitro_ctrl$Vars

# Remove germline
full.t.nomulti.noGerm <- subset(full.t.nomulti, !(Vars %in% vitro_ctrl_vars))
plot_variants(full.t = full.t.nomulti.noGerm, out_dir = out_dir, plot_prefix = "Full_NoMulti_noGerm", fill_by = "Group")

# Parse snpEff Annotation
full.t.nomulti.noGerm$snpEff_tmp <- lapply(full.t.nomulti.noGerm$ANNOT, function(x){
  tmp <- strsplit(x, ";")[[1]]
  ann_pos <- length(tmp)
  if (!startsWith(tmp[length(tmp)], "ANN")) {
    for (i in seq(1,length(tmp))) {
      if (startsWith(tmp[i], "ANN")) {
        ann_pos <- i
      }
    }
  }
  tmp <- tmp[ann_pos]
})
full.t.nomulti.noGerm$snpEff_Effect <- sapply(full.t.nomulti.noGerm$snpEff_tmp, function(tmp){
  strsplit(tmp, "|", fixed = T)[[1]][2]
}, USE.NAMES = F)
full.t.nomulti.noGerm$snpEff_Impact <- sapply(full.t.nomulti.noGerm$snpEff_tmp, function(tmp){
  strsplit(tmp, "|", fixed = T)[[1]][3]
}, USE.NAMES = F)
full.t.nomulti.noGerm$snpEff_Gene <- sapply(full.t.nomulti.noGerm$snpEff_tmp, function(tmp){
  strsplit(tmp, "|", fixed = T)[[1]][4]
}, USE.NAMES = F)
full.t.nomulti.noGerm$snpEff_GeneName <- sapply(full.t.nomulti.noGerm$snpEff_tmp, function(tmp){
  strsplit(tmp, "|", fixed = T)[[1]][5]
}, USE.NAMES = F)
full.t.nomulti.noGerm$snpEff_BioType <- sapply(full.t.nomulti.noGerm$snpEff_tmp, function(tmp){
  strsplit(tmp, "|", fixed = T)[[1]][8]
}, USE.NAMES = F)
full.t.nomulti.noGerm$snpEff_tmp <- NULL

# Save results 
write.table(x = full.t.nomulti.noGerm, file = paste(out_dir, "Full_NoMulti_noGerm_res.tsv", sep = "/"),
            sep = "\t", quote = F, row.names = F, col.names = T)

## Samples only ##
samples.t.nomulti.noGerm <- subset(full.t.nomulti.noGerm, Type == "Sample")
plot_variants(full.t = samples.t.nomulti.noGerm, out_dir = out_dir, plot_prefix = "Samples_NoMulti_noGerm", fill_by = "Group")

write.table(x = samples.t.nomulti.noGerm, file = paste(out_dir, "Samples_NoMulti_noGerm_res.tsv", sep = "/"),
            sep = "\t", quote = F, row.names = F, col.names = T)


############################
### Cancer-related Genes ###
############################
offt_gene <- c("ABCB1","ABCC2","ABL1","ABL2","AKT1","AKT2","AKT3","ALK","ANGPTL7","APC","ASXL1","ATM","ATRX","BCYRN1","BRAF","BRCA1","BRCA2","CBL","CDA",
               "CDH1","CDKN2A","CDKN2B","CEBPA","CHD7","CHIC2","CREBBP","CRLF2","CSF1R","CTNNB1","CYP19A1","CYP2A13","CYP2A6","CYP2A7","CYP2B6","CYP2B7P",
							 "CYP2C19","CYP2C9","CYP2D6","CYP2D7","DACH1","DDR1","DDR2","DDX3X","DDX54","DNMT3A","DPYD","DPYD-AS1","EGFR","EGFR-AS1","ERBB2","ERBB3","ERBB4",
							 "ERG","ESR1","EVI2A","EVI2B","EZH2","FBXW7","FGFR1","FGFR2","FGFR3","FGFR4","FLT1","FLT3","FLT4","FSTL5","GNA11","GNAQ","GNAS","GNAS-AS1",
							 "GSTP1","GTF2IP1","H3F3A","HNF1A","HRAS","IDH1","IDH2","IKZF1","IL1RAPL1","IL2RA","IL2RB","IL2RG","INPP4B","JAK1","JAK2","JAK3","KDM6A",
							 "KDR","KIT","KMT2A","KRAS","LAMA2","LCK","LOC100287072","LOC101928052","LPAR6","LTK","MAP2K1","MAP2K2","MAP2K4","MAP3K1","MAPK1","MED13",
							 "MEIKIN","MET","MGC32805","MGST2","MIR548AZ","MLH1","MPL","MST1R","MTOR","MTOR-AS1","MYC","MYD88","NELL2","NF1","NOTCH1","NPM1","NRAS","OMG",
							 "PDGFRA","PDGFRB","PHF6","PIK3CA","PIK3R1","PSMB1","PSMB2","PSMB5","PSMD1","PSMD2","PTCH1","PTEN","PTENP1","PTPN11","PVT1","RAF1","RARA",
							 "RARA-AS1","RARB","RARG","RB1","RET","ROS1","RPS6KB1","RUNDC3B","RUNX1","RXRA","RXRB","RXRG","SDCCAG8","SHH","SHOC2","SLC22A1","SLC22A2",
							 "SLC31A1","SLC34A2","SLC45A3","SLCO1B1","SMAD4","SMARCA4","SMARCB1","SMO","SNCAIP","SOS1","SPRED1","SRC","STK11","SUFU","TAS2R38","TET2",
							 "TMEM75","TMPRSS2","TP53","TPX2","TRRAP","TYK2","UGT1A9","UTY","VHL","WT1","YES1")
offt_vars <- subset(full.t.nomulti.noGerm, snpEff_GeneName %in% offt_gene)
pdf(paste(out_dir, "Samples_Circos_Cancer_Genes.pdf", sep = "/"), width = 12, height = 9)
par(mfrow = c(3,4))
for (gg in unique(offt_vars$Group)) {
  offt_vars_gg <- subset(offt_vars, Group == gg)
  print(gg)
  for (ss in unique(offt_vars_gg$Sample)) {
    plot_circos(t = offt_vars, sample = ss)
  }
}
legend_plot(t = offt_vars, sample = ss)
dev.off()
write.xlsx(x = list("CancerGenes" = offt_vars), file = paste(out_dir, "Samples_Circos_Cancer_Genes.xlsx", sep = "/"))

offt_vars <- subset(samples.t.nomulti.noGerm, snpEff_GeneName %in% offt_gene)
pdf(paste(out_dir, "Groups_Circos_Cancer_Genes.pdf", sep = "/"), width = 12, height = 9)
par(mfrow = c(2,2))
for (gg in unique(offt_vars$Group)) {
  offt_vars_gg <- subset(offt_vars, Group == gg)
  offt_vars_gg$Sample <- gg
  plot_circos(t = offt_vars_gg, sample = gg)
}
legend_plot(t = offt_vars, sample = gg)
dev.off()
write.xlsx(x = list("CancerGenes" = offt_vars), file = paste(out_dir, "Groups_Circos_Cancer_Genes.xlsx", sep = "/"))