AF_comparison.R 16.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
# Script for analyzing AF 

library(openxlsx)

source(file = "Filter_variants_v3.R")

inputdata <- list()

for (mysample in c("12_27_1","12_27_2", "10_395_3", "10_395_4")) {
  
  inputdata[["rawcall"]][[mysample]] <- filter_variants_v3(infile = paste0(mysample, "_dbsnp_cc_cnc_dbnsfp_refseq_light_raw_filtered.fields.txt"), sampleid = mysample, dp = 50)
  inputdata[["passonly"]][[mysample]] <- filter_variants_v3(infile = paste0(mysample, "_dbsnp_cc_cnc_dbnsfp_refseq_light.fields.txt"), sampleid = mysample, dp = 50)
}

source(file = "/home/mbarcella/scripts/bitbucket/exometools/Compare_variants_AF_v3.R")

Comparison_12_27 <- list()
Comparison_10_395 <- list()

for(i in names(inputdata)){
  
  Comparison_12_27[[i]] <- Compare_variants_AF_v3(dblist = inputdata[[i]], sampleid_1 = "12_27_1", sampleid_2 = "12_27_2",
                                               my.width = 8, my.height = 4, my.res = 120, 
                                               tag = i, outfolder = "12_27/")
  
  Comparison_10_395[[i]] <- Compare_variants_AF_v3(dblist = inputdata[[i]], sampleid_1 = "10_395_3", sampleid_2 = "10_395_4",
                                                my.width = 8, my.height = 4, my.res = 120,
                                                tag = i, outfolder = "10_395/")
}

# plot AF density in H and L (full list)

AF_density_raw <- list()

library(cowplot)

tmpdf_1_raw <- subset.data.frame(x = Comparison_12_27$rawcall$unionannot, select = c("AF_GFP_H"), subset = AF_GFP_H > 0)
tmpdf_2_raw <- subset.data.frame(x = Comparison_12_27$rawcall$unionannot, select = c("AF_GFP_L"), subset = AF_GFP_L > 0)

tmpdf_3_raw <- subset.data.frame(x = Comparison_10_395$rawcall$unionannot, select = c("AF_GFP_H"), subset = AF_GFP_H > 0)
tmpdf_4_raw <- subset.data.frame(x = Comparison_10_395$rawcall$unionannot, select = c("AF_GFP_L"), subset = AF_GFP_L > 0)

AF_density_raw[["12-27_miRNA_L"]] <- ggplot(data = tmpdf_1_raw,  
                                        mapping = aes(AF_GFP_H)) + ylim(0,600) + 
  geom_histogram(bins = 50, fill = alpha("#003366",0.8)) + xlab("miRNA_low") + 
  theme(axis.text = element_text(size = 14), legend.title = element_blank()) +
  ggtitle("12-27_miRNA_L")

AF_density_raw[["12-27_miRNA_H"]] <- ggplot(data = tmpdf_2_raw,  
                                        mapping = aes(AF_GFP_L)) +  ylim(0,600) + 
  geom_histogram(bins = 50, fill = alpha("#fc2803",0.8))+ xlab("miRNA_high") + 
  theme(axis.text = element_text(size = 14), legend.title = element_blank()) + 
  ggtitle("12-27_miRNA_H")

AF_density_raw[["10-395_miRNA_L"]] <- ggplot(data = tmpdf_3_raw,  
                                         mapping = aes(AF_GFP_H)) +  ylim(0,600) + 
  geom_histogram(bins = 50, fill = alpha("#003366",0.8)) + xlab("miRNA_low") + 
  theme(axis.text = element_text(size = 14), legend.title = element_blank()) + 
  ggtitle("10-395_miRNA_L")

AF_density_raw[["10-395_miRNA_H"]] <- ggplot(data = tmpdf_4_raw,  
                                         mapping = aes(AF_GFP_L)) +  ylim(0,600) + 
  geom_histogram(bins = 50, fill = alpha("#fc2803",0.8)) + xlab("miRNA_high") + 
  theme(axis.text = element_text(size = 14), legend.title = element_blank()) + 
  ggtitle("10-395_miRNA_H")

png(filename = "AF_density_raw_variants.png", width = 8, height = 6, units = "in", res = 300)
plot_grid(AF_density_raw[["12-27_miRNA_L"]], AF_density_raw[["12-27_miRNA_H"]],
          AF_density_raw[["10-395_miRNA_L"]] , AF_density_raw[["10-395_miRNA_H"]],
          nrow = 2)
dev.off()

## PLOTTING PASS-ONLY

tmpdf_1_pass <- subset.data.frame(x = Comparison_12_27$passonly$unionannot, select = c("AF_GFP_H"), subset = AF_GFP_H > 0)
tmpdf_2_pass <- subset.data.frame(x = Comparison_12_27$passonly$unionannot, select = c("AF_GFP_L"), subset = AF_GFP_L > 0)

tmpdf_3_pass <- subset.data.frame(x = Comparison_10_395$passonly$unionannot, select = c("AF_GFP_H"), subset = AF_GFP_H > 0)
tmpdf_4_pass <- subset.data.frame(x = Comparison_10_395$passonly$unionannot, select = c("AF_GFP_L"), subset = AF_GFP_L > 0)

AF_density_pass <- list()

AF_density_pass[["12-27_miRNA_L"]] <- ggplot(data = tmpdf_1_pass,
                                        mapping = aes(AF_GFP_H)) + ylim(0,300) +
  geom_histogram(bins = 50, fill = alpha("#003366",0.8)) + xlab("miRNA_low") +
  theme(axis.text = element_text(size = 14), legend.title = element_blank()) +
  ggtitle("12-27_miRNA_L")

AF_density_pass[["12-27_miRNA_H"]] <- ggplot(data = tmpdf_2_pass,
                                        mapping = aes(AF_GFP_L)) + ylim(0,300) +
  geom_histogram(bins = 50, fill = alpha("#fc2803",0.8))+ xlab("miRNA_high") +
  theme(axis.text = element_text(size = 14), legend.title = element_blank()) +
  ggtitle("12-27_miRNA_H")

AF_density_pass[["10-395_miRNA_L"]] <- ggplot(data = tmpdf_3_pass,
                                         mapping = aes(AF_GFP_H)) + ylim(0,300) +
  geom_histogram(bins = 50, fill = alpha("#003366",0.8)) + xlab("miRNA_low") +
  theme(axis.text = element_text(size = 14), legend.title = element_blank()) +
  ggtitle("10-395_miRNA_L")

AF_density_pass[["10-395_miRNA_H"]] <- ggplot(data = tmpdf_4_pass,
                                         mapping = aes(AF_GFP_L)) + ylim(0,300) +
  geom_histogram(bins = 50, fill = alpha("#fc2803",0.8)) + xlab("miRNA_high") +
  theme(axis.text = element_text(size = 14), legend.title = element_blank()) +
  ggtitle("10-395_miRNA_H")

png(filename = "AF_density_pass_variants.png", width = 8, height = 6, units = "in", res = 300)
plot_grid(AF_density_pass[["12-27_miRNA_L"]], AF_density_pass[["12-27_miRNA_H"]],
          AF_density_pass[["10-395_miRNA_L"]] , AF_density_pass[["10-395_miRNA_H"]],
          nrow = 2)
dev.off()

# Histograms with PASS or clustered_events

AF_density_mildfilt <- list()
Mild_vars_12_27 <- Comparison_12_27$rawcall$unionannot

# Recoding filter variable, PASS / no PASS
Mild_vars_12_27$FILTER_GFP_H_simple <- ifelse(test = Mild_vars_12_27$FILTER_GFP_H == "PASS", "PASS","NOPASS")
Mild_vars_12_27$FILTER_GFP_L_simple <- ifelse(test = Mild_vars_12_27$FILTER_GFP_L == "PASS", "PASS","NOPASS")

tmpdf_1_mild <- subset.data.frame(x = Mild_vars_12_27, select = c("AF_GFP_H"), 
                                  subset = AF_GFP_H > 0 & (FILTER_GFP_H_simple == "PASS" | FILTER_GFP_L_simple == "PASS") & !HGVS_P == "")
tmpdf_2_mild <- subset.data.frame(x = Mild_vars_12_27, select = c("AF_GFP_L"), 
                                  subset = AF_GFP_L > 0 & (FILTER_GFP_H_simple == "PASS" | FILTER_GFP_L_simple == "PASS") & !HGVS_P == "")

Mild_vars_10_395 <- Comparison_10_395$rawcall$unionannot
Mild_vars_10_395$FILTER_GFP_H_simple <- ifelse(test = Mild_vars_10_395$FILTER_GFP_H == "PASS", "PASS","NOPASS")
Mild_vars_10_395$FILTER_GFP_L_simple <- ifelse(test = Mild_vars_10_395$FILTER_GFP_L == "PASS", "PASS","NOPASS")

tmpdf_3_mild <- subset.data.frame(x = Mild_vars_10_395, select = c("AF_GFP_H"), 
                                  subset = AF_GFP_H > 0 & (FILTER_GFP_H_simple == "PASS" | FILTER_GFP_L_simple == "PASS") & !HGVS_P == "")
tmpdf_4_mild <- subset.data.frame(x = Mild_vars_10_395, select = c("AF_GFP_L"), 
                                  subset = AF_GFP_L > 0 & (FILTER_GFP_H_simple == "PASS" | FILTER_GFP_L_simple == "PASS") & !HGVS_P == "")


AF_density_mildfilt[["12-27_miRNA_L"]] <- ggplot(data = tmpdf_1_mild,  
                                            mapping = aes(AF_GFP_H)) + ylim(0,300) + 
  geom_histogram(bins = 50, fill = alpha("#003366",0.8)) + xlab("miRNA_low") + 
  theme(axis.text = element_text(size = 14), legend.title = element_blank()) +
  ggtitle("12-27_miRNA_L")

AF_density_mildfilt[["12-27_miRNA_H"]] <- ggplot(data = tmpdf_2_mild,  
                                            mapping = aes(AF_GFP_L)) + ylim(0,300) + 
  geom_histogram(bins = 50,  fill = alpha("#fc2803",0.8))+ xlab("miRNA_high") + 
  theme(axis.text = element_text(size = 14), legend.title = element_blank()) + 
  ggtitle("12-27_miRNA_H")

AF_density_mildfilt[["10-395_miRNA_L"]] <- ggplot(data = tmpdf_3_mild,  
                                             mapping = aes(AF_GFP_H)) + ylim(0,300) + 
  geom_histogram(bins = 50,  fill = alpha("#003366",0.8)) + xlab("miRNA_low") + 
  theme(axis.text = element_text(size = 14), legend.title = element_blank()) + 
  ggtitle("10-395_miRNA_L")

AF_density_mildfilt[["10-395_miRNA_H"]] <- ggplot(data = tmpdf_4_mild,  
                                             mapping = aes(AF_GFP_L)) + ylim(0,300) + 
  geom_histogram(bins = 50, fill = alpha("#fc2803",0.8)) + xlab("miRNA_high") + 
  theme(axis.text = element_text(size = 14), legend.title = element_blank()) + 
  ggtitle("10-395_miRNA_H")

png(filename = "AF_density_mildfilt_variants.png", width = 8, height = 6, units = "in", res = 300)
plot_grid(AF_density_mildfilt[["12-27_miRNA_L"]], AF_density_mildfilt[["12-27_miRNA_H"]],
          AF_density_mildfilt[["10-395_miRNA_L"]] , AF_density_mildfilt[["10-395_miRNA_H"]],
          nrow = 2)
dev.off()

# removing not exonic by using & !HGVS_P == ""

mild_12_27 <- subset.data.frame(x = Mild_vars_12_27, subset = (FILTER_GFP_H_simple == "PASS" | FILTER_GFP_L_simple == "PASS") & !HGVS_P == "")
mild_12_27$FILTER <- paste0(mild_12_27$FILTER_GFP_H_simple, "_", mild_12_27$FILTER_GFP_L_simple)
png(filename = "12_27_variants_PASS_H_or_L_scatterplot_mild.png", width = 12, height = 9, units = "in", res = 900)
ggplot(data = mild_12_27, mapping = aes(x = AF_GFP_H, y = AF_GFP_L, col=FILTER)) + geom_point() +
  theme(legend.text = element_text(size = 7), legend.title = element_blank(), legend.position = "top",
        legend.key.height = unit(1, units = "mm"), legend.key.width = unit(1, units = "mm"),
        plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5),
        legend.justification = 0.5) + ggtitle("12-27 variants", subtitle = "PASS in at least one the 2 populations")
dev.off()

mild_10_395 <- subset.data.frame(x = Mild_vars_10_395, subset = (FILTER_GFP_H_simple == "PASS" | FILTER_GFP_L_simple == "PASS") & !HGVS_P == "")
mild_10_395$FILTER <- paste0(mild_10_395$FILTER_GFP_H_simple, "_", mild_10_395$FILTER_GFP_L_simple)
png(filename = "10_395_variants_PASS_H_or_L_scatterplot_mild.png", width = 12, height = 9, units = "in", res = 900)
ggplot(data = mild_10_395, mapping = aes(x = AF_GFP_H, y = AF_GFP_L, col=FILTER)) + geom_point() +
  theme(legend.text = element_text(size = 7), legend.title = element_blank(), legend.position = "top",
        legend.key.height = unit(1, units = "mm"), legend.key.width = unit(1, units = "mm"),
        plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5),
        legend.justification = 0.5) + ggtitle("10_395 variants", subtitle = "PASS in at least one the 2 populations")
dev.off()

# Variants NA in miRNAlow 12-27 and PASS in miRNA-H and viceversa

library(openxlsx)

write.xlsx(x = mild_12_27[is.na(mild_12_27$FILTER_GFP_H) & mild_12_27$FILTER_GFP_L == "PASS",],
           file = "mild_12_27_PASS_GFP_L_NA_GFP_H.xlsx", asTable = T) 
write.xlsx(x = mild_12_27[is.na(mild_12_27$FILTER_GFP_L) & mild_12_27$FILTER_GFP_H == "PASS",],
           file = "mild_12_27_PASS_GFP_H_NA_GFP_L.xlsx", asTable = T) 

write.xlsx(x = mild_10_395[is.na(mild_10_395$FILTER_GFP_H) & mild_10_395$FILTER_GFP_L == "PASS",],
           file = "mild_10_395_PASS_GFP_L_NA_GFP_H.xlsx", asTable = T) 
write.xlsx(x = mild_10_395[is.na(mild_10_395$FILTER_GFP_L) & mild_10_395$FILTER_GFP_H == "PASS",],
           file = "mild_10_395_PASS_GFP_H_NA_GFP_L.xlsx", asTable = T) 

# saving also only annotated (COSMIC or dbSNP)

mild_12_27_pass_GFP_L_NA_GFP_H <- subset.data.frame(x = mild_12_27, subset = !ID == "" & is.na(FILTER_GFP_H) & FILTER_GFP_L == "PASS")
writeLines(text = mild_12_27_pass_GFP_L_NA_GFP_H$variantkey, con = "mild_12_27_pass_GFP_L_NA_GFP_H.txt")

mild_12_27_pass_GFP_H_NA_GFP_L <- subset.data.frame(x = mild_12_27, subset = !ID == "" & is.na(FILTER_GFP_L) & FILTER_GFP_H == "PASS")
writeLines(text = mild_12_27_pass_GFP_H_NA_GFP_L$variantkey, con = "mild_12_27_pass_GFP_H_NA_GFP_L.txt")

write.xlsx(x = mild_12_27_pass_GFP_L_NA_GFP_H,
           file = "mild_12_27_PASS_GFP_L_NA_GFP_H_cosmic_dbsnp.xlsx", asTable = T) 
write.xlsx(x = mild_12_27_pass_GFP_H_NA_GFP_L,
           file = "mild_12_27_PASS_GFP_H_NA_GFP_L_cosmic_dbsnp.xlsx", asTable = T) 

mild_10_395_pass_GFP_L_NA_GFP_H <- subset.data.frame(x = mild_10_395, subset = !ID == "" & is.na(FILTER_GFP_H) & FILTER_GFP_L == "PASS")
writeLines(text = mild_10_395_pass_GFP_L_NA_GFP_H$variantkey, con = "mild_10_395_pass_GFP_L_NA_GFP_H.txt")

mild_10_395_pass_GFP_H_NA_GFP_L <- subset.data.frame(x = mild_10_395, subset = !ID == "" & is.na(FILTER_GFP_L) & FILTER_GFP_H == "PASS")
writeLines(text = mild_10_395_pass_GFP_H_NA_GFP_L$variantkey, con = "mild_10_395_pass_GFP_H_NA_GFP_L.txt")

write.xlsx(x = mild_10_395_pass_GFP_L_NA_GFP_H,
           file = "mild_10_395_PASS_GFP_L_NA_GFP_H_cosmic_dbsnp.xlsx", asTable = T) 
write.xlsx(x = mild_10_395_pass_GFP_H_NA_GFP_L,
           file = "mild_10_395_PASS_GFP_H_NA_GFP_L_cosmic_dbsnp.xlsx", asTable = T) 


# plotting CR vs DGN variants
# importing variants

dgn_vs_cr_vars <- read.xlsx(xlsxFile = "12_27_Somatic_DGN_only_variants.xlsx")
mykeys <- dgn_vs_cr_vars$variantkey

raw_vars_12_27_1 <- read.table(file = "12_27_1_dbsnp_cc_cnc_dbnsfp_refseq_light_raw_filtered.fields.txt",
                               sep = "\t" ,header = T, stringsAsFactors = F)

raw_vars_12_27_2 <- read.table(file = "12_27_2_dbsnp_cc_cnc_dbnsfp_refseq_light_raw_filtered.fields.txt",
                               sep = "\t" ,header = T, stringsAsFactors = F)

colnames(raw_vars_12_27_1) <- c("Chromosome","Position","Reference","Alternative","AF","DP","AD.R","AD.A","GT","ID","FILTER","HGVS_P","Gene","Biotype",
                  "Rank","Effect","Impact","Common","G5",colnames(raw_vars_12_27_1)[20:35])

raw_vars_12_27_1$sampleid <- "12_27_1"
raw_vars_12_27_1$variantkey <- paste(raw_vars_12_27_1$Chromosome, 
                                     raw_vars_12_27_1$Position, 
                                     raw_vars_12_27_1$Reference, 
                                     raw_vars_12_27_1$Alternative,sep = "_")

colnames(raw_vars_12_27_2) <- c("Chromosome","Position","Reference","Alternative","AF","DP","AD.R","AD.A","GT","ID","FILTER","HGVS_P","Gene","Biotype",
                  "Rank","Effect","Impact","Common","G5",colnames(raw_vars_12_27_2)[20:35])

raw_vars_12_27_2$sampleid <- "12_27_2"
raw_vars_12_27_2$variantkey <- paste(raw_vars_12_27_2$Chromosome, 
                                     raw_vars_12_27_2$Position, 
                                     raw_vars_12_27_2$Reference, 
                                     raw_vars_12_27_2$Alternative,sep = "_")


# subsetting dfs

df_1 <- subset.data.frame(x = raw_vars_12_27_1, subset = variantkey %in% mykeys, select = c("variantkey","AF","DP","Gene","Effect","FILTER"))
colnames(df_1) <- c("variantkey","AF_GFP_H","DP_GFP_H","Gene", "Effect","FILTER")
df_2 <- subset.data.frame(x = raw_vars_12_27_2, subset = variantkey %in% mykeys, select = c("variantkey","AF","DP","Gene","Effect","FILTER"))
colnames(df_2) <- c("variantkey","AF_GFP_L","DP_GFP_L","Gene", "Effect","FILTER")


union_df <- merge.data.frame(x = df_1, y=df_2, by = c("variantkey","Gene","Effect"), all = T)


union_df$FILTER.x[is.na(union_df$FILTER.x)] <- "NOPASS"
union_df$FILTER.y[is.na(union_df$FILTER.y)] <- "NOPASS"

union_df$AF_1[is.na(union_df$AF_GFP_H)] <- 0
union_df$AF_2[is.na(union_df$AF_GFP_L)] <- 0

union_df$DP_1[is.na(union_df$DP_GFP_H)] <- 0
union_df$DP_2[is.na(union_df$DP_GFP_L)] <- 0

union_df$FILTER <- ifelse(test = (union_df$FILTER.x == "" | union_df$FILTER.y == ""), 
                          yes = paste0(union_df$FILTER.x, "_", union_df$FILTER.y),
                          no = union_df$FILTER.y)

union_df$FILTER.x <- NULL
union_df$FILTER.y <- NULL

# subsetting only variants in exons

union_df_exonic <- union_df[union_df$Effect %in% c("frameshift_variant","missense_variant","synonymous_variant","splice_region_variant&intron_variant"),]
write.xlsx(x = union_df_exonic, file = "DGN_vs_CR_variants_exonic.xlsx")

library(ggrepel)

png(filename = paste0("AF_Comparison_12_27_1_vs_12_27_2_DGN_vs_CR_variants.png"), width = 8, height = 6, units = "in", res = 200)
print(ggplot(data = union_df_exonic, mapping = aes(x = AF_GFP_H, y = AF_GFP_L, label = Gene)) + xlim(0,1) + ylim(0,1) + 
        geom_point(mapping = aes(col = "black"), size = 2) + #geom_abline(intercept = 0, slope = 1) + 
        xlab(label = "GFP_HIGH") + ylab(label = "GFP_LOW") + ggtitle(label = "12_27 Somatic DGN only variants") + 
        #geom_point(data=union_df[union_df$AF_GFP_H == 0, ], aes(x=AF_GFP_H, y=AF_GFP_L), colour="red",size = 2) + 
        geom_text_repel(size=3, show.legend = FALSE, segment.alpha = 0.4) +
        theme(plot.title = element_text(hjust = 0.5), legend.title = element_blank(), legend.position = "none"))
dev.off()