Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
custom
casertanucera_leukemia2023
Commits
38a09ebb
Commit
38a09ebb
authored
Aug 03, 2023
by
Matteo Barcella
Browse files
Adding scripts for Allele Fractions variants
parent
8d31118e
Changes
3
Hide whitespace changes
Inline
Side-by-side
WES/AF_comparison.R
0 → 100755
View file @
38a09ebb
# Script for analyzing AF
library
(
openxlsx
)
source
(
file
=
"Filter_variants_v3.R"
)
inputdata
<-
list
()
for
(
mysample
in
c
(
"12_27_1"
,
"12_27_2"
,
"10_395_3"
,
"10_395_4"
))
{
inputdata
[[
"rawcall"
]][[
mysample
]]
<-
filter_variants_v3
(
infile
=
paste0
(
mysample
,
"_dbsnp_cc_cnc_dbnsfp_refseq_light_raw_filtered.fields.txt"
),
sampleid
=
mysample
,
dp
=
50
)
inputdata
[[
"passonly"
]][[
mysample
]]
<-
filter_variants_v3
(
infile
=
paste0
(
mysample
,
"_dbsnp_cc_cnc_dbnsfp_refseq_light.fields.txt"
),
sampleid
=
mysample
,
dp
=
50
)
}
source
(
file
=
"/home/mbarcella/scripts/bitbucket/exometools/Compare_variants_AF_v3.R"
)
Comparison_12_27
<-
list
()
Comparison_10_395
<-
list
()
for
(
i
in
names
(
inputdata
)){
Comparison_12_27
[[
i
]]
<-
Compare_variants_AF_v3
(
dblist
=
inputdata
[[
i
]],
sampleid_1
=
"12_27_1"
,
sampleid_2
=
"12_27_2"
,
my.width
=
8
,
my.height
=
4
,
my.res
=
120
,
tag
=
i
,
outfolder
=
"12_27/"
)
Comparison_10_395
[[
i
]]
<-
Compare_variants_AF_v3
(
dblist
=
inputdata
[[
i
]],
sampleid_1
=
"10_395_3"
,
sampleid_2
=
"10_395_4"
,
my.width
=
8
,
my.height
=
4
,
my.res
=
120
,
tag
=
i
,
outfolder
=
"10_395/"
)
}
# plot AF density in H and L (full list)
AF_density_raw
<-
list
()
library
(
cowplot
)
tmpdf_1_raw
<-
subset.data.frame
(
x
=
Comparison_12_27
$
rawcall
$
unionannot
,
select
=
c
(
"AF_GFP_H"
),
subset
=
AF_GFP_H
>
0
)
tmpdf_2_raw
<-
subset.data.frame
(
x
=
Comparison_12_27
$
rawcall
$
unionannot
,
select
=
c
(
"AF_GFP_L"
),
subset
=
AF_GFP_L
>
0
)
tmpdf_3_raw
<-
subset.data.frame
(
x
=
Comparison_10_395
$
rawcall
$
unionannot
,
select
=
c
(
"AF_GFP_H"
),
subset
=
AF_GFP_H
>
0
)
tmpdf_4_raw
<-
subset.data.frame
(
x
=
Comparison_10_395
$
rawcall
$
unionannot
,
select
=
c
(
"AF_GFP_L"
),
subset
=
AF_GFP_L
>
0
)
AF_density_raw
[[
"12-27_miRNA_L"
]]
<-
ggplot
(
data
=
tmpdf_1_raw
,
mapping
=
aes
(
AF_GFP_H
))
+
ylim
(
0
,
600
)
+
geom_histogram
(
bins
=
50
,
fill
=
alpha
(
"#003366"
,
0.8
))
+
xlab
(
"miRNA_low"
)
+
theme
(
axis.text
=
element_text
(
size
=
14
),
legend.title
=
element_blank
())
+
ggtitle
(
"12-27_miRNA_L"
)
AF_density_raw
[[
"12-27_miRNA_H"
]]
<-
ggplot
(
data
=
tmpdf_2_raw
,
mapping
=
aes
(
AF_GFP_L
))
+
ylim
(
0
,
600
)
+
geom_histogram
(
bins
=
50
,
fill
=
alpha
(
"#fc2803"
,
0.8
))
+
xlab
(
"miRNA_high"
)
+
theme
(
axis.text
=
element_text
(
size
=
14
),
legend.title
=
element_blank
())
+
ggtitle
(
"12-27_miRNA_H"
)
AF_density_raw
[[
"10-395_miRNA_L"
]]
<-
ggplot
(
data
=
tmpdf_3_raw
,
mapping
=
aes
(
AF_GFP_H
))
+
ylim
(
0
,
600
)
+
geom_histogram
(
bins
=
50
,
fill
=
alpha
(
"#003366"
,
0.8
))
+
xlab
(
"miRNA_low"
)
+
theme
(
axis.text
=
element_text
(
size
=
14
),
legend.title
=
element_blank
())
+
ggtitle
(
"10-395_miRNA_L"
)
AF_density_raw
[[
"10-395_miRNA_H"
]]
<-
ggplot
(
data
=
tmpdf_4_raw
,
mapping
=
aes
(
AF_GFP_L
))
+
ylim
(
0
,
600
)
+
geom_histogram
(
bins
=
50
,
fill
=
alpha
(
"#fc2803"
,
0.8
))
+
xlab
(
"miRNA_high"
)
+
theme
(
axis.text
=
element_text
(
size
=
14
),
legend.title
=
element_blank
())
+
ggtitle
(
"10-395_miRNA_H"
)
png
(
filename
=
"AF_density_raw_variants.png"
,
width
=
8
,
height
=
6
,
units
=
"in"
,
res
=
300
)
plot_grid
(
AF_density_raw
[[
"12-27_miRNA_L"
]],
AF_density_raw
[[
"12-27_miRNA_H"
]],
AF_density_raw
[[
"10-395_miRNA_L"
]]
,
AF_density_raw
[[
"10-395_miRNA_H"
]],
nrow
=
2
)
dev.off
()
## PLOTTING PASS-ONLY
tmpdf_1_pass
<-
subset.data.frame
(
x
=
Comparison_12_27
$
passonly
$
unionannot
,
select
=
c
(
"AF_GFP_H"
),
subset
=
AF_GFP_H
>
0
)
tmpdf_2_pass
<-
subset.data.frame
(
x
=
Comparison_12_27
$
passonly
$
unionannot
,
select
=
c
(
"AF_GFP_L"
),
subset
=
AF_GFP_L
>
0
)
tmpdf_3_pass
<-
subset.data.frame
(
x
=
Comparison_10_395
$
passonly
$
unionannot
,
select
=
c
(
"AF_GFP_H"
),
subset
=
AF_GFP_H
>
0
)
tmpdf_4_pass
<-
subset.data.frame
(
x
=
Comparison_10_395
$
passonly
$
unionannot
,
select
=
c
(
"AF_GFP_L"
),
subset
=
AF_GFP_L
>
0
)
AF_density_pass
<-
list
()
AF_density_pass
[[
"12-27_miRNA_L"
]]
<-
ggplot
(
data
=
tmpdf_1_pass
,
mapping
=
aes
(
AF_GFP_H
))
+
ylim
(
0
,
300
)
+
geom_histogram
(
bins
=
50
,
fill
=
alpha
(
"#003366"
,
0.8
))
+
xlab
(
"miRNA_low"
)
+
theme
(
axis.text
=
element_text
(
size
=
14
),
legend.title
=
element_blank
())
+
ggtitle
(
"12-27_miRNA_L"
)
AF_density_pass
[[
"12-27_miRNA_H"
]]
<-
ggplot
(
data
=
tmpdf_2_pass
,
mapping
=
aes
(
AF_GFP_L
))
+
ylim
(
0
,
300
)
+
geom_histogram
(
bins
=
50
,
fill
=
alpha
(
"#fc2803"
,
0.8
))
+
xlab
(
"miRNA_high"
)
+
theme
(
axis.text
=
element_text
(
size
=
14
),
legend.title
=
element_blank
())
+
ggtitle
(
"12-27_miRNA_H"
)
AF_density_pass
[[
"10-395_miRNA_L"
]]
<-
ggplot
(
data
=
tmpdf_3_pass
,
mapping
=
aes
(
AF_GFP_H
))
+
ylim
(
0
,
300
)
+
geom_histogram
(
bins
=
50
,
fill
=
alpha
(
"#003366"
,
0.8
))
+
xlab
(
"miRNA_low"
)
+
theme
(
axis.text
=
element_text
(
size
=
14
),
legend.title
=
element_blank
())
+
ggtitle
(
"10-395_miRNA_L"
)
AF_density_pass
[[
"10-395_miRNA_H"
]]
<-
ggplot
(
data
=
tmpdf_4_pass
,
mapping
=
aes
(
AF_GFP_L
))
+
ylim
(
0
,
300
)
+
geom_histogram
(
bins
=
50
,
fill
=
alpha
(
"#fc2803"
,
0.8
))
+
xlab
(
"miRNA_high"
)
+
theme
(
axis.text
=
element_text
(
size
=
14
),
legend.title
=
element_blank
())
+
ggtitle
(
"10-395_miRNA_H"
)
png
(
filename
=
"AF_density_pass_variants.png"
,
width
=
8
,
height
=
6
,
units
=
"in"
,
res
=
300
)
plot_grid
(
AF_density_pass
[[
"12-27_miRNA_L"
]],
AF_density_pass
[[
"12-27_miRNA_H"
]],
AF_density_pass
[[
"10-395_miRNA_L"
]]
,
AF_density_pass
[[
"10-395_miRNA_H"
]],
nrow
=
2
)
dev.off
()
# Histograms with PASS or clustered_events
AF_density_mildfilt
<-
list
()
Mild_vars_12_27
<-
Comparison_12_27
$
rawcall
$
unionannot
# Recoding filter variable, PASS / no PASS
Mild_vars_12_27
$
FILTER_GFP_H_simple
<-
ifelse
(
test
=
Mild_vars_12_27
$
FILTER_GFP_H
==
"PASS"
,
"PASS"
,
"NOPASS"
)
Mild_vars_12_27
$
FILTER_GFP_L_simple
<-
ifelse
(
test
=
Mild_vars_12_27
$
FILTER_GFP_L
==
"PASS"
,
"PASS"
,
"NOPASS"
)
tmpdf_1_mild
<-
subset.data.frame
(
x
=
Mild_vars_12_27
,
select
=
c
(
"AF_GFP_H"
),
subset
=
AF_GFP_H
>
0
&
(
FILTER_GFP_H_simple
==
"PASS"
|
FILTER_GFP_L_simple
==
"PASS"
)
&
!
HGVS_P
==
""
)
tmpdf_2_mild
<-
subset.data.frame
(
x
=
Mild_vars_12_27
,
select
=
c
(
"AF_GFP_L"
),
subset
=
AF_GFP_L
>
0
&
(
FILTER_GFP_H_simple
==
"PASS"
|
FILTER_GFP_L_simple
==
"PASS"
)
&
!
HGVS_P
==
""
)
Mild_vars_10_395
<-
Comparison_10_395
$
rawcall
$
unionannot
Mild_vars_10_395
$
FILTER_GFP_H_simple
<-
ifelse
(
test
=
Mild_vars_10_395
$
FILTER_GFP_H
==
"PASS"
,
"PASS"
,
"NOPASS"
)
Mild_vars_10_395
$
FILTER_GFP_L_simple
<-
ifelse
(
test
=
Mild_vars_10_395
$
FILTER_GFP_L
==
"PASS"
,
"PASS"
,
"NOPASS"
)
tmpdf_3_mild
<-
subset.data.frame
(
x
=
Mild_vars_10_395
,
select
=
c
(
"AF_GFP_H"
),
subset
=
AF_GFP_H
>
0
&
(
FILTER_GFP_H_simple
==
"PASS"
|
FILTER_GFP_L_simple
==
"PASS"
)
&
!
HGVS_P
==
""
)
tmpdf_4_mild
<-
subset.data.frame
(
x
=
Mild_vars_10_395
,
select
=
c
(
"AF_GFP_L"
),
subset
=
AF_GFP_L
>
0
&
(
FILTER_GFP_H_simple
==
"PASS"
|
FILTER_GFP_L_simple
==
"PASS"
)
&
!
HGVS_P
==
""
)
AF_density_mildfilt
[[
"12-27_miRNA_L"
]]
<-
ggplot
(
data
=
tmpdf_1_mild
,
mapping
=
aes
(
AF_GFP_H
))
+
ylim
(
0
,
300
)
+
geom_histogram
(
bins
=
50
,
fill
=
alpha
(
"#003366"
,
0.8
))
+
xlab
(
"miRNA_low"
)
+
theme
(
axis.text
=
element_text
(
size
=
14
),
legend.title
=
element_blank
())
+
ggtitle
(
"12-27_miRNA_L"
)
AF_density_mildfilt
[[
"12-27_miRNA_H"
]]
<-
ggplot
(
data
=
tmpdf_2_mild
,
mapping
=
aes
(
AF_GFP_L
))
+
ylim
(
0
,
300
)
+
geom_histogram
(
bins
=
50
,
fill
=
alpha
(
"#fc2803"
,
0.8
))
+
xlab
(
"miRNA_high"
)
+
theme
(
axis.text
=
element_text
(
size
=
14
),
legend.title
=
element_blank
())
+
ggtitle
(
"12-27_miRNA_H"
)
AF_density_mildfilt
[[
"10-395_miRNA_L"
]]
<-
ggplot
(
data
=
tmpdf_3_mild
,
mapping
=
aes
(
AF_GFP_H
))
+
ylim
(
0
,
300
)
+
geom_histogram
(
bins
=
50
,
fill
=
alpha
(
"#003366"
,
0.8
))
+
xlab
(
"miRNA_low"
)
+
theme
(
axis.text
=
element_text
(
size
=
14
),
legend.title
=
element_blank
())
+
ggtitle
(
"10-395_miRNA_L"
)
AF_density_mildfilt
[[
"10-395_miRNA_H"
]]
<-
ggplot
(
data
=
tmpdf_4_mild
,
mapping
=
aes
(
AF_GFP_L
))
+
ylim
(
0
,
300
)
+
geom_histogram
(
bins
=
50
,
fill
=
alpha
(
"#fc2803"
,
0.8
))
+
xlab
(
"miRNA_high"
)
+
theme
(
axis.text
=
element_text
(
size
=
14
),
legend.title
=
element_blank
())
+
ggtitle
(
"10-395_miRNA_H"
)
png
(
filename
=
"AF_density_mildfilt_variants.png"
,
width
=
8
,
height
=
6
,
units
=
"in"
,
res
=
300
)
plot_grid
(
AF_density_mildfilt
[[
"12-27_miRNA_L"
]],
AF_density_mildfilt
[[
"12-27_miRNA_H"
]],
AF_density_mildfilt
[[
"10-395_miRNA_L"
]]
,
AF_density_mildfilt
[[
"10-395_miRNA_H"
]],
nrow
=
2
)
dev.off
()
# removing not exonic by using & !HGVS_P == ""
mild_12_27
<-
subset.data.frame
(
x
=
Mild_vars_12_27
,
subset
=
(
FILTER_GFP_H_simple
==
"PASS"
|
FILTER_GFP_L_simple
==
"PASS"
)
&
!
HGVS_P
==
""
)
mild_12_27
$
FILTER
<-
paste0
(
mild_12_27
$
FILTER_GFP_H_simple
,
"_"
,
mild_12_27
$
FILTER_GFP_L_simple
)
png
(
filename
=
"12_27_variants_PASS_H_or_L_scatterplot_mild.png"
,
width
=
12
,
height
=
9
,
units
=
"in"
,
res
=
900
)
ggplot
(
data
=
mild_12_27
,
mapping
=
aes
(
x
=
AF_GFP_H
,
y
=
AF_GFP_L
,
col
=
FILTER
))
+
geom_point
()
+
theme
(
legend.text
=
element_text
(
size
=
7
),
legend.title
=
element_blank
(),
legend.position
=
"top"
,
legend.key.height
=
unit
(
1
,
units
=
"mm"
),
legend.key.width
=
unit
(
1
,
units
=
"mm"
),
plot.title
=
element_text
(
hjust
=
0.5
),
plot.subtitle
=
element_text
(
hjust
=
0.5
),
legend.justification
=
0.5
)
+
ggtitle
(
"12-27 variants"
,
subtitle
=
"PASS in at least one the 2 populations"
)
dev.off
()
mild_10_395
<-
subset.data.frame
(
x
=
Mild_vars_10_395
,
subset
=
(
FILTER_GFP_H_simple
==
"PASS"
|
FILTER_GFP_L_simple
==
"PASS"
)
&
!
HGVS_P
==
""
)
mild_10_395
$
FILTER
<-
paste0
(
mild_10_395
$
FILTER_GFP_H_simple
,
"_"
,
mild_10_395
$
FILTER_GFP_L_simple
)
png
(
filename
=
"10_395_variants_PASS_H_or_L_scatterplot_mild.png"
,
width
=
12
,
height
=
9
,
units
=
"in"
,
res
=
900
)
ggplot
(
data
=
mild_10_395
,
mapping
=
aes
(
x
=
AF_GFP_H
,
y
=
AF_GFP_L
,
col
=
FILTER
))
+
geom_point
()
+
theme
(
legend.text
=
element_text
(
size
=
7
),
legend.title
=
element_blank
(),
legend.position
=
"top"
,
legend.key.height
=
unit
(
1
,
units
=
"mm"
),
legend.key.width
=
unit
(
1
,
units
=
"mm"
),
plot.title
=
element_text
(
hjust
=
0.5
),
plot.subtitle
=
element_text
(
hjust
=
0.5
),
legend.justification
=
0.5
)
+
ggtitle
(
"10_395 variants"
,
subtitle
=
"PASS in at least one the 2 populations"
)
dev.off
()
# Variants NA in miRNAlow 12-27 and PASS in miRNA-H and viceversa
library
(
openxlsx
)
write.xlsx
(
x
=
mild_12_27
[
is.na
(
mild_12_27
$
FILTER_GFP_H
)
&
mild_12_27
$
FILTER_GFP_L
==
"PASS"
,],
file
=
"mild_12_27_PASS_GFP_L_NA_GFP_H.xlsx"
,
asTable
=
T
)
write.xlsx
(
x
=
mild_12_27
[
is.na
(
mild_12_27
$
FILTER_GFP_L
)
&
mild_12_27
$
FILTER_GFP_H
==
"PASS"
,],
file
=
"mild_12_27_PASS_GFP_H_NA_GFP_L.xlsx"
,
asTable
=
T
)
write.xlsx
(
x
=
mild_10_395
[
is.na
(
mild_10_395
$
FILTER_GFP_H
)
&
mild_10_395
$
FILTER_GFP_L
==
"PASS"
,],
file
=
"mild_10_395_PASS_GFP_L_NA_GFP_H.xlsx"
,
asTable
=
T
)
write.xlsx
(
x
=
mild_10_395
[
is.na
(
mild_10_395
$
FILTER_GFP_L
)
&
mild_10_395
$
FILTER_GFP_H
==
"PASS"
,],
file
=
"mild_10_395_PASS_GFP_H_NA_GFP_L.xlsx"
,
asTable
=
T
)
# saving also only annotated (COSMIC or dbSNP)
mild_12_27_pass_GFP_L_NA_GFP_H
<-
subset.data.frame
(
x
=
mild_12_27
,
subset
=
!
ID
==
""
&
is.na
(
FILTER_GFP_H
)
&
FILTER_GFP_L
==
"PASS"
)
writeLines
(
text
=
mild_12_27_pass_GFP_L_NA_GFP_H
$
variantkey
,
con
=
"mild_12_27_pass_GFP_L_NA_GFP_H.txt"
)
mild_12_27_pass_GFP_H_NA_GFP_L
<-
subset.data.frame
(
x
=
mild_12_27
,
subset
=
!
ID
==
""
&
is.na
(
FILTER_GFP_L
)
&
FILTER_GFP_H
==
"PASS"
)
writeLines
(
text
=
mild_12_27_pass_GFP_H_NA_GFP_L
$
variantkey
,
con
=
"mild_12_27_pass_GFP_H_NA_GFP_L.txt"
)
write.xlsx
(
x
=
mild_12_27_pass_GFP_L_NA_GFP_H
,
file
=
"mild_12_27_PASS_GFP_L_NA_GFP_H_cosmic_dbsnp.xlsx"
,
asTable
=
T
)
write.xlsx
(
x
=
mild_12_27_pass_GFP_H_NA_GFP_L
,
file
=
"mild_12_27_PASS_GFP_H_NA_GFP_L_cosmic_dbsnp.xlsx"
,
asTable
=
T
)
mild_10_395_pass_GFP_L_NA_GFP_H
<-
subset.data.frame
(
x
=
mild_10_395
,
subset
=
!
ID
==
""
&
is.na
(
FILTER_GFP_H
)
&
FILTER_GFP_L
==
"PASS"
)
writeLines
(
text
=
mild_10_395_pass_GFP_L_NA_GFP_H
$
variantkey
,
con
=
"mild_10_395_pass_GFP_L_NA_GFP_H.txt"
)
mild_10_395_pass_GFP_H_NA_GFP_L
<-
subset.data.frame
(
x
=
mild_10_395
,
subset
=
!
ID
==
""
&
is.na
(
FILTER_GFP_L
)
&
FILTER_GFP_H
==
"PASS"
)
writeLines
(
text
=
mild_10_395_pass_GFP_H_NA_GFP_L
$
variantkey
,
con
=
"mild_10_395_pass_GFP_H_NA_GFP_L.txt"
)
write.xlsx
(
x
=
mild_10_395_pass_GFP_L_NA_GFP_H
,
file
=
"mild_10_395_PASS_GFP_L_NA_GFP_H_cosmic_dbsnp.xlsx"
,
asTable
=
T
)
write.xlsx
(
x
=
mild_10_395_pass_GFP_H_NA_GFP_L
,
file
=
"mild_10_395_PASS_GFP_H_NA_GFP_L_cosmic_dbsnp.xlsx"
,
asTable
=
T
)
# plotting CR vs DGN variants
# importing variants
dgn_vs_cr_vars
<-
read.xlsx
(
xlsxFile
=
"12_27_Somatic_DGN_only_variants.xlsx"
)
mykeys
<-
dgn_vs_cr_vars
$
variantkey
raw_vars_12_27_1
<-
read.table
(
file
=
"12_27_1_dbsnp_cc_cnc_dbnsfp_refseq_light_raw_filtered.fields.txt"
,
sep
=
"\t"
,
header
=
T
,
stringsAsFactors
=
F
)
raw_vars_12_27_2
<-
read.table
(
file
=
"12_27_2_dbsnp_cc_cnc_dbnsfp_refseq_light_raw_filtered.fields.txt"
,
sep
=
"\t"
,
header
=
T
,
stringsAsFactors
=
F
)
colnames
(
raw_vars_12_27_1
)
<-
c
(
"Chromosome"
,
"Position"
,
"Reference"
,
"Alternative"
,
"AF"
,
"DP"
,
"AD.R"
,
"AD.A"
,
"GT"
,
"ID"
,
"FILTER"
,
"HGVS_P"
,
"Gene"
,
"Biotype"
,
"Rank"
,
"Effect"
,
"Impact"
,
"Common"
,
"G5"
,
colnames
(
raw_vars_12_27_1
)[
20
:
35
])
raw_vars_12_27_1
$
sampleid
<-
"12_27_1"
raw_vars_12_27_1
$
variantkey
<-
paste
(
raw_vars_12_27_1
$
Chromosome
,
raw_vars_12_27_1
$
Position
,
raw_vars_12_27_1
$
Reference
,
raw_vars_12_27_1
$
Alternative
,
sep
=
"_"
)
colnames
(
raw_vars_12_27_2
)
<-
c
(
"Chromosome"
,
"Position"
,
"Reference"
,
"Alternative"
,
"AF"
,
"DP"
,
"AD.R"
,
"AD.A"
,
"GT"
,
"ID"
,
"FILTER"
,
"HGVS_P"
,
"Gene"
,
"Biotype"
,
"Rank"
,
"Effect"
,
"Impact"
,
"Common"
,
"G5"
,
colnames
(
raw_vars_12_27_2
)[
20
:
35
])
raw_vars_12_27_2
$
sampleid
<-
"12_27_2"
raw_vars_12_27_2
$
variantkey
<-
paste
(
raw_vars_12_27_2
$
Chromosome
,
raw_vars_12_27_2
$
Position
,
raw_vars_12_27_2
$
Reference
,
raw_vars_12_27_2
$
Alternative
,
sep
=
"_"
)
# subsetting dfs
df_1
<-
subset.data.frame
(
x
=
raw_vars_12_27_1
,
subset
=
variantkey
%in%
mykeys
,
select
=
c
(
"variantkey"
,
"AF"
,
"DP"
,
"Gene"
,
"Effect"
,
"FILTER"
))
colnames
(
df_1
)
<-
c
(
"variantkey"
,
"AF_GFP_H"
,
"DP_GFP_H"
,
"Gene"
,
"Effect"
,
"FILTER"
)
df_2
<-
subset.data.frame
(
x
=
raw_vars_12_27_2
,
subset
=
variantkey
%in%
mykeys
,
select
=
c
(
"variantkey"
,
"AF"
,
"DP"
,
"Gene"
,
"Effect"
,
"FILTER"
))
colnames
(
df_2
)
<-
c
(
"variantkey"
,
"AF_GFP_L"
,
"DP_GFP_L"
,
"Gene"
,
"Effect"
,
"FILTER"
)
union_df
<-
merge.data.frame
(
x
=
df_1
,
y
=
df_2
,
by
=
c
(
"variantkey"
,
"Gene"
,
"Effect"
),
all
=
T
)
union_df
$
FILTER.x
[
is.na
(
union_df
$
FILTER.x
)]
<-
"NOPASS"
union_df
$
FILTER.y
[
is.na
(
union_df
$
FILTER.y
)]
<-
"NOPASS"
union_df
$
AF_1
[
is.na
(
union_df
$
AF_GFP_H
)]
<-
0
union_df
$
AF_2
[
is.na
(
union_df
$
AF_GFP_L
)]
<-
0
union_df
$
DP_1
[
is.na
(
union_df
$
DP_GFP_H
)]
<-
0
union_df
$
DP_2
[
is.na
(
union_df
$
DP_GFP_L
)]
<-
0
union_df
$
FILTER
<-
ifelse
(
test
=
(
union_df
$
FILTER.x
==
""
|
union_df
$
FILTER.y
==
""
),
yes
=
paste0
(
union_df
$
FILTER.x
,
"_"
,
union_df
$
FILTER.y
),
no
=
union_df
$
FILTER.y
)
union_df
$
FILTER.x
<-
NULL
union_df
$
FILTER.y
<-
NULL
# subsetting only variants in exons
union_df_exonic
<-
union_df
[
union_df
$
Effect
%in%
c
(
"frameshift_variant"
,
"missense_variant"
,
"synonymous_variant"
,
"splice_region_variant&intron_variant"
),]
write.xlsx
(
x
=
union_df_exonic
,
file
=
"DGN_vs_CR_variants_exonic.xlsx"
)
library
(
ggrepel
)
png
(
filename
=
paste0
(
"AF_Comparison_12_27_1_vs_12_27_2_DGN_vs_CR_variants.png"
),
width
=
8
,
height
=
6
,
units
=
"in"
,
res
=
200
)
print
(
ggplot
(
data
=
union_df_exonic
,
mapping
=
aes
(
x
=
AF_GFP_H
,
y
=
AF_GFP_L
,
label
=
Gene
))
+
xlim
(
0
,
1
)
+
ylim
(
0
,
1
)
+
geom_point
(
mapping
=
aes
(
col
=
"black"
),
size
=
2
)
+
#geom_abline(intercept = 0, slope = 1) +
xlab
(
label
=
"GFP_HIGH"
)
+
ylab
(
label
=
"GFP_LOW"
)
+
ggtitle
(
label
=
"12_27 Somatic DGN only variants"
)
+
#geom_point(data=union_df[union_df$AF_GFP_H == 0, ], aes(x=AF_GFP_H, y=AF_GFP_L), colour="red",size = 2) +
geom_text_repel
(
size
=
3
,
show.legend
=
FALSE
,
segment.alpha
=
0.4
)
+
theme
(
plot.title
=
element_text
(
hjust
=
0.5
),
legend.title
=
element_blank
(),
legend.position
=
"none"
))
dev.off
()
WES/Filter_variants_v3.R
0 → 100644
View file @
38a09ebb
filter_variants_v3
<-
function
(
infile
,
sampleid
=
"Sample01"
,
dp
=
50
){
library
(
openxlsx
)
df
<-
read.table
(
file
=
infile
,
sep
=
"\t"
,
header
=
T
,
stringsAsFactors
=
F
)
write.xlsx
(
x
=
df
,
file
=
paste0
(
sampleid
,
"_full_variants_annotated.xlsx"
),
asTable
=
T
)
colnames
(
df
)
<-
c
(
"Chromosome"
,
"Position"
,
"Reference"
,
"Alternative"
,
"AF"
,
"DP"
,
"AD.R"
,
"AD.A"
,
"GT"
,
"ID"
,
"FILTER"
,
"HGVS_P"
,
"Gene"
,
"Biotype"
,
"Rank"
,
"Effect"
,
"Impact"
,
"Common"
,
"G5"
,
colnames
(
df
)[
20
:
35
])
df
$
Common
[
is.na
(
df
$
Common
)]
<-
0
df
<-
subset.data.frame
(
x
=
df
,
subset
=
DP
>
dp
&
Common
==
0
&
Biotype
==
"protein_coding"
&
!
Impact
%in%
c
(
"LOW"
,
"MODIFIER"
)
)
df
$
sampleid
<-
sampleid
df
$
variantkey
<-
paste
(
df
$
Chromosome
,
df
$
Position
,
df
$
Reference
,
df
$
Alternative
,
sep
=
"_"
)
write.xlsx
(
x
=
df
,
file
=
paste0
(
sampleid
,
"_rawvariants_dp_"
,
dp
,
"_coding_lowimpact_notcommon"
,
".xlsx"
),
asTable
=
T
)
return
(
df
)
}
WES/MethylAnalysis_bulk_v3.R
0 → 100644
View file @
38a09ebb
MethylAnalysis_bulk_v3
<-
function
(
methcall.folder
=
NULL
,
# input folder with .cov files (bismark)
outfolder
=
NULL
,
# output folder (if not exist it will be created)
proj.id
=
"myproject"
,
# project id to prefix in putput files
samplesheet
=
NULL
,
sheetnum
=
1
,
design.var
=
"Disease"
,
# variable to subset samplesheet
case.var
=
"Case"
,
# 1 in treatment vector
control.var
=
"Control"
,
# 0 in treatment vector
idcol
=
"SampleID"
,
mincoverage
=
10
,
lowperc
=
5
,
lowcount
=
10
,
assem
=
"hg38"
,
do.subset
=
T
,
chr.subset
=
"chr9"
,
start.subset
=
136668000
,
end.subset
=
136671000
,
pipeline.meth
=
"bismarkCoverage"
,
plot.covariates
=
c
(
"Condition"
,
"Batch"
,
"MRD"
,
"Torelapse"
,
"Chemorefractory"
,
"Sex"
,
"CytoA"
,
"Tissue"
),
idstoremove
=
NULL
,
delta.meth
=
10
,
plot.categorical.vars
=
c
(
"Condition"
,
"Batch"
,
"MRD"
,
"Torelapse"
,
"Chemorefractory"
,
"Sex"
,
"CytoA"
),
plot.continuous.vars
=
c
(
"miR-126"
,
"egfl7"
),
plot.height
=
9
,
plot.width
=
12
,
cellw
=
15
,
cellh
=
15
,
fontnumsize
=
5
,
fontsize
=
8
){
# load libraries
require
(
methylKit
)
require
(
ggplot2
)
require
(
reshape2
)
require
(
plyr
)
require
(
ggrepel
)
library
(
GenomicRanges
)
library
(
openxlsx
)
library
(
pheatmap
)
# check vars
if
(
is.null
(
methcall.folder
)){
stop
(
"Please set methcall.folder"
)
}
if
(
is.null
(
outfolder
)){
stop
(
"Please set outfolder"
)
}
if
(
is.null
(
samplesheet
)){
stop
(
"Please set samplesheet"
)
}
# Setup variables
infolder
=
paste0
(
methcall.folder
,
"/"
)
dir.create
(
infolder
,
showWarnings
=
F
)
outfolder
=
paste0
(
outfolder
,
"/"
)
dir.create
(
outfolder
,
showWarnings
=
F
)
qcfolder
<-
paste0
(
outfolder
,
"QC/"
)
dir.create
(
qcfolder
,
showWarnings
=
F
)
# reading sample sheet with metadata
ssheet
<-
read.xlsx
(
samplesheet
,
sheet
=
sheetnum
)
if
(
!
is.null
(
idstoremove
)){
ssheet
<-
subset.data.frame
(
x
=
ssheet
,
subset
=
!
ssheet
[[
idcol
]]
%in%
idstoremove
)
}
# defining design vector according to variable
ssheet
<-
subset.data.frame
(
x
=
ssheet
,
subset
=
ssheet
[[
design.var
]]
%in%
c
(
case.var
,
control.var
))
d.vec
<-
ssheet
[[
design.var
]]
d.vector
<-
ifelse
(
d.vec
==
case.var
,
1
,
0
)
# initialzing coverage .cov files list
covs
<-
list
()
sampleids
<-
as.list
(
ssheet
[[
idcol
]])
names
(
sampleids
)
<-
ssheet
[[
idcol
]]
for
(
i
in
ssheet
[[
idcol
]])
{
covs
[[
i
]]
<-
paste0
(
infolder
,
"/"
,
i
,
".bismark.cov"
)
}
saveRDS
(
object
=
covs
,
file
=
"covs_object.rds"
)
# creating object
myobj
<-
methRead
(
location
=
covs
,
sample.id
=
sampleids
,
assembly
=
assem
,
pipeline
=
pipeline.meth
,
treatment
=
d.vector
,
context
=
"CpG"
,
mincov
=
mincoverage
)
saveRDS
(
myobj
,
file
=
"Myobject.rds"
)
names
(
myobj
)
<-
ssheet
[[
idcol
]]
saveRDS
(
myobj
,
file
=
"Myobject_with_names.rds"
)
# subsetting if declared
if
(
isTRUE
(
do.subset
)){
my.win
=
GRanges
(
seqnames
=
chr.subset
,
ranges
=
IRanges
(
start
=
start.subset
,
end
=
end.subset
))
myobj
<-
selectByOverlap
(
myobj
,
my.win
)
}
saveRDS
(
myobj
,
file
=
"Myobject_with_names_aftersubset.rds"
)
# filtering on minimum coverage
myobj
<-
filterByCoverage
(
methylObj
=
myobj
,
lo.count
=
lowcount
,
lo.perc
=
lowperc
)
# Normalization
myobj
<-
normalizeCoverage
(
obj
=
myobj
)
# saveRDS(object = myobj, file = "Initial_object.rds")
# Calculate basic stats and PCs
metricsfolder
<-
paste0
(
qcfolder
,
"Metrics/"
)
dir.create
(
path
=
metricsfolder
,
showWarnings
=
F
)
for
(
id
in
ssheet
[[
idcol
]])
{
png
(
filename
=
paste0
(
metricsfolder
,
proj.id
,
"_CpG_pct_methylation_sample_"
,
id
,
".png"
),
width
=
9
,
height
=
6
,
units
=
"in"
,
res
=
96
)
print
(
getMethylationStats
(
myobj
[[
id
]],
plot
=
TRUE
,
both.strands
=
FALSE
))
dev.off
()
png
(
filename
=
paste0
(
metricsfolder
,
proj.id
,
"_Coverage_stats_sample_"
,
id
,
".png"
),
width
=
9
,
height
=
6
,
units
=
"in"
,
res
=
96
)
print
(
getCoverageStats
(
myobj
[[
id
]],
plot
=
TRUE
,
both.strands
=
FALSE
))
dev.off
()
}
# create meth obj
#meth <- unite(object = myobj, destrand=FALSE)
meth
<-
unite
(
object
=
myobj
,
destrand
=
FALSE
)
# for debugging
saveRDS
(
meth
,
"savemeth.tmp.rds"
)
# Perform correlation
sink
(
paste0
(
qcfolder
,
proj.id
,
"_Correlations.txt"
))
getCorrelation
(
meth
,
plot
=
FALSE
)
sink
()
if
(
length
(
ssheet
[[
idcol
]])
<
15
){
png
(
filename
=
paste0
(
qcfolder
,
proj.id
,
"_Correlations_pearson_pairwise.png"
),
width
=
9
,
height
=
6
,
units
=
"in"
,
res
=
96
)
print
(
getCorrelation
(
meth
,
plot
=
TRUE
))
dev.off
()
}
png
(
filename
=
paste0
(
qcfolder
,
proj.id
,
"_Clustering.png"
),
width
=
9
,
height
=
6
,
units
=
"in"
,
res
=
96
)
clusterSamples
(
meth
,
dist
=
"euclidean"
,
plot
=
TRUE
,
method
=
"ward.D2"
)
dev.off
()
# Re-plotting PCs (custom chart)
# compute PCs and store in object
pca_compt
<-
PCASamples
(
meth
,
obj.return
=
T
,
screeplot
=
F
)
# extract PCs components
pcafolder
<-
paste0
(
qcfolder
,
"PCA/"
)
dir.create
(
path
=
pcafolder
,
showWarnings
=
F
)
pca_pc1_2
<-
as.data.frame
(
x
=
pca_compt
$
x
[,
1
:
2
])
for
(
myvars
in
plot.covariates
){
pca_pc1_2
$
condition
<-
as.factor
(
ssheet
[[
myvars
]])
png
(
filename
=
paste0
(
pcafolder
,
proj.id
,
"_PCA_"
,
myvars
,
".png"
),
width
=
9
,
height
=
6
,
units
=
"in"
,
res
=
96
)
print
(
ggplot
(
data
=
pca_pc1_2
,
mapping
=
aes
(
x
=
PC1
,
y
=
PC2
,
col
=
condition
,
label
=
rownames
(
pca_pc1_2
)))
+
geom_point
(
size
=
3
)
+
geom_text_repel
(
size
=
3
)
+
ggtitle
(
label
=
"Principal component analysis"
,
subtitle
=
myvars
)
+
theme
(
plot.title
=
element_text
(
size
=
16
,
face
=
"bold"
,
hjust
=
0.5
))
+
theme
(
plot.subtitle
=
element_text
(
size
=
12
,
hjust
=
0.5
,
face
=
"italic"
,
color
=
"black"
))
+
theme
(
axis.title
=
element_text
(
size
=
12
,
hjust
=
0.5
,
face
=
"bold"
,
color
=
"black"
))
+
theme
(
legend.text
=
element_text
(
size
=
8
,
hjust
=
0.5
))
+
theme
(
legend.title
=
element_blank
())
+
theme
(
axis.text
=
element_text
(
size
=
12
,
hjust
=
0.5
,
color
=
"black"
)))
dev.off
()
}
# retrieve and store % of methylation
perc.meth
<-
percMethylation
(
meth
)
saveRDS
(
perc.meth
,
"pctmethly.rds"
)
base
::
rownames
(
perc.meth
)
<-
paste0
(
meth
$
chr
,
"_"
,
meth
$
start
)
# Perform diff methylation
myDiff
=
calculateDiffMeth
(
meth
)
write.table
(
myDiff
,
paste0
(
outfolder
,
proj.id
,
"_DiffMeth_single_CpG.txt"
),
row.names
=
F
)
difftest
<-
read.table
(
paste0
(
outfolder
,
proj.id
,
"_DiffMeth_single_CpG.txt"
),
header
=
T
)
difftest
$
comparison
<-
proj.id
difftest
$
qvalue_r
<-
as.character
(
cut
(
x
=
difftest
$
qvalue
,
breaks
=
c
(
-1
,
1e-100
,
1e-10
,
1e-02
,
1
),
labels
=
c
(
"***"
,
"**"
,
"*"
,
"ns"
)))
myindex
<-
abs
(
difftest
$
meth.diff
)
<
delta.meth
difftest
$
meth.diff
<-
abs
(
difftest
$
meth.diff
)
difftest
$
qvalue_r
[
myindex
]
<-
"ns"
write.table
(
difftest
,
paste0
(
outfolder
,
proj.id
,
"_DiffMeth_single_CpG.txt"
),
row.names
=
F
)
# Adding color list and annotations
library
(
RColorBrewer
)
color_list
<-
list
()
annrows
<-
NULL
anncols
<-
subset.data.frame
(
difftest
,
select
=
c
(
"qvalue_r"
,
"meth.diff"
))
base
::
rownames
(
anncols
)
<-
difftest
$
start
rows.annot.vars.cat
=
plot.categorical.vars
rows.annot.vars.con
=
plot.continuous.vars
# if(!is.null(rows.annot.vars.cat) | !is.null(rows.annot.vars.con)){
# rownames(ssheet) <- ssheet$SampleID
# annrows <- subset.data.frame(x = ssheet, select = c(rows.annot.vars.cat, rows.annot.vars.con))
# concolors <- RColorBrewer::brewer.pal(n = 9, name = "Set1")
# catcolors <- NULL
# for (varcon in 1:length(rows.annot.vars.con)) {
# color_list[[rows.annot.vars.con[varcon]]] <- colorRampPalette(c("lightgrey", concolors[varcon]))(10)
# }
# for (varcat in 1:length(rows.annot.vars.cat)) {
# ncolor <- 1
# for (val in unique(ssheet[[rows.annot.vars.cat[varcat]]])[order(unique(ssheet[[rows.annot.vars.cat[varcat]]]))]){
# if(length(unique(ssheet[[rows.annot.vars.cat[varcat]]])) > 9){
# catcolors <- colorRampPalette(RColorBrewer::brewer.pal(n = 9, name = "Set3"))(length(unique(ssheet[[rows.annot.vars.cat[varcat]]])))
# }
# else{
# catcolors <- RColorBrewer::brewer.pal(n = 9, name = "Set3")
# }
# color_list[[rows.annot.vars.cat[varcat]]][[val]] <- catcolors[ncolor]
# ncolor <- ncolor + 1
# }
# }
# }
color_list
[[
"qvalue_r"
]]
=
c
(
"*"
=
"#6497b1"
,
"**"
=
"#03396c"
,
"***"
=
"#011f4b"
,
ns
=
"#c4cacf"
)
color_list
[[
"meth.diff"
]]
=
colorRampPalette
(
brewer.pal
(
n
=
11
,
"Reds"
))(
100
)
color_list
[[
"miR-126"
]]
<-
brewer.pal
(
n
=
9
,
"PuRd"
)
# Heatmap
pctmeth_matrix
<-
t
(
perc.meth
)
base
::
colnames
(
pctmeth_matrix
)
<-
gsub
(
x
=
base
::
colnames
(
pctmeth_matrix
),
pattern
=
"chr[0-9]+_"
,
replacement
=
""
)
pheatmap
(
mat
=
pctmeth_matrix
,
main
=
gsub
(
x
=
proj.id
,
pattern
=
"_"
,
replacement
=
" "
),
filename
=
paste0
(
outfolder
,
proj.id
,
"_CpG_percent_methylation_matrix_pheatmap.pdf"
),
width
=
plot.width
,
height
=
plot.height
,
na_col
=
"black"
,
cluster_cols
=
FALSE
,
cluster_rows
=
TRUE
,
annotation_row
=
annrows
,
cellwidth
=
cellw
,
cellheight
=
cellh
,
display_numbers
=
T
,
fontsize
=
fontsize
,
fontsize_number
=
fontnumsize
,
number_format
=
"%.0f"
,
#border_color = "#CBBEB5",
annotation_col
=
anncols
,
#labels_row = lrow,
#labels_col = lcol,
#gaps_row = gaps.row,
gaps_col
=
c
(
8
),
annotation_colors
=
color_list
,
color
=
c
(
"#F5F5F5"
,
"#EEEEEE"
,
"#CCCCCC"
,
"#999999"
,
"#666666"
,
"#333333"
,
"#000000"
),
breaks
=
c
(
0
,
10
,
20
,
30
,
50
,
70
,
90
,
100
)
)
saveRDS
(
object
=
pctmeth_matrix
,
paste0
(
outfolder
,
proj.id
,
"_CpG_percent_methylation_matrix_pheatmap.rds"
))
saveRDS
(
object
=
list
(
anncol
=
anncols
,
annrow
=
annrows
,
anncolors
=
color_list
),
paste0
(
outfolder
,
proj.id
,
"_annotations_matrix_pheatmap.rds"
))
saveRDS
(
object
=
difftest
,
paste0
(
outfolder
,
proj.id
,
"_CpG_differential_methylation.rds"
))
saveRDS
(
object
=
perc.meth
,
paste0
(
outfolder
,
proj.id
,
"_CpG_percent_methylation.rds"
))
write.table
(
x
=
perc.meth
,
file
=
paste0
(
outfolder
,
proj.id
,
"_CpG_percent_methylation.txt"
))
write.table
(
x
=
difftest
,
file
=
paste0
(
outfolder
,
proj.id
,
"_CpG_differential_methylation.txt"
))
saveRDS
(
myobj
,
file
=
paste0
(
outfolder
,
proj.id
,
"_methylkit.rds"
))
saveRDS
(
myobj
,
file
=
paste0
(
outfolder
,
proj.id
,
"_methylkit_meth.rds"
))
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment