-
Notifications
You must be signed in to change notification settings - Fork 0
/
04_variantCalling_temporalDynamics_c.Rmd
290 lines (188 loc) · 11.5 KB
/
04_variantCalling_temporalDynamics_c.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
---
title: "Analysis of the Temporal Dynamics of Mutations Identified in two AD mice"
author: "Ricardo Ramiro"
date: "29/7/2020"
output:
html_document:
code_folding: show
---
```{r setup, include=FALSE}
## unless noted all analysis was run on the Buenos Aires desktop
knitr::opts_chunk$set(echo=TRUE,
message=FALSE,
error = TRUE,
warning=FALSE,
paged.print=FALSE,
fig.align = "center")
knitr::opts_hooks$set(eval = function(options) {
if (options$engine == "bash") {
options$eval <- FALSE
}
options
})
```
### Run breseq v0.34.1 to call mutations in B. theta populations
this was done on the bioinformatics VM in the biodata.pt server
```{bash}
# activate conda environment where breseq is installed
conda activate breseq0.34.1
# change permissions on script to run it
chmod u+x breseq_script_dynamics_final_prokka_withMQpara_v3ref.sh
# run script (this is set up to run multiple samples in parallel)
nohup bash breseq_script_dynamics_final_prokka_withMQpara_v3ref.sh &
```
# get output.gd files into a single folder
```{bash, eval=FALSE, echo=FALSE}
# go to dir
cd analysis/genomics/evolved_populations_breseq/temporalDynamics_prokkaREF_MQ20_BQ30_v3ref
# make new dir to put all gd files
mkdir breseq_gdOutput_temporalDynamics
# copy and rename gd files
for DIR in D*; do
cd "$DIR"/output
cp output.gd ../../breseq_gdOutput_temporalDynamics/$DIR.gd
cd ../..
done
cd ..
#copy data from day 84
cp D84_prokkaREF_MQ20_BQ30_v3ref/breseq_gdOutput_day84/D84_F2.gd temporalDynamics_prokkaREF_MQ20_BQ30_v3ref/breseq_gdOutput_temporalDynamics/
cp D84_prokkaREF_MQ20_BQ30_v3ref/breseq_gdOutput_day84/D84_F5.gd temporalDynamics_prokkaREF_MQ20_BQ30_v3ref/breseq_gdOutput_temporalDynamics/
```
# use gdtools compare to get mutation tables
```{bash}
# reference sequence
REF=analysis/genomics/TDA1000/TDA1000_hybridAssembly/results/assemblies/prokka_annotation/TDA1000_chromosome_plasmid_v3.gff3
# move to folder with gd files
cd temporalDynamics_prokkaREF_MQ20_BQ30_v3ref/breseq_gdOutput_temporalDynamics
# generate html file comparing all runs of the ancestral
gdtools COMPARE -r $REF --repeat-header 0 -o ../downstreamAnalysis/breseq_output_temporalDynamics.html `ls *.gd`
```
The above html files were then copied in to google sheets of the same name. Two tabs were created:
- raw: just copy paste
- edited. In this sheet I did the following:
- google sheets considers anything that starts as a plus sign as a formula and gives an error. This is important in column C (mutation). To change this behaviour I replaced, using REGEX,
"=\+" by "'+". The apostrophe allows the plus sign to be the first character
- when the same gene is mutated with both SNPs and deletions, breseq will add Δ in the frequency field for samples that got deletions. I replaced all these Δ by nothing
- replaced "\?" by noting
- I changed the format of the frequencies of the mutations. These are expressed as %, I changed to decimal
- some positions are written as ######:# (e.g. 314,132:1 or 314,132:2). These lead to problems when making position column as numeric. Thus, I replaced "\:[0-9]" by nothing
- change the name of the column "seq id" to "seq_id"
- in the gene column, I replaced all spaces, ← and → by nothing
# general data manipulation
```{r}
#load libraries
library(tidyverse)
library(readxl)
library(cowplot)
library(plotly)
library(RColorBrewer)
# load data
dynamics<-read_xlsx("analysis/genomics/evolved_populations_breseq/temporalDynamics_prokkaREF_MQ20_BQ30_v3ref/downstreamAnalysis/breseq_output_temporalDynamics.xlsx",sheet=2)
# change order of columns, such that metadata is all on the left of the data frame
dynamics<-dynamics[c(1:3,28:30,4:27)]
# transform data to long format
dynamics_long<-gather(dynamics,key=sample,value=frequency,D0_F2:D84_F5)
# separate the sample column to its component day and mouse codes
dynamics_long<-dynamics_long %>% separate(sample, c("day", "mouse"),sep="_",remove=F)
# remove D from day
dynamics_long$day<-str_remove(dynamics_long$day,"D")
# replace NAs by zeroes. Contrary to the day 84 data, this needs to be done here we want to track mutation dynamics
dynamics_long$frequency[is.na(dynamics_long$frequency)] <- 0
#transform position to numeric
dynamics_long$position<-as.numeric(dynamics_long$position)
# get the first letter from each mouse code, as this indicates the diet
dynamics_long$trt<-str_sub(dynamics_long$mouse,1,1)
# create a data frame matching the 1-letter codes to the codes used in the paper
trt_diet<-data.frame(trt=c("C","M","F"),diet=c("WD","SD","AD"))
# match the 1-letter diet codes to those to use in the paper
dynamics_long<-left_join(dynamics_long,trt_diet)
# create a new column for easy identification of mutations
dynamics_long$seqid_position_mutation_gene<-paste0(dynamics_long$seq_id,"_",dynamics_long$position,"_",dynamics_long$mutation,"_",dynamics_long$gene)
#create a new column specifying the diet per week
dynamics_long$diet_week<-ifelse(dynamics_long$day %in% c(0,13,27,41,55,69,84),"SD","WD")
# write the long format dataframe to a new file
# writexl::write_xlsx(dynamics_long,"analysis/genomics/evolved_populations_breseq/temporalDynamics_prokkaREF_MQ20_BQ30_v3ref/downstreamAnalysis/breseq_output_temporalDynamics_longFormat_c.xlsx")
```
## remove low confidence mutations from mutation list and add info about which mutations are parallel (as created in 03_variantCalling_day84_c.Rmd)
```{r}
#load data
dynamics_long<-read_xlsx("analysis/genomics/evolved_populations_breseq/temporalDynamics_prokkaREF_MQ20_BQ30_v3ref/downstreamAnalysis/breseq_output_temporalDynamics_longFormat_c.xlsx")
# load low confidence mutations (including ancestral when using mapping quality thresholds)
low_confidence_mutations_unique<-read_xlsx("analysis/genomics/evolved_populations_breseq/D84_prokkaREF_MQ20_BQ30_v3ref/downstreamAnalysis/low_confidence_mutations_unique_c.xlsx")
#create a column code for filtering
low_confidence_mutations_unique$seqid_position_mutation_gene<-paste(low_confidence_mutations_unique$seq_id,
low_confidence_mutations_unique$position,
low_confidence_mutations_unique$mutation,
low_confidence_mutations_unique$gene,
sep = "_")
#create vector with low quality mutation codes
low_confidence_mutations_unique<-low_confidence_mutations_unique$seqid_position_mutation_gene
# remove all mutations that match above code
dynamics_long<-dynamics_long %>% subset(!seqid_position_mutation_gene %in% low_confidence_mutations_unique)
##### add column describing d84 parallel genes #####
#load parallel mutation data
d84_long_parallel<-read_xlsx("analysis/genomics/evolved_populations_breseq/D84_prokkaREF_MQ20_BQ30_v3ref/downstreamAnalysis/breseq_output_day84_longFormat_parallelMutations_withAnnotations_c.xlsx")
# add column for parallel genes
parallel_genes<-unique(d84_long_parallel$gene)
dynamics_long$parallel<-ifelse(dynamics_long$gene %in% parallel_genes, "yes","no")
# write the long format dataframe to a new file
# writexl::write_xlsx(dynamics_long,"analysis/genomics/evolved_populations_breseq/temporalDynamics_prokkaREF_MQ20_BQ30_v3ref/downstreamAnalysis/breseq_out_temporalDyn_longFormat_woutLowConfMutations_c.xlsx")
```
## add annotations to table (e.g. BT gene names)
```{r}
# load annotation data
annotation<-readxl::read_excel("analysis/genomics/TDA1000/TDA1000_hybridAssembly/results/assemblies/prokka_annotation/annotation_table_short.xlsx")
#create new dataframe with annotations
dynamics_long_annot<-dynamics_long
# there were a few mutations that correspond to a deletion, the labels for these needed to be edited
dynamics_long_annot$gene<-str_replace_all(dynamics_long_annot$gene,"\\[", "")
dynamics_long_annot$gene<-str_replace_all(dynamics_long_annot$gene,"\\]", "")
dynamics_long_annot$gene<-str_replace_all(dynamics_long_annot$gene,"–", "/")
# create two new columns with gene names (for when mutations are in intergenic regions)
dynamics_long_annot<-dynamics_long_annot %>% separate(gene, c("gene_1", "gene_2"),sep="/",remove=F)
#merge data frame of parallel mutations with annotations (for gene 1)
dynamics_long_annot<-left_join(dynamics_long_annot,annotation,by=c("gene_1"="locus_tag_prokka"))
dynamics_long_annot<-dynamics_long_annot %>% rename_at(18, ~paste0(., "_1"))
#merge data frame of parallel mutations with annotations (for gene 2)
dynamics_long_annot<-left_join(dynamics_long_annot,annotation,by=c("gene_2"="locus_tag_prokka"))
dynamics_long_annot<-dynamics_long_annot %>% rename_at(19, ~paste0(., "_2"))
# merge locus that are intergenic mutations
dynamics_long_annot$locus_name<-ifelse(is.na(dynamics_long_annot$locus_tag_db_2),dynamics_long_annot$locus_tag_db_1,
paste0(dynamics_long_annot$locus_tag_db_1,"/",dynamics_long_annot$locus_tag_db_2))
# keep gene name for plasmid mutations
dynamics_long_annot$locus_name<-ifelse(dynamics_long_annot$seq_id=="plasmid",dynamics_long_annot$gene,
dynamics_long_annot$locus_name)
#change order of columns so that the BT locus_names are close to the gene names
dynamics_long_annot<-dynamics_long_annot[c(1:5,20,6:19)]
#transform day as numeric
dynamics_long_annot$day<-as.numeric(dynamics_long_annot$day)
# write the long format dataframe to a new file
# writexl::write_xlsx(dynamics_long_annot,"analysis/genomics/evolved_populations_breseq/temporalDynamics_prokkaREF_MQ20_BQ30_v3ref/downstreamAnalysis/breseq_out_temporalDyn_longFormat_woutLowConfMuts_withAnnots_c.xlsx")
```
## Plot mutation temporal dynamics, hihglighting strongly fluctuating mutations 2 genes.
Fig. 6A
```{r,fig.width=10,fig.height=6}
# load data
dynamics_long_annot<-readxl::read_xlsx("analysis/genomics/evolved_populations_breseq/temporalDynamics_prokkaREF_MQ20_BQ30_v3ref/downstreamAnalysis/breseq_out_temporalDyn_longFormat_woutLowConfMuts_withAnnots_c.xlsx")
#create a dataframe for the shading
shading_df <- data.frame(ymin = -Inf,
ymax = Inf,
xmin = seq(0,11,1),
xmax = c(seq(1,12,1)),
diet = rep(c("WD", "SD"),6))
#make the plot
p1<-ggplot()+
geom_line(data=dynamics_long_annot[!dynamics_long_annot$seqid_position_mutation_gene %in% c("chromosome_314605_G→T_TDA1000_00266","chromosome_314248_+C_TDA1000_00266","chromosome_4305370_G→A_TDA1000_03351"),],aes(x=day/7,y=frequency,group=seqid_position_mutation_gene),color="grey")+
geom_line(data=dynamics_long_annot[dynamics_long_annot$seqid_position_mutation_gene %in% c("chromosome_314605_G→T_TDA1000_00266","chromosome_314248_+C_TDA1000_00266","chromosome_4305370_G→A_TDA1000_03351"),],aes(x=day/7,y=frequency,color=gene,group=seqid_position_mutation_gene,label=locus_name,label2=annotation,label3=description),size=1)+
geom_rect(data = shading_df, aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax, fill = diet), alpha = 0.1)+
scale_y_continuous(breaks = seq(0,1,0.25))+
scale_x_continuous(breaks=seq(0,12,1),limits=c(0,12))+
scale_colour_manual(values=c("#084594","#CB181D"),labels=c("BT2689","BT0867"))+
scale_fill_manual(values=c(NA,"#d73027"))+
theme_cowplot()+
background_grid(major = "xy")+
facet_wrap(~mouse,ncol=2,labeller=labeller(mouse=c("F2"="AD1","F5"="AD4")))+
xlab("Week")+
ylab("Mutation frequency")
p1
```