-
Notifications
You must be signed in to change notification settings - Fork 2
/
.Rhistory
512 lines (512 loc) · 21.6 KB
/
.Rhistory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
theme_bw() +
theme(axis.text=element_text(size=11),
title=element_text(size=14),
legend.text=element_text(size=12))
#plot highest and lowest cprel for each team
df = setDT(df)
df = df[order(df$cprel, decreasing = T)]
library(data.table)
## --------------
## plus minus plot
df$label = NA
df$label[1:5] = df$name[1:5]
df$label[666:670] = df$name[666:670]
df2 = df
df2$label = ifelse(abs(df2$pm) >= 28, df2$name, df2$label)
df2$pm_group = ifelse(df2$pm > 0, "+", ifelse(df2$pm < 0, "-", "0"))
ggplot(df2, aes(pm, cp, label=label)) +
geom_point(aes(color=pm_group),size=2) +
geom_label_repel(data=df2[df2$cp > 50,],
size=4, alpha=.85, force = 5,
segment.size = 0.2, nudge_y = 6,
show.legend = F) +
geom_label_repel(data=df2[df2$cp < 50,],
size=4, alpha=.85, force = 5,
segment.size = 0.2, nudge_y = -5,
show.legend = F) +
scale_y_continuous(limits=c(25,75)) +
scale_color_manual(values=c("+"="#ed4a2d", "-"="#2a6fde", "0"="gray75")) +
stat_cor(method="spearman", show.legend = F,alternative = "greater", size=5) +
labs(title="NHL skater CF% versus plus/minus",
subtitle="2019-2020 season\nn = 670 (at least 20 games)",
x="+/-",y="CF%",
color="+/- group") +
geom_hline(yintercept = 50, linetype="dashed", color="gray50", size=1) +
theme_bw() +
theme(axis.text=element_text(size=11),
title=element_text(size=14),
legend.text=element_text(size=12))
#plot highest and lowest cprel for each team
df = setDT(df)
df = df[order(df$cprel, decreasing = T)]
low = df[df[, .I[cprel == min(cprel)], by=team]$V1]
low = low[!duplicated(low$team),]
high = df[df[, .I[cprel == max(cprel)], by=team]$V1]
high = high[!duplicated(high$team),]
combined = data.frame(rbind(high,low),stringsAsFactors = F)
combined = combined[order(combined$team),]
combined$team = factor(combined$team, levels=rev(unique(combined$team)))
## plot
ggplot(combined, aes(y=cprel, x=team, label=name)) +
geom_bar(stat="identity",alpha=.3, fill="dodgerblue4") +
coord_flip() +
geom_text(fontface="bold") +
scale_y_continuous(limits=c(-23,15)) +
geom_hline(yintercept = 0, linetype="dashed", color="red", size=.75, alpha=0.85) +
labs(title="Highest and lowest CF% rel skater by NHL team",
subtitle="2019-2020 season",
y="CF% rel",x="Team") +
theme_bw() +
theme(axis.text=element_text(size=11),
title=element_text(size=14),
legend.text=element_text(size=12))
head(df$name,10)
head(df,3)
df = merge(stats, bstats, by="name")
## keep unique and clean
df = df[!duplicated(df$name),]
colnames(df)[1:9] = c("name","age","team","pos","gp","cf","ca","cp","cprel")
df$name = gsub("\\\\.*","",df$name)
## keep players with more than 20 games
df = df[df$gp >= 20,]
## order by cp
df = df[order(df$cp, decreasing=T),]
## points per 60
df$pp60 = df$points / df$toi * 60
## add labels for top skaters
df$label = NA
df$label[1:5] = df$name[1:5]
df$label[666:670] = df$name[666:670]
df$label = ifelse(df$pp60 >= 4, df$name, df$label)
df$pm_group = ifelse(df$pm > 50, ">50+", ifelse(df$pm < 50, "<50", "0"))
## add specific colors
df$point_color = "Rest"
df$point_color[1:5] = "Top5 CF%"
df$point_color[666:670] = "Bot5 CF%"
df$point_color = ifelse(df$pp60 >= 4, "Top5 PP60", df$point_color)
head(df$name,10)
df = df[order(df$pp60,decreasing = T),]
head(df,12)
head(df,15)
head(df)
head(bstats,3)
## combine subset of basic stats
bstats = read.csv("basic_stats_2020.csv", header=T, skip=1, stringsAsFactors=F)
head(bstats,3)
head(bstats,10)
df = df[order(df$cf,decreasing = T),]
head(df$name,20)
head(df$name,30)
head(df)
dim(df0)
dim(df)
df = df[order(df$cp, decreasing = T),]
head(df)
head(df$name,20)
library(devtools)
library(roxygen2)
setwd("~/Documents/workfromhome/RM2/")
setwd("~/Documents/workfromhome/RM2/")
document()
setwd("..")
install("RM2")
?RM2
data("ctcf_chr3_4")
muts = cbind(mutations_chr3_4, get_mut_trinuc_strand(mutations_chr3_4))
dim(muts)
head(muts)
?RM2
results = RM2(muts, ctcf_chr3_4, window_size=window_size)
results
detach("package:RM2", unload=TRUE)
rm(list = ls(all = TRUE))
setwd("~/Documents/workfromhome/RM2/")
document()
setwd("..")
install("RM2")
setwd("..")
install("RM2")
getwd()
setwd("workfromhome/")
install("RM2")
data("ctcf_chr3_4")
muts = cbind(mutations_chr3_4, get_mut_trinuc_strand(mutations_chr3_4))
window_size = 50
RM2(muts, ctcf_chr3_4, window_size=window_size)
dim(muts)
head(muts)
?RM2
?RM2
library(devtools)
library(roxygen2)
setwd("~/Documents/workfromhome/RM2/")
document()
setwd("..")
install("RM2")
data("ctcf_chr3_4")
muts = cbind(mutations_chr3_4, get_mut_trinuc_strand(mutations_chr3_4))
window_size = 50
?RM2
RM2(muts, ctcf_chr3_4, window_size=window_size)
RM2(maf = muts, sites = ctcf_chr3_4, window_size=window_size)
maf = muts
sites = ctcf_chr3_4
cofactor_column = NA
window_size = 100
n_min_mut = 100
n_bin = 10
maf = .sort_coords(maf)
sites = .sort_coords(sites)
maf =
.sort_coords = function(dfr) {
chrs = gsub("chr", "", dfr$chr)
chrs[chrs == "X"] = 23
chrs[chrs == "Y"] = 24
chrs[chrs == "M"] = 25
chrs = as.numeric(chrs)
new_order = order(chrs, (dfr$start + dfr$end) / 2)
dfr = dfr[new_order,, drop = FALSE ]
}
maf = .sort_coords(maf)
sites = .sort_coords(sites)
empty_res = data.frame(mut_type = NA, pp = NA, this_coef = NA, obs_mut = NA,
exp_mut = NA, exp_mut_lo = NA, exp_mut_hi = NA, fc = NA,
pp_cofac = NA, this_coef_cofac = NA,
pp_regmut = NA, coef_regmut = NA, n_sites_tested = NA,
stringsAsFactors = FALSE)
results
#' Generates bins and calls prepare_sites such that site coordinates, number of trinucleotide contexts and indels, and all possible quadnucleotide contexts are returned for each mutation rate bin and site + flank
#'
#' @param sites Data frame of sites
#' @param window_size Integer indicating the half-width of sites and flanking regions
#' @param maf Data frame of mutations prepared by get_mut_trinuc_strand
#' @param n_bin Integer (10) indicating the number of bins to which to split sites
#' @param n_min_mut Integer (100) indicating minimum number of mutations to proceed with analysis
#' @param global_mut_rate_window Integer (1e6) indicating the window for mutation rate binning
#'
#' @return List of length n_bin each containing 3 GRanges of coordinates (site and flanks) and 3 dataframes of trinucleotide counts and mutation contexts
.prepare_bins_of_sites = function(sites, window_size, maf, n_bin=NA, n_min_mut = 100, global_mut_rate_window = 1e6) {
# first count mutations in sites + flanks in total. Skip analysis if too low
gr_maf = GenomicRanges::GRanges(maf$chr, IRanges::IRanges(maf$start, maf$end))
sites_mid = floor((sites$start+sites$end)/2)
gr_sites = GenomicRanges::GRanges(sites$chr,
IRanges::IRanges(sites_mid - 3 * window_size - 1, sites_mid + 3 * window_size))
total_muts = sum(GenomicRanges::countOverlaps(gr_sites, gr_maf))
if (total_muts < n_min_mut) {
print(paste0("Too few mutations in sites+flanks (", total_muts, "), skipping."))
return(NULL)
}
# remove sites that together with windows exceed chromosomal coordinates
chr_ends = GenomeInfoDb::seqlengths(GenomeInfoDb::seqinfo(Hsapiens))[as.character(GenomeInfoDb::seqnames(gr_sites))]
chr_ends_index = which(end(gr_sites) >= chr_ends)
chr_starts_index = which(start(gr_sites) <= 1)
keep_site_index = setdiff(1:nrow(sites), c(chr_ends_index, chr_starts_index))
sites = sites[keep_site_index,, drop = FALSE ]
sites_mid = sites_mid[keep_site_index]
# prepare all sites together
if (is.na(n_bin)) {
sites_prepared = list("mutrate__1"=.prepare_sites(sites, window_size))
} else {
# prepare sites for every bin based on average mutation rate
dfr_1mb = data.frame(chr = sites$chr,
start = sites_mid - global_mut_rate_window / 2,
end = sites_mid + global_mut_rate_window / 2 - 1,
stringsAsFactors = FALSE)
# make sure windows don't exceed chr boundaries
chr_ends = GenomeInfoDb::seqlengths(GenomeInfoDb::seqinfo(Hsapiens))[dfr_1mb$chr]
chr_ends_index = which(dfr_1mb$end >= chr_ends)
dfr_1mb$end[chr_ends_index] = chr_ends[chr_ends_index] - 1
dfr_1mb$start[dfr_1mb$start < 1] = 1
# count mutations per bin
gr_1mb = GenomicRanges::GRanges(dfr_1mb$chr, IRanges::IRanges(dfr_1mb$start, dfr_1mb$end))
site_mb_mut_counts = GenomicRanges::countOverlaps(gr_1mb, gr_maf)
# split sites into equal bins based on 1MB mut rates
# return NULL; to be handled in higher-level functions
bin_index = try(ggplot2::cut_number(site_mb_mut_counts, n_bin), silent = T)
if (any(class(bin_index) == "try-error")) {
stop(paste0("Too few mutations to produce ", n_bin, " bins."))
}
site_bins = split(sites, bin_index)
bin_mean_mut_rate = c(by(site_mb_mut_counts, bin_index, mean, na.rm=T))
# prepare sites for every bin
sites_prepared = lapply(site_bins, .prepare_sites, window_size)
names(sites_prepared) = paste("mutrate__", bin_mean_mut_rate, sep="")
}
return(sites_prepared)
}
#' For each mutation rate bin and site + flanks, calculates site coordinates, number of trinucleotide contexts and indels, and all possible mutation contexts
#'
#' @param sites Data frame of sites
#' @param window_size Integer indicating the half-width of sites and flanking regions
#'
#' @return List containing 3 GRanges of coordinates (site and flanks) and 3 dataframes of trinucleotide counts and mutation contexts
.prepare_sites = function(sites, window_size) {
sites_mid = floor((sites$start+sites$end)/2)
gr_sites = GenomicRanges::GRanges(sites$chr, IRanges::IRanges(sites_mid - window_size - 1, sites_mid + window_size))
site_ids = paste(GenomicRanges::seqnames(gr_sites), GenomicRanges::start(gr_sites), GenomicRanges::end(gr_sites), sep="_")
# create unique site IDs
dupl_site_index = which(duplicated(site_ids))
site_ids[dupl_site_index] = paste(site_ids[dupl_site_index], 1:length(dupl_site_index), sep="_")
# compare each site with immediate background of same length
gr_sites_upstream = .get_up_flank(gr_sites)
gr_sites_downstream = .get_down_flank(gr_sites)
trinuc_sites = .get_nucs(gr_sites, site_ids)
trinuc_upstream = .get_nucs(gr_sites_upstream, site_ids)
trinuc_downstream = .get_nucs(gr_sites_downstream, site_ids)
list(gr_sites, gr_sites_upstream, gr_sites_downstream, trinuc_sites, trinuc_upstream, trinuc_downstream)
}
#' Combines sites and mutations for sites and flanks from each bin in prepared_sites
#'
#' @param maf Data frame of mutations prepared by get_mut_trinuc_strand. Contains all if not cofactors are included
#' @param prepared_sites List of coordinates and quadnucleotide contexts generated by prepare_bins_of_sites
#'
#' @return Data frame containing mutation counts for each quadnucleotide context + indel from each bin and sites + up/down flanks. Also includes trinucleotide position counts and average megabase mutation rate by bin
.maf_to_dfr = function(maf, prepared_sites) {
if (is.null(prepared_sites[[1]])) {
return(NULL)
}
gr_maf = GenomicRanges::GRanges(maf$chr, IRanges::IRanges(maf$start, maf$end))
GenomicRanges::mcols(gr_maf)[,"trinuc"] = maf$mut_trinuc
GenomicRanges::mcols(gr_maf)[,"ID"] = paste(maf$chr, maf$start, maf$end, maf$ref, maf$alt, maf$patient_id, sep=":")
full_dfr = NULL
for (i in 1:length(prepared_sites)) {
gr_sites = prepared_sites[[i]][[1]]
gr_sites_upstream = prepared_sites[[i]][[2]]
gr_sites_downstream = prepared_sites[[i]][[3]]
trinuc_sites = prepared_sites[[i]][[4]]
trinuc_upstream = prepared_sites[[i]][[5]]
trinuc_downstream = prepared_sites[[i]][[6]]
sites_dfr = .merge_sites_and_muts(gr_sites, gr_maf, trinuc_sites, is_site=TRUE)
upstream_dfr = .merge_sites_and_muts(gr_sites_upstream, gr_maf, trinuc_upstream, is_site=FALSE)
downstream_dfr = .merge_sites_and_muts(gr_sites_downstream, gr_maf, trinuc_downstream, is_site=FALSE)
dfr = rbind(upstream_dfr, sites_dfr, downstream_dfr)
dfr$mut_rate = as.numeric(gsub("mutrate__", "", names(prepared_sites)[i]))
dfr$mut_bin = i
full_dfr = rbind(full_dfr, dfr)
}
full_dfr
}
#' Combines quadnucleotide counts and site information
#'
#' @param gr_sites GenomicRanges object of site/flanks chromosomes and positions
#' @param gr_maf GenomicRanges object with mcols containing quadnucleotide context or indel
#' @param trinuc_sites Data frame of counts for each trinucleotide counts extended to each quadnucleotide
#' @param is_site Boolean indicating whether mutation is within site or flanking region
#'
#' @return Data frame containing mutation and trinucleotide counts for each quadnucleotide context + indel. Includes boolean indicator of is_site
.merge_sites_and_muts = function(gr_sites, gr_maf, trinuc_sites, is_site) {
muts_sites = .get_muts(gr_sites, gr_maf)
dfr = cbind(is_site=is_site,
merge(trinuc_sites, muts_sites, by.x="quadnuc", by.y="tag", all=T))
dfr[is.na(dfr$muts_count), "muts_count"] = 0
dfr
}
#' Counts number of mutations per quadnucleotide context
#'
#' @param gr_sites GenomicRanges object of site/flank chromosomes and positions
#' @param gr_maf GenomicRanges object with mcols containing quadnucleotide context or indel
#'
#' @return Data frame containing mutation count and each quadnucleotide context + indel
.get_muts = function(gr_sites, gr_maf) {
# remove duplicate mutation calls
gr_maf_in_sites = gr_maf[S4Vectors::subjectHits(GenomicRanges::findOverlaps(gr_sites, gr_maf))]
gr_maf_in_sites = gr_maf_in_sites[!duplicated(GenomicRanges::mcols(gr_maf_in_sites)[,"ID"])]
muts_sites = GenomicRanges::mcols(gr_maf_in_sites)[,"trinuc"]
muts_sites = data.frame(muts_count = c(table(muts_sites)), stringsAsFactors=F)
muts_sites$tag = rownames(muts_sites)
muts_sites
}
#' Counts the number of each trinucleotide within each site
#'
#' @param gr_sites GenomicRanges object of sites
#' @param side_ids Character vector of site ids
#'
#' @return Data frame containing each site id (character) and its trinucleotide counts (numeric)
.get_sequence_trinucleotides = function(gr_sites, site_ids) {
all_trinucs = .get_all_trinucs()
# extend by one nucleotide to get the neighbouring context for each nucleotide of the sequence
GenomicRanges::start(gr_sites) = GenomicRanges::start(gr_sites) - 1
GenomicRanges::end(gr_sites) = GenomicRanges::end(gr_sites) + 1
this_seq = strsplit(as.character(BSgenome::getSeq(BSgenome.Hsapiens.UCSC.hg19::Hsapiens, gr_sites)), '')
# extract trinucleotides from sequence
get_sq_trinucs = function(x) sapply(1:(length(x)-2), function(i) paste0(x[i:(i+2)], collapse=""))
tri_nucleotide = lapply(this_seq, get_sq_trinucs)
# convert to tall format, map A/G-centred trinucleotides to complementary strand
tri_nucleotide = do.call(rbind, lapply(1:length(gr_sites), function(i) cbind(ID=site_ids[i], trinuc=tri_nucleotide[[i]])))
where_reverse = substr(tri_nucleotide[, "trinuc"], 2, 2) %in% c("A", "G")
tri_nucleotide[where_reverse, "trinuc"] =
as.character(Biostrings::complement(Biostrings::DNAStringSet(tri_nucleotide[where_reverse, "trinuc"])))
# add up trinucleotides and convert into a wide format
tri_nucleotide = data.frame(tri_nucleotide[,1:2], count=1, stringsAsFactors=F)
trinuc_dfr = reshape2::dcast(ID~trinuc, data=tri_nucleotide, value.var="count", fun.aggregate=sum)
# add missing quad-nucleotide classes as zero columns
missing_trinucs = setdiff(all_trinucs, colnames(trinuc_dfr))
dfr_tail = matrix(0, nrow(trinuc_dfr), length(missing_trinucs))
colnames(dfr_tail) = missing_trinucs
trinuc_dfr = cbind(trinuc_dfr, dfr_tail)
# indel can occur at any given position, add up trinucleotide positions and make new column for indel
trinuc_dfr$indel = apply(trinuc_dfr[,all_trinucs], 1, sum)
trinuc_dfr[,c("ID", all_trinucs, "indel")]
}
#' Derive quadnucleotide contexts, strandedness and single-base substitution
#'
#' get_mut_trinuc_strand() takes a data frame of mutations and determines the quadnucleotide context (trinucleotide context + alternate allele) and strand.
#' @param maf Data frame of mutations with the following columns: chr, start, end, ref and alt
#' \describe{
#' \item{chr}{autosomal chromosomes as chr1 to chr22 and sex chromosomes as chrX and chrY}
#' \item{start}{the start position of the mutation in base 1 coordinates}
#' \item{end}{the end position of the mutation in base 1 coordinates}
#' \item{ref}{the reference allele as a string containing the bases A, T, C or G}
#' \item{alt}{the alternate allele as a string containing the bases A, T, C or G}
#' }
#'
#' @return Dataframe containing quadnucleotide context (character), the strand, classified as w for Watson and c for Crick (character), and the single-base substitution (character)
#' @export
get_mut_trinuc_strand = function(maf) {
gr_trinuc = GenomicRanges::GRanges(maf$chr, IRanges::IRanges(start=maf$start-1, end=maf$end+1), strand="*")
triples = as.character(BSgenome::getSeq(BSgenome.Hsapiens.UCSC.hg19::Hsapiens, gr_trinuc))
legal_nucl = c("A", "C", "G", "T")
complement_nucl = c("A", "G") ## these nucleotides will be replaced by complementary nucleotide
mut_strand = mut_class = rep("indel", length(triples))
mut_class[maf$ref %in% legal_nucl & maf$alt %in% legal_nucl] = "SNV"
mut_strand[mut_class=="SNV"] = "w"
mid_nucl = gsub("(.)(.)(.)", "\\2", triples)
which_to_complement = which(mut_class=="SNV" & mid_nucl %in% complement_nucl)
new_triples = triples
new_alt = maf$alt
new_triples[which_to_complement] = as.character(Biostrings::complement(Biostrings::DNAStringSet(new_triples[which_to_complement])))
new_alt[which_to_complement] = as.character(Biostrings::complement(Biostrings::DNAStringSet(new_alt[which_to_complement])))
mut_strand[which_to_complement] = "c"
mut_trinuc = paste0(new_triples,"_",new_alt)
mut_trinuc[mut_class=="indel"] = "indel"
ref_alt = paste0(gsub("^.(.).$", "\\1", new_triples), "_", new_alt)
ref_alt[mut_class=="indel"] = "indel"
data.frame(mut_trinuc, mut_strand, ref_alt, stringsAsFactors=F)
}
#' Determines upstream coordinates of flanking regions to sites
#'
#' @param gr_sites GenomicRanges object of sites
#'
#' @return GenomicRanges object containing the coordinates of upstream flanking regions
.get_up_flank = function(gr_sites) {
gr_sites_upstream = gr_sites
GenomicRanges::start(gr_sites_upstream) = GenomicRanges::start(gr_sites) - GenomicRanges::width(gr_sites)
GenomicRanges::end(gr_sites_upstream) = GenomicRanges::start(gr_sites) - 1
gr_sites_upstream
}
#' Determines downstream coordinates of flanking regions to sites
#'
#' @param gr_sites GenomicRanges object of sites
#'
#' @return GenomicRanges object containing the coordinates of downstream flanking regions
.get_down_flank = function(gr_sites) {
gr_sites_downstream = gr_sites
GenomicRanges::start(gr_sites_downstream) = GenomicRanges::end(gr_sites) + 1
GenomicRanges::end(gr_sites_downstream) = GenomicRanges::end(gr_sites) + GenomicRanges::width(gr_sites)
gr_sites_downstream
}
#' Determines trinucleotide, pyrimidine-centered trinucleotide contexts
#'
#' @return Character vector of all possible trinucleotide mutation contexts with pyrimidines in the middle position
.get_all_trinucs = function() {
apply(as.matrix(expand.grid(c("A", "C", "G", "T"), c("C", "T"), c("A", "C", "G", "T"))), 1, paste, collapse="")
}
#' Creates a data frame of quadnucleotide mutation contexts with counts of corresponding trinucleotides
#'
#' @param gr_sites GenomicRanges object of sites
#' @param site_ids Character vector of unique site ids
#'
#' @return Data frame of all possible quadnucleotide mutation contexts with the number of positions at which the trinucleotides occur
.get_nucs = function(gr_sites, site_ids) {
quadnuc = .get_quadnuc_map()
trinuc_counts = .get_sequence_trinucleotides(gr_sites, site_ids)
trinuc_counts1 = apply(trinuc_counts[,-1], 2, sum)
trinuc_counts1 = data.frame(trinuc=names(trinuc_counts1), posi_count=trinuc_counts1, stringsAsFactors=F)
trinuc_counts2 = merge(trinuc_counts1, quadnuc, by="trinuc")
trinuc_counts2
}
#' Determines all trinucleotide, pyrimidine-centered trinucleotide contexts and mutation/quadnucleotide contexts
#'
#' @return Data frame trinucleotides, alternate allele and possible quadnucleotide contexts
.get_quadnuc_map = function() {
quadnuc = S4Vectors::expand.grid(.get_all_trinucs(), c("A", "C", "G", "T"))
quadnuc$tag = paste(quadnuc[,1], quadnuc[,2], sep="_")
quadnuc = quadnuc[gsub("^.(.).$", "\\1", quadnuc[,1]) != quadnuc[,2],]
quadnuc = data.frame(as.matrix(quadnuc), stringsAsFactors=F)
quadnuc = rbind(quadnuc, c("indel", "indel", "indel"))
colnames(quadnuc) = c("trinuc", "alt", "quadnuc")
quadnuc
}
#' Selects median value with priority given to larger value
#'
#' @param x Numeric vector of p-values
#'
#' @return Position of median value
#' @export
which_median = function(x) {
mid_pos = ceiling(length(x)/2)
which(x == sort(x)[mid_pos])[1]
}
prepared_sites = try(.prepare_bins_of_sites(sites, window_size, maf,
n_bin = 10, n_min_mut = n_min_mut), silent = TRUE)
prepared_sites
setwd("~/Documents/workfromhome/RM2/")
document()
library(devtools)
library(roxygen2)
## make changes
## update document
setwd("~/Documents/workfromhome/RM2/")
document()
rm(list=ls())
?RM2
rm(list=ls())
?RM2
?RM2final
RM2
library(devtools)
library(roxygen2)
setwd("~/Documents/workfromhome/RM2/")
document()
library(devtools)
library(roxygen2)
## make changes
## update document
setwd("~/Documents/workfromhome/RM2/")
document()
setwd("..")
install("RM2")
## test functions
library(RM2)
data("ctcf_chr3_4")
muts = cbind(mutations_chr3_4, get_mut_trinuc_strand(mutations_chr3_4))
## basic
window_size = 50
results = RM2(maf = muts, sites = ctcf_chr3_4, window_size=window_size)
results
head(ctcf_chr3_4)
dim(ctcf_chr3_4)
mut_class_columns = c("mut_strand")
RM2(maf = muts, sites = ctcf_chr3_4,
window_size=window_size, mut_class_columns = mut_class_columns)
head(maf)
head(muts)
mut_class_columns = c("mut_strand", "ref_alt")
RM2(maf = muts, sites = ctcf_chr3_4,
window_size=window_size, mut_class_columns = mut_class_columns)
warnings()
results = RM2(maf = muts, sites = ctcf_chr3_4,
window_size=window_size, mut_class_columns = mut_class_columns)
supressWarnings(results)
suppressWarnings(results)
dfr = get_mutations_in_flanked_sites(muts, ctcf_chr3_4, window_size)
plot_mutations_in_flanked_sites(dfr, window_size)
?sample
muts$co_factor = sample(c(0,1), nrow(muts))
muts$co_factor = sample(c(0,1), nrow(muts), replace=T)
head(muts)
table(muts$co_factor)
?RM2
results = RM2(maf = muts, sites = ctcf_chr3_4, window_size=window_size,
cofactor_column = "co_factor")
results