-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #7 from Phuong-Le/develop
ENH: added function to recover original samples from trees, complete #3
- Loading branch information
Showing
20 changed files
with
1,209 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,3 +2,4 @@ | |
^\.Rproj\.user$ | ||
^LICENSE\.md$ | ||
^README\.Rmd$ | ||
^data-raw$ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
#' Single base substitution (SBS) mutation matrix | ||
#' | ||
#' @description | ||
#' an SBS matrix that has been mapped to the tree nodes (branches) | ||
#' from the small intestine of one patient. | ||
#' The tree is stored in `mut_tree` | ||
#' | ||
#' @format ## `SBS mutation matrix` | ||
#' A data frame with 33 rows and 96 columns, | ||
#' with columns representing the mutations | ||
#' and rows representing the tree nodes (branches) | ||
#' | ||
#' @source <https://github.com/YichenWang1/small_bowel/blob/main/data/mutation_matrices/sbs_mapped_to_branches.txt> | ||
"mut_by_branch" | ||
|
||
|
||
#' Single base substitution (SBS) mutation matrix | ||
#' | ||
#' @description | ||
#' an SBS matrix from the small intestine of one patient. | ||
#' The tree is stored in `mut_tree` | ||
#' | ||
#' @format ## `SBS mutation matrix` | ||
#' A data frame with 21 rows and 96 columns, | ||
#' with columns representing the mutations | ||
#' and rows representing the samples. | ||
#' This dataset is used to test | ||
#' whether `tree_to_sample` can reconstruct | ||
#' the original samples from | ||
#' information that has been split to branches/nodes and its tree structure. | ||
#' | ||
#' @source <https://github.com/YichenWang1/small_bowel/blob/main/data/mutation_matrices/sbs_small_bowel_persample.txt> | ||
"mut_by_sample" | ||
|
||
|
||
|
||
#' Mutation tree | ||
#' | ||
#' @description | ||
#' a phylogenetic tree describing how samples | ||
#' from one donor are related to each other. | ||
#' The "tips/leaves" of the tree are the sample names | ||
#' where as the nodes (branches) are estimated by MPBoot | ||
#' | ||
#' | ||
#' @format ## `Newick tree` | ||
#' | ||
#' @source <https://github.com/YichenWang1/small_bowel/blob/main/data/phylogenetic_trees/PD28690_snp_tree_with_branch_length.tree> | ||
"mut_tree" | ||
|
||
|
||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
#' recover values (mutations/exposure) that have been split into branches | ||
#' back to original samples | ||
#' by aggregating all values from one tip/leaf to the root | ||
#' for every tip/leaf | ||
#' | ||
#' @param branches a dataframe of values, with a column called 'node' that corresponds to the input tree | ||
#' @param tree a phylogenetic tree of Newick format, where the tip/leaf labels are the same as the original samples | ||
#' @param included_columns the columns in `branches` to be aggregated, default to all except the column 'node' | ||
#' @return a dataframe that contains values for the original samples, these samples are the tips/leaves of the input tree | ||
#' @export | ||
#' | ||
#' @examples | ||
#' # the example will use `mut_by_branch` and `mut_tree` incorporated in this package, | ||
#' # original source https://github.com/YichenWang1/small_bowel | ||
#' library(mutationsR) | ||
#' branches = mut_by_branch | ||
#' branches$node = branches$node = sapply(rownames(branches), function(x) strsplit(x, '_')[[1]][2]) # in branches, the node ID is incorporated in its rowname | ||
#' tree_to_samples(branches, mut_tree) | ||
tree_to_samples = function(branches, tree, included_columns = NULL) { | ||
# convert tree to dataframe format | ||
tree = ggtree::fortify(tree) | ||
|
||
# processing branches | ||
if (is.null(included_columns)) | ||
included_columns = colnames(branches)[colnames(branches) != 'node'] | ||
available_nodes = as.numeric(branches$node) | ||
branches = dplyr::select(branches, all_of(included_columns)) | ||
samples = as.data.frame(matrix( | ||
0, | ||
nrow = length(which(tree$isTip)), | ||
ncol = length(included_columns) | ||
)) | ||
colnames(samples) = included_columns | ||
sample_names = c() | ||
for (sample in which(tree$isTip)) { | ||
node = tree$node[sample] | ||
parent = tree$parent[tree$node == node] | ||
if (node %in% available_nodes) { | ||
sample_values = branches[available_nodes == node, ] | ||
} else { | ||
sample_values = matrix(0, nrow = 1, ncol = length(included_columns)) | ||
} | ||
while (parent != node) { | ||
if (parent %in% available_nodes) { | ||
sample_values = sample_values + branches[available_nodes == parent, ] | ||
} | ||
node = parent | ||
parent = tree$parent[tree$node == node] | ||
} | ||
samples[sample, ] = sample_values | ||
sample_names = c(sample_names, tree$label[tree$node == sample]) | ||
} | ||
samples$sample_id = sample_names | ||
samples = dplyr::select(samples, c('sample_id', included_columns)) | ||
return(samples) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
(((((PD28690bp_SB1_A9:3037,PD28690bp_SB1_H8:2611)80:0,PD28690bp_SB1_G8:1732)100:15,(PD28690bt_SB2_F10:3600,PD28690bt_SB2_H10:3463)100:34)52:1,(((((PD28690bp_SB1_D8:3508,PD28690bp_SB1_G9:2596)5:0,((PD28690bt_SB2_C11:2889,PD28690bt_SB2_E11:2659)100:10,((PD28690bt_SB2_F11:806,PD28690bt_SB2_G11:397)100:2402,(PD28690bt_SB3_E5:14,PD28690bt_SB3_F5:13)100:3345)100:2)100:22)1:0,(PD28690bt_SB2_A11:3454,(PD28690bt_SB3_B5:446,PD28690bt_SB3_C5:191)100:1521)100:18)1:0,(PD28690bp_SB1_H9:4486,PD28690bt_SB2_G10:3052)45:1)0:0,PD28690bp_SB1_E9:3457)0:0)4:0,(PD28690bp_SB1_B8:4184,PD28690bp_SB1_B9:3575)100:9); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
## code to prepare `mut_by_branch` dataset goes here | ||
|
||
mut_by_branch = read.table("./data-raw/sbs_mapped_to_branches.txt", header = T, check.names = F) | ||
|
||
# allowing just one donor | ||
mut_by_branch = mut_by_branch[grep("PD28690", rownames(mut_by_branch)),] | ||
|
||
# change column names | ||
to_cosmic_muts = function(mut) { | ||
center = strsplit(mut, ",")[[1]][1] | ||
flanks = strsplit(mut, ",")[[1]][2] | ||
flank1 = strsplit(flanks, "-")[[1]][1] | ||
flank2 = strsplit(flanks, "-")[[1]][2] | ||
return(paste0(flank1, "[", center, "]", flank2, sep = "")) | ||
} | ||
colnames(mut_by_branch) = sapply(colnames(mut_by_branch), to_cosmic_muts) | ||
|
||
usethis::use_data(mut_by_branch, overwrite = TRUE) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
## code to prepare `mut_by_sample` dataset goes here | ||
|
||
mut_by_sample = read.table("./data-raw/sbs_small_bowel_persample.txt", header = T, check.names = F) | ||
|
||
# allowing just one donor | ||
mut_by_sample = mut_by_sample[grep("PD28690", rownames(mut_by_sample)),] | ||
|
||
# change column names | ||
to_cosmic_muts = function(mut) { | ||
center = strsplit(mut, ",")[[1]][1] | ||
flanks = strsplit(mut, ",")[[1]][2] | ||
flank1 = strsplit(flanks, "-")[[1]][1] | ||
flank2 = strsplit(flanks, "-")[[1]][2] | ||
return(paste0(flank1, "[", center, "]", flank2, sep = "")) | ||
} | ||
colnames(mut_by_sample) = sapply(colnames(mut_by_sample), to_cosmic_muts) | ||
|
||
|
||
usethis::use_data(mut_by_sample, internal = TRUE) | ||
usethis::use_data(mut_by_sample, overwrite = TRUE) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
## code to prepare `mut_tree` dataset goes here | ||
|
||
mut_tree = ape::read.tree("./data-raw/PD28690_snp_tree_with_branch_length.tree") | ||
usethis::use_data(mut_tree, overwrite = TRUE) |
Oops, something went wrong.