Skip to content

Commit

Permalink
Merge pull request #7 from Phuong-Le/develop
Browse files Browse the repository at this point in the history
ENH: added function to recover original samples from trees, complete #3
  • Loading branch information
Phuong-Le authored Sep 6, 2023
2 parents 3710047 + 0183d5a commit 4bf111e
Show file tree
Hide file tree
Showing 20 changed files with 1,209 additions and 1 deletion.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
^\.Rproj\.user$
^LICENSE\.md$
^README\.Rmd$
^data-raw$
7 changes: 6 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,16 @@ Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
Suggests:
ape,
seqinr,
testthat (>= 3.0.0)
Config/testthat/edition: 3
Imports:
Biostrings,
dplyr,
ggplot2,
ggpubr
ggpubr,
ggtree
Depends:
R (>= 2.10)
LazyData: true
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ export(mut_cols)
export(rv_context)
export(spectra_plot)
export(strand_symmetric)
export(tree_to_samples)
import(ggplot2)
52 changes: 52 additions & 0 deletions R/data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#' Single base substitution (SBS) mutation matrix
#'
#' @description
#' an SBS matrix that has been mapped to the tree nodes (branches)
#' from the small intestine of one patient.
#' The tree is stored in `mut_tree`
#'
#' @format ## `SBS mutation matrix`
#' A data frame with 33 rows and 96 columns,
#' with columns representing the mutations
#' and rows representing the tree nodes (branches)
#'
#' @source <https://github.com/YichenWang1/small_bowel/blob/main/data/mutation_matrices/sbs_mapped_to_branches.txt>
"mut_by_branch"


#' Single base substitution (SBS) mutation matrix
#'
#' @description
#' an SBS matrix from the small intestine of one patient.
#' The tree is stored in `mut_tree`
#'
#' @format ## `SBS mutation matrix`
#' A data frame with 21 rows and 96 columns,
#' with columns representing the mutations
#' and rows representing the samples.
#' This dataset is used to test
#' whether `tree_to_sample` can reconstruct
#' the original samples from
#' information that has been split to branches/nodes and its tree structure.
#'
#' @source <https://github.com/YichenWang1/small_bowel/blob/main/data/mutation_matrices/sbs_small_bowel_persample.txt>
"mut_by_sample"



#' Mutation tree
#'
#' @description
#' a phylogenetic tree describing how samples
#' from one donor are related to each other.
#' The "tips/leaves" of the tree are the sample names
#' where as the nodes (branches) are estimated by MPBoot
#'
#'
#' @format ## `Newick tree`
#'
#' @source <https://github.com/YichenWang1/small_bowel/blob/main/data/phylogenetic_trees/PD28690_snp_tree_with_branch_length.tree>
"mut_tree"



Binary file added R/sysdata.rda
Binary file not shown.
56 changes: 56 additions & 0 deletions R/tree_to_samples.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#' recover values (mutations/exposure) that have been split into branches
#' back to original samples
#' by aggregating all values from one tip/leaf to the root
#' for every tip/leaf
#'
#' @param branches a dataframe of values, with a column called 'node' that corresponds to the input tree
#' @param tree a phylogenetic tree of Newick format, where the tip/leaf labels are the same as the original samples
#' @param included_columns the columns in `branches` to be aggregated, default to all except the column 'node'
#' @return a dataframe that contains values for the original samples, these samples are the tips/leaves of the input tree
#' @export
#'
#' @examples
#' # the example will use `mut_by_branch` and `mut_tree` incorporated in this package,
#' # original source https://github.com/YichenWang1/small_bowel
#' library(mutationsR)
#' branches = mut_by_branch
#' branches$node = branches$node = sapply(rownames(branches), function(x) strsplit(x, '_')[[1]][2]) # in branches, the node ID is incorporated in its rowname
#' tree_to_samples(branches, mut_tree)
tree_to_samples = function(branches, tree, included_columns = NULL) {
# convert tree to dataframe format
tree = ggtree::fortify(tree)

# processing branches
if (is.null(included_columns))
included_columns = colnames(branches)[colnames(branches) != 'node']
available_nodes = as.numeric(branches$node)
branches = dplyr::select(branches, all_of(included_columns))
samples = as.data.frame(matrix(
0,
nrow = length(which(tree$isTip)),
ncol = length(included_columns)
))
colnames(samples) = included_columns
sample_names = c()
for (sample in which(tree$isTip)) {
node = tree$node[sample]
parent = tree$parent[tree$node == node]
if (node %in% available_nodes) {
sample_values = branches[available_nodes == node, ]
} else {
sample_values = matrix(0, nrow = 1, ncol = length(included_columns))
}
while (parent != node) {
if (parent %in% available_nodes) {
sample_values = sample_values + branches[available_nodes == parent, ]
}
node = parent
parent = tree$parent[tree$node == node]
}
samples[sample, ] = sample_values
sample_names = c(sample_names, tree$label[tree$node == sample])
}
samples$sample_id = sample_names
samples = dplyr::select(samples, c('sample_id', included_columns))
return(samples)
}
1 change: 1 addition & 0 deletions data-raw/PD28690_snp_tree_with_branch_length.tree
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
(((((PD28690bp_SB1_A9:3037,PD28690bp_SB1_H8:2611)80:0,PD28690bp_SB1_G8:1732)100:15,(PD28690bt_SB2_F10:3600,PD28690bt_SB2_H10:3463)100:34)52:1,(((((PD28690bp_SB1_D8:3508,PD28690bp_SB1_G9:2596)5:0,((PD28690bt_SB2_C11:2889,PD28690bt_SB2_E11:2659)100:10,((PD28690bt_SB2_F11:806,PD28690bt_SB2_G11:397)100:2402,(PD28690bt_SB3_E5:14,PD28690bt_SB3_F5:13)100:3345)100:2)100:22)1:0,(PD28690bt_SB2_A11:3454,(PD28690bt_SB3_B5:446,PD28690bt_SB3_C5:191)100:1521)100:18)1:0,(PD28690bp_SB1_H9:4486,PD28690bt_SB2_G10:3052)45:1)0:0,PD28690bp_SB1_E9:3457)0:0)4:0,(PD28690bp_SB1_B8:4184,PD28690bp_SB1_B9:3575)100:9);
18 changes: 18 additions & 0 deletions data-raw/mut_by_branch.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
## code to prepare `mut_by_branch` dataset goes here

mut_by_branch = read.table("./data-raw/sbs_mapped_to_branches.txt", header = T, check.names = F)

# allowing just one donor
mut_by_branch = mut_by_branch[grep("PD28690", rownames(mut_by_branch)),]

# change column names
to_cosmic_muts = function(mut) {
center = strsplit(mut, ",")[[1]][1]
flanks = strsplit(mut, ",")[[1]][2]
flank1 = strsplit(flanks, "-")[[1]][1]
flank2 = strsplit(flanks, "-")[[1]][2]
return(paste0(flank1, "[", center, "]", flank2, sep = ""))
}
colnames(mut_by_branch) = sapply(colnames(mut_by_branch), to_cosmic_muts)

usethis::use_data(mut_by_branch, overwrite = TRUE)
20 changes: 20 additions & 0 deletions data-raw/mut_by_sample.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
## code to prepare `mut_by_sample` dataset goes here

mut_by_sample = read.table("./data-raw/sbs_small_bowel_persample.txt", header = T, check.names = F)

# allowing just one donor
mut_by_sample = mut_by_sample[grep("PD28690", rownames(mut_by_sample)),]

# change column names
to_cosmic_muts = function(mut) {
center = strsplit(mut, ",")[[1]][1]
flanks = strsplit(mut, ",")[[1]][2]
flank1 = strsplit(flanks, "-")[[1]][1]
flank2 = strsplit(flanks, "-")[[1]][2]
return(paste0(flank1, "[", center, "]", flank2, sep = ""))
}
colnames(mut_by_sample) = sapply(colnames(mut_by_sample), to_cosmic_muts)


usethis::use_data(mut_by_sample, internal = TRUE)
usethis::use_data(mut_by_sample, overwrite = TRUE)
4 changes: 4 additions & 0 deletions data-raw/mut_tree.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
## code to prepare `mut_tree` dataset goes here

mut_tree = ape::read.tree("./data-raw/PD28690_snp_tree_with_branch_length.tree")
usethis::use_data(mut_tree, overwrite = TRUE)
Loading

0 comments on commit 4bf111e

Please sign in to comment.