Skip to content

Commit

Permalink
Merge pull request #304 from immunomind/dev
Browse files Browse the repository at this point in the history
Immunarch 0.8.0 release
  • Loading branch information
Alexander230 authored Oct 18, 2022
2 parents 7a70786 + 138e7fe commit bc90fed
Show file tree
Hide file tree
Showing 21 changed files with 500 additions and 713 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: immunarch
Type: Package
Title: Bioinformatics Analysis of T-Cell and B-Cell Immune Repertoires
Version: 0.7.0
Version: 0.8.0
Authors@R: c(
person("Vadim I.", "Nazarov", , "[email protected]", c("aut", "cre")),
person("Vasily O.", "Tsvetkov", , role = "aut"),
Expand Down Expand Up @@ -84,6 +84,6 @@ Suggests:
rmarkdown
VignetteBuilder: knitr
Encoding: UTF-8
RoxygenNote: 7.2.0
RoxygenNote: 7.2.1
LazyData: true
LazyDataCompression: xz
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ importFrom(purrr,imap)
importFrom(purrr,map)
importFrom(purrr,map2)
importFrom(purrr,map2_chr)
importFrom(purrr,map2_df)
importFrom(purrr,map2_lgl)
importFrom(purrr,map_chr)
importFrom(purrr,map_df)
Expand Down Expand Up @@ -289,6 +290,7 @@ importFrom(tibble,tibble)
importFrom(tidyr,drop_na)
importFrom(tidyr,unite)
importFrom(tidyr,unnest)
importFrom(tidyselect,any_of)
importFrom(tidyselect,starts_with)
importFrom(utils,capture.output)
importFrom(utils,packageVersion)
Expand Down
191 changes: 60 additions & 131 deletions R/align_lineage.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,14 @@
#'
#' @usage
#'
#' repAlignLineage(.data,
#' .min_lineage_sequences, .prepare_threads, .align_threads, .verbose_output, .nofail)
#' repAlignLineage(.data, .min_lineage_sequences, .prepare_threads, .align_threads, .nofail)
#'
#' @param .data The data to be processed. Can be \link{data.frame}, \link{data.table}
#' or a list of these objects.
#'
#' @param .min_lineage_sequences If number of sequences in the same clonal lineage and the same
#' cluster (not including germline) is lower than this threshold, this group of sequences
#' will not be aligned and will not be used in next steps of BCR pipeline
#' (will be saved in output table only if .verbose_output parameter is set to TRUE).
#' will be filtered out from the dataframe; so only large enough lineages will be included.
#'
#' @param .prepare_threads Number of threads to prepare results table.
#' Please note that high number can cause heavy memory usage!
Expand All @@ -43,11 +41,6 @@
#' must contain 'Cluster' column, which is added by seqCluster() function, and 'Germline.sequence'
#' column, which is added by repGermline() function.
#'
#' @param .verbose_output If TRUE, all output dataframe columns will be included (see documentation about this
#' function return), and unaligned clusters will be included in the output. Setting this to TRUE significantly
#' increases memory usage. If FALSE, only aligned clusters and columns required for repClonalFamily() and
#' repSomaticHypermutation() calculation will be included in the output.
#'
#' @param .nofail Will return NA instead of stopping if Clustal W is not installed.
#' Used to avoid raising errors in examples on computers where Clustal W is not installed.
#'
Expand All @@ -57,21 +50,13 @@
#' The dataframe has these columns:
#' * Cluster: cluster name
#' * Germline: germline sequence
#' * V.germline.nt: germline V gene sequence
#' * J.germline.nt: germline J gene sequence
#' * CDR3.germline.length: length of CDR3 in germline
#' * Aligned (included if .verbose_output=TRUE): FALSE if this group of sequences was not aligned with lineage
#' (.min_lineage_sequences is below the threshold); TRUE if it was aligned
#' * Alignment: DNAbin object with alignment or DNAbin object with unaligned sequences (if Aligned=FALSE)
#' * V.length: shortest length of V gene part outside of CDR3 region in this
#' group of sequences; longer V genes (including germline) are trimmed to this length before alignment
#' * J.length: shortest length of J gene part outside of CDR3 region in this
#' group of sequences; longer J genes (including germline) are trimmed to this length before alignment
#' * Alignment: DNAbin object with alignment
#' * Sequences: nested dataframe containing all sequences for this combination
#' of cluster and germline; it has columns
#' Sequence, Clone.ID, Clones, CDR1.nt, CDR2.nt, CDR3.nt, FR1.nt, FR2.nt, FR3.nt, FR4.nt
#' and, if .verbose_output=TRUE, also V.end, J.start, CDR3.start, CDR3.end;
#' all values taken from the input dataframe
#' * Sequence, CDR1.nt, CDR2.nt, CDR3.nt, FR1.nt, FR2.nt, FR3.nt, FR4.nt, V.allele, J.allele,
#' V.aa, J.aa: all values taken from the input dataframe
#' * Clone.ID: taken from the input dataframe, or created (filled with row numbers) if missing
#' * Clones: taken from the input dataframe, or created (filled with '1' values) if missing
#'
#' @examples
#'
Expand All @@ -87,36 +72,45 @@ repAlignLineage <- function(.data,
.min_lineage_sequences = 3,
.prepare_threads = 2,
.align_threads = 4,
.verbose_output = FALSE,
.nofail = FALSE) {
if (!require_system_package("clustalw", error_message = paste0(
if (!require_system_package(c("clustalw", "clustalw2"), error_message = paste0(
"repAlignLineage requires Clustal W app to be installed!\n",
"Please download it from here: http://www.clustal.org/download/current/\n",
"or install it with your system package manager (such as apt or dnf)."
), .nofail)) {
return(get_empty_object_with_class("step_failure_ignored"))
}
if (.min_lineage_sequences < 2) {
warning(
".min_lineage_sequences is set to less than 2; ",
"results will not be valid to build trees with repClonalLineage()!"
)
}

doParallel::registerDoParallel(cores = .prepare_threads)
parallel_prepare <- .prepare_threads > 1
if (parallel_prepare) {
doParallel::registerDoParallel(cores = .prepare_threads)
}
.data %<>%
apply_to_sample_or_list(
align_single_df,
.min_lineage_sequences = .min_lineage_sequences,
.parallel_prepare = .prepare_threads > 1,
.align_threads = .align_threads,
.verbose_output = .verbose_output
.parallel_prepare = parallel_prepare,
.align_threads = .align_threads
)
doParallel::stopImplicitCluster()
if (parallel_prepare) {
doParallel::stopImplicitCluster()
}
return(.data)
}

align_single_df <- function(data,
.min_lineage_sequences,
.parallel_prepare,
.align_threads,
.verbose_output) {
.align_threads) {
for (required_column in c(
"Cluster", "Germline.sequence", "V.germline.nt", "J.germline.nt", "CDR3.germline.length"
"Cluster", "Germline.sequence", "V.allele", "J.allele",
"FR1.nt", "CDR1.nt", "FR2.nt", "CDR2.nt", "FR3.nt", "CDR3.nt", "FR4.nt", "V.aa", "J.aa"
)) {
if (!(required_column %in% colnames(data))) {
stop(
Expand All @@ -129,11 +123,11 @@ align_single_df <- function(data,
}

results <- data %>%
fill_missing_columns() %>%
plyr::dlply(
.variables = .(get("Cluster"), get("Germline.sequence")),
.fun = prepare_results_row,
.min_lineage_sequences = .min_lineage_sequences,
.verbose_output = .verbose_output,
.parallel = .parallel_prepare
) %>%
`[`(!is.na(.)) %>%
Expand All @@ -143,134 +137,69 @@ align_single_df <- function(data,
stop("There are no lineages containing at least ", .min_lineage_sequences, " sequences!")
}

# only required columns are passed to alignment function to reduce consumed memory
if (.verbose_output) {
alignments <- lapply(results, "[", c("Aligned", "Alignment"))
} else {
alignments <- lapply(results, "[", "Alignment")
}
alignments %<>% parallel::mclapply(
align_sequences,
.verbose_output = .verbose_output,
mc.preschedule = TRUE,
mc.cores = .align_threads
)
# only Alignment column are passed to alignment function to reduce consumed memory
alignments <- lapply(results, "[", "Alignment") %>%
par_or_normal_lapply(mc.preschedule = TRUE, mc.cores = .align_threads, function(df_row) {
df_row[["Alignment"]] %<>% ape::clustal()
})

return(convert_results_to_df(results, alignments))
}

# fill Clone.ID and Clones columns if they are missing
fill_missing_columns <- function(data) {
if (!("Clone.ID" %in% colnames(data))) {
data[["Clone.ID"]] <- seq.int(nrow(data))
}
if (!("Clones" %in% colnames(data))) {
data[["Clones"]] <- as.integer(1)
}
return(data)
}

# this function accepts dataframe subset containing rows only for current lineage
# and returns named list containing 1 row for results dataframe
prepare_results_row <- function(lineage_subset, .min_lineage_sequences, .verbose_output) {
cluster_name <- lineage_subset[[1, "Cluster"]]
germline_seq <- lineage_subset[[1, "Germline.sequence"]]
germline_v <- lineage_subset[[1, "V.germline.nt"]]
germline_j <- lineage_subset[[1, "J.germline.nt"]]
germline_cdr3_len <- lineage_subset[[1, "CDR3.germline.length"]]
aligned <- nrow(lineage_subset) >= .min_lineage_sequences

if (!aligned & !.verbose_output) {
prepare_results_row <- function(lineage_subset, .min_lineage_sequences) {
if (nrow(lineage_subset) < .min_lineage_sequences) {
# NA rows will be filtered out
return(NA)
}

lineage_subset[["V.lengths"]] <- v_len_outside_cdr3(
lineage_subset[["V.end"]], lineage_subset[["CDR3.start"]]
)
lineage_subset[["J.lengths"]] <- j_len_outside_cdr3(
lineage_subset[["Sequence"]], lineage_subset[["J.start"]], lineage_subset[["CDR3.end"]]
)
cluster_name <- lineage_subset[[1, "Cluster"]]
germline_seq <- lineage_subset[[1, "Germline.sequence"]]

sequences_columns <- c(
"Sequence", "Clone.ID", "Clones",
"CDR1.nt", "CDR2.nt", "CDR3.nt", "FR1.nt", "FR2.nt", "FR3.nt", "FR4.nt"
"Sequence", "Clone.ID", "Clones", "V.allele", "J.allele",
"CDR1.nt", "CDR2.nt", "CDR3.nt", "FR1.nt", "FR2.nt", "FR3.nt", "FR4.nt", "V.aa", "J.aa"
)
if (.verbose_output) {
sequences_columns %<>% c("V.end", "J.start", "CDR3.start", "CDR3.end")
}

sequences <- lineage_subset[sequences_columns]
sequences[["Clone.ID"]] %<>% as.integer()
sequences[["Clones"]] %<>% as.integer()

germline_v_len <- str_length(germline_v)
germline_j_len <- str_length(germline_j)
v_min_len <- min(lineage_subset[["V.lengths"]], germline_v_len)
j_min_len <- min(lineage_subset[["J.lengths"]], germline_j_len)

germline_trimmed <- trim_seq(germline_seq, germline_v_len, v_min_len, germline_j_len, j_min_len)
clonotypes_trimmed <- trim_seq(
lineage_subset[["Sequence"]],
lineage_subset[["V.lengths"]],
v_min_len,
lineage_subset[["J.lengths"]],
j_min_len
)

clonotypes_names <- sapply(lineage_subset[["Clone.ID"]], function(id) {
paste0("ID_", id)
})
all_sequences_list <- c(list(germline_trimmed), as.list(clonotypes_trimmed))
all_sequences_list <- c(list(germline_seq), as.list(lineage_subset[["Sequence"]]))
names(all_sequences_list) <- c("Germline", clonotypes_names)
alignment <- convert_seq_list_to_dnabin(all_sequences_list)

if (.verbose_output) {
return(list(
Cluster = cluster_name,
Germline = germline_seq,
V.germline.nt = germline_v,
J.germline.nt = germline_j,
CDR3.germline.length = germline_cdr3_len,
Aligned = aligned,
Alignment = alignment,
V.length = v_min_len,
J.length = j_min_len,
Sequences = sequences
))
} else {
return(list(
Cluster = cluster_name,
Germline = germline_seq,
V.germline.nt = germline_v,
J.germline.nt = germline_j,
CDR3.germline.length = germline_cdr3_len,
Alignment = alignment,
V.length = v_min_len,
J.length = j_min_len,
Sequences = sequences
))
}
return(list(
Cluster = cluster_name,
Germline = germline_seq,
Alignment = alignment,
Sequences = sequences
))
}

# trim V/J tails in sequence to the specified lenghts v_min, j_min
trim_seq <- function(seq, v_len, v_min, j_len, j_min) {
str_sub(seq, v_len - v_min + 1, -(j_len - j_min + 1))
}

convert_results_to_df <- function(nested_results_list, nested_alignments_list) {
alignments <- nested_alignments_list %>%
lapply(magrittr::extract2, "Alignment") %>%
tibble(Alignment = .)
convert_results_to_df <- function(nested_results_list, alignments_list) {
alignments <- tibble(Alignment = alignments_list)
sequences <- nested_results_list %>%
lapply(magrittr::extract2, "Sequences") %>%
tibble(Sequences = .)
df <- nested_results_list %>%
lapply(rlist::list.remove, c("Alignment", "Sequences")) %>%
purrr::map_dfr(~.) %>%
cbind(alignments, sequences)
# fix column types after dataframe rebuilding
for (column in c("CDR3.germline.length", "V.length", "J.length")) {
df[[column]] %<>% as.integer()
}
return(df)
}

align_sequences <- function(df_row, .verbose_output) {
if (.verbose_output) {
aligned <- df_row[["Aligned"]]
} else {
aligned <- TRUE
}
if (aligned) {
df_row[["Alignment"]] %<>% ape::clustal()
}
return(df_row)
}
1 change: 0 additions & 1 deletion R/clustering.R
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ immunr_hclust <- function(.data, .k = 2, .k.max = nrow(.data) - 1, .method = "co
}

immunr_kmeans <- function(.data, .k = 2, .k.max = as.integer(sqrt(nrow(.data))) + 1, .method = c("silhouette", "gap_stat")) {
# res = list(kmeans = add_class(kmeans(as.dist(.data), .k), "immunr_kmeans"),
res <- list(
kmeans = add_class(kmeans(.data, .k), "immunr_kmeans"),
nbclust = add_class(fviz_nbclust(.data, kmeans, k.max = .k.max, .method[1]), "immunr_nbclust"),
Expand Down
Loading

0 comments on commit bc90fed

Please sign in to comment.