Merge pull request #304 from immunomind/dev

Immunarch 0.8.0 release
immunomind · Oct 18, 2022 · bc90fed · bc90fed
2 parents 7a70786 + 138e7fe
commit bc90fed
Show file tree

Hide file tree

Showing 21 changed files with 500 additions and 713 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: immunarch
 Type: Package
 Title: Bioinformatics Analysis of T-Cell and B-Cell Immune Repertoires
-Version: 0.7.0
+Version: 0.8.0
 Authors@R: c(
     person("Vadim I.", "Nazarov", , "[email protected]", c("aut", "cre")),
     person("Vasily O.", "Tsvetkov", , role = "aut"),
@@ -84,6 +84,6 @@ Suggests:
     rmarkdown
 VignetteBuilder: knitr
 Encoding: UTF-8
-RoxygenNote: 7.2.0
+RoxygenNote: 7.2.1
 LazyData: true
 LazyDataCompression: xz
diff --git a/NAMESPACE b/NAMESPACE
@@ -208,6 +208,7 @@ importFrom(purrr,imap)
 importFrom(purrr,map)
 importFrom(purrr,map2)
 importFrom(purrr,map2_chr)
+importFrom(purrr,map2_df)
 importFrom(purrr,map2_lgl)
 importFrom(purrr,map_chr)
 importFrom(purrr,map_df)
@@ -289,6 +290,7 @@ importFrom(tibble,tibble)
 importFrom(tidyr,drop_na)
 importFrom(tidyr,unite)
 importFrom(tidyr,unnest)
+importFrom(tidyselect,any_of)
 importFrom(tidyselect,starts_with)
 importFrom(utils,capture.output)
 importFrom(utils,packageVersion)

diff --git a/R/align_lineage.R b/R/align_lineage.R
@@ -23,16 +23,14 @@
 #'
 #' @usage
 #'
-#' repAlignLineage(.data,
-#' .min_lineage_sequences, .prepare_threads, .align_threads, .verbose_output, .nofail)
+#' repAlignLineage(.data, .min_lineage_sequences, .prepare_threads, .align_threads, .nofail)
 #'
 #' @param .data The data to be processed. Can be \link{data.frame}, \link{data.table}
 #' or a list of these objects.
 #'
 #' @param .min_lineage_sequences If number of sequences in the same clonal lineage and the same
 #' cluster (not including germline) is lower than this threshold, this group of sequences
-#' will not be aligned and will not be used in next steps of BCR pipeline
-#' (will be saved in output table only if .verbose_output parameter is set to TRUE).
+#' will be filtered out from the dataframe; so only large enough lineages will be included.
 #'
 #' @param .prepare_threads Number of threads to prepare results table.
 #' Please note that high number can cause heavy memory usage!
@@ -43,11 +41,6 @@
 #' must contain 'Cluster' column, which is added by seqCluster() function, and 'Germline.sequence'
 #' column, which is added by repGermline() function.
 #'
-#' @param .verbose_output If TRUE, all output dataframe columns will be included (see documentation about this
-#' function return), and unaligned clusters will be included in the output. Setting this to TRUE significantly
-#' increases memory usage. If FALSE, only aligned clusters and columns required for repClonalFamily() and
-#' repSomaticHypermutation() calculation will be included in the output.
-#'
 #' @param .nofail Will return NA instead of stopping if Clustal W is not installed.
 #' Used to avoid raising errors in examples on computers where Clustal W is not installed.
 #'
@@ -57,21 +50,13 @@
 #' The dataframe has these columns:
 #' * Cluster: cluster name
 #' * Germline: germline sequence
-#' * V.germline.nt: germline V gene sequence
-#' * J.germline.nt: germline J gene sequence
-#' * CDR3.germline.length: length of CDR3 in germline
-#' * Aligned (included if .verbose_output=TRUE): FALSE if this group of sequences was not aligned with lineage
-#'   (.min_lineage_sequences is below the threshold); TRUE if it was aligned
-#' * Alignment: DNAbin object with alignment or DNAbin object with unaligned sequences (if Aligned=FALSE)
-#' * V.length: shortest length of V gene part outside of CDR3 region in this
-#'   group of sequences; longer V genes (including germline) are trimmed to this length before alignment
-#' * J.length: shortest length of J gene part outside of CDR3 region in this
-#'   group of sequences; longer J genes (including germline) are trimmed to this length before alignment
+#' * Alignment: DNAbin object with alignment
 #' * Sequences: nested dataframe containing all sequences for this combination
 #'   of cluster and germline; it has columns
-#'   Sequence, Clone.ID, Clones, CDR1.nt, CDR2.nt, CDR3.nt, FR1.nt, FR2.nt, FR3.nt, FR4.nt
-#'   and, if .verbose_output=TRUE, also V.end, J.start, CDR3.start, CDR3.end;
-#'   all values taken from the input dataframe
+#'   * Sequence, CDR1.nt, CDR2.nt, CDR3.nt, FR1.nt, FR2.nt, FR3.nt, FR4.nt, V.allele, J.allele,
+#'     V.aa, J.aa: all values taken from the input dataframe
+#'   * Clone.ID: taken from the input dataframe, or created (filled with row numbers) if missing
+#'   * Clones: taken from the input dataframe, or created (filled with '1' values) if missing
 #'
 #' @examples
 #'
@@ -87,36 +72,45 @@ repAlignLineage <- function(.data,
                             .min_lineage_sequences = 3,
                             .prepare_threads = 2,
                             .align_threads = 4,
-                            .verbose_output = FALSE,
                             .nofail = FALSE) {
-  if (!require_system_package("clustalw", error_message = paste0(
+  if (!require_system_package(c("clustalw", "clustalw2"), error_message = paste0(
     "repAlignLineage requires Clustal W app to be installed!\n",
     "Please download it from here: http://www.clustal.org/download/current/\n",
     "or install it with your system package manager (such as apt or dnf)."
   ), .nofail)) {
     return(get_empty_object_with_class("step_failure_ignored"))
   }
+  if (.min_lineage_sequences < 2) {
+    warning(
+      ".min_lineage_sequences is set to less than 2; ",
+      "results will not be valid to build trees with repClonalLineage()!"
+    )
+  }
 
-  doParallel::registerDoParallel(cores = .prepare_threads)
+  parallel_prepare <- .prepare_threads > 1
+  if (parallel_prepare) {
+    doParallel::registerDoParallel(cores = .prepare_threads)
+  }
   .data %<>%
     apply_to_sample_or_list(
       align_single_df,
       .min_lineage_sequences = .min_lineage_sequences,
-      .parallel_prepare = .prepare_threads > 1,
-      .align_threads = .align_threads,
-      .verbose_output = .verbose_output
+      .parallel_prepare = parallel_prepare,
+      .align_threads = .align_threads
     )
-  doParallel::stopImplicitCluster()
+  if (parallel_prepare) {
+    doParallel::stopImplicitCluster()
+  }
   return(.data)
 }
 
 align_single_df <- function(data,
                             .min_lineage_sequences,
                             .parallel_prepare,
-                            .align_threads,
-                            .verbose_output) {
+                            .align_threads) {
   for (required_column in c(
-    "Cluster", "Germline.sequence", "V.germline.nt", "J.germline.nt", "CDR3.germline.length"
+    "Cluster", "Germline.sequence", "V.allele", "J.allele",
+    "FR1.nt", "CDR1.nt", "FR2.nt", "CDR2.nt", "FR3.nt", "CDR3.nt", "FR4.nt", "V.aa", "J.aa"
   )) {
     if (!(required_column %in% colnames(data))) {
       stop(
@@ -129,11 +123,11 @@ align_single_df <- function(data,
   }
 
   results <- data %>%
+    fill_missing_columns() %>%
     plyr::dlply(
       .variables = .(get("Cluster"), get("Germline.sequence")),
       .fun = prepare_results_row,
       .min_lineage_sequences = .min_lineage_sequences,
-      .verbose_output = .verbose_output,
       .parallel = .parallel_prepare
     ) %>%
     `[`(!is.na(.)) %>%
@@ -143,134 +137,69 @@ align_single_df <- function(data,
     stop("There are no lineages containing at least ", .min_lineage_sequences, " sequences!")
   }
 
-  # only required columns are passed to alignment function to reduce consumed memory
-  if (.verbose_output) {
-    alignments <- lapply(results, "[", c("Aligned", "Alignment"))
-  } else {
-    alignments <- lapply(results, "[", "Alignment")
-  }
-  alignments %<>% parallel::mclapply(
-    align_sequences,
-    .verbose_output = .verbose_output,
-    mc.preschedule = TRUE,
-    mc.cores = .align_threads
-  )
+  # only Alignment column are passed to alignment function to reduce consumed memory
+  alignments <- lapply(results, "[", "Alignment") %>%
+    par_or_normal_lapply(mc.preschedule = TRUE, mc.cores = .align_threads, function(df_row) {
+      df_row[["Alignment"]] %<>% ape::clustal()
+    })
 
   return(convert_results_to_df(results, alignments))
 }
 
+# fill Clone.ID and Clones columns if they are missing
+fill_missing_columns <- function(data) {
+  if (!("Clone.ID" %in% colnames(data))) {
+    data[["Clone.ID"]] <- seq.int(nrow(data))
+  }
+  if (!("Clones" %in% colnames(data))) {
+    data[["Clones"]] <- as.integer(1)
+  }
+  return(data)
+}
+
 # this function accepts dataframe subset containing rows only for current lineage
 # and returns named list containing 1 row for results dataframe
-prepare_results_row <- function(lineage_subset, .min_lineage_sequences, .verbose_output) {
-  cluster_name <- lineage_subset[[1, "Cluster"]]
-  germline_seq <- lineage_subset[[1, "Germline.sequence"]]
-  germline_v <- lineage_subset[[1, "V.germline.nt"]]
-  germline_j <- lineage_subset[[1, "J.germline.nt"]]
-  germline_cdr3_len <- lineage_subset[[1, "CDR3.germline.length"]]
-  aligned <- nrow(lineage_subset) >= .min_lineage_sequences
-
-  if (!aligned & !.verbose_output) {
+prepare_results_row <- function(lineage_subset, .min_lineage_sequences) {
+  if (nrow(lineage_subset) < .min_lineage_sequences) {
+    # NA rows will be filtered out
     return(NA)
   }
 
-  lineage_subset[["V.lengths"]] <- v_len_outside_cdr3(
-    lineage_subset[["V.end"]], lineage_subset[["CDR3.start"]]
-  )
-  lineage_subset[["J.lengths"]] <- j_len_outside_cdr3(
-    lineage_subset[["Sequence"]], lineage_subset[["J.start"]], lineage_subset[["CDR3.end"]]
-  )
+  cluster_name <- lineage_subset[[1, "Cluster"]]
+  germline_seq <- lineage_subset[[1, "Germline.sequence"]]
 
   sequences_columns <- c(
-    "Sequence", "Clone.ID", "Clones",
-    "CDR1.nt", "CDR2.nt", "CDR3.nt", "FR1.nt", "FR2.nt", "FR3.nt", "FR4.nt"
+    "Sequence", "Clone.ID", "Clones", "V.allele", "J.allele",
+    "CDR1.nt", "CDR2.nt", "CDR3.nt", "FR1.nt", "FR2.nt", "FR3.nt", "FR4.nt", "V.aa", "J.aa"
   )
-  if (.verbose_output) {
-    sequences_columns %<>% c("V.end", "J.start", "CDR3.start", "CDR3.end")
-  }
+
   sequences <- lineage_subset[sequences_columns]
   sequences[["Clone.ID"]] %<>% as.integer()
   sequences[["Clones"]] %<>% as.integer()
 
-  germline_v_len <- str_length(germline_v)
-  germline_j_len <- str_length(germline_j)
-  v_min_len <- min(lineage_subset[["V.lengths"]], germline_v_len)
-  j_min_len <- min(lineage_subset[["J.lengths"]], germline_j_len)
-
-  germline_trimmed <- trim_seq(germline_seq, germline_v_len, v_min_len, germline_j_len, j_min_len)
-  clonotypes_trimmed <- trim_seq(
-    lineage_subset[["Sequence"]],
-    lineage_subset[["V.lengths"]],
-    v_min_len,
-    lineage_subset[["J.lengths"]],
-    j_min_len
-  )
-
   clonotypes_names <- sapply(lineage_subset[["Clone.ID"]], function(id) {
     paste0("ID_", id)
   })
-  all_sequences_list <- c(list(germline_trimmed), as.list(clonotypes_trimmed))
+  all_sequences_list <- c(list(germline_seq), as.list(lineage_subset[["Sequence"]]))
   names(all_sequences_list) <- c("Germline", clonotypes_names)
   alignment <- convert_seq_list_to_dnabin(all_sequences_list)
 
-  if (.verbose_output) {
-    return(list(
-      Cluster = cluster_name,
-      Germline = germline_seq,
-      V.germline.nt = germline_v,
-      J.germline.nt = germline_j,
-      CDR3.germline.length = germline_cdr3_len,
-      Aligned = aligned,
-      Alignment = alignment,
-      V.length = v_min_len,
-      J.length = j_min_len,
-      Sequences = sequences
-    ))
-  } else {
-    return(list(
-      Cluster = cluster_name,
-      Germline = germline_seq,
-      V.germline.nt = germline_v,
-      J.germline.nt = germline_j,
-      CDR3.germline.length = germline_cdr3_len,
-      Alignment = alignment,
-      V.length = v_min_len,
-      J.length = j_min_len,
-      Sequences = sequences
-    ))
-  }
+  return(list(
+    Cluster = cluster_name,
+    Germline = germline_seq,
+    Alignment = alignment,
+    Sequences = sequences
+  ))
 }
 
-# trim V/J tails in sequence to the specified lenghts v_min, j_min
-trim_seq <- function(seq, v_len, v_min, j_len, j_min) {
-  str_sub(seq, v_len - v_min + 1, -(j_len - j_min + 1))
-}
-
-convert_results_to_df <- function(nested_results_list, nested_alignments_list) {
-  alignments <- nested_alignments_list %>%
-    lapply(magrittr::extract2, "Alignment") %>%
-    tibble(Alignment = .)
+convert_results_to_df <- function(nested_results_list, alignments_list) {
+  alignments <- tibble(Alignment = alignments_list)
   sequences <- nested_results_list %>%
     lapply(magrittr::extract2, "Sequences") %>%
     tibble(Sequences = .)
   df <- nested_results_list %>%
     lapply(rlist::list.remove, c("Alignment", "Sequences")) %>%
     purrr::map_dfr(~.) %>%
     cbind(alignments, sequences)
-  # fix column types after dataframe rebuilding
-  for (column in c("CDR3.germline.length", "V.length", "J.length")) {
-    df[[column]] %<>% as.integer()
-  }
   return(df)
 }
-
-align_sequences <- function(df_row, .verbose_output) {
-  if (.verbose_output) {
-    aligned <- df_row[["Aligned"]]
-  } else {
-    aligned <- TRUE
-  }
-  if (aligned) {
-    df_row[["Alignment"]] %<>% ape::clustal()
-  }
-  return(df_row)
-}
diff --git a/R/clustering.R b/R/clustering.R
@@ -73,7 +73,6 @@ immunr_hclust <- function(.data, .k = 2, .k.max = nrow(.data) - 1, .method = "co
 }
 
 immunr_kmeans <- function(.data, .k = 2, .k.max = as.integer(sqrt(nrow(.data))) + 1, .method = c("silhouette", "gap_stat")) {
-  # res = list(kmeans = add_class(kmeans(as.dist(.data), .k), "immunr_kmeans"),
   res <- list(
     kmeans = add_class(kmeans(.data, .k), "immunr_kmeans"),
     nbclust = add_class(fviz_nbclust(.data, kmeans, k.max = .k.max, .method[1]), "immunr_nbclust"),