From de119f7f92a29bd9f3c5558a30790ebcd0df598b Mon Sep 17 00:00:00 2001 From: Maria Date: Fri, 5 Aug 2022 14:11:04 +0300 Subject: [PATCH 01/46] Update _pkgdown.yml --- _pkgdown.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/_pkgdown.yml b/_pkgdown.yml index 29b16ceb..b288f7a6 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -135,6 +135,7 @@ reference: - has_concept("germline") - has_concept("align_lineage") - has_concept("phylip") + - has_concept("somatic_hypermutation") - title: Basic immune repertoire statistics - subtitle: Exploratory data analysis contents: From 8223d08506ba53eb4d5044f36aba6818065f4b36 Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Fri, 12 Aug 2022 12:57:58 +0200 Subject: [PATCH 02/46] version bump --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 59cccbcf..72dd01d1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: immunarch Type: Package Title: Bioinformatics Analysis of T-Cell and B-Cell Immune Repertoires -Version: 0.7.0 +Version: 0.8.0 Authors@R: c( person("Vadim I.", "Nazarov", , "support@immunomind.io", c("aut", "cre")), person("Vasily O.", "Tsvetkov", , role = "aut"), From 7e0aacfa4c819059102d9d085153102c5ca68cf6 Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Fri, 12 Aug 2022 12:58:10 +0200 Subject: [PATCH 03/46] site build fix --- _pkgdown.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/_pkgdown.yml b/_pkgdown.yml index 29b16ceb..b288f7a6 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -135,6 +135,7 @@ reference: - has_concept("germline") - has_concept("align_lineage") - has_concept("phylip") + - has_concept("somatic_hypermutation") - title: Basic immune repertoire statistics - subtitle: Exploratory data analysis contents: From cb3495b2b80a6873a16f880fe934467d3cac270f Mon Sep 17 00:00:00 2001 From: Maria Date: Fri, 12 Aug 2022 15:13:12 +0300 Subject: [PATCH 04/46] Site fix + new BCR visualisations --- vignettes/web_only/BCRpipeline.Rmd | 56 +++++++++++++++++++----------- vignettes/web_only/load_10x.Rmd | 6 ++-- vignettes/web_only/load_mixcr.Rmd | 4 +-- 3 files changed, 40 insertions(+), 26 deletions(-) diff --git a/vignettes/web_only/BCRpipeline.Rmd b/vignettes/web_only/BCRpipeline.Rmd index 5f2a8c8e..abc3febf 100644 --- a/vignettes/web_only/BCRpipeline.Rmd +++ b/vignettes/web_only/BCRpipeline.Rmd @@ -167,10 +167,12 @@ The function has several parameters: ```{r example 8, results = 'hide'} # take clusters that contain at least 1 sequence +bcr_data <- bcrdata$data align_dt <- bcr_data %>% - seqCluster(seqDist(bcr_data), .fixed_threshold = 3) %>% - repGermline(.threads = 1) %>% - repAlignLineage(.min_lineage_sequences = 2, .align_threads = 2) + seqCluster(seqDist(bcr_data, .col = 'CDR3.nt', .group_by_seqLength = TRUE), + .perc_similarity = 0.6) %>% + repGermline(.threads = 2) %>% + repAlignLineage(.min_lineage_sequences = 6, .align_threads = 2, .nofail = TRUE) ``` - `.prepare_threads` — the number of threads to prepare results table. A high number can cause memory overload! @@ -217,17 +219,27 @@ sudo apt-get install -y phylip repClonalFamily usage example: ```{r example 10, results = 'hide'} -library(ape) -bcr <- bcr_data %>% - seqCluster(seqDist(bcr_data), .fixed_threshold = 3) %>% - repGermline(.threads = 1) %>% - repAlignLineage(.min_lineage_sequences = 2) %>% - repClonalFamily(.threads = 2) - +bcr <- align_dt %>% + repClonalFamily(.threads = 2, .nofail = TRUE) #plot visualization of the first tree -plot(bcr$full_clones$Tree[[1]]) -tiplabels() -nodelabels() +vis(bcr[["full_clones"]][["TreeStats"]][[1]]) +``` +For each cluster tree is represented as table (The default number of clones for CommonAncestor, Germline, Presumable is 1): + +```{r example 10.1, results = 'hide'} +#example fot first tree +bcr[["full_clones"]][["TreeStats"]][[1]] +``` + +You could recolor leaves. For example, we recolor leaves where number of AA mutations is not 0: + +```{r example 10.3, results = 'hide'} +#take sequence where number of AA mutations is not 0 +f <- bcr[["full_clones"]][["TreeStats"]][[1]] +#rename this leaves +f[f$DistanceAA != 0, ]['Type'] = 'mutationAA' +#new tree +vis(f) ``` We have found 4 clusters: @@ -252,6 +264,13 @@ We have calculated a trunk length for each cluster: bcr$full_clones[ , c('Cluster', 'Trunk.Length') ] ``` +Also trunk length specified in "TreeStats" table in column "DistanceNT". + +```{r example 10.2, results = 'hide'} +#example fot first tree +bcr[["full_clones"]][["TreeStats"]][[1]][1, ] +``` + # Somatic hypermutation analysis The rate of somatic hypermutation allows us to estimate repertoire maturation and detect the type of mutation contributing to the emergence of high affinity antibodies. This makes somatic hypermutation analysis a valuable asset for B-cell repertoire analysis. @@ -261,12 +280,7 @@ In Immunarch, `repSomaticHypermutation()` function is designed for hypermutation ```{r example 14, , warning = FALSE} bcr_data <- bcrdata$data -shm_data <- bcr_data %>% - seqCluster(seqDist(bcr_data), .fixed_threshold = 3) %>% - repGermline(.threads = 2) %>% - repAlignLineage(.min_lineage_sequences = 2, .align_threads = 2, .nofail = TRUE) %>% - repClonalFamily(.threads = 2, .nofail = TRUE) %>% - repSomaticHypermutation(.threads = 2, .nofail = TRUE) +shm_data <- bcr %>% repSomaticHypermutation(.threads = 2, .nofail = TRUE) ``` The function repSomaticHypermutation() takes V and J germline sequences and V and J clonotype sequences as an input. Then the function aligns germline and clonotype sequences to detect and calculate occurring mutations. @@ -291,13 +305,13 @@ shm_data$full_clones$FR4.nt[1] Example: aligning germline and clonotype V sequences: ```{r example 16} -image(shm_data$full_clones$Germline.Alignment.V[[1]], grid = TRUE) +image(shm_data$full_clones$Germline.Alignment.V[[3]], grid = TRUE) ``` Example: aligning germline and clonotype J sequences: ```{r example 17} -image(shm_data$full_clones$Germline.Alignment.J[[1]], grid = TRUE) +image(shm_data$full_clones$Germline.Alignment.J[[3]], grid = TRUE) ``` The number of mutations for each clonotype sequence: diff --git a/vignettes/web_only/load_10x.Rmd b/vignettes/web_only/load_10x.Rmd index 14451681..f976dfd5 100644 --- a/vignettes/web_only/load_10x.Rmd +++ b/vignettes/web_only/load_10x.Rmd @@ -28,7 +28,7 @@ The 10x Genomics Chromium Single Cell Immune Profiling Solution enables simultan - 5' gene expression. - Cell surface proteins/antigen specificity (feature barcodes) at single-cell resolution for the same set of cells. -Their end-to-end pipeline also includes the Cell Ranger software, which include the following pipelines for Immune profiling analysis: +Their end-to-end pipeline also includes the Cell Ranger software, which include the following pipelines for Immune profiling analysis: `cellranger mkfastq` demultiplexes raw base call (BCL) files generated by Illumina sequencers into FASTQ files. It is a wrapper around Illumina's bcl2fastq, with additional useful features that are specific to 10x libraries and a simplified sample sheet format. @@ -38,7 +38,7 @@ Their end-to-end pipeline also includes the here to use Cellranger on your data. Note that Cellranger is currently only supported by Linux operating systems. The `cellranger count` and `cellranger vdj` methods will be particularly useful. +Follow the instructions here to use Cellranger on your data. Note that Cellranger is currently only supported by Linux operating systems. The `cellranger count` and `cellranger vdj` methods will be particularly useful. You can find sample data of full runs to download here. In this tutorial, we use this single cell mouse data. If you are using `immunarch` you can download only the .csv files. @@ -122,7 +122,7 @@ $meta 6 vdj_v1_mm_c57bl6_splenocytes_t_consensus_annotations_TRB TRB vdj_v1_mm_c57bl6_splenocytes_t_consensus_annotations ``` -Congrats! Now your data is ready for exploration. Follow the steps [here](https://immunarch.com/articles/v3_basic_analysis.html) to learn more about how to explore your dataset. +Congrats! Now your data is ready for exploration. Follow the steps [here](https://immunarch.com/articles/web_only/v3_basic_analysis.html) to learn more about how to explore your dataset. ## Note on barcodes diff --git a/vignettes/web_only/load_mixcr.Rmd b/vignettes/web_only/load_mixcr.Rmd index 41fa4549..14fcfdf7 100644 --- a/vignettes/web_only/load_mixcr.Rmd +++ b/vignettes/web_only/load_mixcr.Rmd @@ -30,7 +30,7 @@ Some of its features include: - Extracting repertoire data even from regular RNA-Seq - Successfully analysing full-length antibody data -Find more info about MiXCR at here +Find more info about MiXCR at here # Prepare MiXCR Data Follow these instructions here to install MiXCR and get started with processing data. **Important Note:** Currently supports MiXCR version 3 and above. @@ -251,4 +251,4 @@ $meta 3 analysis.clonotypes.IGH_3 F 3 A ``` -Congrats! Now your data is ready for exploration. Follow the steps [here](https://immunarch.com/articles/v3_basic_analysis.html) to learn more about how to explore your dataset. +Congrats! Now your data is ready for exploration. Follow the steps [here](https://immunarch.com/articles/web_only/v3_basic_analysis.html) to learn more about how to explore your dataset. From a2cec776231862a84e271b62c23cc55e5ec52586 Mon Sep 17 00:00:00 2001 From: Maria Date: Fri, 12 Aug 2022 16:13:52 +0300 Subject: [PATCH 05/46] new Fixes --- vignettes/web_only/BCRpipeline.Rmd | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vignettes/web_only/BCRpipeline.Rmd b/vignettes/web_only/BCRpipeline.Rmd index abc3febf..5db7e207 100644 --- a/vignettes/web_only/BCRpipeline.Rmd +++ b/vignettes/web_only/BCRpipeline.Rmd @@ -171,7 +171,7 @@ bcr_data <- bcrdata$data align_dt <- bcr_data %>% seqCluster(seqDist(bcr_data, .col = 'CDR3.nt', .group_by_seqLength = TRUE), .perc_similarity = 0.6) %>% - repGermline(.threads = 2) %>% + repGermline(.threads = 1) %>% repAlignLineage(.min_lineage_sequences = 6, .align_threads = 2, .nofail = TRUE) ``` @@ -231,12 +231,12 @@ For each cluster tree is represented as table (The default number of clones for bcr[["full_clones"]][["TreeStats"]][[1]] ``` -You could recolor leaves. For example, we recolor leaves where number of AA mutations is not 0: +You can recolor leaves. For example, we recolor leaves where number of AA mutations is not 0: ```{r example 10.3, results = 'hide'} #take sequence where number of AA mutations is not 0 f <- bcr[["full_clones"]][["TreeStats"]][[1]] -#rename this leaves +#rename these leaves f[f$DistanceAA != 0, ]['Type'] = 'mutationAA' #new tree vis(f) From ca2541d1d83f2348494c53d6256747fb98f82d0f Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Fri, 19 Aug 2022 12:44:09 +0200 Subject: [PATCH 06/46] hotfix for wrong check in fixVis() --- R/shiny.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/shiny.R b/R/shiny.R index 96c32536..f1279728 100644 --- a/R/shiny.R +++ b/R/shiny.R @@ -80,7 +80,7 @@ fixVis <- function(.plot = NA) { do.call(tabPanel, objs) } - if (is.na(.plot)) { + if (has_no_data(.plot)) { diamonds <- ggplot2::diamonds .plot <- qplot(x = carat, y = price, fill = cut, shape = cut, color = color, size = clarity, data = diamonds[sample.int(nrow(diamonds), 5000), ]) + theme_classic() } From 84e23f50a0251eb04b09457e33b55824b023e252 Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Fri, 19 Aug 2022 14:25:19 +0200 Subject: [PATCH 07/46] distances calculation fixes WIP --- R/phylip.R | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/R/phylip.R b/R/phylip.R index bdd7c13b..7d00f3ea 100644 --- a/R/phylip.R +++ b/R/phylip.R @@ -265,47 +265,48 @@ process_cluster <- function(cluster_row) { germline <- tree_stats[which(tree_stats["Type"] == "Germline"), ][1, "Sequence"] common_ancestor <- tree_stats[which(tree_stats["Type"] == "CommonAncestor"), ][1, "Sequence"] - # calculate distances of all sequences from germline - germline_v <- str_sub(germline, 1, v_trimmed_length) - germline_j <- str_sub(germline, -j_trimmed_length) - germline_nt_chars <- strsplit(paste0(germline_v, germline_j), "")[[1]] - - germline_v_aa <- bunch_translate(substring(germline_v, v_aa_frame_start), - .two.way = FALSE, .ignore.n = TRUE - ) - germline_j_aa <- bunch_translate(substring(germline_j, j_aa_frame_start), - .two.way = FALSE, .ignore.n = TRUE - ) - germline_aa_chars <- strsplit(paste0(germline_v_aa, germline_j_aa), "")[[1]] + # calculate distances of all sequences from their ancestors for (row in seq_len(nrow(tree_stats))) { - if (tree_stats[row, "Type"] != "Germline") { + if (!has_no_data(tree_stats[row, "Ancestor"])) { seq <- tree_stats[row, "Sequence"] + ancestor <- tree_stats[row, "Ancestor"] seq_v <- str_sub(seq, 1, v_trimmed_length) seq_j <- str_sub(seq, -j_trimmed_length) + ancestor_v <- str_sub(ancestor, 1, v_trimmed_length) + ancestor_j <- str_sub(ancestor, -j_trimmed_length) seq_nt_chars <- strsplit(paste0(seq_v, seq_j), "")[[1]] - if (length(germline_nt_chars) != length(seq_nt_chars)) { + ancestor_nt_chars <- strsplit(paste0(ancestor_v, ancestor_j), "")[[1]] + if (length(seq_nt_chars) != length(ancestor_nt_chars)) { warning( - "Germline and sequence lengths are different; ", + "Sequence and ancestor lengths are different; ", "something is wrong in repClonalFamily calculation!\n", - "germline_nt_chars = ", germline_nt_chars, "\nseq_nt_chars = ", seq_nt_chars + "seq_nt_chars = ", seq_nt_chars, "\nancestor_nt_chars = ", ancestor_nt_chars ) } - tree_stats[row, "DistanceNT"] <- sum(germline_nt_chars != seq_nt_chars) + tree_stats[row, "DistanceNT"] <- sum(seq_nt_chars != ancestor_nt_chars) + seq_v_aa <- bunch_translate(substring(seq_v, v_aa_frame_start), .two.way = FALSE, .ignore.n = TRUE ) seq_j_aa <- bunch_translate(substring(seq_j, j_aa_frame_start), .two.way = FALSE, .ignore.n = TRUE ) + ancestor_v_aa <- bunch_translate(substring(ancestor_v, v_aa_frame_start), + .two.way = FALSE, .ignore.n = TRUE + ) + ancestor_j_aa <- bunch_translate(substring(ancestor_j, j_aa_frame_start), + .two.way = FALSE, .ignore.n = TRUE + ) seq_aa_chars <- strsplit(paste0(seq_v_aa, seq_j_aa), "")[[1]] - if (length(germline_aa_chars) != length(seq_aa_chars)) { + ancestor_aa_chars <- strsplit(paste0(ancestor_v_aa, ancestor_j_aa), "")[[1]] + if (length(seq_aa_chars) != length(ancestor_aa_chars)) { warning( - "Germline and sequence lengths are different; ", + "Sequence and ancestor lengths are different; ", "something is wrong in repClonalFamily calculation!\n", - "germline_aa_chars = ", germline_aa_chars, "\nseq_aa_chars = ", seq_aa_chars + "seq_aa_chars = ", seq_aa_chars, "\ancestor_aa_chars = ", ancestor_aa_chars ) } - tree_stats[row, "DistanceAA"] <- sum(germline_aa_chars != seq_aa_chars) + tree_stats[row, "DistanceAA"] <- sum(seq_aa_chars != ancestor_aa_chars) } } From 77734d80c9419c52ed835512a8e8a00e25ac0b26 Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Mon, 22 Aug 2022 15:51:44 +0200 Subject: [PATCH 08/46] bugfixes --- R/phylip.R | 5 +++-- R/tools.R | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/R/phylip.R b/R/phylip.R index 7d00f3ea..a6c5fb28 100644 --- a/R/phylip.R +++ b/R/phylip.R @@ -269,7 +269,8 @@ process_cluster <- function(cluster_row) { for (row in seq_len(nrow(tree_stats))) { if (!has_no_data(tree_stats[row, "Ancestor"])) { seq <- tree_stats[row, "Sequence"] - ancestor <- tree_stats[row, "Ancestor"] + ancestor_name <- tree_stats[row, "Ancestor"] + ancestor <- tree_stats[which(tree_stats["Name"] == ancestor_name), ][1, "Sequence"] seq_v <- str_sub(seq, 1, v_trimmed_length) seq_j <- str_sub(seq, -j_trimmed_length) ancestor_v <- str_sub(ancestor, 1, v_trimmed_length) @@ -303,7 +304,7 @@ process_cluster <- function(cluster_row) { warning( "Sequence and ancestor lengths are different; ", "something is wrong in repClonalFamily calculation!\n", - "seq_aa_chars = ", seq_aa_chars, "\ancestor_aa_chars = ", ancestor_aa_chars + "seq_aa_chars = ", seq_aa_chars, "\nancestor_aa_chars = ", ancestor_aa_chars ) } tree_stats[row, "DistanceAA"] <- sum(seq_aa_chars != ancestor_aa_chars) diff --git a/R/tools.R b/R/tools.R index dfcda321..3408569c 100644 --- a/R/tools.R +++ b/R/tools.R @@ -490,7 +490,7 @@ as_numeric_or_fail <- function(.string) { } has_no_data <- function(.data) { - any(sapply(list(NA, NULL, NaN), identical, .data)) + any(sapply(list(NA, NULL, NaN), identical, .data)) | all(is.na(.data)) } # apply function to .data if it's a single sample or to each sample if .data is a list of samples From 66181cbe2776a07077c6de93803935347e24343f Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Thu, 25 Aug 2022 17:23:39 +0200 Subject: [PATCH 09/46] fix for AA frame starts for V genes in repClonalFamily() --- DESCRIPTION | 2 +- R/align_lineage.R | 27 +++++++++++++++++---- R/phylip.R | 54 +++++++++++++++++++++++++++++++++++------- man/repAlignLineage.Rd | 2 ++ 4 files changed, 72 insertions(+), 13 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 72dd01d1..71393fc4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -84,6 +84,6 @@ Suggests: rmarkdown VignetteBuilder: knitr Encoding: UTF-8 -RoxygenNote: 7.2.0 +RoxygenNote: 7.2.1 LazyData: true LazyDataCompression: xz diff --git a/R/align_lineage.R b/R/align_lineage.R index 68af4c55..64917ad4 100644 --- a/R/align_lineage.R +++ b/R/align_lineage.R @@ -72,6 +72,8 @@ #' Sequence, Clone.ID, Clones, CDR1.nt, CDR2.nt, CDR3.nt, FR1.nt, FR2.nt, FR3.nt, FR4.nt #' and, if .verbose_output=TRUE, also V.end, J.start, CDR3.start, CDR3.end; #' all values taken from the input dataframe +#' * AA.frame.starts: start positions for amino acid translation for germline and all sequences +#' after trimming (possible values: 1, 2 and 3) #' #' @examples #' @@ -96,6 +98,12 @@ repAlignLineage <- function(.data, ), .nofail)) { return(get_empty_object_with_class("step_failure_ignored")) } + if (.min_lineage_sequences < 2) { + warning( + ".min_lineage_sequences is set to less than 2; ", + "results will not be valid to build trees with repClonalLineage()!" + ) + } doParallel::registerDoParallel(cores = .prepare_threads) .data %<>% @@ -212,6 +220,12 @@ prepare_results_row <- function(lineage_subset, .min_lineage_sequences, .verbose names(all_sequences_list) <- c("Germline", clonotypes_names) alignment <- convert_seq_list_to_dnabin(all_sequences_list) + # calculate amino acid frame starts for all trimmed sequences including germline + germline_aa_start <- (germline_v_len - v_min_len) %% 3 + 1 + clonotypes_aa_starts <- (lineage_subset[["V.lengths"]] - v_min_len) %% 3 + 1 + all_sequences_aa_starts <- c(list(germline_aa_start), as.list(clonotypes_aa_starts)) + names(all_sequences_aa_starts) <- names(all_sequences_list) + if (.verbose_output) { return(list( Cluster = cluster_name, @@ -223,7 +237,8 @@ prepare_results_row <- function(lineage_subset, .min_lineage_sequences, .verbose Alignment = alignment, V.length = v_min_len, J.length = j_min_len, - Sequences = sequences + Sequences = sequences, + AA.frame.starts = all_sequences_aa_starts )) } else { return(list( @@ -235,7 +250,8 @@ prepare_results_row <- function(lineage_subset, .min_lineage_sequences, .verbose Alignment = alignment, V.length = v_min_len, J.length = j_min_len, - Sequences = sequences + Sequences = sequences, + AA.frame.starts = all_sequences_aa_starts )) } } @@ -252,10 +268,13 @@ convert_results_to_df <- function(nested_results_list, nested_alignments_list) { sequences <- nested_results_list %>% lapply(magrittr::extract2, "Sequences") %>% tibble(Sequences = .) + frame_starts <- nested_results_list %>% + lapply(magrittr::extract2, "AA.frame.starts") %>% + tibble(AA.frame.starts = .) df <- nested_results_list %>% - lapply(rlist::list.remove, c("Alignment", "Sequences")) %>% + lapply(rlist::list.remove, c("Alignment", "Sequences", "AA.frame.starts")) %>% purrr::map_dfr(~.) %>% - cbind(alignments, sequences) + cbind(alignments, sequences, frame_starts) # fix column types after dataframe rebuilding for (column in c("CDR3.germline.length", "V.length", "J.length")) { df[[column]] %<>% as.integer() diff --git a/R/phylip.R b/R/phylip.R index a6c5fb28..c2eb0065 100644 --- a/R/phylip.R +++ b/R/phylip.R @@ -89,7 +89,7 @@ repClonalFamily <- function(.data, .threads = parallel::detectCores(), .nofail = process_dataframe <- function(df, .threads, sample_name = NA) { required_columns <- c( "Cluster", "Germline", "V.germline.nt", "J.germline.nt", "CDR3.germline.length", - "V.length", "J.length", "Alignment", "Sequences" + "V.length", "J.length", "Alignment", "Sequences", "AA.frame.starts" ) for (column in required_columns) { if (!(column %in% colnames(df))) { @@ -129,18 +129,18 @@ process_dataframe <- function(df, .threads, sample_name = NA) { } process_cluster <- function(cluster_row) { - # alignment and sequences should be extracted from 1-element lists because of these columns format + # alignment, sequences and aa_frame_starts should be extracted from 1-element lists + # because of these columns format alignment <- cluster_row[["Alignment"]][[1]] sequences <- cluster_row[["Sequences"]][[1]] + v_aa_frame_starts <- cluster_row[["AA.frame.starts"]][[1]] cluster_name <- cluster_row[["Cluster"]] cdr3_germline_length <- cluster_row[["CDR3.germline.length"]] v_trimmed_length <- cluster_row[["V.length"]] j_trimmed_length <- cluster_row[["J.length"]] - # positions of starting nucleotide for translation of sequences to amino acids - v_full_length <- str_length(cluster_row[["V.germline.nt"]]) - v_aa_frame_start <- (v_full_length - v_trimmed_length) %% 3 + 1 - j_aa_frame_start <- (v_full_length + cdr3_germline_length) %% 3 + 1 + # positions of J gene starting nucleotide for translation of sequences to amino acids + j_aa_frame_start <- (str_length(cluster_row[["V.germline.nt"]]) + cdr3_germline_length) %% 3 + 1 temp_dir <- file.path(tempdir(check = TRUE), uuid::UUIDgenerate(use.time = FALSE)) dir.create(temp_dir) @@ -286,13 +286,18 @@ process_cluster <- function(cluster_row) { } tree_stats[row, "DistanceNT"] <- sum(seq_nt_chars != ancestor_nt_chars) - seq_v_aa <- bunch_translate(substring(seq_v, v_aa_frame_start), + seq_v_aa_frame_start <- get_v_aa_frame_start(v_aa_frame_starts, tree_stats[row, "Name"]) + seq_v_aa <- bunch_translate(substring(seq_v, seq_v_aa_frame_start), .two.way = FALSE, .ignore.n = TRUE ) seq_j_aa <- bunch_translate(substring(seq_j, j_aa_frame_start), .two.way = FALSE, .ignore.n = TRUE ) - ancestor_v_aa <- bunch_translate(substring(ancestor_v, v_aa_frame_start), + ancestor_v_aa_frame_start <- get_v_aa_frame_start( + v_aa_frame_starts, + tree_stats[row, "Ancestor"] + ) + ancestor_v_aa <- bunch_translate(substring(ancestor_v, ancestor_v_aa_frame_start), .two.way = FALSE, .ignore.n = TRUE ) ancestor_j_aa <- bunch_translate(substring(ancestor_j, j_aa_frame_start), @@ -308,6 +313,31 @@ process_cluster <- function(cluster_row) { ) } tree_stats[row, "DistanceAA"] <- sum(seq_aa_chars != ancestor_aa_chars) + + if (grepl("*", paste0(seq_v_aa, seq_j_aa), fixed = TRUE)) { + warning( + "Wrong translation from NT to AA found in repClonalFamily:\n", + "seq_name = ", tree_stats[row, "Name"], "\n", + "seq_v_nt = ", seq_v, "\n", + "seq_v_aa = ", seq_v_aa, "\n", + "seq_v_aa_frame_start = ", seq_v_aa_frame_start, "\n", + "seq_j_nt = ", seq_j, "\n", + "seq_j_aa = ", seq_j_aa, "\n", + "j_aa_frame_start = ", j_aa_frame_start, "\n" + ) + } + if (grepl("*", paste0(ancestor_v_aa, ancestor_j_aa), fixed = TRUE)) { + warning( + "Wrong translation from NT to AA found in repClonalFamily:\n", + "ancestor_name = ", tree_stats[row, "Ancestor"], "\n", + "ancestor_v_nt = ", ancestor_v, "\n", + "ancestor_v_aa = ", ancestor_v_aa, "\n", + "ancestor_v_aa_frame_start = ", ancestor_v_aa_frame_start, "\n", + "ancestor_j_nt = ", ancestor_j, "\n", + "ancestor_j_aa = ", ancestor_j_aa, "\n", + "j_aa_frame_start = ", j_aa_frame_start, "\n" + ) + } } } @@ -337,6 +367,14 @@ process_cluster <- function(cluster_row) { )) } +get_v_aa_frame_start <- function(v_aa_frame_starts, seq_name) { + if (seq_name %in% names(v_aa_frame_starts)) { + return(v_aa_frame_starts[[seq_name]]) + } else { + return(v_aa_frame_starts[["Germline"]]) + } +} + convert_nested_to_df <- function(nested_results_list) { tree <- nested_results_list %>% lapply(magrittr::extract2, "Tree") %>% diff --git a/man/repAlignLineage.Rd b/man/repAlignLineage.Rd index 2b39ba64..d56d1fb7 100644 --- a/man/repAlignLineage.Rd +++ b/man/repAlignLineage.Rd @@ -53,6 +53,8 @@ The dataframe has these columns: Sequence, Clone.ID, Clones, CDR1.nt, CDR2.nt, CDR3.nt, FR1.nt, FR2.nt, FR3.nt, FR4.nt and, if .verbose_output=TRUE, also V.end, J.start, CDR3.start, CDR3.end; all values taken from the input dataframe +* AA.frame.starts: start positions for amino acid translation for germline and all sequences + after trimming (possible values: 1, 2 and 3) } \description{ This function aligns all sequences (incliding germline) that belong to one clonal From 8933415314310eecd7172744f9ddbc28066be510 Mon Sep 17 00:00:00 2001 From: ivan-immunomind Date: Wed, 7 Sep 2022 16:02:34 +0300 Subject: [PATCH 10/46] Add grouping vars to use in joininig .data and clustering results --- R/seqCluster.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/seqCluster.R b/R/seqCluster.R index d3d43d1d..6590ecf8 100644 --- a/R/seqCluster.R +++ b/R/seqCluster.R @@ -45,6 +45,7 @@ seqCluster <- function(.data, .dist, .perc_similarity, .nt_similarity, .fixed_threshold = 10) { matching_col <- attr(.dist, "col") + grouping_cols <- attr(.dist, "group_by") if (length(.data) != length(.dist)) { stop(".data and .dist lengths do not match!") } @@ -111,6 +112,7 @@ seqCluster <- function(.data, .dist, .perc_similarity, .nt_similarity, .fixed_th map_df(~.x) res <- rbind(result_single, result_multi) colnames(res) <- c(matching_col, "Cluster") + res[grouping_cols]<-str_split(str_split(res[['Cluster']],pattern = '_',simplify=T)[,1],pattern = '/',simplify = T) return(res) } clusters <- map(.dist, ~ graph_clustering(.x, threshold_fun = .threshold_fun)) From 87a19951e6a6edc371fde0de03139f86fee8bb19 Mon Sep 17 00:00:00 2001 From: ivan-immunomind <83207157+ivan-immunomind@users.noreply.github.com> Date: Thu, 8 Sep 2022 12:48:03 +0300 Subject: [PATCH 11/46] Update R/seqCluster.R Co-authored-by: Aleksandr Popov --- R/seqCluster.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/seqCluster.R b/R/seqCluster.R index 6590ecf8..6b245e4f 100644 --- a/R/seqCluster.R +++ b/R/seqCluster.R @@ -112,7 +112,7 @@ seqCluster <- function(.data, .dist, .perc_similarity, .nt_similarity, .fixed_th map_df(~.x) res <- rbind(result_single, result_multi) colnames(res) <- c(matching_col, "Cluster") - res[grouping_cols]<-str_split(str_split(res[['Cluster']],pattern = '_',simplify=T)[,1],pattern = '/',simplify = T) + res[grouping_cols] <- str_split(str_split(res[["Cluster"]], pattern = "_", simplify = TRUE)[, 1], pattern = "/", simplify = TRUE) return(res) } clusters <- map(.dist, ~ graph_clustering(.x, threshold_fun = .threshold_fun)) From 630c3aa1c68b44eb385c72fadfe20c1f0fc2f8e9 Mon Sep 17 00:00:00 2001 From: ivan-immunomind Date: Fri, 9 Sep 2022 14:23:30 +0300 Subject: [PATCH 12/46] Fix case when split produced more columns than grouping variables --- R/seqCluster.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/seqCluster.R b/R/seqCluster.R index 6b245e4f..3b5b4791 100644 --- a/R/seqCluster.R +++ b/R/seqCluster.R @@ -112,7 +112,7 @@ seqCluster <- function(.data, .dist, .perc_similarity, .nt_similarity, .fixed_th map_df(~.x) res <- rbind(result_single, result_multi) colnames(res) <- c(matching_col, "Cluster") - res[grouping_cols] <- str_split(str_split(res[["Cluster"]], pattern = "_", simplify = TRUE)[, 1], pattern = "/", simplify = TRUE) + res[grouping_cols] <- str_split(str_split(res[["Cluster"]], pattern = "_", simplify = TRUE)[, 1], pattern = "/", simplify = TRUE)[,1:length(grouping_cols)] return(res) } clusters <- map(.dist, ~ graph_clustering(.x, threshold_fun = .threshold_fun)) From 411f1c56c5cfe636dab67d3fee4fc7a94994f4d6 Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Wed, 7 Sep 2022 14:57:59 +0200 Subject: [PATCH 13/46] repGermline tests WIP --- tests/testthat/test-germline.R | 40 ++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 tests/testthat/test-germline.R diff --git a/tests/testthat/test-germline.R b/tests/testthat/test-germline.R new file mode 100644 index 00000000..89f7c77c --- /dev/null +++ b/tests/testthat/test-germline.R @@ -0,0 +1,40 @@ +data(bcrdata) +test_bcr_data <- bcrdata$data %>% top(1000) +test_clusters <- seqCluster(test_bcr_data, seqDist(test_bcr_data), .fixed_threshold = 3) + +positive_test_cases <- list( + +) + +for (i in seq_along(positive_test_cases)) { + # Arrange + + # Act + + # Assert + +} + +negative_test_cases <- list( + "List of lists" = list( + args = list( + .data = bcrdata + ) + ), + "Missing column" = list( + args = list( + .data = subset(test_clusters[["full_clones"]], select = -c(FR1.nt)) + ) + ) +) + +for (i in seq_along(negative_test_cases)) { + # Arrange + test_name <- names(negative_test_cases)[i] + args <- negative_test_cases[[i]][["args"]] + + # Act, Assert + test_that(test_name, { + expect_error(suppressWarnings(do.call(repGermline, args))) + }) +} From 583c56b6e3a8de81db6fa8d6e26d12c960b4e4a8 Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Thu, 8 Sep 2022 18:37:16 +0200 Subject: [PATCH 14/46] Simple tests for germline --- tests/testthat/test-germline.R | 48 +++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/tests/testthat/test-germline.R b/tests/testthat/test-germline.R index 89f7c77c..aae7fe0d 100644 --- a/tests/testthat/test-germline.R +++ b/tests/testthat/test-germline.R @@ -1,18 +1,52 @@ data(bcrdata) test_bcr_data <- bcrdata$data %>% top(1000) -test_clusters <- seqCluster(test_bcr_data, seqDist(test_bcr_data), .fixed_threshold = 3) positive_test_cases <- list( - + "Not empty result" = list( + args = list( + .data = test_bcr_data, + .threads = 1 + ), + assert_function = function(result) { + expect_equal(immunarch:::has_no_data(result), FALSE) + expect_equal(nrow(result[["full_clones"]]) > 0, TRUE) + } + ), + "Multiple threads" = list( + args = list( + .data = test_bcr_data + ), + assert_function = function(result) { + expect_equal(immunarch:::has_no_data(result), FALSE) + expect_equal(nrow(result[["full_clones"]]) > 0, TRUE) + } + ), + "Dataframe only" = list( + args = list( + .data = test_bcr_data[["full_clones"]], + .threads = 1 + ), + assert_function = function(result) { + expect_equal(immunarch:::has_no_data(result), FALSE) + expect_equal(nrow(result) > 0, TRUE) + } + ) ) for (i in seq_along(positive_test_cases)) { # Arrange + test_name <- names(positive_test_cases)[i] + args <- positive_test_cases[[i]][["args"]] + assert_function <- positive_test_cases[[i]][["assert_function"]] # Act + result <- suppressWarnings(do.call(repGermline, args)) # Assert - + test_that( + test_name, + assert_function(result) + ) } negative_test_cases <- list( @@ -23,7 +57,8 @@ negative_test_cases <- list( ), "Missing column" = list( args = list( - .data = subset(test_clusters[["full_clones"]], select = -c(FR1.nt)) + .data = subset(test_bcr_data[["full_clones"]], select = -c(FR1.nt)), + .threads = 1 ) ) ) @@ -34,7 +69,8 @@ for (i in seq_along(negative_test_cases)) { args <- negative_test_cases[[i]][["args"]] # Act, Assert - test_that(test_name, { + test_that( + test_name, expect_error(suppressWarnings(do.call(repGermline, args))) - }) + ) } From aab99efdbcda618a98c4f56e0cedd9e89de0ac00 Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Mon, 12 Sep 2022 15:30:33 +0200 Subject: [PATCH 15/46] Update R/seqCluster.R --- R/seqCluster.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/seqCluster.R b/R/seqCluster.R index 3b5b4791..89fb6ac3 100644 --- a/R/seqCluster.R +++ b/R/seqCluster.R @@ -112,7 +112,7 @@ seqCluster <- function(.data, .dist, .perc_similarity, .nt_similarity, .fixed_th map_df(~.x) res <- rbind(result_single, result_multi) colnames(res) <- c(matching_col, "Cluster") - res[grouping_cols] <- str_split(str_split(res[["Cluster"]], pattern = "_", simplify = TRUE)[, 1], pattern = "/", simplify = TRUE)[,1:length(grouping_cols)] + res[grouping_cols] <- str_split(str_split(res[["Cluster"]], pattern = "_", simplify = TRUE)[, 1], pattern = "/", simplify = TRUE)[, 1:length(grouping_cols)] return(res) } clusters <- map(.dist, ~ graph_clustering(.x, threshold_fun = .threshold_fun)) From 8b1608007f7f4c62fb285b90ffba31f1259f4f66 Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Thu, 15 Sep 2022 13:30:29 +0200 Subject: [PATCH 16/46] Optional parallelization for repGermline() --- R/germline.R | 18 ++++++++++++------ R/tools.R | 9 +++++++++ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/R/germline.R b/R/germline.R index aa335523..e56baf53 100644 --- a/R/germline.R +++ b/R/germline.R @@ -127,13 +127,17 @@ germline_single_df <- function(data, } calculate_germlines_parallel <- function(data, align_j_gene, threads, sample_name) { - cluster <- makeCluster(threads) - clusterExport(cluster, c("generate_germline_sequence", "align_and_find_j_start", "sample_name"), - envir = environment() - ) + if (threads == 1) { + cluster <- NA + } else { + cluster <- makeCluster(threads) + clusterExport(cluster, c("generate_germline_sequence", "align_and_find_j_start", "sample_name"), + envir = environment() + ) + } # rowwise parallel calculation of new columns that are added to data - data <- parApply(cluster, data, 1, function(row) { + data <- par_or_normal_apply(cluster, data, 1, function(row) { generate_germline_sequence( seq = row[["Sequence"]], v_ref = row[["V.ref.nt"]], @@ -156,7 +160,9 @@ calculate_germlines_parallel <- function(data, align_j_gene, threads, sample_nam germline_handle_warnings() %>% cbind(data, .) - stopCluster(cluster) + if (!has_no_data(cluster)) { + stopCluster(cluster) + } return(data) } diff --git a/R/tools.R b/R/tools.R index 3408569c..3ce1bd51 100644 --- a/R/tools.R +++ b/R/tools.R @@ -656,3 +656,12 @@ quiet <- function(procedure, capture_output = FALSE) { suppressWarnings() } } + +# call normal or parallel apply, depending on existence of parallelization cluster +par_or_normal_apply <- function(cluster, ...) { + if (has_no_data(cluster)) { + return(apply(...)) + } else { + return(parApply(cluster, ...)) + } +} From d6ff9fc5ea15f83b7f7c9545f1327a03922a972c Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Thu, 15 Sep 2022 13:54:46 +0200 Subject: [PATCH 17/46] Optional parallelization for repAlignLineage() --- R/align_lineage.R | 13 +++++++++---- R/tools.R | 9 +++++++++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/R/align_lineage.R b/R/align_lineage.R index 64917ad4..5fa1a334 100644 --- a/R/align_lineage.R +++ b/R/align_lineage.R @@ -105,16 +105,21 @@ repAlignLineage <- function(.data, ) } - doParallel::registerDoParallel(cores = .prepare_threads) + parallel_prepare <- .prepare_threads > 1 + if (parallel_prepare) { + doParallel::registerDoParallel(cores = .prepare_threads) + } .data %<>% apply_to_sample_or_list( align_single_df, .min_lineage_sequences = .min_lineage_sequences, - .parallel_prepare = .prepare_threads > 1, + .parallel_prepare = parallel_prepare, .align_threads = .align_threads, .verbose_output = .verbose_output ) - doParallel::stopImplicitCluster() + if (parallel_prepare) { + doParallel::stopImplicitCluster() + } return(.data) } @@ -157,7 +162,7 @@ align_single_df <- function(data, } else { alignments <- lapply(results, "[", "Alignment") } - alignments %<>% parallel::mclapply( + alignments %<>% par_or_normal_lapply( align_sequences, .verbose_output = .verbose_output, mc.preschedule = TRUE, diff --git a/R/tools.R b/R/tools.R index 3ce1bd51..edfba4b3 100644 --- a/R/tools.R +++ b/R/tools.R @@ -665,3 +665,12 @@ par_or_normal_apply <- function(cluster, ...) { return(parApply(cluster, ...)) } } + +# call normal or parallel lapply, depending on mc.cores +par_or_normal_lapply <- function(mc.preschedule, mc.cores, ...) { + if (mc.cores == 1) { + return(lapply(...)) + } else { + return(mclapply(..., mc.preschedule = mc.preschedule, mc.cores = mc.cores)) + } +} From 29b3da6c92612194c6ac24bdc8c0e6f3a3ae5f2e Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Thu, 15 Sep 2022 14:16:29 +0200 Subject: [PATCH 18/46] Optional parallelization for repClonalLineage and repSomaticHypermutation --- R/phylip.R | 2 +- R/somatic_hypermutation.R | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/R/phylip.R b/R/phylip.R index c2eb0065..4c80fdc8 100644 --- a/R/phylip.R +++ b/R/phylip.R @@ -118,7 +118,7 @@ process_dataframe <- function(df, .threads, sample_name = NA) { df <- df[required_columns] clusters_list <- split(df, seq(nrow(df))) - results <- parallel::mclapply( + results <- par_or_normal_lapply( clusters_list, process_cluster, mc.preschedule = FALSE, diff --git a/R/somatic_hypermutation.R b/R/somatic_hypermutation.R index d8a2f91a..d7d7306b 100644 --- a/R/somatic_hypermutation.R +++ b/R/somatic_hypermutation.R @@ -64,13 +64,18 @@ repSomaticHypermutation <- function(.data, .threads = parallel::detectCores(), . return(get_empty_object_with_class("step_failure_ignored")) } - doParallel::registerDoParallel(cores = .threads) + parallel <- threads > 1 + if (parallel) { + doParallel::registerDoParallel(cores = .threads) + } results <- .data %>% apply_to_sample_or_list( shm_process_dataframe, - .parallel = .threads > 1, + .parallel = parallel, .validate = FALSE ) - doParallel::stopImplicitCluster() + if (parallel) { + doParallel::stopImplicitCluster() + } return(results) } From b43bc766bd6bc0a321291f49ca454b72f4f87e17 Mon Sep 17 00:00:00 2001 From: ivan-immunomind Date: Fri, 16 Sep 2022 02:55:50 +0300 Subject: [PATCH 19/46] remove uniting group values in seqDist, add it to seqCluster; it helps to conduct any gene names --- R/distance.R | 5 ++--- R/seqCluster.R | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/R/distance.R b/R/distance.R index 8fb316c9..b4a36547 100644 --- a/R/distance.R +++ b/R/distance.R @@ -126,9 +126,8 @@ seqDist <- function(.data, .col = "CDR3.nt", .method = "hamming", .group_by = c( if (!gr_by_is_na) { group_by_values <- map(res_data, ~ .x %>% group_keys() %>% - select_if(is.character) %>% - unite("values", sep = "/")) - result <- map2(result, group_by_values, ~ map2(.x, .y$values, function(x, y) set_attr(x, "group_values", y))) + select_if(is.character)) + result <- map2(result, group_by_values, ~ map2(.x, pmap(.y,c), function(x, y) set_attr(x, "group_values", y))) } } } diff --git a/R/seqCluster.R b/R/seqCluster.R index 3b5b4791..89fb6ac3 100644 --- a/R/seqCluster.R +++ b/R/seqCluster.R @@ -112,7 +112,7 @@ seqCluster <- function(.data, .dist, .perc_similarity, .nt_similarity, .fixed_th map_df(~.x) res <- rbind(result_single, result_multi) colnames(res) <- c(matching_col, "Cluster") - res[grouping_cols] <- str_split(str_split(res[["Cluster"]], pattern = "_", simplify = TRUE)[, 1], pattern = "/", simplify = TRUE)[,1:length(grouping_cols)] + res[grouping_cols] <- str_split(str_split(res[["Cluster"]], pattern = "_", simplify = TRUE)[, 1], pattern = "/", simplify = TRUE)[, 1:length(grouping_cols)] return(res) } clusters <- map(.dist, ~ graph_clustering(.x, threshold_fun = .threshold_fun)) From c12d0958755f7490ff8164d6858ff49291923e62 Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Fri, 16 Sep 2022 11:03:31 +0200 Subject: [PATCH 20/46] applied styler --- R/distance.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/distance.R b/R/distance.R index b4a36547..0bd024d0 100644 --- a/R/distance.R +++ b/R/distance.R @@ -127,7 +127,7 @@ seqDist <- function(.data, .col = "CDR3.nt", .method = "hamming", .group_by = c( group_by_values <- map(res_data, ~ .x %>% group_keys() %>% select_if(is.character)) - result <- map2(result, group_by_values, ~ map2(.x, pmap(.y,c), function(x, y) set_attr(x, "group_values", y))) + result <- map2(result, group_by_values, ~ map2(.x, pmap(.y, c), function(x, y) set_attr(x, "group_values", y))) } } } From 5e525cff5a651eea6e9116be4910040da2f1b7f2 Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Wed, 21 Sep 2022 23:01:14 +0200 Subject: [PATCH 21/46] Support for ClustalW executable names both "clustalw" and "clustalw2" --- R/align_lineage.R | 2 +- R/germline.R | 2 +- R/somatic_hypermutation.R | 2 +- R/tools.R | 9 +++++++-- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/R/align_lineage.R b/R/align_lineage.R index 5fa1a334..ac55ed89 100644 --- a/R/align_lineage.R +++ b/R/align_lineage.R @@ -91,7 +91,7 @@ repAlignLineage <- function(.data, .align_threads = 4, .verbose_output = FALSE, .nofail = FALSE) { - if (!require_system_package("clustalw", error_message = paste0( + if (!require_system_package(c("clustalw", "clustalw2"), error_message = paste0( "repAlignLineage requires Clustal W app to be installed!\n", "Please download it from here: http://www.clustal.org/download/current/\n", "or install it with your system package manager (such as apt or dnf)." diff --git a/R/germline.R b/R/germline.R index e56baf53..c833b268 100644 --- a/R/germline.R +++ b/R/germline.R @@ -72,7 +72,7 @@ repGermline <- function(.data, .min_nuc_outside_cdr3 = 5, .threads = parallel::detectCores()) { if (.align_j_gene) { - require_system_package("clustalw", error_message = paste0( + require_system_package(c("clustalw", "clustalw2"), error_message = paste0( "repGermline with .align_j_gene = TRUE requires Clustal W app to be installed!\n", "Please download it from here: http://www.clustal.org/download/current/\n", "or install it with your system package manager (such as apt or dnf)." diff --git a/R/somatic_hypermutation.R b/R/somatic_hypermutation.R index d7d7306b..cd016ef8 100644 --- a/R/somatic_hypermutation.R +++ b/R/somatic_hypermutation.R @@ -56,7 +56,7 @@ #' repSomaticHypermutation(.threads = 1, .nofail = TRUE) #' @export repSomaticHypermutation repSomaticHypermutation <- function(.data, .threads = parallel::detectCores(), .nofail = FALSE) { - if (!require_system_package("clustalw", error_message = paste0( + if (!require_system_package(c("clustalw", "clustalw2"), error_message = paste0( "repSomaticHypermutation requires Clustal W app to be installed!\n", "Please download it from here: http://www.clustal.org/download/current/\n", "or install it with your system package manager (such as apt or dnf)." diff --git a/R/tools.R b/R/tools.R index edfba4b3..8e3ed033 100644 --- a/R/tools.R +++ b/R/tools.R @@ -603,11 +603,16 @@ optional_sample <- function(prefix, sample_name, suffix) { } } -require_system_package <- function(package, error_message, .nofail = FALSE, .prev_failed = FALSE) { +# executable_names can contain 1 name or vector of multiple options how it can be named +require_system_package <- function(executable_names, + error_message, + .nofail = FALSE, + .prev_failed = FALSE) { if (.nofail & .prev_failed) { return(FALSE) } - if (Sys.which(package) == "") { + package_not_exist <- all(unlist(purrr::map(Sys.which(executable_names), identical, ""))) + if (package_not_exist) { if (.nofail) { cat(error_message) return(FALSE) From 1d56cf7c1bb23d92ed4826ccd5bc03d9f0439b80 Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Wed, 21 Sep 2022 23:01:50 +0200 Subject: [PATCH 22/46] removed wip.R --- R/wip.R | 186 -------------------------------------------------------- 1 file changed, 186 deletions(-) delete mode 100644 R/wip.R diff --git a/R/wip.R b/R/wip.R deleted file mode 100644 index 17a6cb31..00000000 --- a/R/wip.R +++ /dev/null @@ -1,186 +0,0 @@ - -########## -# File: aa_properties.R -########## - -# cdrProp <- function (.data, .prop = c("hydro", "polarity+turn"), .region = c("vhj", "h", "v", "j")) { -# proplist = lapply(.prop, function(.prop) { chooseprop(.prop) }) -# -# if (!has_class(.data, "list")) { -# .data = list(Data = .data) -# } -# -# data("aaproperties") -# -# res = .data[IMMCOL$cdr3aa] -# -# for (property in proplist){ -# new = .data %>% -# select(IMMCOL$cdr3aa) %>% -# mutate(hydro = aapropeval(IMMCOL$cdr3aa, property)) %>% -# collect() -# -# colnames(new) <- c("IMMCOL$cdr3aa", property) -# res <- dplyr::bind_cols(res, new[property]) -# } -# -# add_class(res, "immunr_cdr_prop") -# } - - -# chooseprop <- function(prop) { -# switch(prop, -# alpha = "alpha", -# beta = "beta", -# charge = "charge", -# core = "core", -# hydro = "hydropathy", -# ph = "pH", -# polar = "polarity", -# rim = "rim", -# surf = "surface", -# turn = "turn", -# vol = "volume", -# str = "strength", -# dis = "disorder", -# high = "high_contact", -# stop("Unknown property name")) -# } -# -# aapropeval <- function(seq, col){ -# aaproperty <- AA_PROP[,c("amino.acid", col)] -# seq <- strsplit(x = seq, split = "") -# aaseqpropvalue <- lapply(seq, function(seq) { -# sum(aaproperty[seq, ][[col]], na.rm = TRUE) / length(seq) }) -# return(aaseqpropvalue) -# } -# -# cdrPropAnalysis <- function (.data, .method = c("t.test")) { -# -# } - - -########## -# File: graph.R -########## - -# mutationNetwork <- function (.data, .method = c("hamm", "lev"), .err = 2) { -# require(igraph) -# UseMethod("mutationNetwork") -# } -# -# mutationNetwork.character <- function (.data, .method = c("hamm", "lev"), .err = 2) { -# add_class(res, "immunr_mutation_network") -# } -# -# mutationNetwork.immunr_shared_repertoire <- function (.data, .method = c("hamm", "lev"), .err = 2) { -# add_class(res, "immunr_mutation_network") -# } -# -# mutationNetwork.tbl <- function (.data, .col, .method = c("hamm", "lev"), .err = 2) { -# select_(.data, .dots = .col) -# add_class(res, "immunr_mutation_network") -# } -# -# mut.net = mutationNetwork - - -########## -# File: post_analysis.R -######### - -# overlap => distance matrix -# gene usage => N-dimensional vector of values -# diversity => vector of values - -# (both) vector of values => distance matrix -# N-dimensional vector of values => clustering -# N-dimensional vector of values => dimensionality reduction -# N-dimensional vector of values => statistical test -# N-dimensional vector of values => grouped statistical test -# vector of values => grouped statistical test - -# distance matrix => clustering -# distance matrix => dimensionality reduction -# distance matrix => vis - -# dimensionality reduction => clustering -# dimensionality reduction => vis - -# clustering => vis - -# statistical test => vis -# grouped statistical test => vis - -# distance matrix -# - cor -# - js -# - cor -# - cosine - -# clustering -# - hclust -# - dbscan -# - kmeans - -# dimensionality reduction -# - tsne -# - pca -# - mds - -# stat test / grouped stat test -# - kruskall -# - wilcox - -# postAnalysis <- function (.data) - -# immunr_clustering_preprocessing <- function (...) { -# check for the right input class -# preprocess data somehow -# } - - -########## -# File: stat_tests.R -######### - -#' #' Statistical analysis of groups -#' #' -#' immunr_permut <- function () { -#' stop(IMMUNR_ERROR_NOT_IMPL) -#' } -#' -#' # groups -#' immunr_mann_whitney <- function () { -#' stop(IMMUNR_ERROR_NOT_IMPL) -#' } -#' -#' immunr_kruskall <- function (.dunn = T) { -#' stop(IMMUNR_ERROR_NOT_IMPL) -#' } -#' -#' immunr_logreg <- function () { -#' stop(IMMUNR_ERROR_NOT_IMPL) -#' } - -########## -# File: metadata.R -######### - -# read_metadata <- function (.obj) { -# -# } -# -# -# write_metadata <- function (.obj) { -# -# } -# -# -# check_metadata <- function (.data, .meta) { -# .meta = collect(.meta) -# -# (length(.data) == length(unique(names(.data)))) & -# (length(.meta$Sample) == length(unique(.meta$Sample))) & -# (sum(!(names(.data) %in% .meta$Sample)) == 0) -# } From b5c1466aaa9116cf9e00b0ba580bd3d51be08412 Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Wed, 21 Sep 2022 23:01:14 +0200 Subject: [PATCH 23/46] Support for ClustalW executable names both "clustalw" and "clustalw2" --- R/align_lineage.R | 2 +- R/germline.R | 2 +- R/somatic_hypermutation.R | 2 +- R/tools.R | 9 +++++++-- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/R/align_lineage.R b/R/align_lineage.R index 68af4c55..854f7f2b 100644 --- a/R/align_lineage.R +++ b/R/align_lineage.R @@ -89,7 +89,7 @@ repAlignLineage <- function(.data, .align_threads = 4, .verbose_output = FALSE, .nofail = FALSE) { - if (!require_system_package("clustalw", error_message = paste0( + if (!require_system_package(c("clustalw", "clustalw2"), error_message = paste0( "repAlignLineage requires Clustal W app to be installed!\n", "Please download it from here: http://www.clustal.org/download/current/\n", "or install it with your system package manager (such as apt or dnf)." diff --git a/R/germline.R b/R/germline.R index aa335523..13244fe3 100644 --- a/R/germline.R +++ b/R/germline.R @@ -72,7 +72,7 @@ repGermline <- function(.data, .min_nuc_outside_cdr3 = 5, .threads = parallel::detectCores()) { if (.align_j_gene) { - require_system_package("clustalw", error_message = paste0( + require_system_package(c("clustalw", "clustalw2"), error_message = paste0( "repGermline with .align_j_gene = TRUE requires Clustal W app to be installed!\n", "Please download it from here: http://www.clustal.org/download/current/\n", "or install it with your system package manager (such as apt or dnf)." diff --git a/R/somatic_hypermutation.R b/R/somatic_hypermutation.R index d8a2f91a..6c840669 100644 --- a/R/somatic_hypermutation.R +++ b/R/somatic_hypermutation.R @@ -56,7 +56,7 @@ #' repSomaticHypermutation(.threads = 1, .nofail = TRUE) #' @export repSomaticHypermutation repSomaticHypermutation <- function(.data, .threads = parallel::detectCores(), .nofail = FALSE) { - if (!require_system_package("clustalw", error_message = paste0( + if (!require_system_package(c("clustalw", "clustalw2"), error_message = paste0( "repSomaticHypermutation requires Clustal W app to be installed!\n", "Please download it from here: http://www.clustal.org/download/current/\n", "or install it with your system package manager (such as apt or dnf)." diff --git a/R/tools.R b/R/tools.R index dfcda321..1617cc5b 100644 --- a/R/tools.R +++ b/R/tools.R @@ -603,11 +603,16 @@ optional_sample <- function(prefix, sample_name, suffix) { } } -require_system_package <- function(package, error_message, .nofail = FALSE, .prev_failed = FALSE) { +# executable_names can contain 1 name or vector of multiple options how it can be named +require_system_package <- function(executable_names, + error_message, + .nofail = FALSE, + .prev_failed = FALSE) { if (.nofail & .prev_failed) { return(FALSE) } - if (Sys.which(package) == "") { + package_not_exist <- all(unlist(purrr::map(Sys.which(executable_names), identical, ""))) + if (package_not_exist) { if (.nofail) { cat(error_message) return(FALSE) From c3f098e0be962e3361c75a2a2548ec38dce221a2 Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Wed, 21 Sep 2022 23:16:49 +0200 Subject: [PATCH 24/46] formatting and styling fixes --- R/clustering.R | 1 - R/seqCluster.R | 24 +++++-- R/wip.R | 186 ------------------------------------------------- 3 files changed, 18 insertions(+), 193 deletions(-) delete mode 100644 R/wip.R diff --git a/R/clustering.R b/R/clustering.R index 46eda6e1..497c71e7 100644 --- a/R/clustering.R +++ b/R/clustering.R @@ -73,7 +73,6 @@ immunr_hclust <- function(.data, .k = 2, .k.max = nrow(.data) - 1, .method = "co } immunr_kmeans <- function(.data, .k = 2, .k.max = as.integer(sqrt(nrow(.data))) + 1, .method = c("silhouette", "gap_stat")) { - # res = list(kmeans = add_class(kmeans(as.dist(.data), .k), "immunr_kmeans"), res <- list( kmeans = add_class(kmeans(.data, .k), "immunr_kmeans"), nbclust = add_class(fviz_nbclust(.data, kmeans, k.max = .k.max, .method[1]), "immunr_nbclust"), diff --git a/R/seqCluster.R b/R/seqCluster.R index 89fb6ac3..bba208d0 100644 --- a/R/seqCluster.R +++ b/R/seqCluster.R @@ -92,18 +92,26 @@ seqCluster <- function(.data, .dist, .perc_similarity, .nt_similarity, .fixed_th protocluster_names <- map(dist_list, ~ attr(.x, "group_values")) protocluster_names %<>% map2_chr(., seq_labels, ~ ifelse(is.null(.x), .y, .x)) # ^if no grouping variables in data, sequences are IDs for clusters - result_single <- data.frame(Sequence = unlist(seq_labels[singleseq_flag]), Cluster = paste0(protocluster_names[singleseq_flag], "_length_", seq_length[singleseq_flag])) + result_single <- data.frame( + Sequence = unlist(seq_labels[singleseq_flag]), + Cluster = paste0(protocluster_names[singleseq_flag], "_length_", seq_length[singleseq_flag]) + ) multiseq_dist <- dist_list[!singleseq_flag] - mat_dist <- map2(multiseq_dist, threshold[!singleseq_flag], ~ as.matrix(.x) %>% apply(., 1, function(x, t) { - ifelse(x > t, NA, x) - }, .y)) + mat_dist <- map2(multiseq_dist, threshold[!singleseq_flag], ~ as.matrix(.x) %>% + apply(., 1, function(x, t) { + ifelse(x > t, NA, x) + }, .y)) seq_clusters <- map(mat_dist, ~ melt(.x, na.rm = TRUE) %>% graph_from_data_frame() %>% clusters() %>% .$membership %>% melt()) result_multi <- seq_clusters %>% - map2(., seq_length[!singleseq_flag], ~ .x %>% mutate(length_value = map_chr(.y, ~ ifelse(all(.x == .x[1]), yes = .x[1], no = glue("range_{min(.x)}:{max(.x)}"))))) %>% + map2(., seq_length[!singleseq_flag], ~ .x %>% + mutate(length_value = map_chr(.y, ~ ifelse(all(.x == .x[1]), + yes = .x[1], + no = glue("range_{min(.x)}:{max(.x)}") + )))) %>% map2(., protocluster_names[!singleseq_flag], ~ rownames_to_column(.x, var = "Sequence") %>% group_by(value, length_value) %>% mutate(Cluster = paste0(.y, "_length_", length_value, "_cluster_", cur_group_id())) %>% @@ -112,7 +120,11 @@ seqCluster <- function(.data, .dist, .perc_similarity, .nt_similarity, .fixed_th map_df(~.x) res <- rbind(result_single, result_multi) colnames(res) <- c(matching_col, "Cluster") - res[grouping_cols] <- str_split(str_split(res[["Cluster"]], pattern = "_", simplify = TRUE)[, 1], pattern = "/", simplify = TRUE)[, 1:length(grouping_cols)] + res[grouping_cols] <- str_split(str_split(res[["Cluster"]], + pattern = "_", simplify = TRUE + )[, 1], + pattern = "/", simplify = TRUE + )[, seq_along(grouping_cols)] return(res) } clusters <- map(.dist, ~ graph_clustering(.x, threshold_fun = .threshold_fun)) diff --git a/R/wip.R b/R/wip.R deleted file mode 100644 index 17a6cb31..00000000 --- a/R/wip.R +++ /dev/null @@ -1,186 +0,0 @@ - -########## -# File: aa_properties.R -########## - -# cdrProp <- function (.data, .prop = c("hydro", "polarity+turn"), .region = c("vhj", "h", "v", "j")) { -# proplist = lapply(.prop, function(.prop) { chooseprop(.prop) }) -# -# if (!has_class(.data, "list")) { -# .data = list(Data = .data) -# } -# -# data("aaproperties") -# -# res = .data[IMMCOL$cdr3aa] -# -# for (property in proplist){ -# new = .data %>% -# select(IMMCOL$cdr3aa) %>% -# mutate(hydro = aapropeval(IMMCOL$cdr3aa, property)) %>% -# collect() -# -# colnames(new) <- c("IMMCOL$cdr3aa", property) -# res <- dplyr::bind_cols(res, new[property]) -# } -# -# add_class(res, "immunr_cdr_prop") -# } - - -# chooseprop <- function(prop) { -# switch(prop, -# alpha = "alpha", -# beta = "beta", -# charge = "charge", -# core = "core", -# hydro = "hydropathy", -# ph = "pH", -# polar = "polarity", -# rim = "rim", -# surf = "surface", -# turn = "turn", -# vol = "volume", -# str = "strength", -# dis = "disorder", -# high = "high_contact", -# stop("Unknown property name")) -# } -# -# aapropeval <- function(seq, col){ -# aaproperty <- AA_PROP[,c("amino.acid", col)] -# seq <- strsplit(x = seq, split = "") -# aaseqpropvalue <- lapply(seq, function(seq) { -# sum(aaproperty[seq, ][[col]], na.rm = TRUE) / length(seq) }) -# return(aaseqpropvalue) -# } -# -# cdrPropAnalysis <- function (.data, .method = c("t.test")) { -# -# } - - -########## -# File: graph.R -########## - -# mutationNetwork <- function (.data, .method = c("hamm", "lev"), .err = 2) { -# require(igraph) -# UseMethod("mutationNetwork") -# } -# -# mutationNetwork.character <- function (.data, .method = c("hamm", "lev"), .err = 2) { -# add_class(res, "immunr_mutation_network") -# } -# -# mutationNetwork.immunr_shared_repertoire <- function (.data, .method = c("hamm", "lev"), .err = 2) { -# add_class(res, "immunr_mutation_network") -# } -# -# mutationNetwork.tbl <- function (.data, .col, .method = c("hamm", "lev"), .err = 2) { -# select_(.data, .dots = .col) -# add_class(res, "immunr_mutation_network") -# } -# -# mut.net = mutationNetwork - - -########## -# File: post_analysis.R -######### - -# overlap => distance matrix -# gene usage => N-dimensional vector of values -# diversity => vector of values - -# (both) vector of values => distance matrix -# N-dimensional vector of values => clustering -# N-dimensional vector of values => dimensionality reduction -# N-dimensional vector of values => statistical test -# N-dimensional vector of values => grouped statistical test -# vector of values => grouped statistical test - -# distance matrix => clustering -# distance matrix => dimensionality reduction -# distance matrix => vis - -# dimensionality reduction => clustering -# dimensionality reduction => vis - -# clustering => vis - -# statistical test => vis -# grouped statistical test => vis - -# distance matrix -# - cor -# - js -# - cor -# - cosine - -# clustering -# - hclust -# - dbscan -# - kmeans - -# dimensionality reduction -# - tsne -# - pca -# - mds - -# stat test / grouped stat test -# - kruskall -# - wilcox - -# postAnalysis <- function (.data) - -# immunr_clustering_preprocessing <- function (...) { -# check for the right input class -# preprocess data somehow -# } - - -########## -# File: stat_tests.R -######### - -#' #' Statistical analysis of groups -#' #' -#' immunr_permut <- function () { -#' stop(IMMUNR_ERROR_NOT_IMPL) -#' } -#' -#' # groups -#' immunr_mann_whitney <- function () { -#' stop(IMMUNR_ERROR_NOT_IMPL) -#' } -#' -#' immunr_kruskall <- function (.dunn = T) { -#' stop(IMMUNR_ERROR_NOT_IMPL) -#' } -#' -#' immunr_logreg <- function () { -#' stop(IMMUNR_ERROR_NOT_IMPL) -#' } - -########## -# File: metadata.R -######### - -# read_metadata <- function (.obj) { -# -# } -# -# -# write_metadata <- function (.obj) { -# -# } -# -# -# check_metadata <- function (.data, .meta) { -# .meta = collect(.meta) -# -# (length(.data) == length(unique(names(.data)))) & -# (length(.meta$Sample) == length(unique(.meta$Sample))) & -# (sum(!(names(.data) %in% .meta$Sample)) == 0) -# } From 883bc7ead9eb32c865837fdbd8f5dac42b601ade Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Thu, 22 Sep 2022 23:40:35 +0200 Subject: [PATCH 25/46] Germline refactoring WIP: mandatory columns; find first allele column in reference --- R/germline.R | 104 +++++++++++++++++++++++++-------------------------- R/tools.R | 22 +++-------- 2 files changed, 56 insertions(+), 70 deletions(-) diff --git a/R/germline.R b/R/germline.R index c833b268..94aaf7c1 100644 --- a/R/germline.R +++ b/R/germline.R @@ -21,8 +21,7 @@ #' #' @usage #' -#' repGermline(.data, -#' .species, .align_j_gene, .min_nuc_outside_cdr3, .threads) +#' repGermline(.data, .species, .min_nuc_outside_cdr3, .threads) #' #' @param .data The data to be processed. Can be \link{data.frame}, \link{data.table} #' or a list of these objects. @@ -36,13 +35,6 @@ #' "OncorhynchusMykiss", "OrnithorhynchusAnatinus", "OryctolagusCuniculus", "RattusNorvegicus", #' "SusScrofa". #' -#' @param .align_j_gene MiXCR provides the number of J indels only for 1 allele of J gene -#' in the output file, and a germline can contain another allele. Therefore, calculation of -#' J gene start in reference based on numbers from input file can be sometimes incorrect. -#' As result, J gene in the germline will be trimmed in the start or will contain some -#' nucleotides from CDR3. Setting this parameter to TRUE enables precise alignment of J genes -#' to detect the correct starting point, but it significantly reduces performance. -#' #' @param .min_nuc_outside_cdr3 This parameter sets how many nucleotides should have V or J chain #' outside of CDR3 to be considered good for further alignment. #' @@ -51,12 +43,12 @@ #' @return #' #' Data with added columns: -#' * V.first.allele, J.first.allele (first alleles of V and J genes), -#' * V.ref.nt, J.ref.nt (V and J reference sequences), -#' * V.germline.nt, J.germline.nt (V and J germline sequences; they are references with -#' trimmed parts that are from CDR3), +#' * V.allele, J.allele (chosen alleles of V and J genes), +#' * V.germline.nt, J.germline.nt, V.germline.aa, J.germline.aa (V and J germline sequences; +#' they are references with trimmed parts that are from CDR3), +#' * V.aa, J.aa (V and J sequences from original clonotype, converted to amino acids) #' * CDR3.germline.length (length of CDR3 in the germline), -#' * Germline.sequence (combined germline sequence) +#' * Germline.sequence (combined germline nucleotide sequence) #' #' @examples #' @@ -64,21 +56,12 @@ #' #' bcrdata$data %>% #' top(5) %>% -#' repGermline(.threads = 1) +#' repGermline() #' @export repGermline repGermline <- function(.data, .species = "HomoSapiens", - .align_j_gene = FALSE, .min_nuc_outside_cdr3 = 5, - .threads = parallel::detectCores()) { - if (.align_j_gene) { - require_system_package(c("clustalw", "clustalw2"), error_message = paste0( - "repGermline with .align_j_gene = TRUE requires Clustal W app to be installed!\n", - "Please download it from here: http://www.clustal.org/download/current/\n", - "or install it with your system package manager (such as apt or dnf)." - )) - } - + .threads = 1) { # prepare reference sequences for all alleles genesegments_env <- new.env() data("genesegments", envir = genesegments_env) @@ -92,7 +75,6 @@ repGermline <- function(.data, .with_names = TRUE, reference = reference, species = .species, - align_j_gene = .align_j_gene, min_nuc_outside_cdr3 = .min_nuc_outside_cdr3, threads = .threads ) @@ -102,23 +84,14 @@ repGermline <- function(.data, germline_single_df <- function(data, reference, species, - align_j_gene, min_nuc_outside_cdr3, threads, sample_name = NA) { data %<>% - validate_genes_edges(sample_name) %>% - add_column_with_first_gene( - "V.name", - "V.first.allele", - .with_allele = TRUE - ) %>% + validate_mandatory_columns(sample_name) %>% + add_allele_column(reference["allele_id"], "V") %>% merge_reference_sequences(reference, "V", species, sample_name) %>% - add_column_with_first_gene( - "J.name", - "J.first.allele", - .with_allele = TRUE - ) %>% + add_allele_column(reference["allele_id"], "J") %>% merge_reference_sequences(reference, "J", species, sample_name) %>% validate_chains_length(min_nuc_outside_cdr3, sample_name) %>% calculate_germlines_parallel(align_j_gene, threads, sample_name) %>% @@ -247,9 +220,37 @@ generate_germline_sequence <- function(seq, v_ref, j_ref, cdr1_nt, cdr2_nt, } } +add_allele_column <- function(.data, .reference_allele_ids, .gene) { + raw_genes_colname <- paste0(.gene, ".name") + target_colname <- paste0(.gene, ".allele") + if (validate_columns(.data, raw_genes_colname, target_colname)) { + .data[[target_colname]] <- .data[[raw_genes_colname]] %>% sapply( + function(genes_string) { + first_allele <- genes_string %>% + # first allele is substring until first ',' or '(' in string taken from column with gene names + strsplit(",|\\(") %>% + unlist() %>% + magrittr::extract2(1) + # MiXCR uses *00 for unknown alleles; drop it to find first matching allele in reference + name_to_search <- str_replace(first_allele, fixed("*00"), "") + first_matching_allele <- .reference_allele_ids[which(startsWith( + .reference_allele_ids, + name_to_search + ))][1] + if (has_no_data(first_matching_allele)) { + return(first_allele) + } else { + return(first_matching_allele) + } + } + ) + } + return(.data) +} + merge_reference_sequences <- function(data, reference, chain_letter, species, sample_name) { chain_seq_colname <- paste0(chain_letter, ".ref.nt") - chain_allele_colname <- paste0(chain_letter, ".first.allele") + chain_allele_colname <- paste0(chain_letter, ".allele") colnames(reference) <- c(chain_seq_colname, chain_allele_colname) # check for alleles in data that don't exist in the reference @@ -281,7 +282,7 @@ merge_reference_sequences <- function(data, reference, chain_letter, species, sa return(data) } -validate_genes_edges <- function(data, sample_name) { +validate_mandatory_columns <- function(data, sample_name) { if (nrow(data) == 0) { stop( "Sample ", @@ -289,7 +290,9 @@ validate_genes_edges <- function(data, sample_name) { "dataframe is empty!" ) } - for (column in c("V.end", "J.start")) { + old_length <- nrow(data) + mandatory_columns <- c("FR1.nt", "CDR1.nt", "FR2.nt", "CDR2.nt", "FR3.nt", "CDR3.nt", "FR4.nt") + for (column in mandatory_columns) { if (!(column %in% colnames(data))) { stop( "Missing mandatory ", @@ -301,16 +304,14 @@ validate_genes_edges <- function(data, sample_name) { } if (all(is.na(data[, column]))) { stop( - "No data in mandatory ", - column, - " column", + "Dropped all rows when filtering out NAs from mandatory columns ", + paste(mandatory_columns, collapse = ", "), optional_sample(" in sample ", sample_name, ""), "!" ) } + data %<>% filter(!is.na(get(column))) } - old_length <- nrow(data) - data %<>% filter(!is.na(get("V.end")) & !is.na(get("J.start"))) dropped_num <- old_length - nrow(data) if (dropped_num > 0) { warning( @@ -318,14 +319,9 @@ validate_genes_edges <- function(data, sample_name) { " rows from ", old_length, optional_sample(" in sample ", sample_name, ""), - " were dropped because of missing values in mandatory columns V.end and J.start!" - ) - } - if (nrow(data) == 0) { - stop( - "Sample ", - optional_sample("", sample_name, " "), - "dataframe is empty after dropping missing values!" + " were dropped because of missing values in mandatory columns ", + paste(mandatory_columns, collapse = ", "), + "!" ) } return(data) diff --git a/R/tools.R b/R/tools.R index 8e3ed033..cfb55669 100644 --- a/R/tools.R +++ b/R/tools.R @@ -568,25 +568,15 @@ add_column_without_alleles <- function(.data, .original_colname, .target_colname return(.data) } -add_column_with_first_gene <- function(.data, .original_colname, .target_colname, .with_allele = FALSE) { +add_column_with_first_gene <- function(.data, .original_colname, .target_colname) { if (validate_columns(.data, .original_colname, .target_colname)) { .data[[.target_colname]] <- .data[[.original_colname]] %>% sapply( function(genes_string) { - if (.with_allele) { - genes_string %<>% - # first allele is substring until first ',' or '(' in string taken from column with gene names - strsplit(",|\\(") %>% - unlist() %>% - magrittr::extract2(1) %>% - # MiXCR uses *00 for unknown alleles; replace *00 to *01 to find them in reference - stringr::str_replace(stringr::fixed("*00"), "*01") - } else { - genes_string %<>% - # first gene is substring until first ',', '(' or '*' - strsplit(",|\\(|\\*") %>% - unlist() %>% - magrittr::extract2(1) - } + genes_string %<>% + # first gene is substring until first ',', '(' or '*' + strsplit(",|\\(|\\*") %>% + unlist() %>% + magrittr::extract2(1) return(genes_string) } ) From 5a2a75e25dc9387541a9a73a320dda6019c5a2b7 Mon Sep 17 00:00:00 2001 From: ivan-immunomind Date: Tue, 27 Sep 2022 20:55:51 +0300 Subject: [PATCH 26/46] wrong dimensions error --- R/distance.R | 5 ++--- R/seqCluster.R | 16 ++++++++++------ 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/R/distance.R b/R/distance.R index 8fb316c9..0bd024d0 100644 --- a/R/distance.R +++ b/R/distance.R @@ -126,9 +126,8 @@ seqDist <- function(.data, .col = "CDR3.nt", .method = "hamming", .group_by = c( if (!gr_by_is_na) { group_by_values <- map(res_data, ~ .x %>% group_keys() %>% - select_if(is.character) %>% - unite("values", sep = "/")) - result <- map2(result, group_by_values, ~ map2(.x, .y$values, function(x, y) set_attr(x, "group_values", y))) + select_if(is.character)) + result <- map2(result, group_by_values, ~ map2(.x, pmap(.y, c), function(x, y) set_attr(x, "group_values", y))) } } } diff --git a/R/seqCluster.R b/R/seqCluster.R index d3d43d1d..5fca6587 100644 --- a/R/seqCluster.R +++ b/R/seqCluster.R @@ -2,7 +2,7 @@ #' #' @concept seq_cluster #' -#' @importFrom purrr map map_lgl map_chr map2 map2_chr map_df map2_lgl +#' @importFrom purrr map map_lgl map_chr map2 map2_chr map_df map2_lgl pmap #' @importFrom magrittr %>% %<>% #' @importFrom reshape2 melt #' @importFrom dplyr group_by mutate ungroup select cur_group_id left_join @@ -88,10 +88,13 @@ seqCluster <- function(.data, .dist, .perc_similarity, .nt_similarity, .fixed_th singleseq_flag <- map_lgl(seq_labels, ~ length(.x) == 1) seq_length <- map(seq_labels, ~ nchar(.x)) threshold <- map(seq_length, ~ .x %>% threshold_fun()) - protocluster_names <- map(dist_list, ~ attr(.x, "group_values")) + group_values <- map_dfr(dist_list, ~ attr(.x, "group_values")) + protocluster_names <- group_values %>% + unite(col = grouping_cols, sep = "/") %>% + pull(grouping_cols) protocluster_names %<>% map2_chr(., seq_labels, ~ ifelse(is.null(.x), .y, .x)) # ^if no grouping variables in data, sequences are IDs for clusters - result_single <- data.frame(Sequence = unlist(seq_labels[singleseq_flag]), Cluster = paste0(protocluster_names[singleseq_flag], "_length_", seq_length[singleseq_flag])) + result_single <- data.frame(Sequence = unlist(seq_labels[singleseq_flag]), Cluster = paste0(protocluster_names[singleseq_flag], "_length_", seq_length[singleseq_flag])) %>% cbind(group_values[singleseq_flag, ]) multiseq_dist <- dist_list[!singleseq_flag] mat_dist <- map2(multiseq_dist, threshold[!singleseq_flag], ~ as.matrix(.x) %>% apply(., 1, function(x, t) { ifelse(x > t, NA, x) @@ -100,7 +103,8 @@ seqCluster <- function(.data, .dist, .perc_similarity, .nt_similarity, .fixed_th graph_from_data_frame() %>% clusters() %>% .$membership %>% - melt()) + melt() %>% + suppressWarnings()) result_multi <- seq_clusters %>% map2(., seq_length[!singleseq_flag], ~ .x %>% mutate(length_value = map_chr(.y, ~ ifelse(all(.x == .x[1]), yes = .x[1], no = glue("range_{min(.x)}:{max(.x)}"))))) %>% map2(., protocluster_names[!singleseq_flag], ~ rownames_to_column(.x, var = "Sequence") %>% @@ -108,9 +112,9 @@ seqCluster <- function(.data, .dist, .perc_similarity, .nt_similarity, .fixed_th mutate(Cluster = paste0(.y, "_length_", length_value, "_cluster_", cur_group_id())) %>% ungroup() %>% select(Sequence, Cluster)) %>% - map_df(~.x) + map2_df(., pmap(group_values, data.frame)[!singleseq_flag], ~ cbind(.x, .y)) res <- rbind(result_single, result_multi) - colnames(res) <- c(matching_col, "Cluster") + colnames(res) <- c(matching_col, "Cluster", grouping_cols) return(res) } clusters <- map(.dist, ~ graph_clustering(.x, threshold_fun = .threshold_fun)) From b8495a3ba01fecd3bfac12fefedd287fa69b0931 Mon Sep 17 00:00:00 2001 From: ivan-immunomind Date: Tue, 27 Sep 2022 21:54:35 +0300 Subject: [PATCH 27/46] fix for case with absent grouping cols --- R/seqCluster.R | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/R/seqCluster.R b/R/seqCluster.R index 5fca6587..629d5ffc 100644 --- a/R/seqCluster.R +++ b/R/seqCluster.R @@ -44,6 +44,7 @@ #' @export seqCluster seqCluster <- function(.data, .dist, .perc_similarity, .nt_similarity, .fixed_threshold = 10) { + grouping_cols <- attr(.dist, "group_by") matching_col <- attr(.dist, "col") if (length(.data) != length(.dist)) { stop(".data and .dist lengths do not match!") @@ -89,12 +90,16 @@ seqCluster <- function(.data, .dist, .perc_similarity, .nt_similarity, .fixed_th seq_length <- map(seq_labels, ~ nchar(.x)) threshold <- map(seq_length, ~ .x %>% threshold_fun()) group_values <- map_dfr(dist_list, ~ attr(.x, "group_values")) - protocluster_names <- group_values %>% - unite(col = grouping_cols, sep = "/") %>% - pull(grouping_cols) - protocluster_names %<>% map2_chr(., seq_labels, ~ ifelse(is.null(.x), .y, .x)) + if (all(is.na(grouping_cols))) { + protocluster_names <- map_chr(seq_labels, 1) + result_single <- data.frame(Sequence = unlist(seq_labels[singleseq_flag]), Cluster = paste0(protocluster_names[singleseq_flag], "_length_", seq_length[singleseq_flag], "_cluster_1")) + } else { + protocluster_names <- group_values %>% + unite(col = grouping_cols, sep = "/") %>% + pull(grouping_cols) + result_single <- data.frame(Sequence = unlist(seq_labels[singleseq_flag]), Cluster = paste0(protocluster_names[singleseq_flag], "_length_", seq_length[singleseq_flag])) %>% cbind(group_values[singleseq_flag, ]) + } # ^if no grouping variables in data, sequences are IDs for clusters - result_single <- data.frame(Sequence = unlist(seq_labels[singleseq_flag]), Cluster = paste0(protocluster_names[singleseq_flag], "_length_", seq_length[singleseq_flag])) %>% cbind(group_values[singleseq_flag, ]) multiseq_dist <- dist_list[!singleseq_flag] mat_dist <- map2(multiseq_dist, threshold[!singleseq_flag], ~ as.matrix(.x) %>% apply(., 1, function(x, t) { ifelse(x > t, NA, x) @@ -112,9 +117,12 @@ seqCluster <- function(.data, .dist, .perc_similarity, .nt_similarity, .fixed_th mutate(Cluster = paste0(.y, "_length_", length_value, "_cluster_", cur_group_id())) %>% ungroup() %>% select(Sequence, Cluster)) %>% - map2_df(., pmap(group_values, data.frame)[!singleseq_flag], ~ cbind(.x, .y)) + map_df(., ~.x) + if (!all(is.na(grouping_cols))) { + result_multi %<>% map2_df(., pmap(group_values, data.frame)[!singleseq_flag], ~ cbind(.x, .y)) + } res <- rbind(result_single, result_multi) - colnames(res) <- c(matching_col, "Cluster", grouping_cols) + colnames(res) <- na.omit(c(matching_col, "Cluster", grouping_cols)) return(res) } clusters <- map(.dist, ~ graph_clustering(.x, threshold_fun = .threshold_fun)) From fda635d005654812ee880d074e02c32a0df238b3 Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Tue, 27 Sep 2022 21:12:48 +0200 Subject: [PATCH 28/46] code formatting (split long lines) --- R/seqCluster.R | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/R/seqCluster.R b/R/seqCluster.R index 629d5ffc..678f2f15 100644 --- a/R/seqCluster.R +++ b/R/seqCluster.R @@ -92,18 +92,34 @@ seqCluster <- function(.data, .dist, .perc_similarity, .nt_similarity, .fixed_th group_values <- map_dfr(dist_list, ~ attr(.x, "group_values")) if (all(is.na(grouping_cols))) { protocluster_names <- map_chr(seq_labels, 1) - result_single <- data.frame(Sequence = unlist(seq_labels[singleseq_flag]), Cluster = paste0(protocluster_names[singleseq_flag], "_length_", seq_length[singleseq_flag], "_cluster_1")) + result_single <- data.frame( + Sequence = unlist(seq_labels[singleseq_flag]), + Cluster = paste0( + protocluster_names[singleseq_flag], + "_length_", + seq_length[singleseq_flag], + "_cluster_1" + ) + ) } else { protocluster_names <- group_values %>% unite(col = grouping_cols, sep = "/") %>% pull(grouping_cols) - result_single <- data.frame(Sequence = unlist(seq_labels[singleseq_flag]), Cluster = paste0(protocluster_names[singleseq_flag], "_length_", seq_length[singleseq_flag])) %>% cbind(group_values[singleseq_flag, ]) + result_single <- data.frame( + Sequence = unlist(seq_labels[singleseq_flag]), + Cluster = paste0( + protocluster_names[singleseq_flag], + "_length_", + seq_length[singleseq_flag] + ) + ) %>% cbind(group_values[singleseq_flag, ]) } # ^if no grouping variables in data, sequences are IDs for clusters multiseq_dist <- dist_list[!singleseq_flag] - mat_dist <- map2(multiseq_dist, threshold[!singleseq_flag], ~ as.matrix(.x) %>% apply(., 1, function(x, t) { - ifelse(x > t, NA, x) - }, .y)) + mat_dist <- map2(multiseq_dist, threshold[!singleseq_flag], ~ as.matrix(.x) %>% + apply(., 1, function(x, t) { + ifelse(x > t, NA, x) + }, .y)) seq_clusters <- map(mat_dist, ~ melt(.x, na.rm = TRUE) %>% graph_from_data_frame() %>% clusters() %>% @@ -111,7 +127,13 @@ seqCluster <- function(.data, .dist, .perc_similarity, .nt_similarity, .fixed_th melt() %>% suppressWarnings()) result_multi <- seq_clusters %>% - map2(., seq_length[!singleseq_flag], ~ .x %>% mutate(length_value = map_chr(.y, ~ ifelse(all(.x == .x[1]), yes = .x[1], no = glue("range_{min(.x)}:{max(.x)}"))))) %>% + map2(., seq_length[!singleseq_flag], ~ .x %>% + mutate( + length_value = map_chr(.y, ~ ifelse(all(.x == .x[1]), + yes = .x[1], + no = glue("range_{min(.x)}:{max(.x)}") + )) + )) %>% map2(., protocluster_names[!singleseq_flag], ~ rownames_to_column(.x, var = "Sequence") %>% group_by(value, length_value) %>% mutate(Cluster = paste0(.y, "_length_", length_value, "_cluster_", cur_group_id())) %>% From 64d83e467781b471f053664f00b96d0f519456d6 Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Wed, 28 Sep 2022 00:05:57 +0200 Subject: [PATCH 29/46] repGermline update --- R/germline.R | 181 ++++++++++++++++++--------------------------------- R/tools.R | 13 ---- 2 files changed, 65 insertions(+), 129 deletions(-) diff --git a/R/germline.R b/R/germline.R index 94aaf7c1..8b1f0141 100644 --- a/R/germline.R +++ b/R/germline.R @@ -43,10 +43,9 @@ #' @return #' #' Data with added columns: +#' * Sequence (FR1+CDR1+FR2+CDR2+FR3+CDR3+FR4 in nucleotides; column will be replaced if exists) #' * V.allele, J.allele (chosen alleles of V and J genes), -#' * V.germline.nt, J.germline.nt, V.germline.aa, J.germline.aa (V and J germline sequences; -#' they are references with trimmed parts that are from CDR3), -#' * V.aa, J.aa (V and J sequences from original clonotype, converted to amino acids) +#' * V.aa, J.aa (V and J sequences from original clonotype, outside CDR3, converted to amino acids) #' * CDR3.germline.length (length of CDR3 in the germline), #' * Germline.sequence (combined germline nucleotide sequence) #' @@ -94,12 +93,12 @@ germline_single_df <- function(data, add_allele_column(reference["allele_id"], "J") %>% merge_reference_sequences(reference, "J", species, sample_name) %>% validate_chains_length(min_nuc_outside_cdr3, sample_name) %>% - calculate_germlines_parallel(align_j_gene, threads, sample_name) %>% + calculate_germlines_parallel(threads, sample_name) %>% filter(!is.na(get("Germline.sequence"))) return(data) } -calculate_germlines_parallel <- function(data, align_j_gene, threads, sample_name) { +calculate_germlines_parallel <- function(data, threads, sample_name) { if (threads == 1) { cluster <- NA } else { @@ -111,27 +110,11 @@ calculate_germlines_parallel <- function(data, align_j_gene, threads, sample_nam # rowwise parallel calculation of new columns that are added to data data <- par_or_normal_apply(cluster, data, 1, function(row) { - generate_germline_sequence( - seq = row[["Sequence"]], - v_ref = row[["V.ref.nt"]], - j_ref = row[["J.ref.nt"]], - cdr1_nt = row[["CDR1.nt"]], - cdr2_nt = row[["CDR2.nt"]], - fr1_nt = row[["FR1.nt"]], - fr2_nt = row[["FR2.nt"]], - fr3_nt = row[["FR3.nt"]], - fr4_nt = row[["FR4.nt"]], - cdr3_start = as.integer(row[["CDR3.start"]]), - cdr3_end = as.integer(row[["CDR3.end"]]), - j_start = as.integer(row[["J.start"]]), - j3_del = as.integer(row[["J3.Deletions"]]), - align_j_gene = align_j_gene, - sample_name = sample_name - ) + calculate_new_columns(row, sample_name) }) %>% map_dfr(~.) %>% germline_handle_warnings() %>% - cbind(data, .) + merge_germline_results(data) if (!has_no_data(cluster)) { stopCluster(cluster) @@ -139,81 +122,85 @@ calculate_germlines_parallel <- function(data, align_j_gene, threads, sample_nam return(data) } -generate_germline_sequence <- function(seq, v_ref, j_ref, cdr1_nt, cdr2_nt, - fr1_nt, fr2_nt, fr3_nt, fr4_nt, - cdr3_start, cdr3_end, j_start, j3_del, - align_j_gene, sample_name) { +calculate_new_columns <- function(row, sample_name) { + v_ref <- row[["V.ref.nt"]] + j_ref <- row[["J.ref.nt"]] + cdr1_nt <- row[["CDR1.nt"]] + cdr2_nt <- row[["CDR2.nt"]] + cdr3_nt <- row[["CDR3.nt"]] + fr1_nt <- row[["FR1.nt"]] + fr2_nt <- row[["FR2.nt"]] + fr3_nt <- row[["FR3.nt"]] + fr4_nt <- row[["FR4.nt"]] + if (any(is.na(c( - seq, v_ref, j_ref, cdr1_nt, cdr2_nt, fr1_nt, fr2_nt, fr3_nt, fr4_nt, - cdr3_start, cdr3_end, j_start, j3_del - ))) || - (seq == "")) { + v_ref, j_ref, cdr1_nt, cdr2_nt, fr1_nt, fr2_nt, fr3_nt, cdr3_nt, fr4_nt + )))) { # warnings cannot be displayed from parApply; save them and display after finish warn <- paste0( "Some of mandatory fields in a row ", optional_sample("from sample ", sample_name, " "), - "contain unexpected NA or empty strings! Found values:\n", - "Sequence = ", - seq, - ",\nV.ref.nt = ", + "contain unexpected NAs! Found values:\n", + "V.ref.nt = ", v_ref, - ",\nJ.ref.nt = ", + "\nJ.ref.nt = ", j_ref, - ",\nCDR1.nt = ", + "\nCDR1.nt = ", cdr1_nt, - ",\nCDR2.nt = ", + "\nCDR2.nt = ", cdr2_nt, - ",\nFR1.nt = ", + "\nCDR3.nt = ", + cdr3_nt, + "\nFR1.nt = ", fr1_nt, - ",\nFR2.nt = ", + "\nFR2.nt = ", fr2_nt, - ",\nFR3.nt = ", + "\nFR3.nt = ", fr3_nt, - ",\nFR4.nt = ", + "\nFR4.nt = ", fr4_nt, - ",\nCDR3.start = ", - cdr3_start, - ", CDR3.end = ", - cdr3_end, - ", J.start = ", - j_start, - ", J3.Deletions = ", - j3_del, - ".\nThe row will be dropped!" + "\nThe row will be dropped!" ) return(list( - V.germline.nt = NA, - J.germline.nt = NA, - CDR3.germline.length = NA, + Sequence = NA, + V.aa = NA, + J.aa = NA, + CDR3.length = NA, Germline.sequence = NA, Warning = warn )) } else { - v_end <- str_length(cdr1_nt) + str_length(cdr2_nt) + - str_length(fr1_nt) + str_length(fr2_nt) + str_length(fr3_nt) - cdr3_length <- cdr3_end - cdr3_start + seq <- paste0(fr1_nt, cdr1_nt, fr2_nt, cdr2_nt, fr3_nt, cdr3_nt, fr4_nt) %>% toupper() + cdr1_aa <- ifelse(has_no_data(row[["CDR1.aa"]]), bunch_translate(cdr1_nt), row[["CDR1.aa"]]) + cdr2_aa <- ifelse(has_no_data(row[["CDR2.aa"]]), bunch_translate(cdr2_nt), row[["CDR2.aa"]]) + fr1_aa <- ifelse(has_no_data(row[["FR1.aa"]]), bunch_translate(fr1_nt), row[["FR1.aa"]]) + fr2_aa <- ifelse(has_no_data(row[["FR2.aa"]]), bunch_translate(fr2_nt), row[["FR2.aa"]]) + fr3_aa <- ifelse(has_no_data(row[["FR3.aa"]]), bunch_translate(fr3_nt), row[["FR3.aa"]]) + fr4_aa <- ifelse(has_no_data(row[["FR4.aa"]]), bunch_translate(fr4_nt), row[["FR4.aa"]]) + v_aa <- paste0(fr1_aa, cdr1_aa, fr2_aa, cdr2_aa, fr3_aa) + j_aa <- fr4_aa # trim intersection of V and CDR3 from reference V gene - v_part <- str_sub(v_ref, 1, v_end) + v_length <- str_length(cdr1_nt) + str_length(cdr2_nt) + + str_length(fr1_nt) + str_length(fr2_nt) + str_length(fr3_nt) + v_part <- str_sub(v_ref, 1, v_length) + cdr3_length <- str_length(cdr3_nt) cdr3_part <- paste(rep("n", cdr3_length), collapse = "") # trim intersection of J and CDR3 from reference J gene - if (align_j_gene) { - calculated_j_start <- align_and_find_j_start(j_ref, fr4_nt) - } else { - calculated_j_start <- max(0, cdr3_end - j_start - j3_del + 1) - } - j_part <- str_sub(j_ref, calculated_j_start) + j_length <- str_length(fr4_nt) + j_part <- str_sub(j_ref, str_length(j_ref) - j_length + 1, str_length(j_ref)) germline <- paste0(v_part, cdr3_part, j_part) %>% toupper() # return values for new calculated columns return(list( - V.germline.nt = v_part, - J.germline.nt = j_part, - CDR3.germline.length = cdr3_length, + Sequence = seq, + V.aa = v_aa, + J.aa = j_aa, + CDR3.length = cdr3_length, Germline.sequence = germline, Warning = NA )) @@ -226,11 +213,8 @@ add_allele_column <- function(.data, .reference_allele_ids, .gene) { if (validate_columns(.data, raw_genes_colname, target_colname)) { .data[[target_colname]] <- .data[[raw_genes_colname]] %>% sapply( function(genes_string) { - first_allele <- genes_string %>% - # first allele is substring until first ',' or '(' in string taken from column with gene names - strsplit(",|\\(") %>% - unlist() %>% - magrittr::extract2(1) + # first allele is substring until first ',' or '(' in string taken from column with gene names + first_allele <- strsplit(genes_string, ",|\\(")[[1]][1] # MiXCR uses *00 for unknown alleles; drop it to find first matching allele in reference name_to_search <- str_replace(first_allele, fixed("*00"), "") first_matching_allele <- .reference_allele_ids[which(startsWith( @@ -330,21 +314,14 @@ validate_mandatory_columns <- function(data, sample_name) { validate_chains_length <- function(data, min_nuc_outside_cdr3, sample_name) { old_length_v <- nrow(data) data %<>% filter( - v_len_outside_cdr3( - get("V.end"), - get("CDR3.start") - ) >= min_nuc_outside_cdr3 + str_length(get("FR1.nt")) + str_length(get("CDR1.nt")) + str_length(get("FR2.nt")) + + str_length(get("CDR2.nt")) + str_length(get("FR3.nt")) + >= min_nuc_outside_cdr3 ) dropped_v <- old_length_v - nrow(data) old_length_j <- nrow(data) if (nrow(data) > 0) { - data %<>% filter( - j_len_outside_cdr3( - get("Sequence"), - get("J.start"), - get("CDR3.end") - ) >= min_nuc_outside_cdr3 - ) + data %<>% filter(str_length(get("FR4.nt")) >= min_nuc_outside_cdr3) } dropped_j <- old_length_j - nrow(data) @@ -375,39 +352,11 @@ validate_chains_length <- function(data, min_nuc_outside_cdr3, sample_name) { return(data) } -# align reference J gene and FR4 segment from clonotype to find start of J gene outside of CDR3 -align_and_find_j_start <- function(j_ref, fr4_seq, max_len_diff = 10) { - # max_len_diff is needed to prevent alignment of sequences that are very different in length; - # we are only interested in the left side of alignment - j_len <- str_length(j_ref) - fr4_len <- str_length(fr4_seq) - min_len <- min(j_len, fr4_len) - max_len <- max(j_len, fr4_len) - trim_len <- min(min_len + max_len_diff, max_len) - j_trimmed <- str_sub(j_ref, 1, trim_len) - fr4_trimmed <- str_sub(fr4_seq, 1, trim_len) - - # alignment will contain vector of 2 aligned strings where deletions are "-" characters - alignment <- list(J = j_trimmed, FR4 = fr4_trimmed) %>% - lapply( - function(sequence) { - sequence %>% - str_extract_all(boundary("character")) %>% - unlist() - } - ) %>% - ape::as.DNAbin() %>% - ape::clustal() %>% - as.character() %>% - apply(1, paste, collapse = "") %>% - lapply(toupper) - - num_deletions <- str_length(stringr::str_extract(alignment[["FR4"]], pattern = "^(-+)")) - if (is.na(num_deletions) | (str_length(alignment[["J"]]) <= num_deletions)) { - return(1) - } else { - return(num_deletions + 1) - } +merge_germline_results <- function(new_columns, data) { + data %<>% + subset(select = -c("Sequence", "V.ref.nt", "J.ref.nt")) %>% + cbind(new_columns) + return(data) } germline_handle_warnings <- function(df) { diff --git a/R/tools.R b/R/tools.R index cfb55669..0088a19f 100644 --- a/R/tools.R +++ b/R/tools.R @@ -251,7 +251,6 @@ translate_bunch <- bunch_translate check_group_names <- function(.meta, .by) { - names_to_check <- c() if (is.null(.by)) { names_to_check <- .by } else { @@ -304,7 +303,6 @@ process_metadata_arguments <- function(.data, .by, .meta = NA, .data.sample.col if (!is.na(.by)[1]) { if (!is.na(.meta)[1]) { data_groups <- group_from_metadata(.by, .meta) - # group_name = .by group_name <- paste0(.by, collapse = "; ") is_grouped <- TRUE data_group_names <- .meta[[.meta.sample.col]] @@ -330,7 +328,6 @@ process_metadata_arguments <- function(.data, .by, .meta = NA, .data.sample.col } names(data_groups) <- data_group_names group_vec <- data_groups[.data[[.data.sample.col]]] - # group_column = stringr::str_sort(data_groups[.data[[.data.sample.col]]], numeric = TRUE) group_vec_sorted <- stringr::str_sort(group_vec, numeric = TRUE) group_column <- factor(group_vec, levels = unique(group_vec_sorted)) @@ -613,16 +610,6 @@ require_system_package <- function(executable_names, return(TRUE) } -# calculate V gene part length outside of CDR3; works with vectors acquired from dataframe columns -v_len_outside_cdr3 <- function(v_end, cdr3_start) { - pmin(v_end, as.numeric(cdr3_start)) -} - -# calculate J gene part length outside of CDR3; works with vectors acquired from dataframe columns -j_len_outside_cdr3 <- function(seq, j_start, cdr3_end) { - stringr::str_length(seq) - pmax(j_start, as.numeric(cdr3_end)) -} - convert_seq_list_to_dnabin <- function(seq_list) { dnabin <- seq_list %>% lapply( From abc8945e35889ab5ff50ddd61811ee9462a25e7b Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Wed, 28 Sep 2022 22:33:13 +0200 Subject: [PATCH 30/46] repGermline fixes --- R/germline.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/R/germline.R b/R/germline.R index 8b1f0141..42639378 100644 --- a/R/germline.R +++ b/R/germline.R @@ -46,7 +46,7 @@ #' * Sequence (FR1+CDR1+FR2+CDR2+FR3+CDR3+FR4 in nucleotides; column will be replaced if exists) #' * V.allele, J.allele (chosen alleles of V and J genes), #' * V.aa, J.aa (V and J sequences from original clonotype, outside CDR3, converted to amino acids) -#' * CDR3.germline.length (length of CDR3 in the germline), +#' * CDR3.length (length of CDR3), #' * Germline.sequence (combined germline nucleotide sequence) #' #' @examples @@ -88,9 +88,9 @@ germline_single_df <- function(data, sample_name = NA) { data %<>% validate_mandatory_columns(sample_name) %>% - add_allele_column(reference["allele_id"], "V") %>% + add_allele_column(reference[["allele_id"]], "V") %>% merge_reference_sequences(reference, "V", species, sample_name) %>% - add_allele_column(reference["allele_id"], "J") %>% + add_allele_column(reference[["allele_id"]], "J") %>% merge_reference_sequences(reference, "J", species, sample_name) %>% validate_chains_length(min_nuc_outside_cdr3, sample_name) %>% calculate_germlines_parallel(threads, sample_name) %>% @@ -222,7 +222,7 @@ add_allele_column <- function(.data, .reference_allele_ids, .gene) { name_to_search ))][1] if (has_no_data(first_matching_allele)) { - return(first_allele) + return(name_to_search) } else { return(first_matching_allele) } @@ -243,7 +243,7 @@ merge_reference_sequences <- function(data, reference, chain_letter, species, sa missing_alleles <- alleles_in_data[!(alleles_in_data %in% alleles_in_ref)] if (length(missing_alleles) > 0) { warning( - "Alleles ", + "Genes or alleles ", paste(missing_alleles, collapse = ", "), " ", optional_sample("from sample ", sample_name, " "), @@ -354,7 +354,7 @@ validate_chains_length <- function(data, min_nuc_outside_cdr3, sample_name) { merge_germline_results <- function(new_columns, data) { data %<>% - subset(select = -c("Sequence", "V.ref.nt", "J.ref.nt")) %>% + subset(select = -c(get("Sequence"), get("V.ref.nt"), get("J.ref.nt"))) %>% cbind(new_columns) return(data) } From 608c49b18c7e6495b1bd70edc7e476f9020e6409 Mon Sep 17 00:00:00 2001 From: ivan-immunomind Date: Sun, 2 Oct 2022 23:32:09 +0300 Subject: [PATCH 31/46] remove renaming columns in seqDist --- R/distance.R | 17 +---------------- R/seqCluster.R | 19 +++++++++++++------ 2 files changed, 14 insertions(+), 22 deletions(-) diff --git a/R/distance.R b/R/distance.R index 0bd024d0..ac2122c0 100644 --- a/R/distance.R +++ b/R/distance.R @@ -68,24 +68,9 @@ #' seqDist(immdata$data[1:2], .method = f, .group_by_seqLength = FALSE) #' @export seqDist -seqDist <- function(.data, .col = "CDR3.nt", .method = "hamming", .group_by = c("V.first", "J.first"), .group_by_seqLength = TRUE, ...) { +seqDist <- function(.data, .col = "CDR3.nt", .method = "hamming", .group_by = c("V.name", "J.name"), .group_by_seqLength = TRUE, ...) { .validate_repertoires_data(.data) gr_by_is_na <- all(is.na(.group_by)) - # prepare columns with 1st V and J genes if they are used, but not yet calculated - if ("V.first" %in% .group_by) { - .data %<>% apply_to_sample_or_list( - add_column_with_first_gene, - .original_colname = "V.name", - .target_colname = "V.first" - ) - } - if ("J.first" %in% .group_by) { - .data %<>% apply_to_sample_or_list( - add_column_with_first_gene, - .original_colname = "J.name", - .target_colname = "J.first" - ) - } # Since seqDist works with any columns of string type, classic .col values are not suported if (.col %in% c("aa", "nt", "v", "j", "aa+v")) stop("Please, provide full column name") first_sample <- .data[[1]] diff --git a/R/seqCluster.R b/R/seqCluster.R index 629d5ffc..aa3c39b5 100644 --- a/R/seqCluster.R +++ b/R/seqCluster.R @@ -2,7 +2,7 @@ #' #' @concept seq_cluster #' -#' @importFrom purrr map map_lgl map_chr map2 map2_chr map_df map2_lgl pmap +#' @importFrom purrr map map_lgl map_chr map2 map2_chr map_df map2_lgl pmap map2_df #' @importFrom magrittr %>% %<>% #' @importFrom reshape2 melt #' @importFrom dplyr group_by mutate ungroup select cur_group_id left_join @@ -116,13 +116,20 @@ seqCluster <- function(.data, .dist, .perc_similarity, .nt_similarity, .fixed_th group_by(value, length_value) %>% mutate(Cluster = paste0(.y, "_length_", length_value, "_cluster_", cur_group_id())) %>% ungroup() %>% - select(Sequence, Cluster)) %>% - map_df(., ~.x) + select(Sequence, Cluster)) if (!all(is.na(grouping_cols))) { result_multi %<>% map2_df(., pmap(group_values, data.frame)[!singleseq_flag], ~ cbind(.x, .y)) + res <- rbind(result_single, result_multi) + res[grouping_cols] <- str_split(str_split(res[["Cluster"]], + pattern = "_", simplify = TRUE + )[, 1], + pattern = "/", simplify = TRUE + )[, seq_along(grouping_cols)] + } else{ + result_multi %<>% map_df(., ~.x) + res <- rbind(result_single, result_multi) + colnames(res) <- c(matching_col, "Cluster") } - res <- rbind(result_single, result_multi) - colnames(res) <- na.omit(c(matching_col, "Cluster", grouping_cols)) return(res) } clusters <- map(.dist, ~ graph_clustering(.x, threshold_fun = .threshold_fun)) @@ -130,6 +137,6 @@ seqCluster <- function(.data, .dist, .perc_similarity, .nt_similarity, .fixed_th warning("Number of sequence provided in .data and .dist are not matching!") } # supress messages because join spams about joining by matching_col is done - result_data <- map2(.data, clusters, ~ left_join(.x, .y) %>% suppressMessages()) + result_data <- map2(.data, clusters, ~ left_join(.x, .y, by = grouping_cols) %>% suppressMessages()) return(result_data) } From fa63da3fdf523fbd1d5d18bde270c251e4bb0b3c Mon Sep 17 00:00:00 2001 From: ivan-immunomind Date: Sun, 2 Oct 2022 23:39:46 +0300 Subject: [PATCH 32/46] styler fixes --- R/seqCluster.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/seqCluster.R b/R/seqCluster.R index 9001e04c..55593fbf 100644 --- a/R/seqCluster.R +++ b/R/seqCluster.R @@ -144,11 +144,11 @@ seqCluster <- function(.data, .dist, .perc_similarity, .nt_similarity, .fixed_th result_multi %<>% map2_df(., pmap(group_values, data.frame)[!singleseq_flag], ~ cbind(.x, .y)) res <- rbind(result_single, result_multi) res[grouping_cols] <- str_split(str_split(res[["Cluster"]], - pattern = "_", simplify = TRUE + pattern = "_", simplify = TRUE )[, 1], pattern = "/", simplify = TRUE )[, seq_along(grouping_cols)] - } else{ + } else { result_multi %<>% map_df(., ~.x) res <- rbind(result_single, result_multi) colnames(res) <- c(matching_col, "Cluster") From 37cead5b0860257ca9d911266bbeeef699db9c14 Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Mon, 3 Oct 2022 14:25:46 +0200 Subject: [PATCH 33/46] Don't throw error if Sequence column is missing --- R/germline.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/germline.R b/R/germline.R index 42639378..d9c8878c 100644 --- a/R/germline.R +++ b/R/germline.R @@ -354,7 +354,7 @@ validate_chains_length <- function(data, min_nuc_outside_cdr3, sample_name) { merge_germline_results <- function(new_columns, data) { data %<>% - subset(select = -c(get("Sequence"), get("V.ref.nt"), get("J.ref.nt"))) %>% + dplyr::select(-any_of(c("Sequence", "V.ref.nt", "J.ref.nt"))) %>% cbind(new_columns) return(data) } From 5c8afd092b44488d724eaa64ad2b75c30663bcd8 Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Mon, 3 Oct 2022 14:53:29 +0200 Subject: [PATCH 34/46] repGermline: don't save CDR3.length column (because CDR3.nt is saved) --- R/germline.R | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/R/germline.R b/R/germline.R index d9c8878c..ca2ec8ac 100644 --- a/R/germline.R +++ b/R/germline.R @@ -43,10 +43,9 @@ #' @return #' #' Data with added columns: -#' * Sequence (FR1+CDR1+FR2+CDR2+FR3+CDR3+FR4 in nucleotides; column will be replaced if exists) +#' * Sequence (FR1+CDR1+FR2+CDR2+FR3+CDR3+FR4 in nucleotides; the column will be replaced if exists) #' * V.allele, J.allele (chosen alleles of V and J genes), #' * V.aa, J.aa (V and J sequences from original clonotype, outside CDR3, converted to amino acids) -#' * CDR3.length (length of CDR3), #' * Germline.sequence (combined germline nucleotide sequence) #' #' @examples @@ -165,7 +164,6 @@ calculate_new_columns <- function(row, sample_name) { Sequence = NA, V.aa = NA, J.aa = NA, - CDR3.length = NA, Germline.sequence = NA, Warning = warn )) @@ -185,8 +183,7 @@ calculate_new_columns <- function(row, sample_name) { str_length(fr1_nt) + str_length(fr2_nt) + str_length(fr3_nt) v_part <- str_sub(v_ref, 1, v_length) - cdr3_length <- str_length(cdr3_nt) - cdr3_part <- paste(rep("n", cdr3_length), collapse = "") + cdr3_part <- paste(rep("n", str_length(cdr3_nt)), collapse = "") # trim intersection of J and CDR3 from reference J gene j_length <- str_length(fr4_nt) @@ -200,7 +197,6 @@ calculate_new_columns <- function(row, sample_name) { Sequence = seq, V.aa = v_aa, J.aa = j_aa, - CDR3.length = cdr3_length, Germline.sequence = germline, Warning = NA )) From 423e6d492e832cdc18b884ba32da207098937a14 Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Mon, 3 Oct 2022 14:54:00 +0200 Subject: [PATCH 35/46] Remove unnecessary columns: updated documentation for align lineage --- R/align_lineage.R | 31 +++++++------------------------ 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/R/align_lineage.R b/R/align_lineage.R index ac55ed89..f795f5c9 100644 --- a/R/align_lineage.R +++ b/R/align_lineage.R @@ -23,16 +23,14 @@ #' #' @usage #' -#' repAlignLineage(.data, -#' .min_lineage_sequences, .prepare_threads, .align_threads, .verbose_output, .nofail) +#' repAlignLineage(.data, .min_lineage_sequences, .prepare_threads, .align_threads, .nofail) #' #' @param .data The data to be processed. Can be \link{data.frame}, \link{data.table} #' or a list of these objects. #' #' @param .min_lineage_sequences If number of sequences in the same clonal lineage and the same #' cluster (not including germline) is lower than this threshold, this group of sequences -#' will not be aligned and will not be used in next steps of BCR pipeline -#' (will be saved in output table only if .verbose_output parameter is set to TRUE). +#' will be filtered out from the dataframe; so only large enough lineages will be included. #' #' @param .prepare_threads Number of threads to prepare results table. #' Please note that high number can cause heavy memory usage! @@ -43,11 +41,6 @@ #' must contain 'Cluster' column, which is added by seqCluster() function, and 'Germline.sequence' #' column, which is added by repGermline() function. #' -#' @param .verbose_output If TRUE, all output dataframe columns will be included (see documentation about this -#' function return), and unaligned clusters will be included in the output. Setting this to TRUE significantly -#' increases memory usage. If FALSE, only aligned clusters and columns required for repClonalFamily() and -#' repSomaticHypermutation() calculation will be included in the output. -#' #' @param .nofail Will return NA instead of stopping if Clustal W is not installed. #' Used to avoid raising errors in examples on computers where Clustal W is not installed. #' @@ -57,23 +50,13 @@ #' The dataframe has these columns: #' * Cluster: cluster name #' * Germline: germline sequence -#' * V.germline.nt: germline V gene sequence -#' * J.germline.nt: germline J gene sequence -#' * CDR3.germline.length: length of CDR3 in germline -#' * Aligned (included if .verbose_output=TRUE): FALSE if this group of sequences was not aligned with lineage -#' (.min_lineage_sequences is below the threshold); TRUE if it was aligned -#' * Alignment: DNAbin object with alignment or DNAbin object with unaligned sequences (if Aligned=FALSE) -#' * V.length: shortest length of V gene part outside of CDR3 region in this -#' group of sequences; longer V genes (including germline) are trimmed to this length before alignment -#' * J.length: shortest length of J gene part outside of CDR3 region in this -#' group of sequences; longer J genes (including germline) are trimmed to this length before alignment +#' * Alignment: DNAbin object with alignment #' * Sequences: nested dataframe containing all sequences for this combination #' of cluster and germline; it has columns -#' Sequence, Clone.ID, Clones, CDR1.nt, CDR2.nt, CDR3.nt, FR1.nt, FR2.nt, FR3.nt, FR4.nt -#' and, if .verbose_output=TRUE, also V.end, J.start, CDR3.start, CDR3.end; -#' all values taken from the input dataframe -#' * AA.frame.starts: start positions for amino acid translation for germline and all sequences -#' after trimming (possible values: 1, 2 and 3) +#' * Sequence, CDR1.nt, CDR2.nt, CDR3.nt, FR1.nt, FR2.nt, FR3.nt, FR4.nt, V.allele, J.allele, +#' V.aa, J.aa: all values taken from the input dataframe +#' * Clone.ID: taken from the input dataframe, or created (filled with row numbers) if missing +#' * Clones: taken from the input dataframe, or created (filled with '1' values) if missing #' #' @examples #' From b1e65df3e6d2b8dae645bba94955e510856e7c92 Mon Sep 17 00:00:00 2001 From: ivan-immunomind Date: Mon, 3 Oct 2022 19:17:34 +0300 Subject: [PATCH 36/46] add sequence column renaming --- R/seqCluster.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/seqCluster.R b/R/seqCluster.R index 55593fbf..9e7f2bc6 100644 --- a/R/seqCluster.R +++ b/R/seqCluster.R @@ -153,6 +153,7 @@ seqCluster <- function(.data, .dist, .perc_similarity, .nt_similarity, .fixed_th res <- rbind(result_single, result_multi) colnames(res) <- c(matching_col, "Cluster") } + colnames(res)[1] <- matching_col return(res) } clusters <- map(.dist, ~ graph_clustering(.x, threshold_fun = .threshold_fun)) @@ -160,6 +161,6 @@ seqCluster <- function(.data, .dist, .perc_similarity, .nt_similarity, .fixed_th warning("Number of sequence provided in .data and .dist are not matching!") } # supress messages because join spams about joining by matching_col is done - result_data <- map2(.data, clusters, ~ left_join(.x, .y, by = grouping_cols) %>% suppressMessages()) + result_data <- map2(.data, clusters, ~ left_join(.x, .y) %>% suppressMessages()) return(result_data) } From 2a8374d724dbba7482cd35082852b014f5d9c592 Mon Sep 17 00:00:00 2001 From: ivan-immunomind Date: Mon, 3 Oct 2022 20:19:23 +0300 Subject: [PATCH 37/46] fixing typo --- R/seqCluster.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/seqCluster.R b/R/seqCluster.R index 9e7f2bc6..e8bf0940 100644 --- a/R/seqCluster.R +++ b/R/seqCluster.R @@ -46,7 +46,6 @@ seqCluster <- function(.data, .dist, .perc_similarity, .nt_similarity, .fixed_threshold = 10) { grouping_cols <- attr(.dist, "group_by") matching_col <- attr(.dist, "col") - grouping_cols <- attr(.dist, "group_by") if (length(.data) != length(.dist)) { stop(".data and .dist lengths do not match!") } From bc0347a883e76231ebe10cbbe8c784a8d1694198 Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Tue, 4 Oct 2022 13:59:11 +0200 Subject: [PATCH 38/46] Make smarter add_column_with_first_gene() and validate_columns(): support for empty .target_colname, support for list of samples in add_column --- R/tools.R | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/R/tools.R b/R/tools.R index 0088a19f..9de2b418 100644 --- a/R/tools.R +++ b/R/tools.R @@ -524,7 +524,7 @@ apply_to_sample_or_list <- function(.data, .function, .with_names = FALSE, .vali } # return TRUE if target column doesn't exist, otherwise FALSE; stop if original column doesn't exist -validate_columns <- function(.data, .original_colname, .target_colname) { +validate_columns <- function(.data, .original_colname, .target_colname = NA) { if (!(.original_colname %in% colnames(.data))) { stop( "Trying to get data from missing column ", @@ -533,8 +533,13 @@ validate_columns <- function(.data, .original_colname, .target_colname) { colnames(.data) ) } - # FALSE return value means that the column was previously added and no need to add it again - return(!(.target_colname %in% colnames(.data))) + if (is.na(.target_colname)) { + # skip target check if target column is not specified + return(TRUE) + } else { + # FALSE return value means that the column was previously added and no need to add it again + return(!(.target_colname %in% colnames(.data))) + } } # get genes from original column, remove alleles and write to target column @@ -565,19 +570,19 @@ add_column_without_alleles <- function(.data, .original_colname, .target_colname return(.data) } -add_column_with_first_gene <- function(.data, .original_colname, .target_colname) { - if (validate_columns(.data, .original_colname, .target_colname)) { - .data[[.target_colname]] <- .data[[.original_colname]] %>% sapply( - function(genes_string) { - genes_string %<>% - # first gene is substring until first ',', '(' or '*' - strsplit(",|\\(|\\*") %>% - unlist() %>% - magrittr::extract2(1) - return(genes_string) +add_column_with_first_gene <- function(.data, .original_colname, .target_colname = NA) { + .data %<>% apply_to_sample_or_list(.validate = FALSE, .function = function(df) { + if (validate_columns(df, .original_colname, .target_colname)) { + if (is.na(.target_colname)) { + .target_colname <- .original_colname } - ) - } + df[[.target_colname]] <- df[[.original_colname]] %>% sapply(function(genes_string) { + # first gene is substring until first ',', '(' or '*' + unlist(strsplit(genes_string, ",|\\(|\\*"))[1] + }) + } + return(df) + }) return(.data) } From bb9708f17c94b2397b47bc857e14949900235f8f Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Tue, 4 Oct 2022 14:05:44 +0200 Subject: [PATCH 39/46] Make smarter add_column_with_first_gene() and validate_columns(): support for empty .target_colname, support for list of samples in add_column --- R/tools.R | 45 ++++++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/R/tools.R b/R/tools.R index 1617cc5b..a539f6ad 100644 --- a/R/tools.R +++ b/R/tools.R @@ -527,7 +527,7 @@ apply_to_sample_or_list <- function(.data, .function, .with_names = FALSE, .vali } # return TRUE if target column doesn't exist, otherwise FALSE; stop if original column doesn't exist -validate_columns <- function(.data, .original_colname, .target_colname) { +validate_columns <- function(.data, .original_colname, .target_colname = NA) { if (!(.original_colname %in% colnames(.data))) { stop( "Trying to get data from missing column ", @@ -536,8 +536,13 @@ validate_columns <- function(.data, .original_colname, .target_colname) { colnames(.data) ) } - # FALSE return value means that the column was previously added and no need to add it again - return(!(.target_colname %in% colnames(.data))) + if (is.na(.target_colname)) { + # skip target check if target column is not specified + return(TRUE) + } else { + # FALSE return value means that the column was previously added and no need to add it again + return(!(.target_colname %in% colnames(.data))) + } } # get genes from original column, remove alleles and write to target column @@ -568,29 +573,19 @@ add_column_without_alleles <- function(.data, .original_colname, .target_colname return(.data) } -add_column_with_first_gene <- function(.data, .original_colname, .target_colname, .with_allele = FALSE) { - if (validate_columns(.data, .original_colname, .target_colname)) { - .data[[.target_colname]] <- .data[[.original_colname]] %>% sapply( - function(genes_string) { - if (.with_allele) { - genes_string %<>% - # first allele is substring until first ',' or '(' in string taken from column with gene names - strsplit(",|\\(") %>% - unlist() %>% - magrittr::extract2(1) %>% - # MiXCR uses *00 for unknown alleles; replace *00 to *01 to find them in reference - stringr::str_replace(stringr::fixed("*00"), "*01") - } else { - genes_string %<>% - # first gene is substring until first ',', '(' or '*' - strsplit(",|\\(|\\*") %>% - unlist() %>% - magrittr::extract2(1) - } - return(genes_string) +add_column_with_first_gene <- function(.data, .original_colname, .target_colname = NA) { + .data %<>% apply_to_sample_or_list(.validate = FALSE, .function = function(df) { + if (validate_columns(df, .original_colname, .target_colname)) { + if (is.na(.target_colname)) { + .target_colname <- .original_colname } - ) - } + df[[.target_colname]] <- df[[.original_colname]] %>% sapply(function(genes_string) { + # first gene is substring until first ',', '(' or '*' + unlist(strsplit(genes_string, ",|\\(|\\*"))[1] + }) + } + return(df) + }) return(.data) } From ff6481a3a5861201a6c8b1d506948ae43388c1ea Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Wed, 5 Oct 2022 12:21:01 +0200 Subject: [PATCH 40/46] comment for add_column_with_first_gene --- R/tools.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/tools.R b/R/tools.R index 9de2b418..5bf9e8d3 100644 --- a/R/tools.R +++ b/R/tools.R @@ -570,6 +570,7 @@ add_column_without_alleles <- function(.data, .original_colname, .target_colname return(.data) } +# if .target_colname is not set, it will overwrite the original column add_column_with_first_gene <- function(.data, .original_colname, .target_colname = NA) { .data %<>% apply_to_sample_or_list(.validate = FALSE, .function = function(df) { if (validate_columns(df, .original_colname, .target_colname)) { From bc3afcc760271400cc21fc17dc24051d37402e5d Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Wed, 5 Oct 2022 12:25:14 +0200 Subject: [PATCH 41/46] seqDist() and seqCluster() support for optional gene names trimming --- R/distance.R | 10 +++++++++- R/seqCluster.R | 13 +++++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/R/distance.R b/R/distance.R index ac2122c0..3fd2948c 100644 --- a/R/distance.R +++ b/R/distance.R @@ -28,6 +28,8 @@ #' #' @param .group_by_seqLength If TRUE - adds grouping by sequence length of .col argument #' +#' @param .trim_genes If TRUE - use only general gene values (e.g. "IGHV1-18") of .group_by columns for clustering; if FALSE - can cause very small clusters in case of high resolution genotyping +#' #' @param ... Extra arguments for user-defined function. #' #' The default value is \code{'hamming'} for Hamming distance which counts the number of character substitutions that turns b into a. @@ -68,9 +70,14 @@ #' seqDist(immdata$data[1:2], .method = f, .group_by_seqLength = FALSE) #' @export seqDist -seqDist <- function(.data, .col = "CDR3.nt", .method = "hamming", .group_by = c("V.name", "J.name"), .group_by_seqLength = TRUE, ...) { +seqDist <- function(.data, .col = "CDR3.nt", .method = "hamming", .group_by = c("V.name", "J.name"), .group_by_seqLength = TRUE, trim_genes = TRUE, ...) { .validate_repertoires_data(.data) gr_by_is_na <- all(is.na(.group_by)) + if (trim_genes) { + for (colname in .group_by) { + .data <- add_column_with_first_gene(.data, colname) + } + } # Since seqDist works with any columns of string type, classic .col values are not suported if (.col %in% c("aa", "nt", "v", "j", "aa+v")) stop("Please, provide full column name") first_sample <- .data[[1]] @@ -119,5 +126,6 @@ seqDist <- function(.data, .col = "CDR3.nt", .method = "hamming", .group_by = c( attributes(result)[["col"]] <- .col attributes(result)[["group_by"]] <- .group_by attributes(result)[["group_by_length"]] <- .group_by_seqLength + attributes(result)[["trimmed"]] <- trim_genes return(result) } diff --git a/R/seqCluster.R b/R/seqCluster.R index e8bf0940..be27ad7f 100644 --- a/R/seqCluster.R +++ b/R/seqCluster.R @@ -46,13 +46,14 @@ seqCluster <- function(.data, .dist, .perc_similarity, .nt_similarity, .fixed_threshold = 10) { grouping_cols <- attr(.dist, "group_by") matching_col <- attr(.dist, "col") + trimmed <- attr(.dist, "trimmed") if (length(.data) != length(.dist)) { stop(".data and .dist lengths do not match!") } if (all(!(names(.data) %in% names(.dist)))) { stop(".data and .dist names do not match!") } else { - .dist <- .dist[order(match(names(.dist), names(.data)))] + .dist <- .dist[order(match(names(.dist), names(.data)))] # This one cause removing of all attributes except names! } if (!(matching_col %in% colnames(.data[[1]]))) { stop("There is no ", matching_col, " in .data!") @@ -160,6 +161,14 @@ seqCluster <- function(.data, .dist, .perc_similarity, .nt_similarity, .fixed_th warning("Number of sequence provided in .data and .dist are not matching!") } # supress messages because join spams about joining by matching_col is done - result_data <- map2(.data, clusters, ~ left_join(.x, .y) %>% suppressMessages()) + temp_data <- .data + if (trimmed) { + for (colname in grouping_cols) { + temp_data <- add_column_with_first_gene(temp_data, colname) + } + } + joined_data <- map2(temp_data, clusters, ~ left_join(.x, .y) %>% suppressMessages()) + clusters_cols <- map(joined_data, "Cluster") + result_data <- map2(.data, clusters_cols, ~ cbind(.x, "Cluster" = .y)) return(result_data) } From 7af2dfc079b73029b540e0f67f46da1b1fa739fd Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Wed, 5 Oct 2022 16:35:50 +0200 Subject: [PATCH 42/46] repAlignLineage() updated to new BCR pipeline data format --- R/align_lineage.R | 158 ++++++++++++---------------------------------- R/germline.R | 3 +- 2 files changed, 41 insertions(+), 120 deletions(-) diff --git a/R/align_lineage.R b/R/align_lineage.R index f795f5c9..3d89590c 100644 --- a/R/align_lineage.R +++ b/R/align_lineage.R @@ -72,7 +72,6 @@ repAlignLineage <- function(.data, .min_lineage_sequences = 3, .prepare_threads = 2, .align_threads = 4, - .verbose_output = FALSE, .nofail = FALSE) { if (!require_system_package(c("clustalw", "clustalw2"), error_message = paste0( "repAlignLineage requires Clustal W app to be installed!\n", @@ -97,8 +96,7 @@ repAlignLineage <- function(.data, align_single_df, .min_lineage_sequences = .min_lineage_sequences, .parallel_prepare = parallel_prepare, - .align_threads = .align_threads, - .verbose_output = .verbose_output + .align_threads = .align_threads ) if (parallel_prepare) { doParallel::stopImplicitCluster() @@ -109,10 +107,10 @@ repAlignLineage <- function(.data, align_single_df <- function(data, .min_lineage_sequences, .parallel_prepare, - .align_threads, - .verbose_output) { + .align_threads) { for (required_column in c( - "Cluster", "Germline.sequence", "V.germline.nt", "J.germline.nt", "CDR3.germline.length" + "Cluster", "Germline.sequence", "V.allele", "J.allele", + "FR1.nt", "CDR1.nt", "FR2.nt", "CDR2.nt", "FR3.nt", "CDR3.nt", "FR4.nt", "V.aa", "J.aa" )) { if (!(required_column %in% colnames(data))) { stop( @@ -125,11 +123,11 @@ align_single_df <- function(data, } results <- data %>% + fill_missing_columns() %>% plyr::dlply( .variables = .(get("Cluster"), get("Germline.sequence")), .fun = prepare_results_row, .min_lineage_sequences = .min_lineage_sequences, - .verbose_output = .verbose_output, .parallel = .parallel_prepare ) %>% `[`(!is.na(.)) %>% @@ -139,145 +137,69 @@ align_single_df <- function(data, stop("There are no lineages containing at least ", .min_lineage_sequences, " sequences!") } - # only required columns are passed to alignment function to reduce consumed memory - if (.verbose_output) { - alignments <- lapply(results, "[", c("Aligned", "Alignment")) - } else { - alignments <- lapply(results, "[", "Alignment") - } - alignments %<>% par_or_normal_lapply( - align_sequences, - .verbose_output = .verbose_output, - mc.preschedule = TRUE, - mc.cores = .align_threads - ) + # only Alignment column are passed to alignment function to reduce consumed memory + alignments <- lapply(results, "[", "Alignment") %>% + par_or_normal_lapply(mc.preschedule = TRUE, mc.cores = .align_threads, function(df_row) { + df_row[["Alignment"]] %<>% ape::clustal() + }) return(convert_results_to_df(results, alignments)) } +# fill Clone.ID and Clones columns if they are missing +fill_missing_columns <- function(data) { + if (!("Clone.ID" %in% colnames(data))) { + data[["Clone.ID"]] <- seq.int(nrow(data)) + } + if (!("Clones" %in% colnames(data))) { + data[["Clones"]] <- as.integer(1) + } + return(data) +} + # this function accepts dataframe subset containing rows only for current lineage # and returns named list containing 1 row for results dataframe -prepare_results_row <- function(lineage_subset, .min_lineage_sequences, .verbose_output) { - cluster_name <- lineage_subset[[1, "Cluster"]] - germline_seq <- lineage_subset[[1, "Germline.sequence"]] - germline_v <- lineage_subset[[1, "V.germline.nt"]] - germline_j <- lineage_subset[[1, "J.germline.nt"]] - germline_cdr3_len <- lineage_subset[[1, "CDR3.germline.length"]] - aligned <- nrow(lineage_subset) >= .min_lineage_sequences - - if (!aligned & !.verbose_output) { +prepare_results_row <- function(lineage_subset, .min_lineage_sequences) { + if (nrow(lineage_subset) < .min_lineage_sequences) { + # NA rows will be filtered out return(NA) } - lineage_subset[["V.lengths"]] <- v_len_outside_cdr3( - lineage_subset[["V.end"]], lineage_subset[["CDR3.start"]] - ) - lineage_subset[["J.lengths"]] <- j_len_outside_cdr3( - lineage_subset[["Sequence"]], lineage_subset[["J.start"]], lineage_subset[["CDR3.end"]] - ) + cluster_name <- lineage_subset[[1, "Cluster"]] + germline_seq <- lineage_subset[[1, "Germline.sequence"]] sequences_columns <- c( - "Sequence", "Clone.ID", "Clones", - "CDR1.nt", "CDR2.nt", "CDR3.nt", "FR1.nt", "FR2.nt", "FR3.nt", "FR4.nt" + "Sequence", "Clone.ID", "Clones", "V.allele", "J.allele", + "CDR1.nt", "CDR2.nt", "CDR3.nt", "FR1.nt", "FR2.nt", "FR3.nt", "FR4.nt", "V.aa", "J.aa" ) - if (.verbose_output) { - sequences_columns %<>% c("V.end", "J.start", "CDR3.start", "CDR3.end") - } + sequences <- lineage_subset[sequences_columns] sequences[["Clone.ID"]] %<>% as.integer() sequences[["Clones"]] %<>% as.integer() - germline_v_len <- str_length(germline_v) - germline_j_len <- str_length(germline_j) - v_min_len <- min(lineage_subset[["V.lengths"]], germline_v_len) - j_min_len <- min(lineage_subset[["J.lengths"]], germline_j_len) - - germline_trimmed <- trim_seq(germline_seq, germline_v_len, v_min_len, germline_j_len, j_min_len) - clonotypes_trimmed <- trim_seq( - lineage_subset[["Sequence"]], - lineage_subset[["V.lengths"]], - v_min_len, - lineage_subset[["J.lengths"]], - j_min_len - ) - clonotypes_names <- sapply(lineage_subset[["Clone.ID"]], function(id) { paste0("ID_", id) }) - all_sequences_list <- c(list(germline_trimmed), as.list(clonotypes_trimmed)) + all_sequences_list <- c(list(germline_seq), as.list(lineage_subset[["Sequence"]])) names(all_sequences_list) <- c("Germline", clonotypes_names) alignment <- convert_seq_list_to_dnabin(all_sequences_list) - # calculate amino acid frame starts for all trimmed sequences including germline - germline_aa_start <- (germline_v_len - v_min_len) %% 3 + 1 - clonotypes_aa_starts <- (lineage_subset[["V.lengths"]] - v_min_len) %% 3 + 1 - all_sequences_aa_starts <- c(list(germline_aa_start), as.list(clonotypes_aa_starts)) - names(all_sequences_aa_starts) <- names(all_sequences_list) - - if (.verbose_output) { - return(list( - Cluster = cluster_name, - Germline = germline_seq, - V.germline.nt = germline_v, - J.germline.nt = germline_j, - CDR3.germline.length = germline_cdr3_len, - Aligned = aligned, - Alignment = alignment, - V.length = v_min_len, - J.length = j_min_len, - Sequences = sequences, - AA.frame.starts = all_sequences_aa_starts - )) - } else { - return(list( - Cluster = cluster_name, - Germline = germline_seq, - V.germline.nt = germline_v, - J.germline.nt = germline_j, - CDR3.germline.length = germline_cdr3_len, - Alignment = alignment, - V.length = v_min_len, - J.length = j_min_len, - Sequences = sequences, - AA.frame.starts = all_sequences_aa_starts - )) - } + return(list( + Cluster = cluster_name, + Germline = germline_seq, + Alignment = alignment, + Sequences = sequences + )) } -# trim V/J tails in sequence to the specified lenghts v_min, j_min -trim_seq <- function(seq, v_len, v_min, j_len, j_min) { - str_sub(seq, v_len - v_min + 1, -(j_len - j_min + 1)) -} - -convert_results_to_df <- function(nested_results_list, nested_alignments_list) { - alignments <- nested_alignments_list %>% - lapply(magrittr::extract2, "Alignment") %>% - tibble(Alignment = .) +convert_results_to_df <- function(nested_results_list, alignments_list) { + alignments <- tibble(Alignment = alignments_list) sequences <- nested_results_list %>% lapply(magrittr::extract2, "Sequences") %>% tibble(Sequences = .) - frame_starts <- nested_results_list %>% - lapply(magrittr::extract2, "AA.frame.starts") %>% - tibble(AA.frame.starts = .) df <- nested_results_list %>% - lapply(rlist::list.remove, c("Alignment", "Sequences", "AA.frame.starts")) %>% + lapply(rlist::list.remove, c("Alignment", "Sequences")) %>% purrr::map_dfr(~.) %>% - cbind(alignments, sequences, frame_starts) - # fix column types after dataframe rebuilding - for (column in c("CDR3.germline.length", "V.length", "J.length")) { - df[[column]] %<>% as.integer() - } + cbind(alignments, sequences) return(df) } - -align_sequences <- function(df_row, .verbose_output) { - if (.verbose_output) { - aligned <- df_row[["Aligned"]] - } else { - aligned <- TRUE - } - if (aligned) { - df_row[["Alignment"]] %<>% ape::clustal() - } - return(df_row) -} diff --git a/R/germline.R b/R/germline.R index ca2ec8ac..91036768 100644 --- a/R/germline.R +++ b/R/germline.R @@ -179,8 +179,7 @@ calculate_new_columns <- function(row, sample_name) { j_aa <- fr4_aa # trim intersection of V and CDR3 from reference V gene - v_length <- str_length(cdr1_nt) + str_length(cdr2_nt) + - str_length(fr1_nt) + str_length(fr2_nt) + str_length(fr3_nt) + v_length <- str_length(paste0(fr1_nt, cdr1_nt, fr2_nt, cdr2_nt, fr3_nt)) v_part <- str_sub(v_ref, 1, v_length) cdr3_part <- paste(rep("n", str_length(cdr3_nt)), collapse = "") From d3b487897712c2d05f216f59c715cbd2fbf4ba3a Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Thu, 6 Oct 2022 19:42:51 +0200 Subject: [PATCH 43/46] repClonalFamily() updated to the new BCR pipeline format --- R/phylip.R | 110 +++++++++++------------------------------------------ 1 file changed, 23 insertions(+), 87 deletions(-) diff --git a/R/phylip.R b/R/phylip.R index 4c80fdc8..7f63a0ac 100644 --- a/R/phylip.R +++ b/R/phylip.R @@ -23,8 +23,7 @@ #' #' repClonalFamily(.data, .threads, .nofail) #' -#' @param .data The data to be processed. Can be output of repAlignLineage() with normal -#' or verbose output; variants with one sample and list of samples are both supported. +#' @param .data The data to be processed, output of repAlignLineage() function. #' #' @param .threads Number of threads to use. #' @@ -36,16 +35,10 @@ #' Dataframe or list of dataframes (if input is a list with multiple samples). #' The dataframe has these columns: #' * Cluster: cluster name -#' * Germline.Input: germline sequence, like it was in the input; not trimmed and not aligned -#' * V.germline.nt: input germline V gene sequence -#' * J.germline.nt: input germline J gene sequence -#' * CDR3.germline.length: length of CDR3 in input germline -#' * V.length: length of V gene after trimming on repAlignLineage() step -#' * J.length: length of j gene after trimming on repAlignLineage() step +#' * Germline.Input: germline sequence, like it was in the input; not aligned #' * Germline.Output: germline sequence, parsed from PHYLIP dnapars function output; #' it contains difference of germline from the common ancestor; "." characters mean -#' matching letters. It's usually shorter than Germline.Input, because germline and -#' clonotype sequences were trimmed to the same length before alignment. +#' matching letters #' * Common.Ancestor: common ancestor sequence, parsed from PHYLIP dnapars function output #' * Trunk.Length: mean trunk length, representing the distance between the most recent #' common ancestor and germline sequence as a measure of the maturity of a lineage @@ -87,10 +80,7 @@ repClonalFamily <- function(.data, .threads = parallel::detectCores(), .nofail = } process_dataframe <- function(df, .threads, sample_name = NA) { - required_columns <- c( - "Cluster", "Germline", "V.germline.nt", "J.germline.nt", "CDR3.germline.length", - "V.length", "J.length", "Alignment", "Sequences", "AA.frame.starts" - ) + required_columns <- c("Cluster", "Germline", "Alignment", "Sequences") for (column in required_columns) { if (!(column %in% colnames(df))) { stop( @@ -103,10 +93,6 @@ process_dataframe <- function(df, .threads, sample_name = NA) { } } - if ("Aligned" %in% colnames(df)) { - df %<>% filter(get("Aligned")) - } - if (nrow(df) == 0) { stop( "repClonalFamily: input dataframe ", @@ -133,14 +119,7 @@ process_cluster <- function(cluster_row) { # because of these columns format alignment <- cluster_row[["Alignment"]][[1]] sequences <- cluster_row[["Sequences"]][[1]] - v_aa_frame_starts <- cluster_row[["AA.frame.starts"]][[1]] cluster_name <- cluster_row[["Cluster"]] - cdr3_germline_length <- cluster_row[["CDR3.germline.length"]] - v_trimmed_length <- cluster_row[["V.length"]] - j_trimmed_length <- cluster_row[["J.length"]] - - # positions of J gene starting nucleotide for translation of sequences to amino acids - j_aa_frame_start <- (str_length(cluster_row[["V.germline.nt"]]) + cdr3_germline_length) %% 3 + 1 temp_dir <- file.path(tempdir(check = TRUE), uuid::UUIDgenerate(use.time = FALSE)) dir.create(temp_dir) @@ -266,15 +245,23 @@ process_cluster <- function(cluster_row) { common_ancestor <- tree_stats[which(tree_stats["Type"] == "CommonAncestor"), ][1, "Sequence"] # calculate distances of all sequences from their ancestors + v_nt_length <- str_length(paste0( + sequences[[1, "FR1.nt"]], sequences[[1, "CDR1.nt"]], sequences[[1, "FR2.nt"]], + sequences[[1, "CDR2.nt"]], sequences[[1, "FR3.nt"]] + )) + j_nt_length <- str_length(sequences[[1, "FR4.nt"]]) + v_aa_length <- str_length(sequences[[1, "V.aa"]]) + j_aa_length <- str_length(sequences[[1, "J.aa"]]) + for (row in seq_len(nrow(tree_stats))) { if (!has_no_data(tree_stats[row, "Ancestor"])) { seq <- tree_stats[row, "Sequence"] ancestor_name <- tree_stats[row, "Ancestor"] ancestor <- tree_stats[which(tree_stats["Name"] == ancestor_name), ][1, "Sequence"] - seq_v <- str_sub(seq, 1, v_trimmed_length) - seq_j <- str_sub(seq, -j_trimmed_length) - ancestor_v <- str_sub(ancestor, 1, v_trimmed_length) - ancestor_j <- str_sub(ancestor, -j_trimmed_length) + seq_v <- str_sub(seq, 1, v_nt_length) + seq_j <- str_sub(seq, -j_nt_length) + ancestor_v <- str_sub(ancestor, 1, v_nt_length) + ancestor_j <- str_sub(ancestor, -j_nt_length) seq_nt_chars <- strsplit(paste0(seq_v, seq_j), "")[[1]] ancestor_nt_chars <- strsplit(paste0(ancestor_v, ancestor_j), "")[[1]] if (length(seq_nt_chars) != length(ancestor_nt_chars)) { @@ -286,23 +273,12 @@ process_cluster <- function(cluster_row) { } tree_stats[row, "DistanceNT"] <- sum(seq_nt_chars != ancestor_nt_chars) - seq_v_aa_frame_start <- get_v_aa_frame_start(v_aa_frame_starts, tree_stats[row, "Name"]) - seq_v_aa <- bunch_translate(substring(seq_v, seq_v_aa_frame_start), - .two.way = FALSE, .ignore.n = TRUE - ) - seq_j_aa <- bunch_translate(substring(seq_j, j_aa_frame_start), - .two.way = FALSE, .ignore.n = TRUE - ) - ancestor_v_aa_frame_start <- get_v_aa_frame_start( - v_aa_frame_starts, - tree_stats[row, "Ancestor"] - ) - ancestor_v_aa <- bunch_translate(substring(ancestor_v, ancestor_v_aa_frame_start), - .two.way = FALSE, .ignore.n = TRUE - ) - ancestor_j_aa <- bunch_translate(substring(ancestor_j, j_aa_frame_start), - .two.way = FALSE, .ignore.n = TRUE - ) + seq_aa <- bunch_translate(seq, .two.way = FALSE, .ignore.n = TRUE) + ancestor_aa <- bunch_translate(ancestor, .two.way = FALSE, .ignore.n = TRUE) + seq_v_aa <- str_sub(seq_aa, 1, v_aa_length) + seq_j_aa <- str_sub(seq_aa, -j_aa_length) + ancestor_v_aa <- str_sub(ancestor_aa, 1, v_aa_length) + ancestor_j_aa <- str_sub(ancestor_aa, -j_aa_length) seq_aa_chars <- strsplit(paste0(seq_v_aa, seq_j_aa), "")[[1]] ancestor_aa_chars <- strsplit(paste0(ancestor_v_aa, ancestor_j_aa), "")[[1]] if (length(seq_aa_chars) != length(ancestor_aa_chars)) { @@ -313,31 +289,6 @@ process_cluster <- function(cluster_row) { ) } tree_stats[row, "DistanceAA"] <- sum(seq_aa_chars != ancestor_aa_chars) - - if (grepl("*", paste0(seq_v_aa, seq_j_aa), fixed = TRUE)) { - warning( - "Wrong translation from NT to AA found in repClonalFamily:\n", - "seq_name = ", tree_stats[row, "Name"], "\n", - "seq_v_nt = ", seq_v, "\n", - "seq_v_aa = ", seq_v_aa, "\n", - "seq_v_aa_frame_start = ", seq_v_aa_frame_start, "\n", - "seq_j_nt = ", seq_j, "\n", - "seq_j_aa = ", seq_j_aa, "\n", - "j_aa_frame_start = ", j_aa_frame_start, "\n" - ) - } - if (grepl("*", paste0(ancestor_v_aa, ancestor_j_aa), fixed = TRUE)) { - warning( - "Wrong translation from NT to AA found in repClonalFamily:\n", - "ancestor_name = ", tree_stats[row, "Ancestor"], "\n", - "ancestor_v_nt = ", ancestor_v, "\n", - "ancestor_v_aa = ", ancestor_v_aa, "\n", - "ancestor_v_aa_frame_start = ", ancestor_v_aa_frame_start, "\n", - "ancestor_j_nt = ", ancestor_j, "\n", - "ancestor_j_aa = ", ancestor_j_aa, "\n", - "j_aa_frame_start = ", j_aa_frame_start, "\n" - ) - } } } @@ -353,11 +304,6 @@ process_cluster <- function(cluster_row) { return(list( Cluster = cluster_name, Germline.Input = cluster_row[["Germline"]], - V.germline.nt = cluster_row[["V.germline.nt"]], - J.germline.nt = cluster_row[["J.germline.nt"]], - CDR3.germline.length = cdr3_germline_length, - V.length = v_trimmed_length, - J.length = j_trimmed_length, Germline.Output = germline, Common.Ancestor = common_ancestor, Trunk.Length = trunk_length, @@ -367,14 +313,6 @@ process_cluster <- function(cluster_row) { )) } -get_v_aa_frame_start <- function(v_aa_frame_starts, seq_name) { - if (seq_name %in% names(v_aa_frame_starts)) { - return(v_aa_frame_starts[[seq_name]]) - } else { - return(v_aa_frame_starts[["Germline"]]) - } -} - convert_nested_to_df <- function(nested_results_list) { tree <- nested_results_list %>% lapply(magrittr::extract2, "Tree") %>% @@ -390,9 +328,7 @@ convert_nested_to_df <- function(nested_results_list) { purrr::map_dfr(~.) %>% cbind(tree, tree_stats, sequences) # fix column types after dataframe rebuilding - for (column in c("CDR3.germline.length", "V.length", "J.length", "Trunk.Length")) { - df[[column]] %<>% as.integer() - } + df[["Trunk.Length"]] %<>% as.integer() df %<>% add_class("clonal_family_df") return(df) } From 43941c105890b8bdf5e1f866bbea9b1994e8e2df Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Thu, 6 Oct 2022 20:04:11 +0200 Subject: [PATCH 44/46] repSomaticHypermutation() updated to the new BCR pipeline data format --- R/somatic_hypermutation.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R/somatic_hypermutation.R b/R/somatic_hypermutation.R index cd016ef8..8d6bed7b 100644 --- a/R/somatic_hypermutation.R +++ b/R/somatic_hypermutation.R @@ -64,7 +64,7 @@ repSomaticHypermutation <- function(.data, .threads = parallel::detectCores(), . return(get_empty_object_with_class("step_failure_ignored")) } - parallel <- threads > 1 + parallel <- .threads > 1 if (parallel) { doParallel::registerDoParallel(cores = .threads) } @@ -89,8 +89,7 @@ shm_process_dataframe <- function(df, .parallel) { ) # fix column types after dataframe rebuilding for (column in c( - "Clone.ID", "Clones", "CDR3.germline.length", "V.length", "J.length", "Trunk.Length", - "Substitutions", "Insertions", "Deletions", "Mutations" + "Clone.ID", "Clones", "Trunk.Length", "Substitutions", "Insertions", "Deletions", "Mutations" )) { df[[column]] %<>% as.integer() } @@ -98,12 +97,13 @@ shm_process_dataframe <- function(df, .parallel) { } shm_process_clonotype_row <- function(row) { - v_gene_germline <- row[["V.germline.nt"]] - j_gene_germline <- row[["J.germline.nt"]] v_gene_clonotype <- paste0( row[["FR1.nt"]], row[["CDR1.nt"]], row[["FR2.nt"]], row[["CDR2.nt"]], row[["FR3.nt"]] ) j_gene_clonotype <- row[["FR4.nt"]] + germline <- row[["Germline.Input"]] + v_gene_germline <- str_sub(germline, 1, str_length(v_gene_clonotype)) + j_gene_germline <- str_sub(germline, -str_length(j_gene_clonotype)) alignments <- list( V = list(v_gene_germline, v_gene_clonotype), From ab689eda8f1f31bdd0f5ed030e8f1bedf8d91e59 Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Thu, 6 Oct 2022 20:06:27 +0200 Subject: [PATCH 45/46] documentation rebuild --- NAMESPACE | 1 + man/repAlignLineage.Rd | 31 +++++++------------------------ man/repClonalFamily.Rd | 13 +++---------- man/repGermline.Rd | 22 ++++++---------------- man/seqDist.Rd | 2 ++ 5 files changed, 19 insertions(+), 50 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 930ad526..7c85f01c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -208,6 +208,7 @@ importFrom(purrr,imap) importFrom(purrr,map) importFrom(purrr,map2) importFrom(purrr,map2_chr) +importFrom(purrr,map2_df) importFrom(purrr,map2_lgl) importFrom(purrr,map_chr) importFrom(purrr,map_df) diff --git a/man/repAlignLineage.Rd b/man/repAlignLineage.Rd index d56d1fb7..577ab94a 100644 --- a/man/repAlignLineage.Rd +++ b/man/repAlignLineage.Rd @@ -4,8 +4,7 @@ \alias{repAlignLineage} \title{Aligns all sequences incliding germline within each clonal lineage within each cluster} \usage{ -repAlignLineage(.data, -.min_lineage_sequences, .prepare_threads, .align_threads, .verbose_output, .nofail) +repAlignLineage(.data, .min_lineage_sequences, .prepare_threads, .align_threads, .nofail) } \arguments{ \item{.data}{The data to be processed. Can be \link{data.frame}, \link{data.table} @@ -13,8 +12,7 @@ or a list of these objects.} \item{.min_lineage_sequences}{If number of sequences in the same clonal lineage and the same cluster (not including germline) is lower than this threshold, this group of sequences -will not be aligned and will not be used in next steps of BCR pipeline -(will be saved in output table only if .verbose_output parameter is set to TRUE).} +will be filtered out from the dataframe; so only large enough lineages will be included.} \item{.prepare_threads}{Number of threads to prepare results table. Please note that high number can cause heavy memory usage!} @@ -25,11 +23,6 @@ It must have columns in the immunarch compatible format \link{immunarch_data_for must contain 'Cluster' column, which is added by seqCluster() function, and 'Germline.sequence' column, which is added by repGermline() function.} -\item{.verbose_output}{If TRUE, all output dataframe columns will be included (see documentation about this -function return), and unaligned clusters will be included in the output. Setting this to TRUE significantly -increases memory usage. If FALSE, only aligned clusters and columns required for repClonalFamily() and -repSomaticHypermutation() calculation will be included in the output.} - \item{.nofail}{Will return NA instead of stopping if Clustal W is not installed. Used to avoid raising errors in examples on computers where Clustal W is not installed.} } @@ -38,23 +31,13 @@ Dataframe or list of dataframes (if input is a list with multiple samples). The dataframe has these columns: * Cluster: cluster name * Germline: germline sequence -* V.germline.nt: germline V gene sequence -* J.germline.nt: germline J gene sequence -* CDR3.germline.length: length of CDR3 in germline -* Aligned (included if .verbose_output=TRUE): FALSE if this group of sequences was not aligned with lineage - (.min_lineage_sequences is below the threshold); TRUE if it was aligned -* Alignment: DNAbin object with alignment or DNAbin object with unaligned sequences (if Aligned=FALSE) -* V.length: shortest length of V gene part outside of CDR3 region in this - group of sequences; longer V genes (including germline) are trimmed to this length before alignment -* J.length: shortest length of J gene part outside of CDR3 region in this - group of sequences; longer J genes (including germline) are trimmed to this length before alignment +* Alignment: DNAbin object with alignment * Sequences: nested dataframe containing all sequences for this combination of cluster and germline; it has columns - Sequence, Clone.ID, Clones, CDR1.nt, CDR2.nt, CDR3.nt, FR1.nt, FR2.nt, FR3.nt, FR4.nt - and, if .verbose_output=TRUE, also V.end, J.start, CDR3.start, CDR3.end; - all values taken from the input dataframe -* AA.frame.starts: start positions for amino acid translation for germline and all sequences - after trimming (possible values: 1, 2 and 3) + * Sequence, CDR1.nt, CDR2.nt, CDR3.nt, FR1.nt, FR2.nt, FR3.nt, FR4.nt, V.allele, J.allele, + V.aa, J.aa: all values taken from the input dataframe + * Clone.ID: taken from the input dataframe, or created (filled with row numbers) if missing + * Clones: taken from the input dataframe, or created (filled with '1' values) if missing } \description{ This function aligns all sequences (incliding germline) that belong to one clonal diff --git a/man/repClonalFamily.Rd b/man/repClonalFamily.Rd index 3e769db3..5bf5099e 100644 --- a/man/repClonalFamily.Rd +++ b/man/repClonalFamily.Rd @@ -7,8 +7,7 @@ repClonalFamily(.data, .threads, .nofail) } \arguments{ -\item{.data}{The data to be processed. Can be output of repAlignLineage() with normal -or verbose output; variants with one sample and list of samples are both supported.} +\item{.data}{The data to be processed, output of repAlignLineage() function.} \item{.threads}{Number of threads to use.} @@ -19,16 +18,10 @@ Used to avoid raising errors in examples on computers where PHYLIP is not instal Dataframe or list of dataframes (if input is a list with multiple samples). The dataframe has these columns: * Cluster: cluster name -* Germline.Input: germline sequence, like it was in the input; not trimmed and not aligned -* V.germline.nt: input germline V gene sequence -* J.germline.nt: input germline J gene sequence -* CDR3.germline.length: length of CDR3 in input germline -* V.length: length of V gene after trimming on repAlignLineage() step -* J.length: length of j gene after trimming on repAlignLineage() step +* Germline.Input: germline sequence, like it was in the input; not aligned * Germline.Output: germline sequence, parsed from PHYLIP dnapars function output; it contains difference of germline from the common ancestor; "." characters mean - matching letters. It's usually shorter than Germline.Input, because germline and - clonotype sequences were trimmed to the same length before alignment. + matching letters * Common.Ancestor: common ancestor sequence, parsed from PHYLIP dnapars function output * Trunk.Length: mean trunk length, representing the distance between the most recent common ancestor and germline sequence as a measure of the maturity of a lineage diff --git a/man/repGermline.Rd b/man/repGermline.Rd index 244bf5c7..c3acad2e 100644 --- a/man/repGermline.Rd +++ b/man/repGermline.Rd @@ -4,8 +4,7 @@ \alias{repGermline} \title{Creates germlines for clonal lineages} \usage{ -repGermline(.data, -.species, .align_j_gene, .min_nuc_outside_cdr3, .threads) +repGermline(.data, .species, .min_nuc_outside_cdr3, .threads) } \arguments{ \item{.data}{The data to be processed. Can be \link{data.frame}, \link{data.table} @@ -20,13 +19,6 @@ It must have columns in the immunarch compatible format \link{immunarch_data_for "OncorhynchusMykiss", "OrnithorhynchusAnatinus", "OryctolagusCuniculus", "RattusNorvegicus", "SusScrofa".} -\item{.align_j_gene}{MiXCR provides the number of J indels only for 1 allele of J gene -in the output file, and a germline can contain another allele. Therefore, calculation of -J gene start in reference based on numbers from input file can be sometimes incorrect. -As result, J gene in the germline will be trimmed in the start or will contain some -nucleotides from CDR3. Setting this parameter to TRUE enables precise alignment of J genes -to detect the correct starting point, but it significantly reduces performance.} - \item{.min_nuc_outside_cdr3}{This parameter sets how many nucleotides should have V or J chain outside of CDR3 to be considered good for further alignment.} @@ -34,12 +26,10 @@ outside of CDR3 to be considered good for further alignment.} } \value{ Data with added columns: -* V.first.allele, J.first.allele (first alleles of V and J genes), -* V.ref.nt, J.ref.nt (V and J reference sequences), -* V.germline.nt, J.germline.nt (V and J germline sequences; they are references with - trimmed parts that are from CDR3), -* CDR3.germline.length (length of CDR3 in the germline), -* Germline.sequence (combined germline sequence) +* Sequence (FR1+CDR1+FR2+CDR2+FR3+CDR3+FR4 in nucleotides; the column will be replaced if exists) +* V.allele, J.allele (chosen alleles of V and J genes), +* V.aa, J.aa (V and J sequences from original clonotype, outside CDR3, converted to amino acids) +* Germline.sequence (combined germline nucleotide sequence) } \description{ This function creates germlines for clonal lineages. B cell clonal lineage @@ -56,6 +46,6 @@ data(bcrdata) bcrdata$data \%>\% top(5) \%>\% - repGermline(.threads = 1) + repGermline() } \concept{germline} diff --git a/man/seqDist.Rd b/man/seqDist.Rd index d8923737..a4dd8f9c 100644 --- a/man/seqDist.Rd +++ b/man/seqDist.Rd @@ -33,6 +33,8 @@ Other possible values are: \code{'lcs'} for longest common substring is defined as the longest string can be obtained by pairing characters from a and b while keeping the order of characters intact. In case of user-defined function, it should take x and y parameters as input and return \link{dist} object.} + +\item{.trim_genes}{If TRUE - use only general gene values (e.g. "IGHV1-18") of .group_by columns for clustering; if FALSE - can cause very small clusters in case of high resolution genotyping} } \value{ Named list of list with \link{dist} objects for given repertoires for each combination of .group_by variable(s) and/or sequence length of .col. From f1f8801d101efdfccf85a6bba17f0d9a92ace474 Mon Sep 17 00:00:00 2001 From: Aleksandr Popov Date: Tue, 11 Oct 2022 17:12:24 +0200 Subject: [PATCH 46/46] CRAN check fixes --- NAMESPACE | 1 + R/distance.R | 13 +++++++++---- R/germline.R | 1 + man/seqDist.Rd | 6 +++--- 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 7c85f01c..1db20f3b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -290,6 +290,7 @@ importFrom(tibble,tibble) importFrom(tidyr,drop_na) importFrom(tidyr,unite) importFrom(tidyr,unnest) +importFrom(tidyselect,any_of) importFrom(tidyselect,starts_with) importFrom(utils,capture.output) importFrom(utils,packageVersion) diff --git a/R/distance.R b/R/distance.R index 3fd2948c..45eddac5 100644 --- a/R/distance.R +++ b/R/distance.R @@ -13,7 +13,7 @@ #' @usage #' #' seqDist(.data, .col = 'CDR3.nt', .method = 'hamming', -#' .group_by = c("V.first", "J.first"), .group_by_seqLength = TRUE, ...) +#' .group_by = c("V.name", "J.name"), .group_by_seqLength = TRUE, .trim_genes = TRUE, ...) #' #' @param .data The data to be processed. Can be \link{data.frame}, #' \link{data.table}, or a list of these objects. @@ -70,10 +70,15 @@ #' seqDist(immdata$data[1:2], .method = f, .group_by_seqLength = FALSE) #' @export seqDist -seqDist <- function(.data, .col = "CDR3.nt", .method = "hamming", .group_by = c("V.name", "J.name"), .group_by_seqLength = TRUE, trim_genes = TRUE, ...) { +seqDist <- function(.data, + .col = "CDR3.nt", + .method = "hamming", + .group_by = c("V.name", "J.name"), + .group_by_seqLength = TRUE, + .trim_genes = TRUE, ...) { .validate_repertoires_data(.data) gr_by_is_na <- all(is.na(.group_by)) - if (trim_genes) { + if (.trim_genes) { for (colname in .group_by) { .data <- add_column_with_first_gene(.data, colname) } @@ -126,6 +131,6 @@ seqDist <- function(.data, .col = "CDR3.nt", .method = "hamming", .group_by = c( attributes(result)[["col"]] <- .col attributes(result)[["group_by"]] <- .group_by attributes(result)[["group_by_length"]] <- .group_by_seqLength - attributes(result)[["trimmed"]] <- trim_genes + attributes(result)[["trimmed"]] <- .trim_genes return(result) } diff --git a/R/germline.R b/R/germline.R index 91036768..8d21a700 100644 --- a/R/germline.R +++ b/R/germline.R @@ -8,6 +8,7 @@ #' @importFrom purrr imap map_dfr #' @importFrom magrittr %>% %<>% extract2 #' @importFrom dplyr filter rowwise +#' @importFrom tidyselect any_of #' @importFrom parallel parApply detectCores makeCluster clusterExport stopCluster #' @importFrom ape as.DNAbin clustal #' diff --git a/man/seqDist.Rd b/man/seqDist.Rd index a4dd8f9c..6ae7762c 100644 --- a/man/seqDist.Rd +++ b/man/seqDist.Rd @@ -5,7 +5,7 @@ \title{Function for computing distance for sequences} \usage{ seqDist(.data, .col = 'CDR3.nt', .method = 'hamming', - .group_by = c("V.first", "J.first"), .group_by_seqLength = TRUE, ...) + .group_by = c("V.name", "J.name"), .group_by_seqLength = TRUE, .trim_genes = TRUE, ...) } \arguments{ \item{.data}{The data to be processed. Can be \link{data.frame}, @@ -21,6 +21,8 @@ Every object must have columns in the immunarch compatible format \link{immunarc \item{.group_by_seqLength}{If TRUE - adds grouping by sequence length of .col argument} +\item{.trim_genes}{If TRUE - use only general gene values (e.g. "IGHV1-18") of .group_by columns for clustering; if FALSE - can cause very small clusters in case of high resolution genotyping} + \item{...}{Extra arguments for user-defined function. The default value is \code{'hamming'} for Hamming distance which counts the number of character substitutions that turns b into a. @@ -33,8 +35,6 @@ Other possible values are: \code{'lcs'} for longest common substring is defined as the longest string can be obtained by pairing characters from a and b while keeping the order of characters intact. In case of user-defined function, it should take x and y parameters as input and return \link{dist} object.} - -\item{.trim_genes}{If TRUE - use only general gene values (e.g. "IGHV1-18") of .group_by columns for clustering; if FALSE - can cause very small clusters in case of high resolution genotyping} } \value{ Named list of list with \link{dist} objects for given repertoires for each combination of .group_by variable(s) and/or sequence length of .col.