From b67e2e4131158cd9011dccc80611d90250a99ff5 Mon Sep 17 00:00:00 2001 From: almeidasilvaf Date: Thu, 25 Jul 2024 10:45:44 +0200 Subject: [PATCH] Modified pairs2kaks() to use indices2kaks() and make it faster --- .github/workflows/check-bioc.yml | 8 ++--- DESCRIPTION | 5 ++- NAMESPACE | 4 +-- R/ka_ks_analyses.R | 60 +++++++++++++++++--------------- man/pairs2kaks.Rd | 11 ++---- 5 files changed, 41 insertions(+), 47 deletions(-) diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml index 0d244fd..e8cf08a 100644 --- a/.github/workflows/check-bioc.yml +++ b/.github/workflows/check-bioc.yml @@ -100,16 +100,16 @@ jobs: uses: actions/cache@v2 with: path: ${{ env.R_LIBS_USER }} - key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.3-${{ hashFiles('.github/depends.Rds') }} - restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.3- + key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.4-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.4- - name: Cache R packages on Linux if: "!contains(github.event.head_commit.message, '/nocache') && runner.os == 'Linux' " uses: actions/cache@v2 with: path: /home/runner/work/_temp/Library - key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.3-${{ hashFiles('.github/depends.Rds') }} - restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.3- + key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.4-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.4- - name: Install Linux system dependencies if: runner.os == 'Linux' diff --git a/DESCRIPTION b/DESCRIPTION index 50b4df8..f4f8f28 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -47,7 +47,7 @@ biocViews: Encoding: UTF-8 LazyData: false Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 Imports: syntenet, GenomicRanges, @@ -59,8 +59,7 @@ Imports: stats, utils, AnnotationDbi, - GenomicFeatures, - BiocParallel + GenomicFeatures Depends: R (>= 4.2.0) Suggests: diff --git a/NAMESPACE b/NAMESPACE index f1f8743..12c299b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,12 +17,10 @@ export(plot_ks_peaks) export(plot_rates_by_species) export(split_pairs_by_peak) importFrom(AnnotationDbi,select) -importFrom(BiocParallel,SerialParam) -importFrom(BiocParallel,bplapply) importFrom(Biostrings,width) importFrom(GenomicFeatures,intronsByTranscript) importFrom(GenomicRanges,GRangesList) -importFrom(MSA2dist,dnastring2kaks) +importFrom(MSA2dist,indices2kaks) importFrom(ggplot2,aes) importFrom(ggplot2,after_stat) importFrom(ggplot2,element_blank) diff --git a/R/ka_ks_analyses.R b/R/ka_ks_analyses.R index f6b53eb..0bd76bd 100644 --- a/R/ka_ks_analyses.R +++ b/R/ka_ks_analyses.R @@ -10,14 +10,13 @@ #' Possible values are "Li", "NG86", "NG", "LWL", "LPB", "MLWL", "MLPB", "GY", #' "YN", "MYN", "MS", "MA", "GNG", "GLWL", "GLPB", "GMLWL", "GMLPB", "GYN", #' and "GMYN". Default: "MYN". -#' @param bp_param BiocParallel back-end to be used. -#' Default: `BiocParallel::SerialParam()`. +#' @param threads Numeric indicating the number of threads to use. Default: 1. #' #' @return A list of data frames containing gene pairs and their Ka, Ks, #' and Ka/Ks values. -#' @importFrom MSA2dist dnastring2kaks +#' +#' @importFrom MSA2dist indices2kaks #' @importFrom Biostrings width -#' @importFrom BiocParallel SerialParam bplapply #' @export #' @rdname pairs2kaks #' @examples @@ -42,10 +41,8 @@ #' cds <- list(Scerevisiae = cds_scerevisiae) #' #' kaks <- pairs2kaks(gene_pairs_list, cds) -pairs2kaks <- function( - gene_pairs_list, cds, model = "MYN", - bp_param = BiocParallel::SerialParam() -) { +#' +pairs2kaks <- function(gene_pairs_list, cds, model = "MYN", threads = 1) { kaks_list <- lapply(seq_along(gene_pairs_list), function(x) { @@ -70,32 +67,38 @@ pairs2kaks <- function( fcds <- fcds[-remove] } - # Calculate Ka, Ks, and Ka/Ks for each gene pair - seq_list <- lapply(seq_len(nrow(pairs)), function(y) { - return(fcds[as.character(pairs[y, c(1, 2)])]) - }) - kaks <- BiocParallel::bplapply(seq_list, function(z, model) { - - rates <- MSA2dist::dnastring2kaks( - z, model = model, isMSA = FALSE, verbose = FALSE - ) - rates <- data.frame( - dup1 = rates$seq1, - dup2 = rates$seq2, - Ka = as.numeric(ifelse(rates$Ka == "NA", NA, rates$Ka)), - Ks = as.numeric(ifelse(rates$Ks == "NA", NA, rates$Ks)), - Ka_Ks = as.numeric(ifelse(rates[["Ka/Ks"]] == "NA", NA, rates[["Ka/Ks"]])) + # Create a list of indices - vectors of length 2 + idx_df <- data.frame(gene = names(fcds), idx = seq_along(fcds)) + pairs_idx <- lapply(seq_len(nrow(pairs)), function(p) { + idx <- c( + idx_df$idx[idx_df$gene == pairs[p, 1]], + idx_df$idx[idx_df$gene == pairs[p, 2]] ) - return(rates) - }, BPPARAM = bp_param, model = model) - kaks <- Reduce(rbind, kaks) + return(idx) + }) + # Calculate rates + rates <- MSA2dist::indices2kaks( + cds = fcds, indices = pairs_idx, + model = model, + threads = threads, + isMSA = FALSE, + verbose = FALSE + ) + + rates <- data.frame( + dup1 = rates$seq1, + dup2 = rates$seq2, + Ka = as.numeric(ifelse(rates$Ka == "NA", NA, rates$Ka)), + Ks = as.numeric(ifelse(rates$Ks == "NA", NA, rates$Ks)), + Ka_Ks = as.numeric(ifelse(rates[["Ka/Ks"]] == "NA", NA, rates[["Ka/Ks"]])) + ) if("type" %in% names(pairs)) { - kaks$type <- pairs$type + rates$type <- pairs$type } - return(kaks) + return(rates) }) names(kaks_list) <- names(gene_pairs_list) @@ -103,7 +106,6 @@ pairs2kaks <- function( } - #' Find peaks in a Ks distribution with Gaussian Mixture Models #' #' @param ks A numeric vector of Ks values. diff --git a/man/pairs2kaks.Rd b/man/pairs2kaks.Rd index 0e03b02..d9e66ef 100644 --- a/man/pairs2kaks.Rd +++ b/man/pairs2kaks.Rd @@ -4,12 +4,7 @@ \alias{pairs2kaks} \title{Calculate Ka, Ks, and Ka/Ks from duplicate gene pairs} \usage{ -pairs2kaks( - gene_pairs_list, - cds, - model = "MYN", - bp_param = BiocParallel::SerialParam() -) +pairs2kaks(gene_pairs_list, cds, model = "MYN", threads = 1) } \arguments{ \item{gene_pairs_list}{List of data frames containing duplicated gene pairs @@ -23,8 +18,7 @@ Possible values are "Li", "NG86", "NG", "LWL", "LPB", "MLWL", "MLPB", "GY", "YN", "MYN", "MS", "MA", "GNG", "GLWL", "GLPB", "GMLWL", "GMLPB", "GYN", and "GMYN". Default: "MYN".} -\item{bp_param}{BiocParallel back-end to be used. -Default: \code{BiocParallel::SerialParam()}.} +\item{threads}{Numeric indicating the number of threads to use. Default: 1.} } \value{ A list of data frames containing gene pairs and their Ka, Ks, @@ -55,4 +49,5 @@ gene_pairs_list <- list( cds <- list(Scerevisiae = cds_scerevisiae) kaks <- pairs2kaks(gene_pairs_list, cds) + }