From b67e2e4131158cd9011dccc80611d90250a99ff5 Mon Sep 17 00:00:00 2001
From: almeidasilvaf <fabricio_almeidasilva@hotmail.com>
Date: Thu, 25 Jul 2024 10:45:44 +0200
Subject: [PATCH] Modified pairs2kaks() to use indices2kaks() and make it
 faster

---
 .github/workflows/check-bioc.yml |  8 ++---
 DESCRIPTION                      |  5 ++-
 NAMESPACE                        |  4 +--
 R/ka_ks_analyses.R               | 60 +++++++++++++++++---------------
 man/pairs2kaks.Rd                | 11 ++----
 5 files changed, 41 insertions(+), 47 deletions(-)

diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml
index 0d244fd..e8cf08a 100644
--- a/.github/workflows/check-bioc.yml
+++ b/.github/workflows/check-bioc.yml
@@ -100,16 +100,16 @@ jobs:
         uses: actions/cache@v2
         with:
           path: ${{ env.R_LIBS_USER }}
-          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.3-${{ hashFiles('.github/depends.Rds') }}
-          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.3-
+          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.4-${{ hashFiles('.github/depends.Rds') }}
+          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.4-
 
       - name: Cache R packages on Linux
         if: "!contains(github.event.head_commit.message, '/nocache') && runner.os == 'Linux' "
         uses: actions/cache@v2
         with:
           path: /home/runner/work/_temp/Library
-          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.3-${{ hashFiles('.github/depends.Rds') }}
-          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.3-
+          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.4-${{ hashFiles('.github/depends.Rds') }}
+          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.4-
 
       - name: Install Linux system dependencies
         if: runner.os == 'Linux'
diff --git a/DESCRIPTION b/DESCRIPTION
index 50b4df8..f4f8f28 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -47,7 +47,7 @@ biocViews:
 Encoding: UTF-8
 LazyData: false
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 Imports:
     syntenet,
     GenomicRanges,
@@ -59,8 +59,7 @@ Imports:
     stats,
     utils,
     AnnotationDbi,
-    GenomicFeatures,
-    BiocParallel
+    GenomicFeatures
 Depends: 
     R (>= 4.2.0)
 Suggests: 
diff --git a/NAMESPACE b/NAMESPACE
index f1f8743..12c299b 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -17,12 +17,10 @@ export(plot_ks_peaks)
 export(plot_rates_by_species)
 export(split_pairs_by_peak)
 importFrom(AnnotationDbi,select)
-importFrom(BiocParallel,SerialParam)
-importFrom(BiocParallel,bplapply)
 importFrom(Biostrings,width)
 importFrom(GenomicFeatures,intronsByTranscript)
 importFrom(GenomicRanges,GRangesList)
-importFrom(MSA2dist,dnastring2kaks)
+importFrom(MSA2dist,indices2kaks)
 importFrom(ggplot2,aes)
 importFrom(ggplot2,after_stat)
 importFrom(ggplot2,element_blank)
diff --git a/R/ka_ks_analyses.R b/R/ka_ks_analyses.R
index f6b53eb..0bd76bd 100644
--- a/R/ka_ks_analyses.R
+++ b/R/ka_ks_analyses.R
@@ -10,14 +10,13 @@
 #' Possible values are "Li", "NG86", "NG", "LWL", "LPB", "MLWL", "MLPB", "GY", 
 #' "YN", "MYN", "MS", "MA", "GNG", "GLWL", "GLPB", "GMLWL", "GMLPB", "GYN", 
 #' and "GMYN". Default: "MYN".
-#' @param bp_param BiocParallel back-end to be used. 
-#' Default: `BiocParallel::SerialParam()`.
+#' @param threads Numeric indicating the number of threads to use. Default: 1.
 #' 
 #' @return A list of data frames containing gene pairs and their Ka, Ks,
 #' and Ka/Ks values.
-#' @importFrom MSA2dist dnastring2kaks
+#' 
+#' @importFrom MSA2dist indices2kaks
 #' @importFrom Biostrings width
-#' @importFrom BiocParallel SerialParam bplapply
 #' @export
 #' @rdname pairs2kaks
 #' @examples 
@@ -42,10 +41,8 @@
 #' cds <- list(Scerevisiae = cds_scerevisiae)
 #' 
 #' kaks <- pairs2kaks(gene_pairs_list, cds)
-pairs2kaks <- function(
-        gene_pairs_list, cds, model = "MYN", 
-        bp_param = BiocParallel::SerialParam()
-) {
+#' 
+pairs2kaks <- function(gene_pairs_list, cds, model = "MYN", threads = 1) {
     
     kaks_list <- lapply(seq_along(gene_pairs_list), function(x) {
         
@@ -70,32 +67,38 @@ pairs2kaks <- function(
             fcds <- fcds[-remove]
         }
         
-        # Calculate Ka, Ks, and Ka/Ks for each gene pair
-        seq_list <- lapply(seq_len(nrow(pairs)), function(y) {
-            return(fcds[as.character(pairs[y, c(1, 2)])])
-        })
-        kaks <- BiocParallel::bplapply(seq_list, function(z, model) {
-            
-            rates <- MSA2dist::dnastring2kaks(
-                z, model = model, isMSA = FALSE, verbose = FALSE
-            )
-            rates <- data.frame(
-                dup1 = rates$seq1,
-                dup2 = rates$seq2,
-                Ka = as.numeric(ifelse(rates$Ka == "NA", NA, rates$Ka)),
-                Ks = as.numeric(ifelse(rates$Ks == "NA", NA, rates$Ks)),
-                Ka_Ks = as.numeric(ifelse(rates[["Ka/Ks"]] == "NA", NA, rates[["Ka/Ks"]]))
+        # Create a list of indices - vectors of length 2
+        idx_df <- data.frame(gene = names(fcds), idx = seq_along(fcds))
+        pairs_idx <- lapply(seq_len(nrow(pairs)), function(p) {
+            idx <- c(
+                idx_df$idx[idx_df$gene == pairs[p, 1]],
+                idx_df$idx[idx_df$gene == pairs[p, 2]]
             )
             
-            return(rates)
-        }, BPPARAM = bp_param, model = model)
-        kaks <- Reduce(rbind, kaks)
+            return(idx)
+        })
         
+        # Calculate rates
+        rates <- MSA2dist::indices2kaks(
+            cds = fcds, indices = pairs_idx,
+            model = model, 
+            threads = threads,
+            isMSA = FALSE, 
+            verbose = FALSE
+        )
+        
+        rates <- data.frame(
+            dup1 = rates$seq1,
+            dup2 = rates$seq2,
+            Ka = as.numeric(ifelse(rates$Ka == "NA", NA, rates$Ka)),
+            Ks = as.numeric(ifelse(rates$Ks == "NA", NA, rates$Ks)),
+            Ka_Ks = as.numeric(ifelse(rates[["Ka/Ks"]] == "NA", NA, rates[["Ka/Ks"]]))
+        )
         if("type" %in% names(pairs)) {
-            kaks$type <- pairs$type
+            rates$type <- pairs$type
         }
         
-        return(kaks)
+        return(rates)
     })
     names(kaks_list) <- names(gene_pairs_list)
     
@@ -103,7 +106,6 @@ pairs2kaks <- function(
 }
 
 
-
 #' Find peaks in a Ks distribution with Gaussian Mixture Models
 #'
 #' @param ks A numeric vector of Ks values.
diff --git a/man/pairs2kaks.Rd b/man/pairs2kaks.Rd
index 0e03b02..d9e66ef 100644
--- a/man/pairs2kaks.Rd
+++ b/man/pairs2kaks.Rd
@@ -4,12 +4,7 @@
 \alias{pairs2kaks}
 \title{Calculate Ka, Ks, and Ka/Ks from duplicate gene pairs}
 \usage{
-pairs2kaks(
-  gene_pairs_list,
-  cds,
-  model = "MYN",
-  bp_param = BiocParallel::SerialParam()
-)
+pairs2kaks(gene_pairs_list, cds, model = "MYN", threads = 1)
 }
 \arguments{
 \item{gene_pairs_list}{List of data frames containing duplicated gene pairs
@@ -23,8 +18,7 @@ Possible values are "Li", "NG86", "NG", "LWL", "LPB", "MLWL", "MLPB", "GY",
 "YN", "MYN", "MS", "MA", "GNG", "GLWL", "GLPB", "GMLWL", "GMLPB", "GYN",
 and "GMYN". Default: "MYN".}
 
-\item{bp_param}{BiocParallel back-end to be used.
-Default: \code{BiocParallel::SerialParam()}.}
+\item{threads}{Numeric indicating the number of threads to use. Default: 1.}
 }
 \value{
 A list of data frames containing gene pairs and their Ka, Ks,
@@ -55,4 +49,5 @@ gene_pairs_list <- list(
 cds <- list(Scerevisiae = cds_scerevisiae)
 
 kaks <- pairs2kaks(gene_pairs_list, cds)
+
 }