Bug fix: fixed classify_genes() for empty levels and species ID remov…

…al in get_transposed_classes()
almeidasilvaf · Jan 24, 2024 · 616b925 · 616b925
1 parent 1d12ee7
commit 616b925
Show file tree

Hide file tree

Showing 9 changed files with 85 additions and 65 deletions.
diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml
@@ -101,16 +101,16 @@ jobs:
         uses: actions/cache@v2
         with:
           path: ${{ env.R_LIBS_USER }}
-          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2-${{ hashFiles('.github/depends.Rds') }}
-          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2-
+          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_18-r-4.3-${{ hashFiles('.github/depends.Rds') }}
+          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_18-r-4.3-
 
       - name: Cache R packages on Linux
         if: "!contains(github.event.head_commit.message, '/nocache') && runner.os == 'Linux' "
         uses: actions/cache@v2
         with:
           path: /home/runner/work/_temp/Library
-          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2-${{ hashFiles('.github/depends.Rds') }}
-          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2-
+          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_18-r-4.3-${{ hashFiles('.github/depends.Rds') }}
+          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_18-r-4.3-
 
       - name: Install Linux system dependencies
         if: runner.os == 'Linux'
@@ -294,7 +294,7 @@ jobs:
         if: failure()
         uses: actions/upload-artifact@v2
         with:
-          name: ${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2-results
+          name: ${{ runner.os }}-biocversion-RELEASE_3_18-r-4.3-results
           path: check
 
         ## Note that DOCKER_PASSWORD is really a token for your dockerhub

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -47,7 +47,7 @@ biocViews:
 Encoding: UTF-8
 LazyData: false
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.0
 Imports:
     syntenet,
     GenomicRanges,
@@ -59,7 +59,8 @@ Imports:
     stats,
     utils,
     AnnotationDbi,
-    GenomicFeatures
+    GenomicFeatures,
+    BiocParallel
 Depends: 
     R (>= 4.2.0)
 Suggests: 

diff --git a/NAMESPACE b/NAMESPACE
@@ -13,6 +13,8 @@ export(pairs2kaks)
 export(plot_ks_peaks)
 export(split_pairs_by_peak)
 importFrom(AnnotationDbi,select)
+importFrom(BiocParallel,SerialParam)
+importFrom(BiocParallel,bplapply)
 importFrom(Biostrings,width)
 importFrom(GenomicFeatures,intronsByTranscript)
 importFrom(GenomicRanges,GRangesList)

diff --git a/R/duplicate_classification.R b/R/duplicate_classification.R
@@ -182,23 +182,24 @@ classify_gene_pairs <- function(
 #' class_genes <- classify_genes(gene_pairs_list)
 classify_genes <- function(gene_pairs_list = NULL) {
 
-    # Classify genes into unique modes used factor levels as priority order
     class_genes <- lapply(gene_pairs_list, function(x) {
-
+        
         pairs_by_type <- split(x, x$type)
-        gene_type <- Reduce(rbind, lapply(pairs_by_type, function(x) {
-            genes <- unique(c(x$dup1, x$dup2))
-            genes_df <- data.frame(gene = genes, type = x$type[1])
-            genes_df <- genes_df[!duplicated(genes_df$gene), ]
+        gene_type <- Reduce(rbind, lapply(pairs_by_type, function(y) {
+
+            genes_df <- NULL
+            if(nrow(y) > 0) {
+                genes <- unique(c(y$dup1, y$dup2))
+                genes_df <- data.frame(gene = genes, type = y$type[1])
+                genes_df <- genes_df[!duplicated(genes_df$gene), ]
+            }
+
             return(genes_df)
         }))
-
-        # For genes assigned to multiple classes, keep the first (level order)        
         ref <- levels(x$type)
         gene_type <- gene_type[order(match(gene_type$type, ref)), ]
         gene_type <- gene_type[!duplicated(gene_type$gene), ]
     })
-
     return(class_genes)
 }
 
@@ -208,5 +209,3 @@ classify_genes <- function(gene_pairs_list = NULL) {
 
 
 
-
-
diff --git a/R/ka_ks_analyses.R b/R/ka_ks_analyses.R
@@ -10,13 +10,14 @@
 #' Possible values are "Li", "NG86", "NG", "LWL", "LPB", "MLWL", "MLPB", "GY", 
 #' "YN", "MYN", "MS", "MA", "GNG", "GLWL", "GLPB", "GMLWL", "GMLPB", "GYN", 
 #' and "GMYN". Default: "MYN".
-#' @param threads Numeric scalar indicating the number of threads to use.
-#' Default: 1.
+#' @param BiocParallel back-end to be used. 
+#' Default: `BiocParallel::SerialParam()`.
 #' 
 #' @return A list of data frames containing gene pairs and their Ka, Ks,
 #' and Ka/Ks values.
 #' @importFrom MSA2dist dnastring2kaks
 #' @importFrom Biostrings width
+#' @importFrom BiocParallel SerialParam bplapply
 #' @export
 #' @rdname pairs2kaks
 #' @examples 
@@ -40,51 +41,68 @@
 #' cds <- list(Scerevisiae = cds_scerevisiae)
 #' 
 #' kaks <- pairs2kaks(gene_pairs_list, cds)
-pairs2kaks <- function(gene_pairs_list, cds, model = "MYN", threads = 1) {
+pairs2kaks <- function(
+        gene_pairs_list, cds, model = "MYN", 
+        bp_param = BiocParallel::SerialParam()
+) {
 
     kaks_list <- lapply(seq_along(gene_pairs_list), function(x) {
+
+        # Get pairs for species x
         species <- names(gene_pairs_list)[x]
         pairs <- gene_pairs_list[[x]]
         names(pairs)[c(1, 2)] <- c("dup1", "dup2")
+        pairs$dup1 <- gsub("^[a-zA-Z]{2,5}_", "", pairs$dup1)
+        pairs$dup2 <- gsub("^[a-zA-Z]{2,5}_", "", pairs$dup2)
 
-        # Remove species ID from gene IDs in gene pairs
-        pairs$dup1 <- gsub("[a-zA-Z]{2,5}_", "", pairs$dup1)
-        pairs$dup2 <- gsub("[a-zA-Z]{2,5}_", "", pairs$dup2)
+        # Remove CDS that are not multiple of 3
+        fcds <- cds[[species]]
+        m3 <- Biostrings::width(fcds) %% 3
+        remove <- which(m3 != 0)
+        if(length(remove) != 0) {
+            message(
+                "For species ", species, ", the lengths of ", length(remove), 
+                " CDS are not multiples of 3. Removing them..."
+            )
+            pairs <- pairs[!pairs$dup1 %in% names(fcds)[remove], ]
+            pairs <- pairs[!pairs$dup2 %in% names(fcds)[remove], ]
+            fcds <- fcds[-remove]
+        }
 
-        kaks <- Reduce(rbind, lapply(seq_len(nrow(pairs)), function(y) {
-            genes <- as.character(pairs[y, c(1, 2)])
-            cds_genes <- cds[[species]][genes]
+        # Calculate Ka, Ks, and Ka/Ks for each gene pair
+        seq_list <- lapply(seq_len(nrow(pairs)), function(y) {
+            return(fcds[as.character(pairs[y, c(1, 2)])])
+        })
+        kaks <- BiocParallel::bplapply(seq_list, function(z, model) {
+
+            rates <- MSA2dist::dnastring2kaks(
+                z, model = model, isMSA = FALSE, verbose = FALSE
+            )
+            rates <- data.frame(
+                dup1 = rates$seq1,
+                dup2 = rates$seq2,
+                Ka = as.numeric(ifelse(rates$Ka == "NA", NA, rates$Ka)),
+                Ks = as.numeric(ifelse(rates$Ks == "NA", NA, rates$Ks)),
+                Ka_Ks = as.numeric(ifelse(rates[["Ka/Ks"]] == "NA", NA, rates[["Ka/Ks"]]))
+            )
 
-            multiple_3 <- Biostrings::width(cds_genes) %% 3
-            if(sum(multiple_3) > 0) {
-                vals <- NULL
-                pgenes <- paste0(genes, collapse = ", ")
-                message("CDS length is not a multiple of 3 for pair ", pgenes)
-            } else {
-                vals <- MSA2dist::dnastring2kaks(
-                    cds_genes, model = "MYN", isMSA = FALSE, threads = threads
-                )
-                vals <- as.data.frame(vals)
-                vals <- vals[, c("seq1", "seq2", "Ka", "Ks", "Ka/Ks")]
-                names(vals) <- c("dup1", "dup2", "Ka", "Ks", "Ka_Ks")
-                rownames(vals) <- NULL
-                if("type" %in% names(pairs)) { vals$type <- pairs[y, "type"] }
-            }
-            return(vals)
-        }))
-        kaks$Ka <- gsub("NA", NA, kaks$Ka)
-        kaks$Ka <- as.numeric(kaks$Ka)
-        kaks$Ks <- gsub("NA", NA, kaks$Ks)
-        kaks$Ks <- as.numeric(kaks$Ks)
-        kaks$Ka_Ks <- gsub("NA", NA, kaks$Ka_Ks)
-        kaks$Ka_Ks <- as.numeric(kaks$Ka_Ks)
+            return(rates)
+        }, BPPARAM = bp_param, model = model)
+        kaks <- Reduce(rbind, kaks)
+
+        if("type" %in% names(pairs)) {
+            kaks$type <- pairs$type
+        }
+
         return(kaks)
     })
     names(kaks_list) <- names(gene_pairs_list)
+
     return(kaks_list)
 }
 
 
+
 #' Find peaks in a Ks distribution with Gaussian Mixture Models
 #'
 #' @param ks A numeric vector of Ks values.

diff --git a/R/utils_duplicate_classification.R b/R/utils_duplicate_classification.R
@@ -340,8 +340,8 @@ get_transposed_classes <- function(pairs, intron_counts) {
     if(nrow(tpairs) > 0) {
 
         id <- unique(gsub("_.*", "", tpairs$dup1))
-        tpairs$dup1 <- gsub("[a-zA-Z]{2,5}_", "", tpairs$dup1)
-        tpairs$dup2 <- gsub("[a-zA-Z]{2,5}_", "", tpairs$dup2)
+        tpairs$dup1 <- gsub("^[a-zA-Z]{2,5}_", "", tpairs$dup1)
+        tpairs$dup2 <- gsub("^[a-zA-Z]{2,5}_", "", tpairs$dup2)
 
         # Combine `tpairs` and `intron_counts`
         pairs_ic <- merge(
@@ -373,4 +373,3 @@ get_transposed_classes <- function(pairs, intron_counts) {
     return(final_pairs)
 }
 
-
diff --git a/inst/_pkgdown.yml b/inst/_pkgdown.yml
@@ -1,20 +1,16 @@
 template:
   bootstrap: 5
   bslib:
+    preset: "bootstrap"
     font_scale: 1.0
     base_font:
       google: "Atkinson Hyperlegible"           # font for better accessibility
     code_font:
       google: "Source Code Pro"
     primary: "#18416a"                          # navbar hover and sidebar: dark blue
-    navbar-light-bg: "#0092ac"                  # navbar bg: Bioc blue
-    navbar-bg: "#0092ac"                        # navbar bg: Bioc blue
-    navbar-light-brand-color: "#f9fafb"         # pkg name: white
-    text-muted: "#e9ecef"                       # version: light gray
-    navbar-light-color: "#f8f9fa"               # navbar pages text color: white
-    navbar-light-hover-color: "#87b13f"         # navbar text on hover: Bioc green
+    navbar-light-brand-color: "#18416a"         # pkg name: white
+    navbar-light-hover-color: "#18416a"         # navbar text on hover: Bioc green
     pkgdown-nav-height: 78px
 
 navbar:
-  bg: "#0092ac"
-  type: light
+  type: light
diff --git a/man/pairs2kaks.Rd b/man/pairs2kaks.Rd
diff --git a/vignettes/doubletrouble_vignette.Rmd b/vignettes/doubletrouble_vignette.Rmd
@@ -173,7 +173,7 @@ they identify and their required input, are summarized in the table below:
 | extended | SD, TD, PD, TRD, DD         | `blast_list`, `annotation`, `blast_inter`          |
 | full     | SD, TD, PD, rTRD, dTRD, DD  | `blast_list`, `annotation`, `blast_inter`, `intron_counts` |
 
-Table: SD, segmental duplication. SSD, small-scale duplication.
+**Legend:** SD, segmental duplication. SSD, small-scale duplication.
 TD, tandem duplication. PD, proximal duplication. 
 TRD, transposon-derived duplication. rTRD, retrotransposon-derived duplication.
 dTRD, DNA transposon-derived duplication. DD, dispersed duplication.