New feature: TRD duplicates can now be classified into rTRD and dTRD

almeidasilvaf · Nov 8, 2023 · 837030d · 837030d
1 parent dc00d6b
commit 837030d
Show file tree

Hide file tree

Showing 28 changed files with 1,332 additions and 821 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -17,14 +17,16 @@ Authors@R:
     )
 Description: doubletrouble aims to identify duplicated genes from
     whole-genome protein sequences and classify them based on their modes
-    of duplication. The duplication modes are: 
-    i. whole-genome duplication (WGD);
+    of duplication. The duplication modes are i. segmental duplication (SD);
     ii. tandem duplication (TD);
     iii. proximal duplication (PD);
     iv. transposed duplication (TRD) and;
     v. dispersed duplication (DD).
+    Transposon-derived duplicates (TRD) can be further subdivided into
+    rTRD (retrotransposon-derived duplication) and 
+    dTRD (DNA transposon-derived duplication).
     If users want a simpler classification scheme, duplicates can also be
-    classified into WGD- and SSD-derived (small-scale duplication) gene pairs.
+    classified into SD- and SSD-derived (small-scale duplication) gene pairs.
     Besides classifying gene pairs, users can also classify genes, so that
     each gene is assigned a unique mode of duplication. 
     Users can also calculate substitution rates per substitution site (i.e., Ka
@@ -40,7 +42,8 @@ biocViews:
     ComparativeGenomics,
     FunctionalGenomics,
     Phylogenetics,
-    Network
+    Network,
+    Classification
 Encoding: UTF-8
 LazyData: false
 Roxygen: list(markdown = TRUE)
@@ -52,8 +55,11 @@ Imports:
     mclust,
     MSA2dist (>= 1.1.5),
     ggplot2,
+    rlang,
     stats,
-    utils
+    utils,
+    AnnotationDbi,
+    GenomicFeatures
 Depends: 
     R (>= 4.2.0)
 Suggests: 

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,18 +2,22 @@
 
 export(classify_gene_pairs)
 export(classify_genes)
-export(classify_ssd_pairs)
 export(find_ks_peaks)
 export(get_anchors_list)
+export(get_intron_counts)
+export(get_segmental)
+export(get_tandem_proximal)
 export(get_transposed)
-export(get_wgd_pairs)
+export(get_transposed_classes)
 export(pairs2kaks)
 export(plot_ks_peaks)
 export(split_pairs_by_peak)
+importFrom(AnnotationDbi,select)
 importFrom(Biostrings,width)
+importFrom(GenomicFeatures,intronsByTranscript)
 importFrom(GenomicRanges,GRangesList)
 importFrom(MSA2dist,dnastring2kaks)
-importFrom(ggplot2,aes_)
+importFrom(ggplot2,aes)
 importFrom(ggplot2,geom_histogram)
 importFrom(ggplot2,geom_vline)
 importFrom(ggplot2,ggplot)
@@ -22,6 +26,7 @@ importFrom(ggplot2,labs)
 importFrom(ggplot2,stat_function)
 importFrom(ggplot2,theme_bw)
 importFrom(mclust,densityMclust)
+importFrom(rlang,.data)
 importFrom(stats,dnorm)
 importFrom(syntenet,interspecies_synteny)
 importFrom(syntenet,intraspecies_synteny)

diff --git a/R/data.R b/R/data.R
@@ -15,8 +15,8 @@
 
 #' Genome annotation of the yeast species S. cerevisiae and C. glabrata
 #'
-#' Data obtained from Ensembl Fungi. Only annotation data for primary
-#' transcripts were included.
+#' Data obtained from Ensembl Fungi. Only annotation data protein-coding
+#' genes (with associated mRNA, exons, CDS, etc) are included.
 #' 
 #' @name yeast_annot
 #' @format A CompressedGRangesList containing 
@@ -105,3 +105,4 @@
 #' @usage data(gmax_ks)
 "gmax_ks"
 
+
diff --git a/R/duplicate_classification.R b/R/duplicate_classification.R
@@ -8,8 +8,21 @@
 #' for intraspecies comparisons.
 #' Each list element corresponds to the BLAST output for a given species,
 #' and names of list elements must match the names of list elements in
-#' `annotation`. BLASTp, DIAMOND or simular programs must be run on processed
-#' sequence data as returned by \code{process_input()}.
+#' \strong{annotation}. BLASTp, DIAMOND or simular programs must be run 
+#' on processed sequence data as returned by \code{process_input()}.
+#' @param scheme Character indicating which classification scheme to use.
+#' One of "binary", "standard", "extended", or "full". See details below
+#' for information on what each scheme means. Default: "standard".
+#' @param blast_inter (Only valid if \code{scheme == "extended" or "full"}).
+#' A list of data frames containing BLAST tabular output 
+#' for the comparison between target species and outgroups. 
+#' Names of list elements must match the names of 
+#' list elements in `annotation`. BLASTp, DIAMOND or simular programs must 
+#' be run on processed sequence data as returned by \code{process_input()}.
+#' @param intron_counts (Only valid if \code{scheme == "full"}). 
+#' A list of 2-column data frames with the number of
+#' introns per gene as returned by \code{get_intron_counts()}. Names
+#' of list elements must match names of \strong{annotation}.
 #' @param evalue Numeric scalar indicating the E-value threshold. 
 #' Default: 1e-10.
 #' @param anchors Numeric indicating the minimum required number of genes
@@ -18,102 +31,124 @@
 #' @param max_gaps Numeric indicating the number of upstream and downstream
 #' genes to search for anchors, as in \code{syntenet::infer_syntenet}. 
 #' Default: 25.
-#' @param binary Logical indicating whether to perform a binary classification
-#' (i.e., whole-genome and small-scale duplication) or not. If FALSE,
-#' small-scale duplications are subdivided into tandem, proximal, and dispersed
-#' duplications. Default: FALSE.
 #' @param proximal_max Numeric scalar with the maximum distance (in number
 #' of genes) between two genes to consider them as proximal duplicates.
 #' Default: 10.
-#' @param blast_inter A list of data frames containing BLAST tabular output 
-#' for the comparison between target species and outgroups. 
-#' Names of list elements must match the names of 
-#' list elements in `annotation`. BLASTp, DIAMOND or simular programs must 
-#' be run on processed sequence data as returned by \code{process_input()}.
+#' @param collinearity_dir Character indicating the path to the directory
+#' where .collinearity files will be stored. If NULL, files will
+#' be stored in a subdirectory of \code{tempdir()}. Default: NULL.
 #'  
 #' @return A list of 3-column data frames of duplicated gene pairs 
 #' (columns 1 and 2), and their modes of duplication (column 3).
+#' 
+#' @details
+#' The classification schemes increase in complexity (number of classes)
+#' in the order 'binary', 'standard', 'extended', and 'full'.
+#' 
+#' For classification scheme "binary", duplicates are classified into
+#' one of 'SD' (segmental duplications) or 'SSD' (small-scale duplications).
+#' 
+#' For classification scheme "standard" (default), duplicates are
+#' classified into 'SD' (segmental duplication), 'TD' (tandem duplication),
+#' 'PD' (proximal duplication), and 'DD' (dispersed duplication).
+#' 
+#' For classification scheme "extended", duplicates are classified into
+#' 'SD' (segmental duplication), 'TD' (tandem duplication), 
+#' 'PD' (proximal duplication), 'TRD' (transposon-derived duplication), 
+#' and 'DD' (dispersed duplication).
+#' 
+#' Finally, for classification scheme "full", duplicates are classified into
+#' 'SD' (segmental duplication), 'TD' (tandem duplication), 
+#' 'PD' (proximal duplication), 'rTRD' (retrotransposon-derived duplication), 
+#' 'dTRD' (DNA transposon-derived duplication), and 
+#' 'DD' (dispersed duplication).
+#' 
 #' @export
-#' @importFrom GenomicRanges GRangesList
 #' @rdname classify_gene_pairs
 #' @examples 
+#' # Load example data
 #' data(diamond_intra)
 #' data(diamond_inter)
 #' data(yeast_annot)
 #' data(yeast_seq)
-#' blast_list <- diamond_intra
-#' blast_inter <- diamond_inter
 #' 
-#' pdata <- syntenet::process_input(yeast_seq, yeast_annot)
-#' annot <- pdata$annotation["Scerevisiae"]
+#' # Get processed annotation data
+#' annotation <- syntenet::process_input(yeast_seq, yeast_annot)$annotation
 #' 
-#' # Binary classification scheme
-#' dup_binary <- classify_gene_pairs(blast_list, annot, binary = TRUE)
-#' table(dup_binary$Scerevisiae$type)
+#' # Get list of intron counts
+#' txdb_list <- lapply(yeast_annot, GenomicFeatures::makeTxDbFromGRanges)
+#' intron_counts <- lapply(txdb_list, get_intron_counts)
 #' 
-#' # Expanded classification scheme
-#' dup_exp <- classify_gene_pairs(blast_list, annot)
-#' table(dup_exp$Scerevisiae$type)
-#' 
-#' # Full classification scheme
-#' annotation <- pdata$annotation
-#' dup_full <- classify_gene_pairs(
-#'     blast_list, annotation, blast_inter = blast_inter
+#' # Classify duplicates - full scheme
+#' dup_class <- classify_gene_pairs(
+#'     annotation = annotation, 
+#'     blast_list = diamond_intra, 
+#'     scheme = "full",
+#'     blast_inter = diamond_inter, 
+#'     intron_counts = intron_counts
 #' )
-#' table(dup_full$Scerevisiae$type)
-classify_gene_pairs <- function(blast_list = NULL, annotation = NULL,
-                                evalue = 1e-10, anchors = 5, max_gaps = 25,
-                                binary = FALSE, proximal_max = 10,
-                                blast_inter = NULL) {
+#' 
+#' # Check number of gene pairs per class
+#' table(dup_class$Scerevisiae$type)
+#' 
+classify_gene_pairs <- function(
+        annotation = NULL, blast_list = NULL, scheme = "standard",
+        blast_inter = NULL, intron_counts,
+        evalue = 1e-10, anchors = 5, max_gaps = 25, proximal_max = 10,
+        collinearity_dir = NULL
+) {
 
     anchorp <- get_anchors_list(
-        blast_list, annotation, evalue, anchors, max_gaps
+        blast_list, annotation, evalue, anchors, max_gaps, collinearity_dir
     )
-    if(binary & !is.null(blast_inter)) {
-        message("Parameter 'binary' was set to TRUE. Ignoring 'blast_inter'...")
-    }
 
     # Get duplicate pairs and filter duplicate entries
     pairs <- lapply(blast_list, function(x) {
         fpair <- x[x$evalue <= evalue, c(1, 2)]
         fpair <- fpair[fpair[, 1] != fpair[, 2], ]
-        fpair <- fpair[!duplicated(t(apply(fpair, 1, sort))),]
+        fpair <- fpair[!duplicated(t(apply(fpair, 1, sort))), ]
         names(fpair) <- c("dup1", "dup2")
         return(fpair)
     })
-
-    dups <- lapply(seq_along(anchorp), function(x) {
-        # Find WGD-derived gene pairs
+
+
+    dup_list <- lapply(seq_along(anchorp), function(x) {
+        # 1) Get segmental duplicates
         sp <- names(anchorp)[x]
         p <- pairs[[grep(paste0(sp, "$"), names(pairs))]]
-        dups <- get_wgd_pairs(anchorp[[x]], p)
-
-        if(!binary) {
-            ssd <- dups[dups$type == "SSD", ]
-            wgd <- dups[dups$type == "WGD", ]
-
-            # Classify SSD-derived gene pairs
-            annot <- annotation[[sp]]
+
+        dups <- get_segmental(anchorp[[x]], p)
+        if(scheme == "binary") {
+            dups$type <- gsub("DD", "SSD", dups$type)
+            dups$type <- factor(dups$type, levels = c("SD", "SSD"))
+        } else {
+            # 2) Get tandem and proximal duplicates
+            dups <- get_tandem_proximal(
+                dups, annotation_granges = annotation[[sp]], 
+                proximal_max = proximal_max
+            )
 
-            if(is.null(blast_inter)) { # expanded scheme
-                ssd_classes <- classify_ssd_pairs(
-                    ssd, annot, proximal_max = proximal_max
-                )
-            } else { # full scheme
-                pattern <- paste0(sp, "_")
-                binter <- blast_inter[startsWith(names(blast_inter), pattern)]
-                ssd_classes <- classify_ssd_pairs(
-                    ssd, annot, annotation, proximal_max = proximal_max,
-                    blast_inter = binter
+            if(scheme %in% c("extended", "full")) {
+                # 3) Get transposed duplicates
+                binter <- blast_inter[startsWith(names(blast_inter), paste0(sp, "_"))]
+                dups <- get_transposed(
+                    dups, binter, annotation, evalue = evalue,
+                    anchors = anchors, max_gaps = max_gaps,
+                    collinearity_dir = collinearity_dir
                 )
+
+                if(scheme == "full") {
+                    # 4) Get TRD classes (rTRD and dTRD)
+                    dups <- get_transposed_classes(dups, intron_counts[[sp]])
+                }
             }
-            dups <- rbind(wgd, ssd_classes)
         }
-        rownames(dups) <- NULL
+
         return(dups)
     })
-    names(dups) <- names(anchorp)
-    return(dups)
+    names(dup_list) <- names(anchorp)
+
+    return(dup_list)
 }
 
 
@@ -124,21 +159,30 @@ classify_gene_pairs <- function(blast_list = NULL, annotation = NULL,
 #' 
 #' @return A list of 2-column data frames with variables \strong{gene} 
 #' and \strong{type} representing gene ID and duplication type, respectively.
+#' 
+#' @details
+#' If a gene is present in pairs with different duplication modes, the gene
+#' is classified into a unique mode of duplication following the order
+#' of priority indicated in the levels of the factor \strong{type}.
+#' 
+#' For scheme "binary", the order is SD > SSD.
+#' For scheme "standard", the order is SD > TD > PD > DD.
+#' For scheme "extended", the order is SD > TD > PD > TRD > DD.
+#' For scheme "full", the order is SD > TD > PD > rTRD > dTRD > DD.
 #'
 #' @rdname classify_genes
 #' @export
+#' @importFrom GenomicRanges GRangesList
 #' @examples
-#' data(diamond_intra)
-#' data(yeast_annot)
-#' data(yeast_seq)
+#' data(scerevisiae_kaks)
+#' 
+#' cols <- c("dup1", "dup2", "type")
+#' gene_pairs_list <- list(Scerevisiae = scerevisiae_kaks[, cols])
 #' 
-#' pdata <- syntenet::process_input(yeast_seq, yeast_annot)
-#' annot <- pdata$annotation["Scerevisiae"]
-#' duplicates <- classify_gene_pairs(diamond_intra, annot)
-#' class_genes <- classify_genes(duplicates)
+#' class_genes <- classify_genes(gene_pairs_list)
 classify_genes <- function(gene_pairs_list = NULL) {
 
-    # Classify genes following the order of priority: WGD > TD > PD > TRD > DD
+    # Classify genes into unique modes used factor levels as priority order
     class_genes <- lapply(gene_pairs_list, function(x) {
 
         pairs_by_type <- split(x, x$type)
@@ -148,17 +192,13 @@ classify_genes <- function(gene_pairs_list = NULL) {
             genes_df <- genes_df[!duplicated(genes_df$gene), ]
             return(genes_df)
         }))
-
-        # Reorder 'type' variable based on
-        ref <- c("WGD", "TD", "PD", "DD")
-        if(length(unique(gene_type$type)) == 2) {
-            ref <- c("WGD", "SSD")
-        } else if(length(unique(gene_type$type)) == 5) {
-            ref <- c("WGD", "TD", "PD", "TRD", "DD")
-        }
+
+        # For genes assigned to multiple classes, keep the first (level order)        
+        ref <- levels(x$type)
         gene_type <- gene_type[order(match(gene_type$type, ref)), ]
         gene_type <- gene_type[!duplicated(gene_type$gene), ]
     })
+
     return(class_genes)
 }