Skip to content

Commit

Permalink
Bug fix: fixed classify_genes() for empty levels and species ID remov…
Browse files Browse the repository at this point in the history
…al in get_transposed_classes()
  • Loading branch information
almeidasilvaf committed Jan 24, 2024
1 parent 1d12ee7 commit 616b925
Show file tree
Hide file tree
Showing 9 changed files with 85 additions and 65 deletions.
10 changes: 5 additions & 5 deletions .github/workflows/check-bioc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -101,16 +101,16 @@ jobs:
uses: actions/cache@v2
with:
path: ${{ env.R_LIBS_USER }}
key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2-${{ hashFiles('.github/depends.Rds') }}
restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2-
key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_18-r-4.3-${{ hashFiles('.github/depends.Rds') }}
restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_18-r-4.3-

- name: Cache R packages on Linux
if: "!contains(github.event.head_commit.message, '/nocache') && runner.os == 'Linux' "
uses: actions/cache@v2
with:
path: /home/runner/work/_temp/Library
key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2-${{ hashFiles('.github/depends.Rds') }}
restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2-
key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_18-r-4.3-${{ hashFiles('.github/depends.Rds') }}
restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE_3_18-r-4.3-

- name: Install Linux system dependencies
if: runner.os == 'Linux'
Expand Down Expand Up @@ -294,7 +294,7 @@ jobs:
if: failure()
uses: actions/upload-artifact@v2
with:
name: ${{ runner.os }}-biocversion-RELEASE_3_15-r-4.2-results
name: ${{ runner.os }}-biocversion-RELEASE_3_18-r-4.3-results
path: check

## Note that DOCKER_PASSWORD is really a token for your dockerhub
Expand Down
5 changes: 3 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ biocViews:
Encoding: UTF-8
LazyData: false
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
RoxygenNote: 7.3.0
Imports:
syntenet,
GenomicRanges,
Expand All @@ -59,7 +59,8 @@ Imports:
stats,
utils,
AnnotationDbi,
GenomicFeatures
GenomicFeatures,
BiocParallel
Depends:
R (>= 4.2.0)
Suggests:
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ export(pairs2kaks)
export(plot_ks_peaks)
export(split_pairs_by_peak)
importFrom(AnnotationDbi,select)
importFrom(BiocParallel,SerialParam)
importFrom(BiocParallel,bplapply)
importFrom(Biostrings,width)
importFrom(GenomicFeatures,intronsByTranscript)
importFrom(GenomicRanges,GRangesList)
Expand Down
21 changes: 10 additions & 11 deletions R/duplicate_classification.R
Original file line number Diff line number Diff line change
Expand Up @@ -182,23 +182,24 @@ classify_gene_pairs <- function(
#' class_genes <- classify_genes(gene_pairs_list)
classify_genes <- function(gene_pairs_list = NULL) {

# Classify genes into unique modes used factor levels as priority order
class_genes <- lapply(gene_pairs_list, function(x) {

pairs_by_type <- split(x, x$type)
gene_type <- Reduce(rbind, lapply(pairs_by_type, function(x) {
genes <- unique(c(x$dup1, x$dup2))
genes_df <- data.frame(gene = genes, type = x$type[1])
genes_df <- genes_df[!duplicated(genes_df$gene), ]
gene_type <- Reduce(rbind, lapply(pairs_by_type, function(y) {

genes_df <- NULL
if(nrow(y) > 0) {
genes <- unique(c(y$dup1, y$dup2))
genes_df <- data.frame(gene = genes, type = y$type[1])
genes_df <- genes_df[!duplicated(genes_df$gene), ]
}

return(genes_df)
}))

# For genes assigned to multiple classes, keep the first (level order)
ref <- levels(x$type)
gene_type <- gene_type[order(match(gene_type$type, ref)), ]
gene_type <- gene_type[!duplicated(gene_type$gene), ]
})

return(class_genes)
}

Expand All @@ -208,5 +209,3 @@ classify_genes <- function(gene_pairs_list = NULL) {





82 changes: 50 additions & 32 deletions R/ka_ks_analyses.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,14 @@
#' Possible values are "Li", "NG86", "NG", "LWL", "LPB", "MLWL", "MLPB", "GY",
#' "YN", "MYN", "MS", "MA", "GNG", "GLWL", "GLPB", "GMLWL", "GMLPB", "GYN",
#' and "GMYN". Default: "MYN".
#' @param threads Numeric scalar indicating the number of threads to use.
#' Default: 1.
#' @param BiocParallel back-end to be used.
#' Default: `BiocParallel::SerialParam()`.
#'
#' @return A list of data frames containing gene pairs and their Ka, Ks,
#' and Ka/Ks values.
#' @importFrom MSA2dist dnastring2kaks
#' @importFrom Biostrings width
#' @importFrom BiocParallel SerialParam bplapply
#' @export
#' @rdname pairs2kaks
#' @examples
Expand All @@ -40,51 +41,68 @@
#' cds <- list(Scerevisiae = cds_scerevisiae)
#'
#' kaks <- pairs2kaks(gene_pairs_list, cds)
pairs2kaks <- function(gene_pairs_list, cds, model = "MYN", threads = 1) {
pairs2kaks <- function(
gene_pairs_list, cds, model = "MYN",
bp_param = BiocParallel::SerialParam()
) {

kaks_list <- lapply(seq_along(gene_pairs_list), function(x) {

# Get pairs for species x
species <- names(gene_pairs_list)[x]
pairs <- gene_pairs_list[[x]]
names(pairs)[c(1, 2)] <- c("dup1", "dup2")
pairs$dup1 <- gsub("^[a-zA-Z]{2,5}_", "", pairs$dup1)
pairs$dup2 <- gsub("^[a-zA-Z]{2,5}_", "", pairs$dup2)

# Remove species ID from gene IDs in gene pairs
pairs$dup1 <- gsub("[a-zA-Z]{2,5}_", "", pairs$dup1)
pairs$dup2 <- gsub("[a-zA-Z]{2,5}_", "", pairs$dup2)
# Remove CDS that are not multiple of 3
fcds <- cds[[species]]
m3 <- Biostrings::width(fcds) %% 3
remove <- which(m3 != 0)
if(length(remove) != 0) {
message(
"For species ", species, ", the lengths of ", length(remove),
" CDS are not multiples of 3. Removing them..."
)
pairs <- pairs[!pairs$dup1 %in% names(fcds)[remove], ]
pairs <- pairs[!pairs$dup2 %in% names(fcds)[remove], ]
fcds <- fcds[-remove]
}

kaks <- Reduce(rbind, lapply(seq_len(nrow(pairs)), function(y) {
genes <- as.character(pairs[y, c(1, 2)])
cds_genes <- cds[[species]][genes]
# Calculate Ka, Ks, and Ka/Ks for each gene pair
seq_list <- lapply(seq_len(nrow(pairs)), function(y) {
return(fcds[as.character(pairs[y, c(1, 2)])])
})
kaks <- BiocParallel::bplapply(seq_list, function(z, model) {

rates <- MSA2dist::dnastring2kaks(
z, model = model, isMSA = FALSE, verbose = FALSE
)
rates <- data.frame(
dup1 = rates$seq1,
dup2 = rates$seq2,
Ka = as.numeric(ifelse(rates$Ka == "NA", NA, rates$Ka)),
Ks = as.numeric(ifelse(rates$Ks == "NA", NA, rates$Ks)),
Ka_Ks = as.numeric(ifelse(rates[["Ka/Ks"]] == "NA", NA, rates[["Ka/Ks"]]))
)

multiple_3 <- Biostrings::width(cds_genes) %% 3
if(sum(multiple_3) > 0) {
vals <- NULL
pgenes <- paste0(genes, collapse = ", ")
message("CDS length is not a multiple of 3 for pair ", pgenes)
} else {
vals <- MSA2dist::dnastring2kaks(
cds_genes, model = "MYN", isMSA = FALSE, threads = threads
)
vals <- as.data.frame(vals)
vals <- vals[, c("seq1", "seq2", "Ka", "Ks", "Ka/Ks")]
names(vals) <- c("dup1", "dup2", "Ka", "Ks", "Ka_Ks")
rownames(vals) <- NULL
if("type" %in% names(pairs)) { vals$type <- pairs[y, "type"] }
}
return(vals)
}))
kaks$Ka <- gsub("NA", NA, kaks$Ka)
kaks$Ka <- as.numeric(kaks$Ka)
kaks$Ks <- gsub("NA", NA, kaks$Ks)
kaks$Ks <- as.numeric(kaks$Ks)
kaks$Ka_Ks <- gsub("NA", NA, kaks$Ka_Ks)
kaks$Ka_Ks <- as.numeric(kaks$Ka_Ks)
return(rates)
}, BPPARAM = bp_param, model = model)
kaks <- Reduce(rbind, kaks)

if("type" %in% names(pairs)) {
kaks$type <- pairs$type
}

return(kaks)
})
names(kaks_list) <- names(gene_pairs_list)

return(kaks_list)
}



#' Find peaks in a Ks distribution with Gaussian Mixture Models
#'
#' @param ks A numeric vector of Ks values.
Expand Down
5 changes: 2 additions & 3 deletions R/utils_duplicate_classification.R
Original file line number Diff line number Diff line change
Expand Up @@ -340,8 +340,8 @@ get_transposed_classes <- function(pairs, intron_counts) {
if(nrow(tpairs) > 0) {

id <- unique(gsub("_.*", "", tpairs$dup1))
tpairs$dup1 <- gsub("[a-zA-Z]{2,5}_", "", tpairs$dup1)
tpairs$dup2 <- gsub("[a-zA-Z]{2,5}_", "", tpairs$dup2)
tpairs$dup1 <- gsub("^[a-zA-Z]{2,5}_", "", tpairs$dup1)
tpairs$dup2 <- gsub("^[a-zA-Z]{2,5}_", "", tpairs$dup2)

# Combine `tpairs` and `intron_counts`
pairs_ic <- merge(
Expand Down Expand Up @@ -373,4 +373,3 @@ get_transposed_classes <- function(pairs, intron_counts) {
return(final_pairs)
}


12 changes: 4 additions & 8 deletions inst/_pkgdown.yml
Original file line number Diff line number Diff line change
@@ -1,20 +1,16 @@
template:
bootstrap: 5
bslib:
preset: "bootstrap"
font_scale: 1.0
base_font:
google: "Atkinson Hyperlegible" # font for better accessibility
code_font:
google: "Source Code Pro"
primary: "#18416a" # navbar hover and sidebar: dark blue
navbar-light-bg: "#0092ac" # navbar bg: Bioc blue
navbar-bg: "#0092ac" # navbar bg: Bioc blue
navbar-light-brand-color: "#f9fafb" # pkg name: white
text-muted: "#e9ecef" # version: light gray
navbar-light-color: "#f8f9fa" # navbar pages text color: white
navbar-light-hover-color: "#87b13f" # navbar text on hover: Bioc green
navbar-light-brand-color: "#18416a" # pkg name: white
navbar-light-hover-color: "#18416a" # navbar text on hover: Bioc green
pkgdown-nav-height: 78px

navbar:
bg: "#0092ac"
type: light
type: light
11 changes: 8 additions & 3 deletions man/pairs2kaks.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion vignettes/doubletrouble_vignette.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ they identify and their required input, are summarized in the table below:
| extended | SD, TD, PD, TRD, DD | `blast_list`, `annotation`, `blast_inter` |
| full | SD, TD, PD, rTRD, dTRD, DD | `blast_list`, `annotation`, `blast_inter`, `intron_counts` |

Table: SD, segmental duplication. SSD, small-scale duplication.
**Legend:** SD, segmental duplication. SSD, small-scale duplication.
TD, tandem duplication. PD, proximal duplication.
TRD, transposon-derived duplication. rTRD, retrotransposon-derived duplication.
dTRD, DNA transposon-derived duplication. DD, dispersed duplication.
Expand Down

0 comments on commit 616b925

Please sign in to comment.