From 217ad60b2de0c240a9cebcfad6f3dab99e17c272 Mon Sep 17 00:00:00 2001 From: Aidan Lakshman Date: Thu, 17 Oct 2024 09:01:36 -0400 Subject: [PATCH 1/5] Create test-coverage.yaml --- .github/workflows/test-coverage.yaml | 1 + 1 file changed, 1 insertion(+) create mode 100644 .github/workflows/test-coverage.yaml diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/.github/workflows/test-coverage.yaml @@ -0,0 +1 @@ + From 765a40baff7adb7ca870b5f2a0403b4487e1b831 Mon Sep 17 00:00:00 2001 From: Aidan Lakshman Date: Thu, 17 Oct 2024 09:47:50 -0400 Subject: [PATCH 2/5] feat: Unit Testing and Automation (#114) Major changes: - new workflow file to run tests - numerous unit tests Minor changes: - warning in translate.R:16 has been elevated to an error because AA_ALPHABET is now enforced. error messages for letter.R are standardized, some additional error checking is added. - mask was scheduled to be deprecated in Biostrings 2.9. This function now throws a warning when it is used and will be removed in a subsequent release. man/XStringSetList-class.Rd previously said that "DNAStringSetList and AAStringSetList are the only constructors", but the current version of Biostrings includes BStringSetList and RNAStringSetList constructors. This has been updated. - Some tests were included inline in replaceAt.R wrapped in if(FALSE){...} brackets. These have been moved to the unit tests, and performance/timing tests have been removed. - replaceAmbiguities now checks to ensure input is one of DNAString, RNAString, or the XStringSet/XStringSetList analogs. This previously caused some odd behavior with input of type AAString. - an erroneous extra (useless) argument to make_AA_COLORED_LETTERS has been removed. - a warning in XStringViews-class.R that was supposed to be suppressed in Bioc 2.12 has been suppressed. - tests in inst/unitTests have been ported to testthat when possible. Accordingly, run_unitTests.R has been removed. - Some files have had trailing whitespace removed. --- .github/workflows/test-coverage.yaml | 226 ++++++++++++++ .gitignore | 13 + DESCRIPTION | 2 +- R/XStringViews-class.R | 16 +- R/chartr.R | 3 + R/coloring.R | 2 +- R/dinucleotideFrequencyTest.R | 6 +- R/letter.R | 17 +- R/maskMotif.R | 2 + R/matchPDict.R | 2 +- R/replaceAt.R | 169 ----------- R/translate.R | 3 +- man/XStringSetList-class.Rd | 9 +- man/chartr.Rd | 3 +- tests/run_unitTests.R | 2 - tests/testthat.R | 4 + tests/testthat/test-MaskedXString.R | 115 ++++++++ tests/testthat/test-OtherMasking.R | 58 ++++ tests/testthat/test-PDict.R | 205 +++++++++++++ tests/testthat/test-QualityScaledXStringSet.R | 94 ++++++ tests/testthat/test-XString-class.R | 244 ++++++++++++++++ tests/testthat/test-XStringQuality.R | 77 +++++ tests/testthat/test-XStringSet-class.R | 275 ++++++++++++++++++ tests/testthat/test-XStringSet-io.R | 172 +++++++++++ tests/testthat/test-XStringSetList-class.R | 198 +++++++++++++ tests/testthat/test-XStringSetList.R | 36 +++ tests/testthat/test-XStringViews.R | 49 ++++ tests/testthat/test-chartr.R | 42 +++ .../testthat/test-dinucleotideFrequencyTest.R | 32 ++ tests/testthat/test-findPalindromes.R | 175 +++++++++++ tests/testthat/test-letter.R | 46 +++ tests/testthat/test-letterFrequency.R | 265 +++++++++++++++++ tests/testthat/test-matchLRPatterns.R | 47 +++ tests/testthat/test-matchPDict.R | 224 ++++++++++++++ tests/testthat/test-matchPWM.R | 24 ++ tests/testthat/test-matchPattern.R | 120 ++++++++ tests/testthat/test-miscXStringMethods.R | 111 +++++++ tests/testthat/test-miscellaneous.R | 46 +++ tests/testthat/test-replaceAt.R | 192 ++++++++++++ tests/testthat/test-translate.R | 125 ++++++++ tests/testthat/test-trimLRPatterns.R | 62 ++++ 41 files changed, 3311 insertions(+), 202 deletions(-) create mode 100644 .gitignore delete mode 100644 tests/run_unitTests.R create mode 100644 tests/testthat.R create mode 100644 tests/testthat/test-MaskedXString.R create mode 100644 tests/testthat/test-OtherMasking.R create mode 100644 tests/testthat/test-PDict.R create mode 100644 tests/testthat/test-QualityScaledXStringSet.R create mode 100644 tests/testthat/test-XString-class.R create mode 100644 tests/testthat/test-XStringQuality.R create mode 100644 tests/testthat/test-XStringSet-class.R create mode 100644 tests/testthat/test-XStringSet-io.R create mode 100644 tests/testthat/test-XStringSetList-class.R create mode 100644 tests/testthat/test-XStringSetList.R create mode 100644 tests/testthat/test-XStringViews.R create mode 100644 tests/testthat/test-chartr.R create mode 100644 tests/testthat/test-dinucleotideFrequencyTest.R create mode 100644 tests/testthat/test-findPalindromes.R create mode 100644 tests/testthat/test-letter.R create mode 100644 tests/testthat/test-letterFrequency.R create mode 100644 tests/testthat/test-matchLRPatterns.R create mode 100644 tests/testthat/test-matchPDict.R create mode 100644 tests/testthat/test-matchPWM.R create mode 100644 tests/testthat/test-matchPattern.R create mode 100644 tests/testthat/test-miscXStringMethods.R create mode 100644 tests/testthat/test-miscellaneous.R create mode 100644 tests/testthat/test-replaceAt.R create mode 100644 tests/testthat/test-translate.R create mode 100644 tests/testthat/test-trimLRPatterns.R diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 8b137891..c51f050f 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -1 +1,227 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + pull_request: + branches: [main, master, devel, devel-staging] +name: test-coverage-nocodecov + +permissions: read-all + +jobs: + test-coverage: + runs-on: ubuntu-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + + steps: + - name: Checkout current ref + uses: actions/checkout@v4 + with: + path: ./new-state + + - name: Checkout Biostrings devel ref + id: devel-checkout + uses: actions/checkout@v4 + with: + repository: Bioconductor/Biostrings + path: ./original-state + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - name: setup dependencies for old state + uses: r-lib/actions/setup-r-dependencies@v2 + with: + working-directory: ./original-state + extra-packages: any::covr + needs: coverage + + - name: setup dependencies for new state + uses: r-lib/actions/setup-r-dependencies@v2 + with: + working-directory: ./new-state + extra-packages: any::covr + needs: coverage + + - name: Test coverage on base branch + run: | + dirpath <- file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package") + dir.create(dirpath) + + ## first check unit tests + cat("Checking test results...\n") + res <- testthat::test_local("./new-state", stop_on_failure=FALSE, reporter="check") + res <- as.data.frame(res) + test_report <- c(sum(res$failed), sum(res$warning), sum(res$skipped), sum(res$passed)) + shouldStop <- test_report[1] > 0 + shouldPrint <- sum(test_report[1:2]) > 0 + test_report <- paste(c("FAIL", "WARN", "SKIP", "PASS"), test_report, collapse=' | ') + test_report <- paste('Unit Tests: [', test_report, ']') + + ## build the output message + out_msg <- '# Testing Report\n\n' + out_msg <- paste0(out_msg, "## Test Results:\n\n") + out_msg <- paste0(out_msg, "Please note that test coverage is **not** an ", + "end-all-be-all measure of robustness. Having tests that ", + "correctly cover intended use cases and potential errors ", + "is significantly more important than maximizing coverage.\n\n") + out_msg <- paste0(out_msg, "```\n", test_report, '\n```\n\n') + + ## if any tests failed or threw warnings, report them + if(shouldPrint){ + p_toprint <- which(res$failed + res$warning > 0) + ptp <- res[p_toprint,] + failed_tests <- ptp[,c("file", "test", "warning", "failed")] + failed_tests <- apply(failed_tests, 1L, paste, collapse=" | ") + failed_tests <- paste("|", failed_tests, "| ") + failed_tests <- paste(failed_tests, collapse='\n') + md_tab <- paste0("| Test File :page_facing_up: | Test Name :id: | Warnings :warning: | Failures :x: | \n", + "| :----- | :----- | :-----: | :-----: | \n", + failed_tests, "\n\n") + out_msg <- paste0(out_msg, "### Warning/Failing Tests:\n\n", md_tab) + } + if(shouldStop){ + cat(out_msg, file='./test_status.md') + stop("Some tests failed! Skipping coverage report.") + } + + ## if no tests failed, check coverage of old vs. new + library(covr) + # exclude lines with no content + options(covr.exclude_pattern=c("^[ \t{}()]+$")) + # get results on old state + files_to_ignore <- list("R/AMINO_ACID_CODE.R", "R/GENETIC_CODE.R", + "R/zzz.R", "R/IUPAC_CODE_MAP.R", + "R/getSeq.R") + cov <- covr::package_coverage( + path = "./original-state", + quiet = FALSE, + clean = FALSE, + install_path = file.path(dirpath, "old-state"), + function_exclusions = "^\\.", + line_exclusions=files_to_ignore + ) + head_res <- covr::coverage_to_list(cov) + # get results on new state + cov <- covr::package_coverage( + path = "./new-state", + quiet = FALSE, + clean = FALSE, + install_path = file.path(dirpath, "new-state"), + function_exclusions = "^\\.", # excludes functions starting with . + line_exclusions=files_to_ignore + ) + new_res <- covr::coverage_to_list(cov) + + ## compare difference in coverage + f_old <- head_res$filecoverage + f_new <- new_res$filecoverage + cat("Old Coverage:\n") + print(f_old) + cat("***************\n") + cat("New Coverage:\n") + print(f_new) + cat("***************\n") + all_files <- union(names(f_old), names(f_new)) + file_changes <- rep(0, length(all_files)) + names(file_changes) <- all_files + file_changes[names(f_new)] <- file_changes[names(f_new)] + f_new + final_cov <- file_changes + file_changes[names(f_old)] <- file_changes[names(f_old)] - f_old + total_change <- new_res$totalcoverage - head_res$totalcoverage + + out_msg <- paste0(out_msg, "## Negatively Impacted Files\n\n") + ## build warning message + n <- names(file_changes) + pos_neg <- which(file_changes < 0) + if(length(pos_neg) > 0){ + pos_neg <- pos_neg[order(file_changes[pos_neg], decreasing=FALSE)] + warn_changes <- sprintf("%+.01f%%", file_changes) + header <- "| File name | Coverage | Change |\n | :----- | :-----: | :-----: |\n" + warn_tab <- paste0('| ', n[pos_neg], ' | ', sprintf("%0.02f%%", final_cov[pos_neg]), ' | ', + unname(warn_changes[pos_neg]), ' |', collapse='\n') + warn_tab <- paste0(header, warn_tab) + out_msg <- paste0(out_msg, "The following files have lost coverage:\n", warn_tab, '\n') + } else { + out_msg <- paste0(out_msg, "No negatively impacted files. Nice job!\n\n") + } + + ## build extended diff table + p_Rfiles <- grepl("^R/", n) + n <- vapply(strsplit(n, '/'), .subset, character(1L), 2L) + all_diffs <- data.frame(filename=n, + coverage=sprintf("%.02f%%", final_cov), + change=sprintf("%+.01f%%", file_changes)) + max_nchar <- max(nchar(all_diffs$filename)) + all_diffs$filename <- sprintf(paste0("%", max_nchar, "s"), all_diffs$filename) + all_diffs$coverage <- sprintf("%7s", all_diffs$coverage) + all_diffs$change <- sprintf("%7s", all_diffs$change) + all_diffs$mark_char <- 1L + all_diffs$mark_char[file_changes > 0] <- 2L + all_diffs$mark_char[file_changes < 0] <- 3L + all_diffs$mark_char <- c(" ", "+", "-")[all_diffs$mark_char] + + all_rows <- apply(all_diffs[c(4,1:3)], 1L, paste, collapse=' ') + w <- nchar(all_rows[1L]) + + title0 <- "Total Coverage" + n_padding <- (w - nchar(title0) - 4) / 2 + title0 <- paste0("@@", paste(rep(' ', floor(n_padding)), collapse=''), + title0, paste(rep(' ', ceiling(n_padding)), collapse=''), "@@") + row0 <- paste(ifelse(total_change < 0, "-", ifelse(total_change>0, "+", " ")), + sprintf(paste0("%", max_nchar, "s"), "Total Coverage"), + sprintf("%6.02f%%", new_res$totalcoverage), + sprintf("%+6.01f%%", total_change), collapse=' ') + + title1 <- "R/... Files" + n_padding <- (w - nchar(title1) - 4) / 2 + title1 <- paste0("@@", paste(rep(' ', floor(n_padding)), collapse=''), + title1, paste(rep(' ', ceiling(n_padding)), collapse=''), "@@") + + title2 <- "src/... Files" + n_padding <- (w - nchar(title2) - 4) / 2 + title2 <- paste0("@@", paste(rep(' ', floor(n_padding)), collapse=''), + title2, paste(rep(' ', ceiling(n_padding)), collapse=''), "@@") + + spacer <- paste(rep('=', w), collapse='') + entries1 <- paste(all_rows[p_Rfiles], collapse='\n') + entries2 <- paste(all_rows[!p_Rfiles], collapse='\n') + diff_table <- paste(title0, spacer, '\n', row0, '\n', spacer, + title1, spacer, entries1, spacer, + title2, spacer, entries2, spacer, + collapse='\n', sep='\n') + diff_table <- paste0("
\nAdditional Details and Impacted Files:\n\n", + "```diff\n", diff_table, '\n\n```\n\n
') + out_msg <- paste0(out_msg, diff_table, '\n') + cat(out_msg, file='./test_status.md') + shell: Rscript {0} + + ## This is a better option than step summary, but requires + ## the "pull-request: write" permission, which I'd rather + ## not allow on a public repository + # - name: Print comment to PR + # uses: thollander/actions-comment-pull-request@v2 + # with: + # GITHUB_TOKEN: ${{ env.GITHUB_PAT }} + # filePath: ./test_status.md + # comment_tag: unit-test-results + + - name: Print results to summary + if: always() + run: cat ./test_status.md >> $GITHUB_STEP_SUMMARY + + - name: Upload package on failure + if: failure() + uses: actions/upload-artifact@v4 + with: + name: coverage-test-failures + path: ${{ runner.temp }}/package + + - name: Upload status on success + if: always() + uses: actions/upload-artifact@v4 + with: + name: coverage-test-results + path: ./test_status.md \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..f59cef4f --- /dev/null +++ b/.gitignore @@ -0,0 +1,13 @@ +*.o +src/*.o +src/*.so +*.icloud +.Rproj* +.DS_Store +*/.DS_Store +.RData +.Rbuildignore +.Rhistory +*/*.gcov +*/*.gcda +*/*.gcno diff --git a/DESCRIPTION b/DESCRIPTION index 240379e0..6146a526 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -44,7 +44,7 @@ Suggests: graphics, pwalign, BSgenome (>= 1.13.14), drosophila2probe, hgu95av2probe, hgu133aprobe, GenomicFeatures (>= 1.3.14), hgu95av2cdf, affy (>= 1.41.3), affydata (>= 1.11.5), - RUnit, BiocStyle, knitr + RUnit, BiocStyle, knitr, testthat (>= 3.0.0), covr VignetteBuilder: knitr Collate: utils.R IUPAC_CODE_MAP.R diff --git a/R/XStringViews-class.R b/R/XStringViews-class.R index 052b5c6a..b6a2cca3 100644 --- a/R/XStringViews-class.R +++ b/R/XStringViews-class.R @@ -478,14 +478,14 @@ setMethod("as.matrix", "XStringViews", function(x, use.names=TRUE) { ## TODO: Supress this warning in BioC 2.12. - msg <- c("as.matrix() on an XStringViews object 'x' has changed ", - "behavior: now the\n views in 'x' must be of equal width ", - "and each view is converted into a row of\n", - " single characters. To achieve the old behavior, ", - "do 'as.matrix(ranges(x))'.\n To supress this warning, do ", - "'suppressWarnings(as.matrix(x))'.\n This warning will be ", - "removed in BioC 2.12.") - warning(msg) + # msg <- c("as.matrix() on an XStringViews object 'x' has changed ", + # "behavior: now the\n views in 'x' must be of equal width ", + # "and each view is converted into a row of\n", + # " single characters. To achieve the old behavior, ", + # "do 'as.matrix(ranges(x))'.\n To supress this warning, do ", + # "'suppressWarnings(as.matrix(x))'.\n This warning will be ", + # "removed in BioC 2.12.") + # warning(msg) y <- fromXStringViewsToStringSet(x, out.of.limits="error", use.names=use.names) as.matrix(y) diff --git a/R/chartr.R b/R/chartr.R index 253bf779..a98c6943 100644 --- a/R/chartr.R +++ b/R/chartr.R @@ -49,6 +49,9 @@ setMethod("chartr", c(old="ANY", new="ANY", x="MaskedXString"), ### A simple wrapper to chartr(). replaceAmbiguities <- function(x, new="N") { + if(!(inherits(x, c("XString", "XStringSet", "XStringViews"))) || + !(seqtype(x) %in% c("DNA", "RNA"))) + stop("replaceAmbiguities is only supported for DNA and RNA") if (!(isSingleString(new) && nchar(new) == 1L)) stop("'new' must be a single letter") old <- paste(setdiff(names(IUPAC_CODE_MAP), DNA_BASES), collapse="") diff --git a/R/coloring.R b/R/coloring.R index c33eaac7..05406fed 100644 --- a/R/coloring.R +++ b/R/coloring.R @@ -63,7 +63,7 @@ make_DNA_AND_RNA_COLORED_LETTERS <- function() ### Colors groupins by ### https://www.jalview.org/help/html/colourSchemes/zappo.html ### Called in .onLoad() to initialize AA_COLORED_LETTERS. -make_AA_COLORED_LETTERS <- function(x){ +make_AA_COLORED_LETTERS <- function(){ whiter <- make_style(rgb(1, 1, 1)) dark_grey_bg <- make_style(rgb(0.5,0.5,0.5), bg=TRUE) diff --git a/R/dinucleotideFrequencyTest.R b/R/dinucleotideFrequencyTest.R index 483e1895..1a4ed9d6 100644 --- a/R/dinucleotideFrequencyTest.R +++ b/R/dinucleotideFrequencyTest.R @@ -113,12 +113,12 @@ g.test <- function(x, y = NULL, correct="none", else { # x is not a matrix, so we do Goodness of Fit METHOD <- "Log likelihood ratio (G-test) goodness of fit test" - if (length(x) == 1) + if (length(x) == 1) stop("x must at least have 2 elements") - if (length(x) != length(p)) + if (length(x) != length(p)) stop("x and p must have the same number of elements") E <- n * p - + if (correct=="yates"){ # Do Yates' correction if(length(x)!=2) stop("Yates' correction requires 2 data values") diff --git a/R/letter.R b/R/letter.R index 339bfce7..5dd1b387 100644 --- a/R/letter.R +++ b/R/letter.R @@ -12,8 +12,8 @@ setGeneric("letter", signature="x", setMethod("letter", "character", function(x, i) { - if (!is.numeric(i) || any(is.na(i))) - stop("'i' must be an NA-free numeric vector") + if (!is.numeric(i) || anyNA(i)) + stop("subscript 'i' must be an NA-free numeric vector") if (length(x) == 0) return(character(0)) if (length(i) == 0) @@ -42,10 +42,13 @@ setMethod("letter", "character", setMethod("letter", "XString", function(x, i) { - if (!is.numeric(i)) - stop("subscript 'i' must be an integer vector") + if (!is.numeric(i) || anyNA(i)) + stop("subscript 'i' must be an NA-free numeric vector") if (!is.integer(i)) i <- as.integer(i) + imax <- length(x) + if (!all(i >= 1) || !all(i <= imax)) + stop("subscript out of bounds") extract_character_from_XString_by_positions(x, i, collapse=TRUE) } ) @@ -54,12 +57,10 @@ setMethod("letter", "XString", setMethod("letter", "XStringViews", function(x, i) { - if (!is.numeric(i)) - stop("subscript 'i' must be an integer vector") + if (!is.numeric(i) || anyNA(i)) + stop("subscript 'i' must be an NA-free numeric vector") if (!is.integer(i)) i <- as.integer(i) - if (anyNA(i)) - stop("subscript 'i' cannot contain NAs") if (length(x) == 0) return(character(0)) imax <- min(width(x)) diff --git a/R/maskMotif.R b/R/maskMotif.R index 0fda68a0..5f6692b8 100644 --- a/R/maskMotif.R +++ b/R/maskMotif.R @@ -53,6 +53,8 @@ setMethod("maskMotif", signature(x="XString", motif="ANY"), ### Deprecate in Biostrings 2.9! mask <- function(x, start=NA, end=NA, pattern) { + warning("`mask()` is deprecated and will be removed in a future release.\n", + "Please use `Mask()` or `maskMotif()` instead.") if (!is(x, "XString")) x <- XString(NULL, x) if (missing(pattern)) { diff --git a/R/matchPDict.R b/R/matchPDict.R index aa4a90ab..c7324abe 100644 --- a/R/matchPDict.R +++ b/R/matchPDict.R @@ -647,7 +647,7 @@ setMethod("matchPDict", "XStringViews", ### Dispatch on 'subject' (see signature of generic). setMethod("matchPDict", "MaskedXString", - function(pdict, subject, + function(pdict, subject, max.mismatch=0, min.mismatch=0, with.indels=FALSE, fixed=TRUE, algorithm="auto", verbose=FALSE) matchPDict(pdict, toXStringViewsOrXString(subject), diff --git a/R/replaceAt.R b/R/replaceAt.R index b1cca293..3462e50e 100644 --- a/R/replaceAt.R +++ b/R/replaceAt.R @@ -321,75 +321,6 @@ setMethod("extractAt", "XStringSet", } ) - -if (FALSE) { # <<<--- begin testing extractAt() --->>> - -library(Biostrings) - -### From an XStringSet object -### ========================= - -### Only compare the classes, the shapes (i.e. lengths + elementNROWS + -### names), the inner names, and the sequence contents. Doesn't look at -### the metadata or the metadata columns (outer or inner). -identical_XStringSetList <- function(target, current) -{ - ok1 <- identical(class(target), class(current)) - ok2 <- identical(elementNROWS(target), elementNROWS(current)) - unlisted_target <- unlist(target, use.names=FALSE) - unlisted_current <- unlist(current, use.names=FALSE) - ok3 <- identical(names(unlisted_target), names(unlisted_current)) - ok4 <- all(unlisted_target == unlisted_current) - ok1 && ok2 && ok3 && ok4 -} - -x <- BStringSet(c(seq1="ABCD", seq2="abcdefghijk", seq3="XYZ")) -at <- IRangesList(IRanges(c(2, 1), c(3, 0)), - IRanges(c(7, 2, 12, 7), c(6, 5, 11, 8)), - IRanges(2, 2)) -### Set inner names on 'at'. -unlisted_at <- unlist(at) -names(unlisted_at) <- paste0("rg", sprintf("%02d", seq_along(unlisted_at))) -at <- relist(unlisted_at, at) - -res1a <- extractAt(x, at) -res1b <- BStringSetList(mapply(extractAt, x, at)) -stopifnot(identical_XStringSetList(res1a, res1b)) - -res2a <- extractAt(x, at[3]) -res2b <- BStringSetList(mapply(extractAt, x, at[3])) -stopifnot(identical_XStringSetList(res2a, res2b)) -res2c <- extractAt(x, at[[3]]) -stopifnot(identical_XStringSetList(res2a, res2c)) - -### Testing performance with 2 millions small extractions at random -### locations in Fly upstream sequences: -dm3_upstream_filepath <- system.file("extdata", - "dm3_upstream2000.fa.gz", - package="Biostrings") -dm3_upstream <- readDNAStringSet(dm3_upstream_filepath) -dm3_upstream <- dm3_upstream[width(dm3_upstream) == 2000L] - -set.seed(33) -NE <- 2000000L # total number of extractions -sample_size <- NE * 1.1 -some_ranges <- IRanges(sample(2001L, sample_size, replace=TRUE), - width=sample(0:75, sample_size, replace=TRUE)) -some_ranges <- head(some_ranges[end(some_ranges) <= 2000L], n=NE) -split_factor <- factor(sample(length(dm3_upstream), NE, replace=TRUE), - levels=seq_along(dm3_upstream)) -at <- unname(split(some_ranges, split_factor)) - -### Takes < 1s on my machine. -system.time(res3a <- extractAt(dm3_upstream, at)) - -### Equivalent but about 250x slower than the above on my machine. -system.time(res3b <- DNAStringSetList(mapply(extractAt, dm3_upstream, at))) -stopifnot(identical_XStringSetList(res3a, res3b)) - -} # <<<--- end testing extractAt() --->>> - - ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ### replaceAt() ### @@ -423,103 +354,3 @@ setMethod("replaceAt", "XStringSet", ans } ) - - -if (FALSE) { # <<<--- begin testing replaceAt() --->>> - -library(Biostrings) - -### On an XString object -### ==================== - -### Testing performance with half-million small substitutions at random -### locations in Celegans chrI: -library(BSgenome.Celegans.UCSC.ce2) -genome <- BSgenome.Celegans.UCSC.ce2 -chrI <- genome$chrI - -### A very amateurish random disjoint ranges generator. Not for serious use! -randomDisjointRanges <- function(min_start, max_end, min_width, max_width, n) -{ - set.seed(33) - offset <- min_start - 1L - n1 <- n * (1.1 + max_width/(max_end-offset+1L)) # very approximative - - some_starts <- sample(max_end-offset+1L, n1, replace=TRUE) + offset - some_widths <- sample(min_width:max_width, n1, replace=TRUE) - some_ranges <- IRanges(some_starts, width=some_widths) - some_ranges <- some_ranges[end(some_ranges) <= max_end] - ans <- disjoin(some_ranges) - if (min_width == 0L) { - is_empty <- width(some_ranges) == 0L - some_empty_ranges <- some_ranges[is_empty] - ans <- sample(c(ans, some_empty_ranges)) - } - if (length(ans) < n) - stop("internal error, sorry") - head(ans, n=n) -} - -at4 <- randomDisjointRanges(1L, nchar(chrI), 0L, 20L, 500000L) -### Takes < 1s on my machine (64-bit Ubuntu). -system.time(current <- replaceAt(chrI, at4, Views(chrI, at4))) -stopifnot(current == chrI) - -### Testing performance with half-million single-letter insertions at random -### locations in Celegans chrI: -at5 <- randomDisjointRanges(1L, nchar(chrI), 0L, 0L, 500000L) -### Takes < 1s on my machine (64-bit Ubuntu). -system.time(current <- replaceAt(chrI, at5, value="-")) -m <- matchPattern("-", current) -stopifnot(identical(sort(start(at5)), start(m) - seq_along(at5) + 1L)) - -system.time(current2 <- replaceAt(chrI, start(at5), value="-")) -stopifnot(identical(current, current2)) - -matchPattern("---", current) - -### On an XStringSet object -### ======================= - -### Only compare the classes, lengths, names, and sequence contents. -### Doesn't look at the metadata or the metadata columns. -identical_XStringSet <- function(target, current) -{ - ok1 <- identical(class(target), class(current)) - ok2 <- identical(names(target), names(current)) - ok3 <- all(target == current) - ok1 && ok2 && ok3 -} - -x <- BStringSet(c(seq1="ABCD", seq2="abcdefghijk", seq3="XYZ")) -at <- IRangesList(IRanges(c(2, 1), c(3, 0)), - IRanges(c(7, 2, 12, 7), c(6, 5, 11, 8)), - IRanges(2, 2)) -### Set inner names on 'at'. -unlisted_at <- unlist(at) -names(unlisted_at) <- paste0("rg", sprintf("%02d", seq_along(unlisted_at))) -at <- relist(unlisted_at, at) - -current <- replaceAt(x, at, value=extractAt(x, at)) # no-op -stopifnot(identical_XStringSet(x, current)) - -res1a <- replaceAt(x, at) # deletions -res1b <- mendoapply(replaceAt, x, at) -stopifnot(identical_XStringSet(res1a, res1b)) - -### Testing performance with half-million single-letter insertions at random -### locations in Fly upstream sequences: -set.seed(33) -split_factor <- factor(sample(length(dm3_upstream), 500000L, replace=TRUE), - levels=seq_along(dm3_upstream)) -at5 <- unname(split(sample(2001L, 500000L, replace=TRUE), - split_factor)) -### Takes < 1s on my machine. -system.time(res5a <- replaceAt(dm3_upstream, at5, value="-")) -### Equivalent but about 1400x slower than the above on my machine. -system.time(res5b <- mendoapply(replaceAt, - dm3_upstream, as(at5, "List"), as("-", "List"))) -stopifnot(identical_XStringSet(res5a, res5b)) - -} # <<<--- end testing replaceAt() --->>> - diff --git a/R/translate.R b/R/translate.R index 1a16f981..e061fb6e 100644 --- a/R/translate.R +++ b/R/translate.R @@ -13,8 +13,9 @@ if (!all(nchar(genetic.code) == 1L)) stop("'genetic.code' must contain 1-letter strings") ## Just a warning for now. Might become an error in the future. + ## 07/26/2024: updated to error since AAString() enforces alphabet if (!all(genetic.code %in% AA_ALPHABET)) - warning("some codons in 'genetic.code' are mapped to letters ", + stop("some codons in 'genetic.code' are mapped to letters ", "not in the Amino Acid\n alphabet (AA_ALPHABET)") alt_init_codons <- attr(genetic.code, "alt_init_codons", exact=TRUE) if (is.null(alt_init_codons) diff --git a/man/XStringSetList-class.Rd b/man/XStringSetList-class.Rd index f8cc185f..b2ae458b 100644 --- a/man/XStringSetList-class.Rd +++ b/man/XStringSetList-class.Rd @@ -60,10 +60,7 @@ for storing a list of \link{BStringSet}, \link{DNAStringSet}, \link{RNAStringSet} and \link{AAStringSet} objects, respectively. These four containers are direct subclasses of XStringSetList - with no additional slots. - - Currently \code{DNAStringSetList()} and \code{AAStringSetList()} are - the only XStringSetList constructors. The XStringSetList class itself + with no additional slots. The XStringSetList class itself is virtual and has no constructor. } @@ -77,7 +74,7 @@ AAStringSetList(..., use.names=TRUE) \arguments{ \item{\dots}{ - Character vector(s) (with no NAs), or \link{XStringSet} object(s), or + Character vector(s) (with no NAs), or \link{XStringSet} object(s), or \link{XStringViews} object(s) to be concatenated into a \link{XStringSetList}. } @@ -110,8 +107,6 @@ AAStringSetList(..., use.names=TRUE) ## ------------------------------------------------------------------------ ## A. THE XStringSetList CONSTRUCTORS ## ------------------------------------------------------------------------ -## Currently DNAStringSetList() and AAStringSetList() are the only -## constructors. Others will be developed when the use case arises. dna1 <- c("AAA", "AC", "", "T", "GGATA") dna2 <- c("G", "TT", "C") diff --git a/man/chartr.Rd b/man/chartr.Rd index 35a5c91f..a83171a6 100644 --- a/man/chartr.Rd +++ b/man/chartr.Rd @@ -46,7 +46,8 @@ replaceAmbiguities(x, new="N") specifications. \code{replaceAmbiguities()} is a simple wrapper around \code{chartr()} - that replaces all IUPAC ambiguities with N. + that replaces all IUPAC ambiguities with N for objects containing DNA + or RNA sequence data. } \value{ diff --git a/tests/run_unitTests.R b/tests/run_unitTests.R deleted file mode 100644 index f1b20090..00000000 --- a/tests/run_unitTests.R +++ /dev/null @@ -1,2 +0,0 @@ -require("Biostrings") || stop("unable to load Biostrings package") -Biostrings:::.test() diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 00000000..b22abee7 --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,4 @@ +library(testthat) +library(Biostrings) + +test_check("Biostrings") diff --git a/tests/testthat/test-MaskedXString.R b/tests/testthat/test-MaskedXString.R new file mode 100644 index 00000000..5a76d303 --- /dev/null +++ b/tests/testthat/test-MaskedXString.R @@ -0,0 +1,115 @@ +## Testing for MaskedXString-class.R, maskMotif.R, injectHardMask.R +## +## MaskedXString-class.R exports: +## - MaskedXString class (B, AA, DNA, RNA) +## - unmasked +## - masks, masks<- +## - length +## - maskedratio +## - maskedwidth +## - nchar +## - seqtype +## - *Views +## - as.character +## - subseq +## - toString +## - coercion between masked types +## - coercion between unmasked types (B <-> MaskedB) +## - coercion to MaskCollection, NormalIRanges, XStringViews + +dna_c <- sample(DNA_BASES, 25L, replace=TRUE) +rna_c <- sample(RNA_BASES, 25L, replace=TRUE) +aaa_c <- sample(AA_STANDARD, 25L, replace=TRUE) +bbb_c <- sample(LETTERS, 25L, replace=TRUE) + +dna <- DNAString(paste(dna_c, collapse='')) +rna <- RNAString(paste(rna_c, collapse='')) +aaa <- AAString(paste(aaa_c, collapse='')) +bbb <- BString(paste(bbb_c, collapse='')) + +mask_starts <- c(1,5,10) +mask_ends <- c(3,7,12) +m1 <- Mask(25L, start=mask_starts, end=mask_ends) +m2 <- Mask(25L, start=c(15,20), end=c(18,22)) +total_mask_width <- sum(mask_ends - mask_starts + 1L) + +.check_masked_ascharacter <- function(maskedv, origv, maskstarts, maskends){ + c1 <- strsplit(as.character(maskedv), '')[[1]] + c2 <- strsplit(as.character(origv), '')[[1]] + if(sum(c1 == "#") != total_mask_width) return(FALSE) + if(!all(c1[c1!='#'] == c2[c1!='#'])) return(FALSE) + TRUE +} + +test_that("masking coerces between XString and MaskedXString", { + l <- list(dna, rna, aaa, bbb) + for(i in seq_along(l)){ + tmp <- l[[i]] + masks(tmp) <- m1 + ## should convert to MaskedXString + mseqclass <- paste0("Masked", seqtype(l[[i]]), "String") + expect_s4_class(tmp, mseqclass) + expect_s4_class(as(l[[i]], mseqclass), mseqclass) + + ## seqtype should stay the same + expect_equal(seqtype(tmp), seqtype(l[[i]])) + + ## length should stay the same + expect_equal(length(tmp), length(l[[i]])) + + ## maskedwidth should report 9 + expect_equal(maskedwidth(tmp), total_mask_width) + expect_equal(maskedratio(tmp), total_mask_width/nchar(l[[i]])) + expect_equal(nchar(tmp), nchar(l[[i]]) - total_mask_width) + + ## make sure that as.character preserves masking + expect_true(.check_masked_ascharacter(tmp, l[[i]])) + expect_equal(as.character(subseq(tmp, 4, 10)), substr(as.character(tmp), 4, 10)) + expect_equal(toString(tmp), as.character(tmp)) + + ## should convert back to XString + expect_s4_class(as(tmp, paste0(seqtype(tmp), "String")), class(l[[i]])) + + ## equality should work here because they're from the same pool (?) + masks(tmp) <- m2 + expect_equal(masks(tmp), m2) + expect_equal(unmasked(tmp), l[[i]]) + } +}) + +test_that("coercion between masked types works correctly", { + rnam <- rna + dnam <- dna + aaam <- aaa + masks(dnam) <- masks(rnam) <- masks(aaam) <- m2 + + # test translation + udna <- strsplit(as.character(dnam), '')[[1]] + udna <- DNAString(paste(udna[udna!='#'], collapse='')) + expect_equal(as.character(translate(dnam)), as.character(translate(udna))) + + # DNA <-> RNA conversion + expect_equal(as.character(unmasked(as(dnam, "MaskedRNAString"))), as.character(as(dna, "RNAString"))) + expect_equal(as.character(unmasked(as(rnam, "MaskedRNAString"))), as.character(as(rna, "RNAString"))) + + expect_error(as(dnam, "MaskedAAString"), "incompatible sequence types") + expect_error(as(rnam, "MaskedAAString"), "incompatible sequence types") + + # roundtrip X -> B -> X conversion + expect_equal(as(as(dnam, "MaskedBString"), "MaskedDNAString"), dnam) + expect_equal(as(as(rnam, "MaskedBString"), "MaskedRNAString"), rnam) + expect_equal(as(as(aaam, "MaskedBString"), "MaskedAAString"), aaam) + + masks(dnam) <- m1 + # MaskCollection conversion + expect_true(all(as(dnam, "MaskCollection") == m1)) + + # NormalIRanges conversion converts to the masked ranges + expect_equal(start(as(dnam, "NormalIRanges")), mask_starts) + expect_equal(end(as(dnam, "NormalIRanges")), mask_ends) + + # Views conversion converts to unmasked ranges + exp_end <- c(mask_starts-1L, length(dnam)) + exp_end <- unique(exp_end[exp_end>=1]) + expect_equal(end(as(dnam, "Views")), exp_end) +}) \ No newline at end of file diff --git a/tests/testthat/test-OtherMasking.R b/tests/testthat/test-OtherMasking.R new file mode 100644 index 00000000..a68ae2aa --- /dev/null +++ b/tests/testthat/test-OtherMasking.R @@ -0,0 +1,58 @@ +## Additional tests for the following files: +## - maskMotif.R (contains `maskMotif`, `mask`) +## - injectHardMask.R (contains `injectHardMask`) + +test_that("maskMotif works for XStrings", { + s1 <- BString("aabbaacc") + s2 <- maskMotif(s1, "bb") + expect_equal(as.character(s2), "aa##aacc") + s3 <- maskMotif(s2, "baac") + expect_equal(as.character(s3), "aa##aacc") + + ## order of operations matters, masking should skip masked positions + s4 <- maskMotif(s1, "baac") + expect_equal(as.character(s4), "aab####c") + expect_equal(as.character(gaps(s4)), "###baac#") + expect_equal(as.character(as(s4, "Views")), c("aab", "c")) + + ## mask DNAString by multiple motifs + ## example from ?maskMotif.R, just using DNAString objects + x <- DNAString("ACACAACTAGATAGNACTNNGAGAGACGC") + x1 <- maskMotif(x, DNAString("N")) + x2 <- maskMotif(x1, DNAString("AC")) + x3 <- maskMotif(x2, DNAString("GA"), min.block.width=5) + expect_equal(width(as(x3, "Views")), c(1,7,1,5,2)) + exp_opt <- c(A=5L, C=5L, N=3L, `#`=16L) + expect_true(all(table(strsplit(as.character(gaps(x3)), '')[[1]]) == as.table(exp_opt[order(names(exp_opt))]))) +}) + +test_that("injectHardMask works for all supported inputs", { + set.seed(50L) + ds <- sample(DNA_BASES, 50L, replace=TRUE) + + d1 <- DNAString(paste(ds, collapse='')) + d2 <- d1 + + mask_starts <- c(4,10,28) + mask_ends <- c(7,17,38) + m1 <- Mask(length(ds), start=mask_starts, end=mask_ends) + masks(d2) <- m1 + + all_masked_indices <- unlist(lapply(seq_along(mask_starts), \(i) seq(mask_starts[i], mask_ends[i]))) + exp_opt <- ds + exp_opt[all_masked_indices] <- "+" + expect_equal(as.character(injectHardMask(d2)), paste(exp_opt, collapse='')) + + exp_opt[all_masked_indices] <- "M" + expect_equal(as.character(injectHardMask(d2, "M")), paste(exp_opt, collapse='')) + + v <- Views(d1, start=mask_starts, end=mask_ends) + ## injecting on a Views does the opposite + all_vmasked <- seq_len(length(ds))[-all_masked_indices] + exp_vopt <- ds + exp_vopt[all_vmasked] <- "+" + expect_equal(as.character(injectHardMask(v)), paste(exp_vopt, collapse='')) + + exp_vopt[all_vmasked] <- "M" + expect_equal(as.character(injectHardMask(v, "M")), paste(exp_vopt, collapse='')) +}) \ No newline at end of file diff --git a/tests/testthat/test-PDict.R b/tests/testthat/test-PDict.R new file mode 100644 index 00000000..a1db9fdb --- /dev/null +++ b/tests/testthat/test-PDict.R @@ -0,0 +1,205 @@ +## PDict objects have the following accessors: +## - length +## - width +## - names +## - head +## - tb +## - tb.width +## - tail +## - (duplicated) +## - (patternFrequency) +## The `[[` operator is also defined + +test_that("PDicts can be initialized happy path", { + codons <- mkAllStrings(DNA_BASES, 3) + namedcodons <- codons + names(namedcodons) <- codons + + for(algo in c("ACtree2", "Twobit")){ + expect_s4_class(PDict(codons, algorithm=algo), "TB_PDict") + + p <- PDict(codons, algorithm=algo) + expect_output(show(p), "TB_PDict object of length 64 and width 3") + + ## checking threeparts internals + expect_s4_class(p@threeparts@pptb, algo) + expect_equal(lengths(p@threeparts@tail), rep(0L, 64L)) + expect_equal(lengths(p@threeparts@head), rep(0L, 64L)) + + ## accessors + expect_equal(length(p), 64L) + expect_equal(width(p), rep(3L, 64L)) + + expect_null(names(p)) + expect_equal(names(PDict(namedcodons, algorithm=algo)), codons) + + expect_equal(duplicated(p), rep(FALSE, 64L)) + expect_equal(patternFrequency(p), rep(1L, 64L)) + + expect_equal(duplicated(PDict(c("A","A"), algorithm=algo)), c(FALSE, TRUE)) + expect_equal(patternFrequency(PDict(c("A","A"), algorithm=algo)), c(2L, 2L)) + + + ## tb + expect_identical(tb(p), p@threeparts@pptb@tb) + expect_s4_class(tb(p), "DNAStringSet") + expect_equal(as.character(tb(p)), codons) + expect_equal(tb.width(p), 3L) + + expect_s4_class(p[[1L]], "DNAString") + expect_equal(as.character(p[[1L]]), "AAA") + + ## initializing with ambiguity codes + strs <- paste0(DNA_ALPHABET, "AT", DNA_ALPHABET, "A") + l <- length(DNA_ALPHABET) + expect_s4_class(PDict(strs, tb.start=2L, tb.width=2L, algorithm=algo), "TB_PDict") + p <- PDict(strs, tb.start=2L, tb.width=2L, algorithm=algo) + expect_equal(as.character(tb(p)), rep("AT", l)) + expect_equal(tb.width(p), 2L) + expect_equal(lengths(head(p)), rep(1L, l)) + expect_identical(as.character(head(p)), DNA_ALPHABET) + expect_equal(lengths(tail(p)), rep(2L, l)) + expect_identical(as.character(tail(p)), paste0(DNA_ALPHABET, "A")) + expect_equal(width(p@threeparts), rep(5L, l)) + + ## variable width tail + strs[seq(1L,length(strs),2L)] <- paste0(strs[seq(1L,length(strs),2L)], "G") + p <- PDict(strs, tb.start=2L, tb.width=2L, algorithm=algo) + expect_equal(lengths(head(p@threeparts)), rep(1L, l)) + expect_identical(as.character(head(p@threeparts)), DNA_ALPHABET) + expect_equal(lengths(tail(p@threeparts)), rep(c(3L,2L), length.out=l)) + + ## allowing small number of mismatches + pats <- apply(expand.grid(mkAllStrings(c("A","T"), 3L), mkAllStrings(c("G","C"), 3L)), 1, paste0, collapse='') + expect_s4_class(PDict(pats, max.mismatch=1L, algorithm=algo), "MTB_PDict") + p <- PDict(pats, max.mismatch=1L, algorithm=algo) + expect_output(show(p), "MTB_PDict object of length 64 and width 6") + expect_error(p@threeparts, 'no slot of name "threeparts"') + expect_equal(length(p@threeparts_list), 2L) + p1 <- p@threeparts_list[[1]] + p2 <- p@threeparts_list[[2]] + expect_equal(lengths(p1@head), rep(0L, 64L)) + expect_equal(lengths(p1@tail), rep(3L, 64L)) + expect_equal(as.character(p1@tail), substr(pats,4,6)) + expect_equal(lengths(p2@head), rep(3L, 64L)) + expect_equal(lengths(p2@tail), rep(0L, 64L)) + expect_equal(as.character(p2@head), substr(pats,1,3)) + + expect_equal(length(as.list(p)), 2L) + } +}) + +test_that("PDict sad path errors correctly", { + expect_error(PDict(DNA_ALPHABET), "non base DNA letter found in Trusted Band") + expect_error(PDict(AA_ALPHABET), "not in lookup table") + expect_error(PDict(1:3), "unable to find an inherited method") + expect_error(PDict(character(0L)), "must contain at least one pattern") + p <- PDict(DNA_BASES) + expect_error({names(p) <- DNA_BASES}, "attempt to modify the names of a TB_PDict instance") + expect_error(PDict(c("AA", "AAA")), "Trusted Band has a different length") + + codons <- mkAllStrings(DNA_BASES, 3) + expect_error(PDict(codons, max.mismatch=1), "supported only if the width of dictionary 'x' is >= 6") + + longstrings <- mkAllStrings(c("A","T"), 6) + expect_error(PDict(longstrings, max.mismatch=7), "must be <= 1 given the width of dictionary 'x'") + expect_error(PDict(paste0(rep("A", 100), collapse=''), max.mismatch=100), "must be <= 32 given the width of dictionary 'x'") + + expect_error(PDict(codons, max.mismatch=NULL), "must be a single integer or 'NA'") + expect_error(PDict(codons, max.mismatch='a'), "must be a single integer or 'NA'") + expect_error(PDict(codons, max.mismatch=c(1,2)), "must be a single integer or 'NA'") + + expect_error(PDict("A", algorithm='erroralgo'), "is not a defined class") + + names(longstrings) <- rep('', length(longstrings)) + expect_error(PDict(longstrings), "'x' has invalid names") + names(longstrings) <- rep('test', length(longstrings)) + expect_error(PDict(longstrings), "'x' has duplicated names") + + expect_error(PDict("AAAA", tb.start='a'), "must be a single integer or 'NA'") + expect_error(PDict("AAAA", tb.end='a'), "must be a single integer or 'NA'") + expect_error(PDict("AAAA", tb.width='a'), "must be a single integer or 'NA'") + + expect_warning(PDict("AAAA", algorithm='ACtree'), "support for ACtree preprocessing algo has been dropped") +}) + +### Old Tests + +## Helper functions ported from old tests +randomDNASequences <- function(n, w) +{ + alphabet <- DNA_BASES + w <- rep(w, length=n) + sequences <- sapply(seq(1, n, length=n), + function(x) { + s <- sample(alphabet, w[x], replace=TRUE) + s <- paste(s, collapse="") + return(s) + }) + return(Biostrings::DNAStringSet(sequences)) +} + +msubseq <- function(x, ir) +{ + ## differs from subseq in the sense that several subsequences + ## from the same sequence are extracted + ## x: XString + ## ir: IRanges + res <- vector("character", length = length(ir)) + for (i in seq(along=res)) { + res[i] <- as.character(subseq(x, start=ir@start[i], width=width(ir)[i])) + ## forced cast: chain of tools for DNAString seems interupted for + ## some use cases (or I missed something) + } + res <- DNAStringSet(res) + return(res) +} + +test_that("PDict works with constant width initialization", { + set.seed(1) + l <- 150 + target <- randomDNASequences(1, l)[[1]] + W <- 20 + L <- 6 + ir <- successiveIRanges(rep(W, L), gapwidth = 1) + short_sequences <- msubseq(target, ir) + # shuffle the sequences (they are not in consecutive order) + o <- sample(seq(along=short_sequences)) + + dna_short <- DNAStringSet(short_sequences[o]) + pdict <- PDict(dna_short) + expect_equal(L, length(pdict)) + expect_equal(rep(W, L), width(pdict)) + expect_equal(NULL, head(pdict)) + expect_equal(W, tb.width(pdict)) + expect_equal(NULL, tail(pdict)) +}) + +test_that("PDict works for variable width lookup", { + set.seed(1) + l <- 150 + target <- randomDNASequences(1, l)[[1]] + W <- 20 + L <- 6 + n_cut <- sample(0:5, L, replace=TRUE) + ir <- successiveIRanges(rep(W, L) - n_cut, gapwidth = 1 + n_cut[-length(n_cut)]) + short_sequences <- msubseq(target, ir) + # shuffle the sequences (they are not in consecutive order) + o <- sample(seq(along=short_sequences)) + + dna_var_short <- DNAStringSet(short_sequences[o]) + + ## Previous comment: shouldn't 1:min(width) be the default? + pdict <- PDict(dna_var_short, + tb.start=1, + tb.width=min(width(short_sequences)) + ) + expect_equal(L, length(pdict)) + expect_equal((rep(W, L) - n_cut)[o], width(pdict)) + expect_equal(NULL, head(pdict)) + shortest_seq_width <- min(width(dna_var_short)) + expect_equal(shortest_seq_width, + tb.width(pdict)) # mostly a sanity check + expect_equal(substring(short_sequences, shortest_seq_width+1)[o], + as.character(tail(pdict))) +}) \ No newline at end of file diff --git a/tests/testthat/test-QualityScaledXStringSet.R b/tests/testthat/test-QualityScaledXStringSet.R new file mode 100644 index 00000000..2631061a --- /dev/null +++ b/tests/testthat/test-QualityScaledXStringSet.R @@ -0,0 +1,94 @@ +## QualityScaledXStringSet.R exports the following: +## - QualityScaledXStringSet class +## - readQualityScaledXStringSet +## - writeQualityScaledXStringSet +## - quality +## - all accessor methods defined for XStringSet objects +## +## here, X is a seqtype in c("DNA", "RNA", "AA", "B") +## the only accessors with different definitions are: +## - windows +## - narrow +## - reverse +## - reverseComplement +## - show + + +test_that("QualityScaledXStringSet I/O works properly", { + set.seed(4L) + n_samp <- 40L + n_seq <- 3L + qs <- list(phred=PhredQuality, + solexa=SolexaQuality, + illumina=IlluminaQuality) + + alphs <- list(DNA=DNA_BASES, + RNA=RNA_BASES, + AA=AA_STANDARD, + B=LETTERS) + + for(quality_type in c('phred', 'solexa', 'illumina')){ + q <- do.call(c, lapply(seq_len(n_seq), \(x) qs[[quality_type]](sample(40L, n_samp, replace=TRUE)))) + for(stype in c("DNA", "RNA", "AA", "B")){ + seqs <- vapply(seq_len(n_seq), \(i) { + paste(sample(alphs[[stype]], n_samp, replace=TRUE), collapse='') + }, character(1L)) + seqs <- as(seqs, paste0(stype, "StringSet")) + names(seqs) <- letters[seq_len(n_seq)] + + qss <- get(paste0("QualityScaled", stype, "StringSet"))(seqs, q) + + expect_s4_class(qss, paste0("QualityScaled", stype, "StringSet")) + expect_s4_class(quality(qss), class(qs[[quality_type]](0L))) + + tf <- tempfile() + if(file.exists(tf)) file.remove(tf) + writeQualityScaledXStringSet(qss, tf) + expect_equal(fastq.geometry(tf), c(n_seq, n_samp)) + + if(stype == "DNA"){ + ## only readQualityScaledDNAStringSet is currently defined + qss2 <- readQualityScaledDNAStringSet(tf, quality.scoring=quality_type) + + expect_equal(as.character(qss), as.character(qss2)) + expect_equal(as.character(quality(qss)), as.character(quality(qss2))) + } + if(file.exists(tf)) file.remove(tf) + } + } +}) + +test_that("QualityScaledXStringSet properly implements reverse, reverseComplement", { + .revString <- function(s) paste(strsplit(s, '')[[1]][seq(nchar(s), 1L)], collapse='') + set.seed(10L) + n_samp <- 40L + n_seq <- 3L + qs <- list(phred=PhredQuality, + solexa=SolexaQuality, + illumina=IlluminaQuality) + + alphs <- list(DNA=DNA_BASES, + RNA=RNA_BASES, + AA=AA_STANDARD, + B=LETTERS) + + qualities <- lapply(qs, \(q){ + do.call(c, lapply(seq_len(n_seq), \(i){ + q(sample(40L, n_samp, replace=TRUE)) + })) + }) + + for(stype in c("DNA", "RNA", "AA", "B")){ + seqs <- vapply(seq_len(n_seq), \(x) paste(sample(alphs[[stype]], n_samp, replace=TRUE), collapse=''), character(1L)) + seqs <- get(paste0(stype, "StringSet"))(seqs) + for(q in names(qs)){ + qual <- qualities[[q]] + qss <- get(paste0("QualityScaled", stype, "StringSet"))(seqs, qual) + exp_output <- vapply(as.character(qual), .revString, character(1L), USE.NAMES=FALSE) + expect_equal(as.character(quality(reverse(qss))), exp_output) + if(stype %in% c("DNA", "RNA")){ + expect_equal(as.character(quality(reverseComplement(qss))), exp_output) + } + } + } +}) \ No newline at end of file diff --git a/tests/testthat/test-XString-class.R b/tests/testthat/test-XString-class.R new file mode 100644 index 00000000..83f3dabf --- /dev/null +++ b/tests/testthat/test-XString-class.R @@ -0,0 +1,244 @@ +dnastr <- paste(DNA_ALPHABET, collapse='') +rnastr <- paste(RNA_ALPHABET, collapse='') +aastr <- paste(AA_ALPHABET, collapse='') +bstr <- rawToChar(as.raw(32:126)) + +test_allstrings_for_error <- function(input, exp_error, ignore=''){ + if(!"DNA"%in%ignore) expect_error(DNAString(input), exp_error) + if(!"RNA"%in%ignore) expect_error(RNAString(input), exp_error) + if(!"AA"%in%ignore) expect_error(AAString(input), exp_error) + if(!"B"%in%ignore) expect_error(BString(input), exp_error) +} + +test_that("seqtype correctly infers types", { + expect_equal(seqtype(BString("ABC")), "B") + expect_equal(seqtype(RNAString("AUG")), "RNA") + expect_equal(seqtype(DNAString("ATG")), "DNA") + expect_equal(seqtype(AAString("ARN")), "AA") +}) + + +test_that("character, vector conversion works properly", { + expect_equal(as.character(DNAString(dnastr)), dnastr) + expect_equal(as.character(RNAString(rnastr)), rnastr) + expect_equal(as.character(AAString(aastr)), aastr) + expect_equal(as.character(BString(bstr)), bstr) +}) + +test_that("encode/decode tables work correctly", { + expect_equal(DNAString(tolower(dnastr)), DNAString(dnastr)) + expect_equal(RNAString(tolower(rnastr)), RNAString(rnastr)) + expect_equal(AAString(tolower(aastr)), AAString(aastr)) + + expect_equal(DNAString(as.factor(dnastr)), DNAString(dnastr)) + expect_equal(RNAString(as.factor(rnastr)), RNAString(rnastr)) + expect_equal(AAString(as.factor(aastr)), AAString(aastr)) + expect_equal(BString(as.factor(bstr)), BString(bstr)) + + test_allstrings_for_error(bstr, "not in lookup table", ignore="B") +}) + +test_that("'letter' correctly extracts elements", { + expect_equal(letter(DNAString("ATGCATGC"), c(1,3,6)), "AGT") + expect_equal(letter(RNAString("AUGCAUGC"), c(1,3,6)), "AGU") + expect_equal( letter(AAString("ARNDARND"), c(1,3,6)), "ANR") + expect_equal( letter(BString("ABCDEFGH"), c(1,3,6)), "ACF") + expect_equal(letter(bstr, seq(nchar(bstr), 1)), rawToChar(rev(charToRaw(bstr)))) + + ## TODO: Better error messages + expect_error(letter(DNAString(""), 10), "out of bounds") + expect_error(letter(RNAString(""), 10), "out of bounds") + expect_error(letter(AAString(""), 10), "out of bounds") + expect_error(letter(BString(""), 10), "out of bounds") + expect_error(letter("", 10), "out of bounds") +}) + +test_that("constructors handle invalid and non-char input correctly", { + # note that this only displays for character input + # meaning DNAString(1) returns a non-informative error + test_allstrings_for_error(NA_character_, "input must be a single non-NA string") + test_allstrings_for_error(as.factor(1:4), "input must be a single non-NA string") + test_allstrings_for_error(c("A", "A"), "input must be a single non-NA string") +}) + +test_that("conversion between XStrings works properly", { + # DNA <-> RNA + expect_equal(RNAString(DNAString(dnastr)), RNAString(rnastr)) + expect_equal(DNAString(RNAString(rnastr)), DNAString(dnastr)) + + # X -> B + expect_equal(BString(DNAString(dnastr)), BString(dnastr)) + expect_equal(BString(RNAString(rnastr)), BString(rnastr)) + expect_equal(BString(AAString(aastr)), BString(aastr)) + + # valid B -> X + expect_equal(DNAString(BString(dnastr)), DNAString(dnastr)) + expect_equal(RNAString(BString(rnastr)), RNAString(rnastr)) + expect_equal(AAString(BString(aastr)), AAString(aastr)) + + # invalid DNA,RNA <-> AA + expect_error(AAString(DNAString("ATGC")), "incompatible sequence types") + expect_error(AAString(RNAString("AUGC")), "incompatible sequence types") + expect_error(DNAString(AAString("ARND")), "incompatible sequence types") + expect_error(RNAString(AAString("ARND")), "incompatible sequence types") + + # invalid B -> X + bbad <- BString(";") + test_allstrings_for_error(bbad, 'not in lookup table', ignore='B') +}) + +test_that("as.vector methods work correctly", { + ## TODO: sometimes returns factor, sometimes character + ## note the BString case as an example + dnafac <- factor(seq_len(nchar(dnastr))) + attr(dnafac, 'levels') <- strsplit(dnastr, '')[[1]] + + rnafac <- factor(seq_len(nchar(rnastr))) + attr(rnafac, 'levels') <- strsplit(rnastr, '')[[1]] + + aafac <- factor(seq_len(nchar(aastr))) + attr(aafac, 'levels') <- strsplit(aastr, '')[[1]] + + bfac <- factor(seq_len(nchar(bstr))) + attr(bfac, 'levels') <- strsplit(bstr, '')[[1]] + + expect_equal(as.vector(DNAString(dnastr)), dnafac) + expect_equal(as.vector(RNAString(rnastr)), rnafac) + expect_equal(as.vector(AAString(aastr)), aafac) + #expect_equal(as.vector(BString(bstr)), bfac) +}) + +test_that("equality methods work as advertised", { + expect_true(DNAString(dnastr) == DNAString(dnastr)) + expect_true(RNAString(rnastr) == RNAString(rnastr)) + expect_true(AAString(aastr) == AAString(aastr)) + expect_true(BString(bstr) == BString(bstr)) + + expect_false(DNAString(dnastr) == DNAString(substr(dnastr, 1, 7))) + expect_false(RNAString(rnastr) == RNAString(substr(rnastr, 1, 7))) + expect_false(AAString(aastr) == AAString(substr(aastr, 1, 7))) + expect_false(BString(bstr) == BString(substr(bstr, 1, 7))) + + expect_true(DNAString(dnastr) != DNAString(substr(dnastr, 1, 7))) + expect_true(RNAString(rnastr) != RNAString(substr(rnastr, 1, 7))) + expect_true(AAString(aastr) != AAString(substr(aastr, 1, 7))) + expect_true(BString(bstr) != BString(substr(bstr, 1, 7))) + + + ## DNA <-> RNA comparison + expect_true(DNAString(dnastr) == RNAString(rnastr)) + + ## other B comparisons + expect_true(AAString(aastr) == BString(aastr)) + expect_true(AAString(aastr) != BString(bstr)) + expect_true(bstr == BString(bstr)) + + ## invalid comparisons + expect_error(DNAString(dnastr) == AAString(aastr), 'comparison between a "DNAString" instance and a "AAString" instance is not supported') + expect_error(DNAString(dnastr) == BString(bstr), 'comparison between a "DNAString" instance and a "BString" instance is not supported') + expect_error(RNAString(rnastr) == AAString(aastr), 'comparison between a "RNAString" instance and a "AAString" instance is not supported') + expect_error(RNAString(rnastr) == BString(bstr), 'comparison between a "RNAString" instance and a "BString" instance is not supported') +}) + +test_that("output works correctly", { + s <- paste(rep("A", 100L), collapse='') + expect_output(show(DNAString(s)), "100-letter DNAString object\\nseq: .+\\.\\.\\..+$", width=80) + expect_output(show(RNAString(s)), "100-letter RNAString object\\nseq: .+\\.\\.\\..+$", width=80) + expect_output(show(AAString(s)), "100-letter AAString object\\nseq: .+\\.\\.\\..+$", width=80) + expect_output(show(BString(s)), "100-letter BString object\\nseq: A{36}\\.\\.\\.A{36}$", width=80) + + # width of sequence is 5 less than that of 'width' + expect_output(show(DNAString(s)), "100-letter DNAString object\\nseq: .+\\.\\.\\..+$", width=10) + expect_output(show(RNAString(s)), "100-letter RNAString object\\nseq: .+\\.\\.\\..+$", width=10) + expect_output(show(AAString(s)), "100-letter AAString object\\nseq: .+\\.\\.\\..+$", width=10) + expect_output(show(BString(s)), "100-letter BString object\\nseq: AA\\.\\.\\.AA$", width=10) +}) + +test_that("substr, substring methods work correctly", { + d <- DNAString(dnastr) + r <- RNAString(rnastr) + a <- AAString(aastr) + b <- BString(bstr) + + expect_equal(as.character(substr(d, 1, 10)), substr(dnastr, 1, 10)) + expect_equal(as.character(substr(r, 1, 10)), substr(rnastr, 1, 10)) + expect_equal(as.character(substr(a, 1, 10)), substr(aastr, 1, 10)) + expect_equal(as.character(substr(b, 1, 10)), substr(bstr, 1, 10)) + + expect_equal(as.character(substring(d, 5, 10)), substring(dnastr, 5, 10)) + expect_equal(as.character(substring(r, 5, 10)), substring(rnastr, 5, 10)) + expect_equal(as.character(substring(a, 5, 10)), substring(aastr, 5, 10)) + expect_equal(as.character(substring(b, 5, 10)), substring(bstr, 5, 10)) + + expect_error(substring(d, 10, 5), "Invalid sequence coordinates") + expect_error(substring(r, 10, 5), "Invalid sequence coordinates") + expect_error(substring(a, 10, 5), "Invalid sequence coordinates") + expect_error(substring(b, 10, 5), "Invalid sequence coordinates") + + # `[` dispatch + expect_equal(as.character(d[1:10]), substr(dnastr, 1, 10)) + expect_equal(as.character(d[-1]), substr(dnastr, 2, nchar(dnastr))) +}) + +test_that("reverse, complement, reverseComplement work correctly", { + ## reverse tests + .revString <- function(s) paste(strsplit(s, '')[[1]][seq(nchar(s), 1L)], collapse='') + dna <- DNAString(dnastr) + rna <- RNAString(rnastr) + aaa <- AAString(aastr) + bbb <- BString(bstr) + d_comp <- "TGCAKYWSRMBDHVN-+." + r_comp <- "UGCAKYWSRMBDHVN-+." + + ## example Views on strings + d_v <- Views(dna, start=rep(1L,3L), end=rep(nchar(dnastr),3L)) + mdna <- dna + mrna <- rna + m1 <- Mask(nchar(dnastr), start=c(2,7), end=c(3,10)) + masks(mdna) <- masks(mrna) <- m1 + md_comp <- strsplit(d_comp, '')[[1]] + mr_comp <- strsplit(r_comp, '')[[1]] + md_comp[c(2:3,7:10)] <- mr_comp[c(2:3,7:10)] <- "#" + md_comp <- paste(md_comp, collapse='') + mr_comp <- paste(mr_comp, collapse='') + + + ## reverse method + expect_equal(reverse(dnastr), .revString(dnastr)) + expect_equal(as.character(reverse(dna)), .revString(dnastr)) + expect_equal(as.character(reverse(rna)), .revString(rnastr)) + expect_equal(as.character(reverse(aaa)), .revString(aastr)) + expect_equal(as.character(reverse(bbb)), .revString(bstr)) + expect_equal(as.character(reverse(mdna)), .revString(as.character(mdna))) + expect_equal(as.character(reverse(mrna)), .revString(as.character(mrna))) + + ## complement method + expect_equal(as.character(complement(dna)), d_comp) + expect_equal(as.character(complement(rna)), r_comp) + expect_error(complement(AAString()), "unable to find an inherited method") + expect_error(complement(BString()), "unable to find an inherited method") + expect_equal(as.character(complement(d_v)), rep(d_comp, 3L)) + expect_equal(as.character(complement(mdna)), md_comp) + expect_equal(as.character(complement(mrna)), mr_comp) + + ## reverseComplement method + expect_equal(as.character(reverseComplement(dna)), .revString(d_comp)) + expect_equal(as.character(reverseComplement(rna)), .revString(r_comp)) + expect_error(reverseComplement(AAString()), "unable to find an inherited method") + expect_error(reverseComplement(BString()), "unable to find an inherited method") + expect_equal(as.character(reverseComplement(d_v)), rep(.revString(d_comp), 3L)) + expect_equal(as.character(reverseComplement(mdna)), .revString(md_comp)) + expect_equal(as.character(reverseComplement(mrna)), .revString(mr_comp)) +}) + +## Porting RUnit tests +test_that("alphabet finds the correct values", { + expect_equal(alphabet(DNAString(dnastr)), strsplit(dnastr, '')[[1]]) + expect_equal(alphabet(RNAString(rnastr)), strsplit(rnastr, '')[[1]]) + expect_equal(alphabet(AAString(aastr)), strsplit(aastr, '')[[1]]) + expect_equal(alphabet(BString(bstr)), NULL) + + expect_equal(alphabet(DNAString(), baseOnly=TRUE), DNA_BASES) + expect_equal(alphabet(RNAString(), baseOnly=TRUE), RNA_BASES) + expect_error(alphabet(DNAString(), baseOnly=1), "'baseOnly' must be TRUE or FALSE") +}) diff --git a/tests/testthat/test-XStringQuality.R b/tests/testthat/test-XStringQuality.R new file mode 100644 index 00000000..bb13af1c --- /dev/null +++ b/tests/testthat/test-XStringQuality.R @@ -0,0 +1,77 @@ +## XStringQuality-class.R exports the following: +## - PhredQuality +## - SolexaQuality +## - IlluminaQuality +## - offset +## - alphabet +## - encoding +## - coercion methods + +## Phred: 0-99 quality as (ASCII)-33 (! = 0, note that this can't support >=95) +## Solexa: -5-99 quality as (ASCII)-64 (; = 0, no valid characters above 63) +## Illumina: 0-99 quality as (ASCII)-64 (@ = 0, no valid characters above 63) + +test_that("Quality constructors work correctly", { + expect_s4_class(PhredQuality(seq(0,99)), "PhredQuality") + expect_s4_class(SolexaQuality(seq(-5,99)), "SolexaQuality") + expect_s4_class(IlluminaQuality(0:99), "IlluminaQuality") + p <- PhredQuality(seq(0,99)) + s <- SolexaQuality(seq(-5,99)) + i <- IlluminaQuality(0:99) + + ## note that PhredQuality cannot represent scores greater than 93 + ## (and it auto-rounds all these values down) + ## all other qualities can support the values in their range + complete_ascii <- strsplit(rawToChar(as.raw(33:126)), '')[[1]] + + ## check alphabets + expect_equal(alphabet(p), complete_ascii[seq_len(94)]) + expect_equal(alphabet(s)[seq_len(68)], complete_ascii[seq(27,94)]) + expect_equal(alphabet(i)[seq_len(63)], complete_ascii[seq(32,94)]) + + ## check encodings + mapping <- seq(0,131) + names(mapping) <- complete_ascii + expect_equal(encoding(p), mapping[seq_len(94)]) + expect_equal(encoding(s), mapping[seq(27,131)] - 31L) + expect_equal(encoding(i), mapping[seq(32,131)] - 31L) + + ## check probability conversion + prob_conversions <- c("numeric", "NumericList") + + ## probability definitions taken from the man pages, should match + phred_scores <- c(0:93, rep(93,6L)) + solexa_scores <- seq(-5L, 99L) + illumina_scores <- seq(0L, 99L) + + phred_probs <- 10^(-0.1 * phred_scores) + solexa_probs <- 10^(-0.1*solexa_scores) / (1 + 10^(-0.1*solexa_scores)) + illumina_probs <- 10^(-0.1 * illumina_scores) + expect_equal(as.numeric(p), phred_probs) + expect_equal(as.numeric(s), solexa_probs) + expect_equal(as.numeric(i), illumina_probs) + expect_equal(as.numeric(as(p, "NumericList")[[1]]), phred_probs) + expect_equal(as.numeric(as(s, "NumericList")[[1]]), solexa_probs) + expect_equal(as.numeric(as(i, "NumericList")[[1]]), illumina_probs) + + ## score conversions + score_conversions <- c("integer", "matrix", "IntegerList") + expect_equal(as.integer(p), phred_scores) + expect_equal(as.integer(s), solexa_scores) + expect_equal(as.integer(i), illumina_scores) + expect_equal(as(p,'matrix')[1,], phred_scores) + expect_equal(as(s,'matrix')[1,], solexa_scores) + expect_equal(as(i,'matrix')[1,], illumina_scores) + expect_equal(as.integer(as(p,'IntegerList')[[1]]), phred_scores) + expect_equal(as.integer(as(s,'IntegerList')[[1]]), solexa_scores) + expect_equal(as.integer(as(i,'IntegerList')[[1]]), illumina_scores) + + ## BString, character conversions + b <- BString(paste(LETTERS, collapse='')) + expect_equal(as.character(PhredQuality(b)), as.character(b)) + expect_equal(as.character(SolexaQuality(b)), as.character(b)) + expect_equal(as.character(IlluminaQuality(b)), as.character(b)) + expect_equal(as.character(PhredQuality(as.character(b))), as.character(b)) + expect_equal(as.character(SolexaQuality(as.character(b))), as.character(b)) + expect_equal(as.character(IlluminaQuality(as.character(b))), as.character(b)) +}) \ No newline at end of file diff --git a/tests/testthat/test-XStringSet-class.R b/tests/testthat/test-XStringSet-class.R new file mode 100644 index 00000000..276012f8 --- /dev/null +++ b/tests/testthat/test-XStringSet-class.R @@ -0,0 +1,275 @@ +dnastr <- paste(DNA_ALPHABET, collapse='') +rnastr <- paste(RNA_ALPHABET, collapse='') +aastr <- paste(AA_ALPHABET, collapse='') +bstr <- rawToChar(as.raw(32:126)) + +d <- DNAStringSet(dnastr) +r <- RNAStringSet(rnastr) +a <- AAStringSet(aastr) +b <- BStringSet(bstr) + +## testing functions from old testing files +### '.eltAddresses(x)' collects the addresses of the elements in 'x' (in +### practice 'x' will be a list of external pointers or environments). +.eltAddresses <- function(x) sapply(x, XVector:::address) + +### 'x' and 'y' must be XVectorList vectors. +.haveIdenticalPools <- function(x, y) + identical(.eltAddresses(x@pool@xp_list), .eltAddresses(y@pool@xp_list)) + +### 'x' must be an XVectorList vector. +.poolEltLengths <- function(x) +{ + pool_len <- length(x@pool) + if (pool_len == 0L) + return(integer(0)) + sapply(seq_len(pool_len), function(i) length(x@pool[[i]])) +} + +test_that("seqtype correctly infers types", { + expect_equal(seqtype(d), "DNA") + expect_equal(seqtype(r), "RNA") + expect_equal(seqtype(a), "AA") + expect_equal(seqtype(b), "B") + + expect_equal(seqtype(d[[1]]), "DNA") + expect_equal(seqtype(r[[1]]), "RNA") + expect_equal(seqtype(a[[1]]), "AA") + expect_equal(seqtype(b[[1]]), "B") +}) + +test_that("seqtype conversion works as expected", { + ## conversion between RNA and DNA + expect_equal({x <- d; seqtype(x) <- "RNA"; x}, r) + expect_equal({x <- r; seqtype(x) <- "DNA"; x}, d) + + ## conversion to BStringSets + expect_equal({x <- d; seqtype(x) <- "B"; x}, BStringSet(dnastr)) + expect_equal({x <- r; seqtype(x) <- "B"; x}, BStringSet(rnastr)) + expect_equal({x <- a; seqtype(x) <- "B"; x}, BStringSet(aastr)) + + expect_equal({x <- BStringSet(dnastr); seqtype(x) <- "DNA"; x}, d) + expect_equal({x <- BStringSet(rnastr); seqtype(x) <- "RNA"; x}, r) + expect_equal({x <- BStringSet(aastr); seqtype(x) <- "AA"; x}, a) + + ## invalid conversions + expect_error(seqtype(d) <- "AA", "incompatible sequence types") + expect_error(seqtype(r) <- "AA", "incompatible sequence types") + expect_error(seqtype(a) <- "DNA", "incompatible sequence types") + expect_error(seqtype(a) <- "RNA", "incompatible sequence types") + expect_error(seqtype(b) <- "AA", "not in lookup table") + expect_error(seqtype(b) <- "DNA", "not in lookup table") + expect_error(seqtype(b) <- "RNA", "not in lookup table") +}) + +test_that("unlisting works", { + expect_equal(unlist(d), d[[1]]) + expect_equal(unlist(r), r[[1]]) + expect_equal(unlist(a), a[[1]]) + expect_equal(unlist(b), b[[1]]) +}) + +test_that("width and nchar are correct", { + expect_equal(width(d), nchar(dnastr)) + expect_equal(width(r), nchar(rnastr)) + expect_equal(width(a), nchar(aastr)) + expect_equal(width(b), nchar(bstr)) + + expect_equal(nchar(d), nchar(dnastr)) + expect_equal(nchar(r), nchar(rnastr)) + expect_equal(nchar(a), nchar(aastr)) + expect_equal(nchar(b), nchar(bstr)) + + expect_error(width(NA_character_), "NAs in 'x' are not supported") +}) + +test_that("concatenation and character/vector conversion are correct", { + expect_s4_class(c(d,d), "DNAStringSet") + expect_s4_class(c(r,r), "RNAStringSet") + expect_s4_class(c(a,a), "AAStringSet") + expect_s4_class(c(b,b), "BStringSet") + + dd <- c(d,d) + rr <- c(r,r) + aa <- c(a,a) + bb <- c(b,b) + + expect_equal(dd == DNAStringSet(c(dnastr, dnastr)), c(TRUE, TRUE)) + expect_equal(rr == RNAStringSet(c(rnastr, rnastr)), c(TRUE, TRUE)) + expect_equal(aa == AAStringSet(c(aastr, aastr)), c(TRUE, TRUE)) + expect_equal(bb == BStringSet(c(bstr, bstr)), c(TRUE, TRUE)) + + expect_equal(as.character(dd), c(dnastr, dnastr)) + expect_equal(as.character(rr), c(rnastr, rnastr)) + expect_equal(as.character(aa), c(aastr, aastr)) + expect_equal(as.character(bb), c(bstr, bstr)) + + expect_equal(as.vector(dd), c(dnastr, dnastr)) + expect_equal(as.vector(rr), c(rnastr, rnastr)) + expect_equal(as.vector(aa), c(aastr, aastr)) + expect_equal(as.vector(bb), c(bstr, bstr)) +}) + +test_that("matrix conversion is correct", { + dd <- c(d,d) + rr <- c(r,r) + aa <- c(a,a) + bb <- c(b,b) + + md <- do.call(rbind, rep(strsplit(dnastr, ''), 2)) + mr <- do.call(rbind, rep(strsplit(rnastr, ''), 2)) + ma <- do.call(rbind, rep(strsplit(aastr, ''), 2)) + mb <- do.call(rbind, rep(strsplit(bstr, ''), 2)) + + + expect_equal(as.matrix(dd), md) + expect_equal(as.matrix(rr), mr) + expect_equal(as.matrix(aa), ma) + expect_equal(as.matrix(bb), mb) + + md <- as.matrix(DNAStringSet('')) + mr <- as.matrix(RNAStringSet('')) + ma <- as.matrix(AAStringSet('')) + mb <- as.matrix(BStringSet('')) + + m_base <- matrix('', nrow=1, ncol=0) + expect_equal(md, m_base) + expect_equal(mr, m_base) + expect_equal(ma, m_base) + expect_equal(mb, m_base) +}) + +test_that("factor conversion is correct", { + expect_equal(as.factor(d), as.factor(dnastr)) + expect_equal(as.factor(r), as.factor(rnastr)) + expect_equal(as.factor(a), as.factor(aastr)) + expect_equal(as.factor(b), as.factor(bstr)) +}) + +test_that("data.frame conversion is correct", { + expect_equal(as.data.frame(c(d,d)), data.frame(x=c(dnastr, dnastr))) + expect_equal(as.data.frame(c(r,r)), data.frame(x=c(rnastr, rnastr))) + expect_equal(as.data.frame(c(a,a)), data.frame(x=c(aastr, aastr))) + expect_equal(as.data.frame(c(b,b)), data.frame(x=c(bstr, bstr))) +}) + +test_that("toString conversion is correct", { + expect_equal(toString(c(d,d)), paste(dnastr, dnastr, sep=', ')) + expect_equal(toString(c(r,r)), paste(rnastr, rnastr, sep=', ')) + expect_equal(toString(c(a,a)), paste(aastr, aastr, sep=', ')) + expect_equal(toString(c(b,b)), paste(bstr, bstr, sep=', ')) +}) + +test_that("show methods output correctly", { + expect_output(show(c(d,d)), "^DNAStringSet object of length 2:\\n") + expect_output(show(c(r,r)), "^RNAStringSet object of length 2:\\n") + expect_output(show(c(a,a)), "^AAStringSet object of length 2:\\n") + expect_output(show(c(b,b)), "^BStringSet object of length 2:\\n") +}) + +test_that("showAsCell works correctly", { + d <- DNAStringSet(paste(rep(dnastr, 10), collapse='')) + r <- RNAStringSet(paste(rep(rnastr, 10), collapse='')) + a <- AAStringSet(paste(rep(aastr, 10), collapse='')) + + expect_equal(nchar(showAsCell(d)), 23L) + expect_equal(nchar(showAsCell(r)), 23L) + expect_equal(nchar(showAsCell(a)), 23L) + expect_equal(nchar(showAsCell(b)), 23L) +}) + +test_that("comparison operators are correct", { + dna <- DNAStringSet(DNA_ALPHABET) + rna <- RNAStringSet(RNA_ALPHABET) + aaa <- AAStringSet(AA_ALPHABET) + bbb <- BStringSet(LETTERS) + + expect_true(!any(is.na(dna))) + expect_true(!anyNA(dna)) + expect_equal(match(dna, dna), seq_along(dna)) + expect_equal(aaa[seq_len(26)] < bbb, AA_ALPHABET[seq_len(26L)] < LETTERS) + + expect_equal(match(sort(aaa), bbb, nomatch=0), c(rep(0L, 4L), seq_len(26L))) + expect_true(all(dna == as.character(dna))) + + expect_error(aaa == dna, "is not supported") + expect_error(aaa == rna, "is not supported") + expect_true(all(dna == BStringSet(DNA_ALPHABET))) + expect_true(all(rna == BStringSet(RNA_ALPHABET))) + expect_true(all(aaa == BStringSet(AA_ALPHABET))) + expect_equal(dna == NULL, logical(0L)) +}) + +## Porting RUnit tests +test_that("short RUnit tests continue to pass", { + ## test_width_character + x <- safeExplode(rawToChar(as.raw(1:255))) + expect_equal(width(x), rep.int(1L, 255)) + + ## DNAStringSet internal elements ## + dna <- DNAStringSet(DNA_ALPHABET) + + ## DNAStringSet_constructor + expect_equal(.poolEltLengths(dna), length(DNA_ALPHABET)) + + ## DNAStringSet_width + expect_equal(width(dna), width(DNA_ALPHABET)) + + ## DNAStringSet_unlist + expect_equal(as.character(unlist(dna)), dnastr) + + ## DNAStringSet_showAsCell + expect_equal(showAsCell(DNAStringSet()), character(0L)) + expect_equal(showAsCell(dna), DNA_ALPHABET) +}) + +test_that("RUnit test_DNAStringSet_subsetting", { + dna <- DNAStringSet(DNA_ALPHABET) + elementMetadata(dna) <- DataFrame(C1=dna) + + dna0 <- dna[FALSE] + expect_equal(length(dna0), 0L) + ## Checking internal representation. + expect_equal(.poolEltLengths(dna0), integer(0)) + expect_true(.haveIdenticalPools(elementMetadata(dna0)$C1, dna0)) + expect_equal(elementMetadata(dna0)$C1@ranges, dna0@ranges) + + idx <- rep.int((8:6)*2L, 100L) + dna300 <- dna[idx] + expect_equal(length(dna300), length(idx)) + ## Checking internal representation. + expect_true(.haveIdenticalPools(dna300, dna)) + expect_true(.haveIdenticalPools(elementMetadata(dna300)$C1, dna300)) + expect_equal(elementMetadata(dna300)$C1@ranges, dna300@ranges) +}) + +test_that("RUnit test_DNAStringSet_combining", { + dna <- DNAStringSet(DNA_ALPHABET) + elementMetadata(dna) <- DataFrame(C1=dna) + + dna2a <- c(dna, dna) + dna2b <- rep(dna, 2L) + expect_equal(dna2a, dna2b) + + ## Checking internal representation. + expect_true(.haveIdenticalPools(dna2a, dna)) + expect_true(.haveIdenticalPools(dna2a, dna2b)) + expect_true(.haveIdenticalPools(elementMetadata(dna2a)$C1, dna2a)) + expect_equal(elementMetadata(dna2a)$C1@ranges, dna2a@ranges) +}) + +test_that("RUnit test_DNAStringSet_compaction", { + dna <- DNAStringSet(DNA_ALPHABET) + elementMetadata(dna) <- DataFrame(C1=dna) + + idx <- rep.int((8:6)*2L, 100L) + dna300 <- dna[idx] + compact_dna300 <- compact(dna300) + expect_equal(as.character(compact_dna300), as.character(dna300)) + ## Checking internal representation. + expect_equal(.poolEltLengths(compact_dna300), 3L) + expect_equal(.poolEltLengths(elementMetadata(compact_dna300)$C1), + .poolEltLengths(compact_dna300)) + expect_equal(elementMetadata(compact_dna300)$C1@ranges, + compact_dna300@ranges) +}) \ No newline at end of file diff --git a/tests/testthat/test-XStringSet-io.R b/tests/testthat/test-XStringSet-io.R new file mode 100644 index 00000000..ae4fcf63 --- /dev/null +++ b/tests/testthat/test-XStringSet-io.R @@ -0,0 +1,172 @@ +## TODO: Add in I/O tests for FASTQ files +# (will revisit after doing QualityScaledXStringSet testing) + +check_rw_strings_fasta <- function(XStringSetRFUN, XStringSetMAKEFUN, X_ALPH){ + # setup + set.seed(123L) + NUM_STRINGS <- 10L + seq_lens <- sample(100L, NUM_STRINGS, replace=TRUE) + f <- tempfile() + if(file.exists(f)) unlink(f) + + # make some test strings + strings <- character(NUM_STRINGS) + for(i in seq_len(NUM_STRINGS)) + strings[i] <- paste(sample(X_ALPH, seq_lens[i], replace=TRUE), collapse='') + + # make an XStringSet and name it + ss <- XStringSetMAKEFUN(strings) + names(ss) <- as.character(seq_len(NUM_STRINGS)) + + # write, then ensure it wrote + writeXStringSet(ss, f, format="fasta") + expect_true(file.exists(f)) + + # check that seq lengths in the fasta file are correct + ssn <- fasta.seqlengths(f, seqtype=seqtype(ss), use.names=TRUE) + expected_lengths <- lengths(ss) + names(expected_lengths) <- names(ss) + expect_equal(ssn, expected_lengths) + + # check that optional parameters work + expect_equal(fasta.seqlengths(f, seqtype=seqtype(ss), nrec=2, use.names=FALSE), seq_lens[seq_len(2)]) + expect_equal(fasta.seqlengths(f, seqtype=seqtype(ss), skip=2, nrec=3, use.names=FALSE), seq_lens[3:5]) + + # check indexing + indices <- fasta.index(f, seqtype=seqtype(ss)) + indices_sub <- fasta.index(f, skip=2L, nrec=3L, seqtype=seqtype(ss)) + + expect_s3_class(indices, "data.frame") + expect_equal(colnames(indices), c("recno", "fileno", "offset", "desc", "seqlength", "filepath")) + expect_true(all(indices$filepath == f)) + expect_true(all(indices$recno == names(ss))) + expect_equal(indices$seqlength, seq_lens) + expect_equal(nrow(indices_sub), 3L) + + # rownames will screw this up + rownames(indices_sub) <- 3:5 + expect_equal(indices[3:5,], indices_sub) + + # check that the actual values are correct + ss_read <- XStringSetRFUN(f, format="fasta") + ss_read_sub <- XStringSetRFUN(f, nrec=3, skip=2) + + # double checking with equals.XStringSet and standard character + expect_equal(class(ss_read), class(ss)) + expect_true(all(ss_read == ss)) + expect_equal(as.character(ss_read), as.character(ss)) + expect_true(all(ss_read_sub == ss[3:5])) + expect_equal(as.character(ss_read_sub), as.character(ss[3:5])) + + expect_equal(names(ss_read), names(ss)) + expect_equal(names(ss_read_sub), names(ss)[3:5]) + + # check that we can append to file + writeXStringSet(ss, f, format="fasta", append=TRUE) + ss_read <- XStringSetRFUN(f, format="fasta") + expect_true(all(ss_read == c(ss, ss))) + expect_equal(as.character(ss_read), as.character(c(ss, ss))) + + unlink(f) +} + +check_rw_strings_serial <- function(XStringSetMAKEFUN, X_ALPH){ + # setup + set.seed(345L) + NUM_STRINGS <- 10L + seq_lens <- sample(100L, NUM_STRINGS, replace=TRUE) + d <- tempdir() + f <- "example_xs" + outfile <- file.path(d, paste0(f, '.rda')) + if(file.exists(outfile)) unlink(outfile) + + # make some test strings + strings <- character(NUM_STRINGS) + for(i in seq_len(NUM_STRINGS)) + strings[i] <- paste(sample(X_ALPH, seq_lens[i], replace=TRUE), collapse='') + + ss <- XStringSetMAKEFUN(strings) + + expect_output(saveXStringSet(ss, f, dirpath=d, save.dups=FALSE), "Saving .+/example_xs.rda") + expect_true(file.exists(outfile)) + + if("example_xs" %in% ls()) rm("example_xs") + load(outfile) + expect_equal(as.character(example_xs), as.character(ss)) + expect_true(all(example_xs == ss)) + unlink(outfile) + + # unsure how this is supposed to work + # expect_output(saveXStringSet(XStringSetMAKEFUN(rep(strings, 2)), f, dirpath=d, save.dups=TRUE), "Saving .+/example_xs.rda") + + ## testing standard save() + old_ss <- ss + save(ss, file=outfile) + expect_true(file.exists(outfile)) + rm(ss) + load(outfile) + expect_equal(as.character(ss), as.character(old_ss)) + expect_true(all(ss == old_ss)) + unlink(outfile) +} + +test_that("All XStringSets can be read/written to a FASTA", { + B_ALPHABET <- strsplit(rawToChar(as.raw(32:126)), '')[[1]] + check_rw_strings_fasta(readDNAStringSet, DNAStringSet, DNA_ALPHABET) + check_rw_strings_fasta(readRNAStringSet, RNAStringSet, RNA_ALPHABET) + check_rw_strings_fasta(readAAStringSet, AAStringSet, AA_ALPHABET) + check_rw_strings_fasta(readBStringSet, BStringSet, B_ALPHABET) +}) + +test_that("All XStringSets can be serialized and subseqeuntly read", { + B_ALPHABET <- strsplit(rawToChar(as.raw(32:126)), '')[[1]] + check_rw_strings_serial(DNAStringSet, DNA_ALPHABET) + check_rw_strings_serial(RNAStringSet, RNA_ALPHABET) + check_rw_strings_serial(AAStringSet, AA_ALPHABET) + check_rw_strings_serial(BStringSet, B_ALPHABET) +}) + +## Previous RUnit testing +test_that("XStringSet read/write is correct", { + ## setup initial values + tmpdir <- tempdir() + tmpfilefa <- file.path(tmpdir,"whee.fa") + tmpfilefq <- file.path(tmpdir,"whee.fq") + if(file.exists(tmpfilefa)) unlink(tmpfilefa) + if(file.exists(tmpfilefq)) unlink(tmpfilefq) + + + x1 <- DNAStringSet(c("TTGA", "CTCN")) + q1 <- PhredQuality(c("*+,-", "6789")) + qdna1 <- QualityScaledDNAStringSet(x1, q1) + names_dna <- c("A", "B") + names(qdna1) <- names_dna + + names(x1) <- names_dna + writeXStringSet(x1, format="fastq", file=tmpfilefq) + expect_warning(qdna2 <- readQualityScaledDNAStringSet(tmpfilefq), + "metadata columns on input DNAStringSet object were dropped") + + expect_true(!all(quality(qdna2) == quality(qdna1))) + expect_true(all(all(as(quality(qdna2),"IntegerList") == 26L))) + + writeXStringSet(x1, format="fastq", file=tmpfilefq, qualities = q1) + expect_warning(qdna2 <- readQualityScaledDNAStringSet(tmpfilefq), + "metadata columns on input DNAStringSet object were dropped") + expect_true(all(quality(qdna2) == quality(qdna1))) + + writeXStringSet(qdna1, format="fastq", file=tmpfilefq) + expect_warning(qdna2 <- readQualityScaledDNAStringSet(tmpfilefq), + "metadata columns on input DNAStringSet object were dropped") + expect_true(all(names(qdna2) == names_dna)) + expect_true(all(quality(qdna2) == quality(qdna1))) + + qdna2 <- readDNAStringSet(tmpfilefq, format="fastq") + expect_true(is.null(mcols(qdna2))) + + qdna2 <- readDNAStringSet(tmpfilefq, format="fastq", with.qualities = TRUE) + expect_true(all(mcols(qdna2)$qualities == quality(qdna1))) + + unlink(tmpfilefa) + unlink(tmpfilefq) +}) diff --git a/tests/testthat/test-XStringSetList-class.R b/tests/testthat/test-XStringSetList-class.R new file mode 100644 index 00000000..cc9cbdd8 --- /dev/null +++ b/tests/testthat/test-XStringSetList-class.R @@ -0,0 +1,198 @@ +## WARNING: Do *NOT* use checkIdentical() on XString, XStringSet, or +## XStringSetList objects. It is *NOT* reliable. Use checkTrue(all.equal()) +## instead. + +## TODO: Maybe "all.equal" should be made an S4 generic with S4/S3 method +## combos for XVector and XVectorList object? + +B_ALPHABET <- strsplit(rawToChar(as.raw(32:126)), '')[[1]] + +## Unclass the XStringSet object by setting its "pool" and "ranges" slots +## to NULL first. +.unclass_XStringSet <- function(x) +{ + slot(x, "pool", check=FALSE) <- NULL + slot(x, "ranges", check=FALSE) <- NULL + unclass(x) +} + +all.equal.XStringSet <- function(target, current, ...) +{ + ## Compare the sequences (and their names if they have). + target_seqs <- as.character(target) + current_seqs <- as.character(current) + ok1 <- all.equal(target_seqs, current_seqs) + ## Compare the rest. + + ## this was previously commented out, but works with testthat + target <- .unclass_XStringSet(target) + current <- .unclass_XStringSet(current) + ok2 <- all.equal(target, current) + ## + + ok2 <- identical(class(target), class(current)) + if (!ok2) + ok2 <- "class mismatch" + ok3 <- all.equal(target@metadata, current@metadata) + ok4 <- all.equal(target@elementMetadata, current@elementMetadata) + ans <- character(0) + if (!isTRUE(ok1)) + ans <- c(ans, ok1) + if (!isTRUE(ok2)) + ans <- c(ans, ok2) + if (!isTRUE(ok3)) + ans <- c(ans, ok3) + if (!isTRUE(ok4)) + ans <- c(ans, ok4) + if (length(ans) == 0L) + return(TRUE) + ans +} + +all.equal.XStringSetList <- function(target, current, ...) +{ + ok1 <- identical(class(target), class(current)) + if (!ok1) + ok1 <- "class mismatch" + target2 <- as(target, "CharacterList") + current2 <- as(current, "CharacterList") + ## Temporary workaround until coercion from XStringSet to CharacterList + ## is fixed to propagate metadata and metadata columns. + metadata(target2) <- metadata(target) + mcols(target2) <- mcols(target) + metadata(current2) <- metadata(current) + mcols(current2) <- mcols(current) + ok2 <- all.equal(target2, current2) + ans <- character(0) + if (!isTRUE(ok1)) + ans <- c(ans, ok1) + if (!isTRUE(ok2)) + ans <- c(ans, ok2) + if (length(ans) == 0L) + return(TRUE) + ans +} + +.XStringSetList_constructor <- function(XStringSetFUN, XStringSetListFUN, XS_ALPHABET){ + xs1 <- XStringSetFUN(XS_ALPHABET[1:8]) + xs2 <- XStringSetFUN(XS_ALPHABET[9:17]) + + lst1 <- XStringSetListFUN(xs1, xs2) + lst2 <- XStringSetListFUN(as.character(XS_ALPHABET[1:8]), + as.character(XS_ALPHABET[9:17])) + + expect_true(all.equal.XStringSetList(lst1, lst2)) + expect_equal(length(XStringSetListFUN()), 0L) +} + +.XStringSetList_unlist <- function(XStringSetFUN, XStringSetListFUN, XS_ALPHABET){ + lst <- XStringSetListFUN(XS_ALPHABET, XS_ALPHABET) + expected <- XStringSetFUN(c(XS_ALPHABET, XS_ALPHABET)) + expect_true(all.equal.XStringSet(unlist(lst), expected)) +} + +.XStringSetList_append <- function(XStringSetFUN, XStringSetListFUN, XS_ALPHABET){ + xs <- XStringSetFUN(XS_ALPHABET) + lst <- XStringSetListFUN(XS_ALPHABET, XS_ALPHABET) + elementMetadata(lst) <- DataFrame(C1=c("list1", "list2")) + + xs2a <- c(lst, lst) + xs2b <- rep(lst, 2L) + xs2c <- append(lst, lst) + expect_true(all.equal.XStringSetList(xs2a, xs2b)) + expect_true(all.equal.XStringSetList(xs2a, xs2c)) +} + +.XStringSetList_subset <- function(XStringSetFUN, XStringSetListFUN, XS_ALPHABET){ + lst1 <- XStringSetListFUN(XS_ALPHABET[-1], XS_ALPHABET[-2], XS_ALPHABET[-3], XS_ALPHABET[-4]) + lst2 <- XStringSetListFUN(XS_ALPHABET[-2], XS_ALPHABET[-1], XS_ALPHABET[-4]) + + expect_true(all.equal.XStringSetList(lst1[1:2], lst2[2:1])) + expect_true(all.equal.XStringSetList(lst1[-c(2:3)], lst2[-1])) +} + +.XStringSetList_from_list <- function(XStringSetFUN, XStringSetListFUN, XS_ALPHABET){ + lst1 <- list(XStringSetFUN(XS_ALPHABET), XStringSetFUN(XS_ALPHABET)) + lst2 <- XStringSetListFUN(XS_ALPHABET, XS_ALPHABET) + + expect_true(all.equal.XStringSetList(XStringSetListFUN(lst1), lst2)) +} + +## DNAStringSet +test_that("XStringSetList constructor works correctly", { + .XStringSetList_constructor(DNAStringSet, DNAStringSetList, DNA_ALPHABET) + .XStringSetList_constructor(RNAStringSet, RNAStringSetList, RNA_ALPHABET) + .XStringSetList_constructor(AAStringSet, AAStringSetList, AA_ALPHABET) + .XStringSetList_constructor(BStringSet, BStringSetList, B_ALPHABET) +}) + +test_that("XStringSetList unlist works correctly", { + .XStringSetList_unlist(DNAStringSet, DNAStringSetList, DNA_ALPHABET) + .XStringSetList_unlist(RNAStringSet, RNAStringSetList, RNA_ALPHABET) + .XStringSetList_unlist(AAStringSet, AAStringSetList, AA_ALPHABET) + .XStringSetList_unlist(BStringSet, BStringSetList, B_ALPHABET) +}) + +test_that("XStringSetList append works correctly", { + .XStringSetList_append(DNAStringSet, DNAStringSetList, DNA_ALPHABET) + .XStringSetList_append(RNAStringSet, RNAStringSetList, RNA_ALPHABET) + .XStringSetList_append(AAStringSet, AAStringSetList, AA_ALPHABET) + .XStringSetList_append(BStringSet, BStringSetList, B_ALPHABET) +}) + +test_that("XStringSetList subset works correctly", { + .XStringSetList_subset(DNAStringSet, DNAStringSetList, DNA_ALPHABET) + .XStringSetList_subset(RNAStringSet, RNAStringSetList, RNA_ALPHABET) + .XStringSetList_subset(AAStringSet, AAStringSetList, AA_ALPHABET) + .XStringSetList_subset(BStringSet, BStringSetList, B_ALPHABET) +}) + +test_that("XStringSetList showAsCell works correctly", { + expect_equal(showAsCell(DNAStringSetList()), character(0L)) + expect_equal(showAsCell(DNAStringSetList(DNA_ALPHABET)), "A,C,G,...") +}) + +test_that("XStringSetList seqtype is correct", { + expect_equal(seqtype(DNAStringSetList()), "DNA") + expect_equal(seqtype(RNAStringSetList()), "RNA") + expect_equal(seqtype(AAStringSetList()), "AA") + expect_equal(seqtype(BStringSetList()), "B") + + # downgrade sequences + x <- BStringSetList(DNA_ALPHABET) + seqtype(x) <- "DNA" + all.equal.XStringSetList(x, DNAStringSetList(DNA_ALPHABET)) + seqtype(x) <- "RNA" + all.equal.XStringSetList(x, RNAStringSetList(RNA_ALPHABET)) + + x <- BStringSetList(AA_ALPHABET) + seqtype(x) <- "AA" + all.equal.XStringSetList(x, AAStringSetList(AA_ALPHABET)) + + # invalid conversions + x <- DNAStringSetList(DNA_ALPHABET) + expect_error({seqtype(x) <- "AA"}, 'incompatible sequence types "DNA" and "AA"') + seqtype(x) <- "RNA" + expect_error({seqtype(x) <- "AA"}, 'incompatible sequence types "RNA" and "AA"') +}) + +test_that("XStringSetLists show correctly", { + ## TODO: add some values in case printing only breaks on output + expect_output(show(DNAStringSetList(DNA_ALPHABET)), "DNAStringSetList of length 1\\n\\[\\[1\\]\\]") + expect_output(show(RNAStringSetList(RNA_ALPHABET)), "RNAStringSetList of length 1\\n\\[\\[1\\]\\]") + expect_output(show(AAStringSetList(AA_ALPHABET)), "AAStringSetList of length 1\\n\\[\\[1\\]\\]") + expect_output(show(BStringSetList(B_ALPHABET)), "BStringSetList of length 1\\n\\[\\[1\\]\\]") +}) + +test_that("XStringSetLists coerce from list correctly", { + .XStringSetList_from_list(DNAStringSet, DNAStringSetList, DNA_ALPHABET) + .XStringSetList_from_list(RNAStringSet, RNAStringSetList, RNA_ALPHABET) + .XStringSetList_from_list(AAStringSet, AAStringSetList, AA_ALPHABET) + .XStringSetList_from_list(BStringSet, BStringSetList, B_ALPHABET) +}) + +test_that("XStringSetList uses nchar correctly", { + dna <- DNAStringSetList(DNA_ALPHABET) + expect_s4_class(nchar(dna), "IntegerList") + expect_equal(unlist(nchar(dna)), rep(1L, length(DNA_ALPHABET))) +}) \ No newline at end of file diff --git a/tests/testthat/test-XStringSetList.R b/tests/testthat/test-XStringSetList.R new file mode 100644 index 00000000..dd1ae2ac --- /dev/null +++ b/tests/testthat/test-XStringSetList.R @@ -0,0 +1,36 @@ +## XStringSetList-class.R exports the following: +## - DNAStringSetList +## - AAStringSetList +## note that RNAStringSetList and BStringSetList are not supported + + +test_that("XStringSetList constuctors work properly", { + seq_lengths <- c(2,3,4) + seqs <- list(DNA=paste(DNA_ALPHABET, collapse=''), + RNA=paste(RNA_ALPHABET, collapse=''), + AA=paste(AA_ALPHABET, collapse=''), + B=rawToChar(as.raw(32:126))) + + for(s in c("DNA", "RNA", "AA", "B")){ + seq <- seqs[[s]] + all_seqs <- lapply(seq_along(seq_lengths), \(i){ + get(paste0(s, "StringSet"))(rep(seq, seq_lengths[i])) + }) + cl_name <- paste0(s, "StringSetList") + CONSTRUCTOR <- get(cl_name) + expect_s4_class(CONSTRUCTOR(all_seqs), cl_name) + + seqlist <- CONSTRUCTOR(all_seqs) + expect_equal(length(seqlist), length(seq_lengths)) + expect_equal(lengths(seqlist), seq_lengths) + expect_equal(as.list(seqlist), all_seqs) + expect_equal(seqtype(seqlist), s) + + # empty constructor + expect_equal(length(CONSTRUCTOR()), 0L) + + names(all_seqs) <- LETTERS[seq_along(seq_lengths)] + expect_equal(names(CONSTRUCTOR(all_seqs)), names(all_seqs)) + expect_null(names(CONSTRUCTOR(all_seqs, use.names=FALSE))) + } +}) \ No newline at end of file diff --git a/tests/testthat/test-XStringViews.R b/tests/testthat/test-XStringViews.R new file mode 100644 index 00000000..a0970d7d --- /dev/null +++ b/tests/testthat/test-XStringViews.R @@ -0,0 +1,49 @@ +## XStringViews-class.R exports the following: +## - `==` for XStringViews against XStringViews, XString, character +## - `!=` for the same as above +## - as.character +## - as.matrix +## - toString + + +slen <- 25L +dnas <- sample(DNA_BASES, slen, replace=TRUE) +d <- DNAString(paste(dnas, collapse='')) + +test_that('XStringViews equality works correctly', { + n_ranges <- 5L + starts <- sample(slen-5L, n_ranges) + ends <- starts + sample(5L, n_ranges) + strs <- vapply(seq_len(n_ranges), \(i) paste(dnas[seq(starts[i], ends[i])], collapse=''), character(1L)) + v <- Views(d, start=starts, end=ends) + expect_equal(seqtype(v), seqtype(d)) + + ## only the diagonal should be true + exp_opt <- matrix(FALSE, nrow=n_ranges, ncol=n_ranges) + diag(exp_opt) <- TRUE + test1 <- matrix(unlist(v == DNAStringSet(strs)), nrow=n_ranges, byrow=TRUE) + expect_identical(test1, exp_opt) + + ## only the diagonal should be false + exp_opt <- !exp_opt + test2 <- matrix(unlist(v != DNAStringSet(strs)), nrow=n_ranges, byrow=TRUE) + expect_identical(test2, exp_opt) + + ## comparison against Views objects should be supported + expect_identical(v==v, rep(TRUE, n_ranges)) + + ## comparison with non-BString is not supported + expect_error(v == strs, "is not supported") + expect_error(strs == v, "is not supported") + + ## comparing BString against character + v2 <- v + expect_identical((seqtype(v2) <- "B"), "B") + expect_identical(v2 == strs, rep(TRUE, n_ranges)) + expect_identical(strs == v2, rep(TRUE, n_ranges)) + expect_identical(v2 != strs, rep(FALSE, n_ranges)) + expect_identical(strs != v2, rep(FALSE, n_ranges)) + + expect_identical(toString(v2), paste(strs, collapse=', ')) + expect_identical(nchar(v2), nchar(strs)) +}) diff --git a/tests/testthat/test-chartr.R b/tests/testthat/test-chartr.R new file mode 100644 index 00000000..e665cceb --- /dev/null +++ b/tests/testthat/test-chartr.R @@ -0,0 +1,42 @@ +## chartr.R exports the following: +## - chartr +## - replaceAmbiguities +## +## chartr() is defined for XString, XStringSet, +## XStringViews, MaskedXString + + +test_that("chartr has correct behavior on Biostrings objects", { + s1 <- BString("apple AppLE potato") + s2 <- BString("peels AeeLE eotpto") + expect_true(chartr("apple", "peels", s1) == s2) + expect_true(all(chartr("apple", "peels", BStringSet(list(s1,s1))) == BStringSet(list(s2,s2)))) + + v1 <- Views(s1, start=c(1,7,13), width=c(5,5,6)) + v2 <- Views(s2, start=c(1,7,13), width=c(5,5,6)) + expect_equal(as.character(chartr("apple", "peels", v1)), as.character(v2)) + + ## will break when we implement this to remind me to write tests + ms1 <- s1 + ms2 <- s2 + m1 <- Mask(length(s1), start=7, width=5) + masks(ms1) <- m1 + expect_error(chartr("apple", "peels", ms1), "Please complain!") +}) + +test_that('replaceAmbiguities works as expected', { + dna1 <- DNAString(paste(DNA_ALPHABET, collapse='')) + dna2 <- DNAString(paste(c(DNA_BASES, rep("N", length(dna1)-7L), "-+."), collapse='')) + + expect_equal(replaceAmbiguities(dna1), dna2) + expect_equal(replaceAmbiguities(as(dna1, "RNAString")), as(dna2, "RNAString")) + + aa <- AAString(paste(AA_ALPHABET, collapse='')) + bb <- BString(paste(LETTERS, collapse='')) + expect_error(replaceAmbiguities(aa), "only supported for DNA and RNA") + expect_error(replaceAmbiguities(bb), "only supported for DNA and RNA") + expect_error(replaceAmbiguities("test"), "only supported for DNA and RNA") + + expect_true(all(replaceAmbiguities(DNAStringSet(list(dna1, dna1))) == DNAStringSet(list(dna2, dna2)))) + expect_true(all(replaceAmbiguities(Views(dna1, start=c(1,4,7), width=3)) == Views(dna2, start=c(1,4,7), width=3))) +}) \ No newline at end of file diff --git a/tests/testthat/test-dinucleotideFrequencyTest.R b/tests/testthat/test-dinucleotideFrequencyTest.R new file mode 100644 index 00000000..9f6724d0 --- /dev/null +++ b/tests/testthat/test-dinucleotideFrequencyTest.R @@ -0,0 +1,32 @@ +## dinucleotideFrequencyTest exports dinucleotideFrequencyTest() +## for the following classes: +## +## - DNAStringSet +## - RNAStringSet + +test_that("examples in dinucleotideFrequencyTest.Rd maintain functionality", { + ## TODO: better tests. + ## This is adapted from current functionality, do we know that's correct? + data(HNF4alpha) + expect_s3_class(dinucleotideFrequencyTest(HNF4alpha, 1, 2), "htest") + + ## why doesn't p.value have a name? + test1 <- dinucleotideFrequencyTest(HNF4alpha, 1, 2) + expect_equal(c(round(test1$statistic,4L), test1$parameter, round(test1$p.value, 4L)), + c("X-squared"=19.0727, "df"=9, 0.0246)) + + test2 <- dinucleotideFrequencyTest(HNF4alpha, 1, 2, test = "G") + expect_equal(c(round(test2$statistic,4L), test2$parameter, round(test2$p.value,4L)), + c("Log likelihood ratio statistic (G)"=17.2609, + "X-squared df"=9, + "p.value"=0.0448)) + + expect_false(grepl("Williams' correction", test2$method)) + + test3 <- dinucleotideFrequencyTest(HNF4alpha, 1, 2, test = "adjG") + expect_equal(c(round(test3$statistic,4L), test3$parameter, round(test3$p.value,4L)), + c("Log likelihood ratio statistic (G)"=10.8064, + "X-squared df"=9, + "p.value"=0.2892)) + expect_true(grepl("Williams' correction", test3$method)) +}) \ No newline at end of file diff --git a/tests/testthat/test-findPalindromes.R b/tests/testthat/test-findPalindromes.R new file mode 100644 index 00000000..a6a5e601 --- /dev/null +++ b/tests/testthat/test-findPalindromes.R @@ -0,0 +1,175 @@ +## findPalindromes.R exports the following: +## - findPalindromes() +## - palindromeArmLength() +## - palindromeLeftArm() +## - palindromeRightArm() +## +## TODOs: missing MaskedXString, XStringViews tests + + +## new tests +test_that("palindromeLeftArm and *RightArm identify the right arms", { + text4 <- BString("i45hgfe7d321c3b4a56789uvwWVU98765A4B3C123D7EFGH54I") + current <- findPalindromes(text4, min.armlength = 5, max.looplength = length(text4), max.mismatch = 3) + leftarm <- palindromeLeftArm(current) + expect_equal(start(leftarm), start(current)) + expect_equal(width(leftarm), c(5,1,1,3,1,0)) + + rightarm <- palindromeRightArm(current) + expect_equal(end(rightarm), end(current)) + expect_equal(width(rightarm), width(leftarm)) +}) + +## For now I'm just going to port the old tests +test_that("findPalindromes works on BString objects", { + text1 <- BString("ABCDCBA") + current <- findPalindromes(text1, min.armlength=2) + expect_identical(IRanges(1, 7), ranges(current)) + current <- findPalindromes(text1, min.armlength=2, max.looplength=0) + expect_identical(IRanges(), ranges(current)) + + text2 <- BString("xesopeReste etseReposeygohangasalamiImalasagnahogz") + current <- findPalindromes(text2, max.looplength=2) + expect_identical(IRanges(c(2, 24), c(22, 49)), ranges(current)) + expect_identical(c(10L, 12L), palindromeArmLength(current)) + + text3 <- BString("AATAAACTNTCAAATYCCY") + current <- findPalindromes(text3, min.armlength=2) + expect_identical(IRanges(c(1, 3, 16), c(5, 15, 19)), ranges(current)) + + text4 <- BString("i45hgfe7d321c3b4a56789uvwWVU98765A4B3C123D7EFGH54I") + current <- findPalindromes(text4, min.armlength = 5, max.looplength = length(text4), max.mismatch = 3) + expect_identical(IRanges(c(18, 16, 14, 10, 8, 7), c(33, 35, 37, 41, 43, 44)), ranges(current)) +}) + +test_that("findPalindromes works on nucleotide sequences", { + x1 <- DNAString("AATAAACTNTCAAATYCCY") # same sequence as 'text3' + for (class in c("DNAString", "RNAString")) { + x <- as(x1, class) + current <- findPalindromes(x, min.armlength=2) + expect_identical(IRanges(), ranges(current)) + current <- findPalindromes(x, min.armlength=2, max.looplength=10) + expect_identical(IRanges(2, 15), ranges(current)) + } + + x2 <- DNAString("AATTT") + for (class in c("DNAString", "RNAString")) { + x <- as(x2, class) + current <- findPalindromes(x, min.armlength=2) + target <- IRanges(1, 4:5) + expect_identical(target, ranges(current)) + current <- findPalindromes(x, min.armlength=2, max.looplength=0) + expect_identical(target[1], ranges(current)) + } + + x3 <- DNAString("TTTAA") + for (class in c("DNAString", "RNAString")) { + x <- as(x3, class) + current <- findPalindromes(x, min.armlength=2) + target <- IRanges(1:2, 5) + expect_identical(target, ranges(current)) + current <- findPalindromes(x, min.armlength=2, max.looplength=0) + expect_identical(target[2], ranges(current)) + } + + x4 <- DNAString("AAATTT") + for (class in c("DNAString", "RNAString")) { + x <- as(x4, class) + current <- findPalindromes(x, min.armlength=2) + target <- IRanges(c(1, 1, 2), c(5, 6, 6)) + expect_identical(target, ranges(current)) + current <- findPalindromes(x, min.armlength=3) + expect_identical(target[2], ranges(current)) + current <- findPalindromes(x, min.armlength=4) + expect_identical(target[0], ranges(current)) + } + + ## With nested palindromes. + x5 <- DNAString("CTTGAAATTTGAAG") + for (class in c("DNAString", "RNAString")) { + x <- as(x5, class) + target0 <- IRanges(c(2, 2, 5, 5, 1, 6, 8, 9), + c(6, 7, 9, 10, 14, 10, 13, 13)) + current <- findPalindromes(x, min.armlength=2, max.looplength=0) + expect_identical(target0[4], ranges(current)) + current <- findPalindromes(x, min.armlength=2, max.looplength=1) + expect_identical(target0[-c(2, 5, 7)], ranges(current)) + for (i in 2:7) { + current <- findPalindromes(x, min.armlength=2, max.looplength=i) + expect_identical(target0[-5], ranges(current)) + } + current <- findPalindromes(x, min.armlength=2, max.looplength=8) + expect_identical(target0, ranges(current)) + } + + x6 <- DNAString("AAGAATTGTT") + for (class in c("DNAString", "RNAString")) { + x <- as(x6, class) + target0 <- IRanges(c(1, 4, 1, 4), c(7, 7, 10, 10)) + for (i in 0:2) { + current <- findPalindromes(x, min.armlength=2, max.looplength=i) + expect_identical(target0[2], ranges(current)) + } + for (i in 3:5) { + current <- findPalindromes(x, min.armlength=2, max.looplength=i) + expect_identical(target0[-3], ranges(current)) + } + current <- findPalindromes(x, min.armlength=2, max.looplength=6) + expect_identical(target0, ranges(current)) + } + + # With IUPAC ambiguity codes. Ambiguity codes are not allowed in the arms + # of a palindrome! + x7 <- DNAString("NNNNNN") + for (class in c("DNAString", "RNAString")) { + x <- as(x7, class) + current <- findPalindromes(x, min.armlength=2) + expect_identical(IRanges(), ranges(current)) + } + + x8 <- DNAString("ACCGNAAATTTNCGGT") + for (class in c("DNAString", "RNAString")) { + x <- as(x8, class) + current <- findPalindromes(x, min.armlength=2) + expect_identical(IRanges(c(6, 6, 7), c(10, 11, 11)), ranges(current)) + current <- findPalindromes(x, min.armlength=4) + expect_identical(IRanges(), ranges(current)) + current <- findPalindromes(x, min.armlength=3, max.looplength=8) + expect_identical(IRanges(c(6, 1), c(11, 16)), ranges(current)) + } +}) + +test_that("palindromeArmLength functions correctly for nucleotide sequences", { + x2 <- DNAString("AATTT") + for (class in c("DNAString", "RNAString")) { + x <- as(x2, class) + expect_identical(2L, palindromeArmLength(x)) + pals <- findPalindromes(x, min.armlength=2) + expect_identical(c(2L, 2L), palindromeArmLength(pals)) + } + + x3 <- DNAString("TTTAA") + for (class in c("DNAString", "RNAString")) { + x <- as(x3, class) + expect_identical(2L, palindromeArmLength(x)) + pals <- findPalindromes(x, min.armlength=2) + expect_identical(c(2L, 2L), palindromeArmLength(pals)) + } + + x4 <- DNAString("AAATTT") + for (class in c("DNAString", "RNAString")) { + x <- as(x4, class) + expect_identical(3L, palindromeArmLength(x)) + pals <- findPalindromes(x, min.armlength=2) + expect_identical(c(2L, 3L, 2L), palindromeArmLength(pals)) + } + + x5 <- DNAString("CTTGAAATTTGAAG") + for (class in c("DNAString", "RNAString")) { + x <- as(x5, class) + expect_identical(3L, palindromeArmLength(x)) + pals <- findPalindromes(x, min.armlength=2, max.looplength=8) + expect_identical(c(2L, 2L, 2L, 3L, 3L, 2L, 2L, 2L), + palindromeArmLength(pals)) + } +}) diff --git a/tests/testthat/test-letter.R b/tests/testthat/test-letter.R new file mode 100644 index 00000000..bca65936 --- /dev/null +++ b/tests/testthat/test-letter.R @@ -0,0 +1,46 @@ +## letter.R exports `letter` for the following classes: +## - character +## - XString +## - XStringViews +## - MaskedXString + + +test_that("letter generic works properly for all classes", { + s <- paste(letters, collapse='') + sdna <- sample(DNA_BASES, 20L, replace=TRUE) + + dna <- DNAString(paste(sdna, collapse='')) + viewdna <- Views(dna, start=rep(1L,3L), width=10) + + m <- Mask(mask.width=20L, start=1, end=5L) + maskdna <- dna + masks(maskdna) <- m + + ## happy path testing + test_cases <- list(1.0, 1:3, 1:3*2L, integer(0L)) + for(i in seq_along(test_cases)){ + ## character + expect_equal(letter(s, test_cases[[i]]), paste(letters[test_cases[[i]]], collapse='')) + + ## XString + expect_equal(letter(dna, test_cases[[i]]), paste(sdna[test_cases[[i]]], collapse='')) + + ## XStringViews + expect_equal(letter(viewdna, test_cases[[i]]), rep(paste(sdna[test_cases[[i]]], collapse=''), 3L)) + + ## MaskedXString -- note that this ignores the mask + expect_equal(letter(maskdna, test_cases[[i]]), paste(sdna[test_cases[[i]]], collapse='')) + } + + ## sad path testing + sad_list <- list('a', NA_real_, 10000) + error_msgs <- c("must be an NA-free numeric vector", + "must be an NA-free numeric vector", + "out of bounds") + for(i in seq_along(sad_list)){ + expect_error(letter(s, sad_list[[i]]), error_msgs[i]) + expect_error(letter(dna, sad_list[[i]]), error_msgs[i]) + expect_error(letter(viewdna, sad_list[[i]]), error_msgs[i]) + expect_error(letter(maskdna, sad_list[[i]]), error_msgs[i]) + } +}) \ No newline at end of file diff --git a/tests/testthat/test-letterFrequency.R b/tests/testthat/test-letterFrequency.R new file mode 100644 index 00000000..2b8dedd5 --- /dev/null +++ b/tests/testthat/test-letterFrequency.R @@ -0,0 +1,265 @@ +## letterFrequency.R exports the following: +## - alphabetFrequency +## - hasOnlyBaseLetters +## - uniqueLetters +## - letterFrequency +## - *letterFrequencyInSlidingView +## - consensusMatrix +## - consensusString (matrix, DNAStringSet, RNAStringSet) +## - mkAllStrings +## - oligonucleotideFrequency +## - dinucleotideFrequency +## - trinucleotideFrequency +## - *oligonucleotideTransitions +## - *twoWayAlphabetFrequency +## +## starred functions are still lacking tests + +B_ALPHABET <- strsplit(rawToChar(as.raw(32:126)), '')[[1]] + +check_uniqueLetters <- function(XStringFUN, X_ALPH){ + set.seed(123L) + s <- sample(X_ALPH, 100L, replace=TRUE) + u_v <- unique(s) + + s <- paste(s, collapse='') + expect_equal(sort(uniqueLetters(XStringFUN(s))), sort(u_v)) +} + +check_alphfreq <- function(XStringFUN, X_ALPH, isMatrix=FALSE, checkBaseOnly=NULL){ + set.seed(456L) + tab <- integer(length(X_ALPH)) + names(tab) <- X_ALPH + + if(isMatrix){ + exp_res <- matrix(0L, nrow=5, ncol=length(tab)) + colnames(exp_res) <- names(tab) + strs <- character(5L) + for(i in seq_len(5)){ + tab[] <- 0L + s <- sample(X_ALPH, 100L, replace=TRUE) + u_v <- table(s) + tab[names(u_v)] <- u_v + exp_res[i,] <- tab + strs[i] <- paste(s, collapse='') + } + norm_res <- exp_res / rowSums(exp_res) + ss <- XStringFUN(strs) + } else { + strs <- sample(X_ALPH, 100L, replace=TRUE) + u_v <- table(strs) + tab[names(u_v)] <- u_v + exp_res <- tab + ss <- XStringFUN(paste(strs, collapse='')) + norm_res <- exp_res / sum(exp_res) + } + + # Check alphabetFrequency + v <- alphabetFrequency(ss) + expect_equal(v, exp_res) + expect_equal(sum(v), sum(nchar(strs))) + expect_equal(alphabetFrequency(ss, as.prob=TRUE), norm_res) + + # Check letterFrequency + expect_equal(letterFrequency(ss, X_ALPH), exp_res) + expect_equal(letterFrequency(ss, X_ALPH, as.prob=TRUE), norm_res) + + if(!is.null(checkBaseOnly)){ + # provide the alphabet + b <- checkBaseOnly + if(isMatrix){ + p_in <- match(colnames(exp_res), b, nomatch=0) + exp_res <- cbind(exp_res[,p_in!=0], other=rowSums(exp_res[,p_in==0])) + norm_res <- exp_res / rowSums(exp_res) + } else { + p_in <- match(names(exp_res), b, nomatch=0) + exp_res <- c(exp_res[p_in!=0], sum(exp_res[p_in==0])) + names(exp_res) <- c(b, "other") + norm_res <- exp_res / sum(exp_res) + } + + v <- alphabetFrequency(ss, baseOnly=TRUE) + expect_equal(v, exp_res) + expect_equal(sum(v), sum(nchar(strs))) + expect_error(alphabetFrequency(ss, baseOnly=NA), "'baseOnly' must be TRUE or FALSE") + expect_error(alphabetFrequency(ss, baseOnly=1), "'baseOnly' must be TRUE or FALSE") + expect_equal(alphabetFrequency(ss, baseOnly=TRUE, as.prob=TRUE), norm_res) + if(isMatrix){ + expect_equal(letterFrequency(ss, b), exp_res[,seq_along(b)]) + expect_equal(letterFrequency(ss, b, as.prob=TRUE), norm_res[,seq_along(b)]) + } else { + expect_equal(letterFrequency(ss, b), exp_res[seq_along(b)]) + expect_equal(letterFrequency(ss, b, as.prob=TRUE), norm_res[seq_along(b)]) + } + } +} + +test_that("uniqueLetters works properly", { + check_uniqueLetters(DNAString, DNA_ALPHABET) + check_uniqueLetters(RNAString, RNA_ALPHABET) + check_uniqueLetters(AAString, AA_ALPHABET) + check_uniqueLetters(BString, B_ALPHABET) + + check_uniqueLetters(DNAStringSet, DNA_ALPHABET) + check_uniqueLetters(RNAStringSet, RNA_ALPHABET) + check_uniqueLetters(AAStringSet, AA_ALPHABET) + check_uniqueLetters(BStringSet, B_ALPHABET) +}) + +test_that("letterFrequency, alphabetFrequency functions for all input types", { + check_alphfreq(DNAString, DNA_ALPHABET, checkBaseOnly=DNA_BASES) + check_alphfreq(RNAString, RNA_ALPHABET, checkBaseOnly=RNA_BASES) + check_alphfreq(AAString, AA_ALPHABET, checkBaseOnly=AA_STANDARD) + + check_alphfreq(DNAStringSet, DNA_ALPHABET, isMatrix=TRUE, checkBaseOnly=DNA_BASES) + check_alphfreq(RNAStringSet, RNA_ALPHABET, isMatrix=TRUE, checkBaseOnly=RNA_BASES) + check_alphfreq(AAStringSet, AA_ALPHABET, isMatrix=TRUE, checkBaseOnly=AA_STANDARD) + + ## checking for ambiguity tabulation + d <- DNAString("ATGCATGC.+Y") + exp_output <- c(2L, 4L, 2L) + names(exp_output) <- c("A", "G|C", "T") + expect_equal(letterFrequency(d, letters=c("A", "GC", "T")), exp_output) + names(exp_output)[2] <- "G*C" + expect_equal(letterFrequency(d, letters=c("A", "GC", "T"), OR="*"), exp_output) + + r <- RNAString("AUGCAUGC.+Y") + exp_output <- 8L + names(exp_output) <- c("A|U|G|C") + expect_equal(letterFrequency(r, letters="AUGC"), exp_output) + exp_output <- rep(2L, 4) + names(exp_output) <- c("A","U","G","C") # intentionally different from RNA_BASES + expect_equal(letterFrequency(r, letters="AUGC", OR=0), exp_output) + + ## BStrings have their own check + set.seed(999L) + bstr <- sample(B_ALPHABET, 100, replace=TRUE) + bstr_counts <- table(match(bstr, B_ALPHABET)) + r <- integer(256L) + r[as.integer(names(bstr_counts)) + 32L] <- bstr_counts + bstr <- paste(bstr, collapse='') + expect_equal(alphabetFrequency(BString(bstr)), r) + expect_equal(alphabetFrequency(BStringSet(bstr)), matrix(r, nrow=1L, ncol=256L)) + + expect_error(alphabetFrequency(BString(), baseOnly=TRUE), "unused argument (baseOnly = TRUE)", fixed=TRUE) + + ## checking for Views objects and maskedString objects + ds <- sample(DNA_ALPHABET, 100L, replace=TRUE) + exp_out <- matrix(0L, nrow=2, ncol=length(DNA_ALPHABET)) + colnames(exp_out) <- DNA_ALPHABET + exp_out[1,names(table(ds[seq(1,50)]))] <- table(ds[seq(1,50)]) + exp_out[2,names(table(ds[seq(51,100)]))] <- table(ds[seq(51,100)]) + v <- Views(DNAString(paste(ds, collapse='')), start=c(1,51), end=c(50,100)) + expect_equal(alphabetFrequency(v), exp_out) + expect_equal(letterFrequency(v, "A"), exp_out[,"A",drop=FALSE]) + + mask_d <- DNAString(paste(ds, collapse='')) + masks(mask_d) <- Mask(length(ds), start=c(5,50), end=c(10,60)) + exp_out <- rep(0L, length(DNA_ALPHABET)) + names(exp_out) <- DNA_ALPHABET + subdstr <- ds[-c(5:10,50:60)] + exp_out[names(table(subdstr))] <- table(subdstr) + expect_equal(alphabetFrequency(mask_d), exp_out) + expect_equal(letterFrequency(mask_d, "A"), exp_out["A"]) +}) + +test_that("hasOnlyBaseLetters works as expected", { + ## XStringSet + expect_true(hasOnlyBaseLetters(DNAStringSet(DNA_BASES))) + expect_true(hasOnlyBaseLetters(RNAStringSet(RNA_BASES))) + expect_true(hasOnlyBaseLetters(AAStringSet(AA_STANDARD))) + + expect_false(hasOnlyBaseLetters(DNAStringSet(DNA_ALPHABET))) + expect_false(hasOnlyBaseLetters(RNAStringSet(RNA_ALPHABET))) + expect_false(hasOnlyBaseLetters(AAStringSet(AA_ALPHABET))) + expect_error(hasOnlyBaseLetters(BStringSet()), 'unable to find an inherited method') + + ## Views + d1 <- DNAString(paste(DNA_BASES, collapse='')) + d2 <- DNAString(paste(DNA_ALPHABET, collapse='')) + expect_true(hasOnlyBaseLetters(Views(d1, start=1, end=length(DNA_BASES)))) + expect_false(hasOnlyBaseLetters(Views(d2, start=1, end=length(DNA_ALPHABET)))) + + ## MaskedXString + md1 <- d1 + md2 <- d2 + masks(md1) <- Mask(length(d1), start=2, end=3) + masks(md2) <- Mask(length(d2), start=2, end=5) + expect_true(hasOnlyBaseLetters(md1)) + expect_false(hasOnlyBaseLetters(md2)) + +}) + +test_that("miscellaneous letterFrequency methods work", { + ## mkAllStrings + alph <- c("A", "B", "C") + exp_output <- expand.grid(alph, alph, alph) + exp_output <- apply(exp_output, 1L, paste, collapse='') + expect_equal(sort(mkAllStrings(alph, 3L)), sort(exp_output)) + + ## uniqueLetters + d <- paste(rep(DNA_ALPHABET, each=3), collapse='') + expect_equal(uniqueLetters(DNAString(d)), DNA_ALPHABET) + expect_equal(uniqueLetters(DNAStringSet(c(d,d))), DNA_ALPHABET) + expect_equal(uniqueLetters(Views(DNAString(d), start=c(1,6), end=c(5,nchar(d)))), DNA_ALPHABET) + md <- DNAString(d) + masks(md) <- Mask(nchar(d), start=seq(1,nchar(d),3), end=seq(2,nchar(d),3)) + expect_equal(uniqueLetters(md), DNA_ALPHABET) +}) + +test_that("nucleotideFrequency methods work properly", { + set.seed(999L) + l <- 102L + ds <- sample(DNA_BASES, l, replace=TRUE) + d <- DNAString(paste(ds, collapse='')) + dss <- DNAStringSet(c(ds,ds)) + + ## oligonucleotideFrequency(x, 1L) should be the same as alphabetFrequency(...,baseOnly=TRUE) + ## (ignoring the 'other' arg) + expect_equal(oligonucleotideFrequency(d, 1L), alphabetFrequency(d, baseOnly=TRUE)[seq_len(4L)]) + expect_equal(oligonucleotideFrequency(dss, 1L), alphabetFrequency(dss, baseOnly=TRUE)[,seq_len(4L)]) + + ## dinucleotide correctness + all_doubles <- table(paste0(ds[seq_len(l-1L)], ds[seq_len(l-1L)+1L])) + exp_res <- rep(0L, length(DNA_BASES)**2) + names(exp_res) <- mkAllStrings(DNA_BASES, 2L) + exp_res[names(all_doubles)] <- all_doubles + expect_equal(oligonucleotideFrequency(d, 2L), exp_res) + expect_equal(dinucleotideFrequency(d), exp_res) + + ## trinucleotide correctness + all_triples <- table(paste0(ds[seq_len(l-2L)], ds[seq_len(l-2L)+1L], ds[seq_len(l-2L)+2L])) + exp_res <- rep(0L, length(DNA_BASES)**3) + names(exp_res) <- mkAllStrings(DNA_BASES, 3L) + exp_res[names(all_triples)] <- all_triples + expect_equal(oligonucleotideFrequency(d, 3L), exp_res) + expect_equal(trinucleotideFrequency(d), exp_res) + + ## using step + all_codons <- codons(d) + exp_res[] <- 0L + codon_table <- table(as.character(all_codons)) + exp_res[names(codon_table)] <- codon_table + expect_equal(oligonucleotideFrequency(d, 3L, step=3L), exp_res) + + ## frequency on a Views object + exp_res <- matrix(0L, nrow=l/3, ncol=length(DNA_BASES)**3) + colnames(exp_res) <- mkAllStrings(DNA_BASES, 3L) + exp_res[cbind(seq_len(nrow(exp_res)), match(as.character(all_codons), colnames(exp_res)))] <- 1L + expect_equal(oligonucleotideFrequency(codons(d), 3L), exp_res) + + ## frequency on a MaskedXString object + mask_d <- d + masks(mask_d) <- Mask(l, start=c(10,50), end=c(19,57)) + mask_str <- strsplit(as.character(mask_d), '')[[1]] + all_doubles <- table(paste0(mask_str[seq_len(l-1L)], mask_str[seq_len(l-1L)+1L])) + all_doubles <- all_doubles[!grepl("#", names(all_doubles))] + exp_res <- rep(0L, length(DNA_BASES)**2) + names(exp_res) <- mkAllStrings(DNA_BASES, 2L) + exp_res[names(all_doubles)] <- all_doubles + expect_equal(oligonucleotideFrequency(mask_d, 2L), exp_res) + + ## disallowed types + expect_error(oligonucleotideFrequency(AAString(), "must contain sequences of type DNA or RNA")) + expect_error(oligonucleotideFrequency(BStringSet(), "must contain sequences of type DNA or RNA")) +}) \ No newline at end of file diff --git a/tests/testthat/test-matchLRPatterns.R b/tests/testthat/test-matchLRPatterns.R new file mode 100644 index 00000000..a146d276 --- /dev/null +++ b/tests/testthat/test-matchLRPatterns.R @@ -0,0 +1,47 @@ +## matchLRPatterns.R exports matchLRPatterns for the following: +## - XString +## - XStringViews +## - MaskedXString +## +## note that Lfixed, Rfixed are only defined for DNAString, RNAString + +test_that("matchLRPatterns works for all inputs", { + d <- DNAString("AAATTAACCCTT") + + ## different values of max.mismatch + expect_equal(matchLRPatterns("AA", "TT", 0, d), Views(d, start=2, end=5)) + expect_equal(matchLRPatterns("AA", "TT", 1, d), Views(d, start=c(1,2), end=c(5,5))) + expect_equal(matchLRPatterns("AA", "TT", 3, d), Views(d, start=c(1,2,6), end=c(5,5,12))) + expect_equal(matchLRPatterns("AA", "TT", 7, d), Views(d, start=c(1,2,2,6), end=c(5,5,12,12))) + + ## L,Rmismatch + expect_equal(matchLRPatterns("AA", "TT", 0, subject=d, max.Lmismatch=2), Views(d, start=c(2,9), end=c(5,12))) + expect_equal(matchLRPatterns("AA", "TT", 0, subject=d, max.Rmismatch=2), Views(d, start=c(1,2,6), end=c(4,5,9))) + + ## L,Rindels + d2 <- DNAString("ATATTA") + expect_equal(matchLRPatterns("AA", "TT", 0, d2, max.Lmismatch=1, with.Lindels=FALSE), Views(d2, start=2, end=5)) + expect_equal(matchLRPatterns("AA", "TT", 0, d2, max.Lmismatch=1, with.Lindels=TRUE), Views(d2, start=3, end=5)) + d3 <- DNAString("AATAT") + expect_equal(matchLRPatterns("AA", "TT", 0, subject=d3, max.Rmismatch=1, with.Rindels=FALSE), Views(d3,start=1,end=4)) + expect_equal(matchLRPatterns("AA", "TT", 0, subject=d3, max.Rmismatch=1, with.Rindels=TRUE), Views(d3,start=1,end=3)) + + ## Lfixed, Rfixed + expect_equal(matchLRPatterns("NN", "TT", 0, subject=d), Views(d)) + expect_equal(matchLRPatterns("MM", "TT", 0, subject=d, Lfixed="subject"), Views(d, start=c(2,9), end=c(5,12))) + expect_equal(matchLRPatterns("AA", "WY", 0, subject=d, Rfixed="subject"), Views(d, start=c(1,2), end=c(4,5))) + + ## with a mask + dm <- d + masks(dm) <- Mask(length(dm), start=8, end=10) + expect_equal(matchLRPatterns("AA", "TT", 0, dm), Views(unmasked(dm), start=2, end=5)) + expect_equal(matchLRPatterns("AA", "TT", 1, dm), Views(unmasked(dm), start=c(1,2), end=c(5,5))) + expect_equal(matchLRPatterns("AA", "TT", 3, dm), Views(unmasked(dm), start=c(1,2), end=c(5,5))) + + ## on a Views object + v <- Views(d, start=c(1,6), end=c(5,12)) + expect_equal(matchLRPatterns("AA", "TT", 0, v), Views(d, start=2, end=5)) + expect_equal(matchLRPatterns("AA", "TT", 1, v), Views(d, start=c(1,2), end=c(5,5))) + expect_equal(matchLRPatterns("AA", "TT", 3, v), Views(d, start=c(1,2,6), end=c(5,5,12))) + expect_equal(matchLRPatterns("AA", "TT", 7, v), Views(d, start=c(1,2,6), end=c(5,5,12))) +}) \ No newline at end of file diff --git a/tests/testthat/test-matchPDict.R b/tests/testthat/test-matchPDict.R new file mode 100644 index 00000000..a92e3206 --- /dev/null +++ b/tests/testthat/test-matchPDict.R @@ -0,0 +1,224 @@ +## matchPDict.R exports the following: +## - matchPDict +## - countPDict +## - whichPDict +## - vmatchPDict +## - vcountPDict +## - vwhichPDict +## These are all S4s defined for "subject", XString, +## XStringSet, XStringViews, MaskedXString + +test_that("general match/count/whichPDict behavior works as expected", { + d1 <- DNAString("ATGGATACACAA") + all_codons <- mkAllStrings(DNA_BASES, 3L) + + ## non-pdict with matchPDict + p1 <- matchPDict(DNAStringSet("ATA"), d1) + expect_equal(as.integer(c(start(p1), end(p1))), c(5,7)) + + pd <- PDict(all_codons) + + ## pd against a DNAStringSet + p2 <- lengths(matchPDict(pd, d1)) + pos_good <- which(all_codons %in% as.character(Views(d1, start=seq_len(length(d1)-2L), width=3))) + expect_true(all(p2[pos_good] > 0) && all(p2[-pos_good] == 0)) + expect_equal(whichPDict(pd, d1), pos_good) + expect_equal(countPDict(pd, d1), p2) + + ## pd against an XStringViews object + p3 <- lengths(matchPDict(pd, codons(d1))) + pos_good <- which(all_codons %in% as.character(codons(d1))) + expect_true(all(p3[pos_good] > 0) && all(p3[-pos_good] == 0)) + expect_equal(whichPDict(pd, codons(d1)), pos_good) + expect_equal(countPDict(pd, codons(d1)), p3) + + ## pd against a MaskedXString object + m1 <- Mask(length(d1), start=4, end=8) + d2 <- d1 + masks(d2) <- m1 + p4 <- lengths(matchPDict(pd, d2)) + pos_good <- which(all_codons %in% as.character(Views(unmasked(d2), start=c(1,9,10), width=3))) + expect_true(all(p4[pos_good] > 0) && all(p4[-pos_good] == 0)) + expect_equal(whichPDict(pd, d2), pos_good) + expect_equal(countPDict(pd, d2), p4) + + + ## matchPDict() with AA, RNA, B + aa1 <- AAStringSet(c("DARC", "EGH")) + aa2 <- AAString("KMFPRNDEGHSTTWTEE") + paa <- matchPDict(aa1, aa2) + expect_equal(c(unlist(startIndex(paa)), unlist(endIndex(paa))), c(8,10)) + expect_equal(countPDict(aa1, aa2), c(0,1)) + expect_equal(whichPDict(aa1, aa2), 2) + + b1 <- BStringSet(c("DARC", "EGH")) + b2 <- BString("KMFPRNDEGHSTTWTEE") + pb <- matchPDict(b1, b2) + expect_equal(c(unlist(startIndex(pb)), unlist(endIndex(pb))), c(8,10)) + expect_equal(countPDict(b1, b2), c(0,1)) + expect_equal(whichPDict(b1, b2), 2) +}) + +test_that("inexact PDict matching works", { + pdict <- PDict(c("acgt", "gt", "cgt", "ac"), tb.end=2) + d <- DNAString("acggaccg") + expect_equal(lengths(endIndex(matchPDict(pdict, d, max.mismatch=0))), c(0L,0L,0L,2L)) + expect_equal(lengths(endIndex(matchPDict(pdict, d, max.mismatch=1))), c(1L,0L,2L,2L)) + expect_equal(lengths(endIndex(matchPDict(pdict, d, max.mismatch=2))), c(2L,0L,2L,2L)) + + expect_equal(countPDict(pdict, d, max.mismatch=0), c(0,0,0,2)) + expect_equal(countPDict(pdict, d, max.mismatch=1), c(1,0,2,2)) + expect_equal(countPDict(pdict, d, max.mismatch=2), c(2,0,2,2)) +}) + +test_that("vectorized PDict lookup works", { + ## canary tests for future changes + pdict <- PDict(c("acgt", "gt", "cgt", "ac"), tb.end=2) + d <- DNAString("acggaccg") + dss <- DNAStringSet(list(d,d)) + dvv <- Views(d, start=rep(1,2), width=rep(length(d), 2)) + + # Disallowed cases + expect_error(matchPDict(pdict, dss), "please use vmatchPDict") + expect_error(vmatchPDict(pdict, d), "please use matchPDict") + expect_error(vmatchPDict(pdict, as(d, "MaskedDNAString")), "please use matchPDict") + expect_error(vwhichPDict(pdict, d), "please use whichPDict") + expect_error(vwhichPDict(pdict, as(d, "MaskedDNAString")), "please use whichPDict") + expect_error(vcountPDict(pdict, d), "please use countPDict") + expect_error(vcountPDict(pdict, as(d, "MaskedDNAString")), "please use countPDict") + + exp_out <- matrix(rep(c(0,2), times=c(6,2)), nrow=4L, byrow=TRUE) + expect_equal(vcountPDict(pdict, dss), exp_out) + expect_equal(vwhichPDict(pdict, dss), list(4,4)) + expect_equal(vcountPDict(pdict, dvv), exp_out) + expect_equal(vwhichPDict(pdict, dvv), list(4,4)) + + exp_out[1,] <- 1 + exp_out[3,] <- 2 + expect_equal(vcountPDict(pdict, dss, max.mismatch=1), exp_out) + expect_equal(vwhichPDict(pdict, dss, max.mismatch=1), list(c(1,3,4),c(1,3,4))) + expect_equal(vcountPDict(pdict, dvv, max.mismatch=1), exp_out) + expect_equal(vwhichPDict(pdict, dvv, max.mismatch=1), list(c(1,3,4),c(1,3,4))) + + exp_out[1,] <- 2 + expect_equal(vcountPDict(pdict, dss, max.mismatch=2), exp_out) + expect_equal(vwhichPDict(pdict, dss, max.mismatch=2), list(c(1,3,4),c(1,3,4))) + expect_equal(vcountPDict(pdict, dvv, max.mismatch=2), exp_out) + expect_equal(vwhichPDict(pdict, dvv, max.mismatch=2), list(c(1,3,4),c(1,3,4))) + + ## this will fail when we implement it to remind me to add tests for it + expect_error(vmatchPDict(pdict, dss), "vmatchPDict() is not ready yet", fixed=TRUE) +}) + +test_that("MIndex functionality works", { + ## set up an MIndex object + s <- mkAllStrings(c("A", "T"), 2L) + names(s) <- s + pd <- PDict(s) + d <- DNAString("ATTATTAATTAA") + m <- matchPDict(pd, d) + exp_match_starts <- list(c(7,11), c(1,4,8), c(3,6,10), c(2,5,9)) + expect_equal(length(m), 2^2) + expect_equal(names(m), names(s)) + expect_equal(startIndex(m), exp_match_starts) + expect_equal(endIndex(m), lapply(exp_match_starts, \(x) x+1L)) + expect_equal(elementNROWS(m), c(2,3,3,3)) + expect_equal(m[[1L]], IRanges(start=c(7,11), end=c(8,12))) + expect_s4_class(as(m, "CompressedIRangesList"), "CompressedIRangesList") + expect_equal(start(unlist(m)), unlist(exp_match_starts)) + ux <- extractAllMatches(d, m) + exp_ux <- rep(s, times=c(2,3,3,3)) + expect_equal(start(ux), unlist(exp_match_starts)) + expect_equal(as.character(ux), exp_ux) +}) + +### All old tests +## Helper functions ported from old tests +randomDNASequences <- function(n, w) +{ + alphabet <- DNA_BASES + w <- rep(w, length=n) + sequences <- sapply(seq(1, n, length=n), + function(x) { + s <- sample(alphabet, w[x], replace=TRUE) + s <- paste(s, collapse="") + return(s) + }) + return(Biostrings::DNAStringSet(sequences)) +} + +msubseq <- function(x, ir) +{ + ## differs from subseq in the sense that several subsequences + ## from the same sequence are extracted + ## x: XString + ## ir: IRanges + res <- vector("character", length = length(ir)) + for (i in seq(along=res)) { + res[i] <- as.character(subseq(x, start=ir@start[i], width=width(ir)[i])) + ## forced cast: chain of tools for DNAString seems interupted for + ## some use cases (or I missed something) + } + res <- DNAStringSet(res) + return(res) +} + +test_that("matchPDict works for constant width lookup", { + set.seed(1) + l <- 150 + dna_target <- randomDNASequences(1, l)[[1]] + W <- 20 + L <- 6 + ir <- successiveIRanges(rep(W, L), gapwidth = 1) + short_sequences <- msubseq(dna_target, ir) + # shuffle the sequences (so they are not in consecutive order) + o <- sample(seq(along=short_sequences)) + + dna_short <- DNAStringSet(short_sequences[o]) + pdict <- PDict(dna_short) + + res <- matchPDict(pdict, dna_target) + + # mostly a sanity check + expect_equal(L, length(res)) + + for (i in seq(along=res)) { + m_start <- ir[o][i]@start + expect_equal(m_start, start(res[[i]])) + expect_equal(W, width(res[[i]])) + expect_equal(m_start + W - 1, end(res[[i]])) # mostly a sanity check + } +}) + +test_that("matchPDict works for variable width lookup", { + set.seed(1) + l <- 150 + dna_target <- randomDNASequences(1, l)[[1]] + W <- 20 + L <- 6 + n_cut <- sample(0:5, L, replace=TRUE) + ir <- successiveIRanges(rep(W, L) - n_cut, gapwidth = 1 + n_cut[-length(n_cut)]) + short_sequences <- msubseq(dna_target, ir) + # shuffle the sequences (they are not in consecutive order) + o <- sample(seq(along=short_sequences)) + + dna_var_short <- DNAStringSet(short_sequences[o]) + + pdict <- PDict(dna_var_short, + tb.start=1, # can't this be + tb.width=min(width(dna_var_short)) # the default for + # variable width ? + ) + + res <- matchPDict(pdict, dna_target) + + # mostly a sanity check + expect_equal(L, length(res)) + + iro <- ir[o] + for (i in seq(along=res)) { + expect_equal(start(iro)[i], start(res[[i]])) + expect_equal(width(iro)[i], width(res[[i]])) + expect_equal(end(iro)[i], end(res[[i]])) # mostly a sanity check + } +}) diff --git a/tests/testthat/test-matchPWM.R b/tests/testthat/test-matchPWM.R new file mode 100644 index 00000000..2434f905 --- /dev/null +++ b/tests/testthat/test-matchPWM.R @@ -0,0 +1,24 @@ +## matchPWM.R exports the following: +## - PWM +## - matchPWM +## - countPWM +## - PWMscoreStartingAt +## - maxWeights +## - minWeights +## - minScore +## - unitScale +## - reverseComplement + +## TODO: more tests, this is really just a first pass +test_that("PWMs can be initialized", { + d <- DNAStringSet(c("AAAATTC", + "ATTAAAG")) + + p <- PWM(d, type='prob') + expect_equal(dim(p), c(4L, width(d)[1])) + ## have to use all.equal because of floating point error + expect_true(all.equal(colSums(p), rep(sum(p[,1L]), ncol(p)))) + expect_true(all(p[c("C", "G"),-7] == 0)) + expect_true(maxScore(p) == 1) + expect_true(minScore(p) == 0) +}) \ No newline at end of file diff --git a/tests/testthat/test-matchPattern.R b/tests/testthat/test-matchPattern.R new file mode 100644 index 00000000..ce6036b2 --- /dev/null +++ b/tests/testthat/test-matchPattern.R @@ -0,0 +1,120 @@ +## matchPattern.R exports the following: +## - gregexpr2 +## - matchPattern generic +## - countPattern generic +## - vmatchPattern generic +## - vcountPattern generic +## +## classes for generics: character, XString, +## XStringSet, XStringViews, MaskedXString + +sss <- "ATGCATGCATGC" +dna <- DNAString(sss) +dss <- DNAStringSet(list(dna,dna)) +maskpos <- c(2L,9L) +m1 <- Mask(length(dna), start=maskpos, end=maskpos+2L) +mdna <- dna +masks(mdna) <- m1 +vdna <- Views(dna, start=maskpos, end=maskpos+2L) +# note that gregexpr and gregexpr2 are supported but not documented +all_algos <- c("naive-exact", "naive-inexact", "boyer-moore", "shift-or", "indels") + +.test_match_output <- function(subject, pattern, algo, exp_match, ...){ + m <- matchPattern(subject=subject, pattern=pattern, algorithm=algo, ...) + mc <- countPattern(subject=subject, pattern=pattern, algorithm=algo, ...) + res <- list(start=start(m), end=end(m), width=width(m), stype=seqtype(m)) + expect_s4_class(m, "XStringViews") + expect_equal(res, exp_match) + expect_equal(mc, length(exp_match$start)) +} + +.test_vmatch_output <- function(subject, pattern, algo, exp_match, ...){ + m <- vmatchPattern(subject=subject, pattern=pattern, algorithm=algo, ...) + mc <- vcountPattern(subject=subject, pattern=pattern, algorithm=algo, ...) + res <- lapply(m, \(x) list(start=start(x), end=end(x), width=width(x))) + expect_s4_class(m, "MIndex") + expect_equal(res, exp_match) + expect_equal(mc, vapply(m, \(x) length(start(x)), integer(1L))) +} + +test_that("gregexpr2 correctly returns overlapping matches", { + ## should find overlapping matches + expect_equal(unlist(gregexpr2("aa", c("XaaaYaa", "a"))), c(2L, 3L, 6L, -1L)) + + ## only supports character vectors of length 1 for 'pattern' + expect_error(gregexpr2(c("aa", "aaa"), "a"), "invalid pattern") + expect_error(gregexpr2(NA_character_, "a"), "invalid pattern") + expect_error(gregexpr2(1, "a"), "invalid pattern") + expect_error(gregexpr2(character(0L), "a"), "invalid pattern") +}) + +test_that("matchPattern, countPattern work as expected", { + ## exact matching + for(algo in all_algos[c(1:4)]){ + .test_match_output(subject=sss, pattern="ATG", algo=algo, + exp_match=list(start=c(1,5,9), end=c(3,7,11), width=rep(3,3), stype="B")) + .test_match_output(subject=dna, pattern="ATG", algo=algo, + exp_match=list(start=c(1,5,9), end=c(3,7,11), width=rep(3,3), stype="DNA")) + .test_match_output(subject=mdna, pattern="ATG", algo=algo, + exp_match=list(start=5, end=7, width=3, stype="DNA")) + .test_match_output(subject=vdna, pattern="ATG", algo=algo, + exp_match=list(start=9, end=11, width=3, stype="DNA")) + } + + ## inexact matching + .test_match_output(subject=dna, pattern="ATC", algo="auto", + exp_match=list(start=c(1,2,5,6,9,10), end=c(3,4,7,8,11,12), width=rep(3,6), stype="DNA"), + max.mismatch=2) + .test_match_output(subject=dna, pattern="ATC", algo="auto", + exp_match=list(start=c(2,6,10), end=c(4,8,12), width=rep(3,3), stype="DNA"), + max.mismatch=2, min.mismatch=2) + .test_match_output(subject=dna, pattern="ATC", algo="auto", + exp_match=list(start=c(1,5,9), end=c(2,6,10), width=rep(2,3), stype="DNA"), + max.mismatch=1, with.indels=TRUE) + + ## fixed args + dna2 <- DNAString("ACGTNMRWSYKVHDB") + .test_match_output(subject=dna2, pattern="GTN", algo="auto", + exp_match=list(start=c(3), end=c(5), width=rep(3,1), stype="DNA"), fixed=TRUE) + .test_match_output(subject=dna2, pattern="GTN", algo="auto", + exp_match=list(start=c(3,7,9,12), end=c(5,9,11,14), width=rep(3,4), stype="DNA"), fixed=FALSE) + + ## sad paths + expect_error(matchPattern("ATG", dna, algorithm="indels"), "valid algos for your string matching problem") + expect_error(matchPattern("ATG", dss), "please use vmatchPattern") + expect_error(countPattern("ATG", dna, algorithm="indels"), "valid algos for your string matching problem") + expect_error(countPattern("ATG", dss), "please use vcountPattern") + expect_error(matchPattern("", dna), "empty patterns are not supported") + expect_error(matchPattern(1, dna), "'pattern' must be a single string or an XString object") + expect_error(matchPattern("test", '', algorithm="gregexpr"), "'subject' must be a single (non-empty) string", fixed=TRUE) + expect_error(matchPattern("test", '', algorithm="gregexpr2"), "'subject' must be a single (non-empty) string", fixed=TRUE) + + ## edge cases from matchPattern.R + ## unsure if these are intended functionality, but at least we'll track changes to it + .test_match_output(subject=DNAString("ACGTGCA"), pattern="---", algo="auto", + exp_match=list(start=seq(-1,7), end=seq(1,9), width=rep(3,9), stype='DNA'), max.mismatch=3) + .test_match_output(subject=DNAString("A"), pattern="---", algo="auto", + exp_match=list(start=integer(0L), end=integer(0L), width=integer(0L), stype='DNA')) +}) + +test_that("vmatchPattern, vcountPattern work correctly", { + for(algo in all_algos[c(1:4)]){ + .test_vmatch_output(subject=dss, pattern="ATG", algo=algo, + exp_match=lapply(seq_len(2), \(x) list(start=c(1,5,9), end=c(3,7,11), width=rep(3,3)))) + .test_vmatch_output(subject=rep(sss,2L), pattern="ATG", algo=algo, + exp_match=lapply(seq_len(2), \(x) list(start=c(1,5,9), end=c(3,7,11), width=rep(3,3)))) + # XStringViews objects aren't supported yet + # .test_vmatch_output(subject=vdna, pattern="ATG", algo=algo, + # exp_match=lapply(seq_len(2), \(x) list(start=c(1,5,9), end=c(3,7,11), width=rep(3,3)))) + } + + ## sad path + expect_error(vmatchPattern("ATG", dna), "please use matchPattern") + expect_error(vmatchPattern("ATG", mdna), "please use matchPattern") + expect_error(vcountPattern("ATG", dna), "please use countPattern") + expect_error(vcountPattern("ATG", mdna), "please use countPattern") + + ## TODO: more sad path options + expect_equal(vcountPattern("TG", vdna), c(1L,1L)) +}) + diff --git a/tests/testthat/test-miscXStringMethods.R b/tests/testthat/test-miscXStringMethods.R new file mode 100644 index 00000000..9fc4cdf3 --- /dev/null +++ b/tests/testthat/test-miscXStringMethods.R @@ -0,0 +1,111 @@ +## Miscellaneous XString Methods +## Currently includes: +## - toComplex +## - xscat +## - strsplit, unstrsplit (for XStringSet, XStringSetList) +## - N50 + +test_that("toComplex conversion works correctly", { + seq <- DNAString("agtcagtcagtcagtc") + baseValues1 <- c(A=1+0i, G=0+1i, T=-1+0i, C=0-1i) + expect_equal(toComplex(seq, baseValues1), unname(rep(baseValues1, 4))) + + ## GC content: + baseValues2 <- c(A=0, C=1, G=1, T=0) + expect_equal(sum(as.integer(toComplex(seq, baseValues2))), 8L) + + expect_error(toComplex(seq, 1:4), "'baseValues' must have names") + expect_error(toComplex(seq, c(W=1,X=1,Y=1,Z=1)), "'baseValues' names must be valid DNA letters") +}) + +test_that("xscat concatenates all types correctly", { + s1 <- "ATGATG" + s2 <- "CCACCA" + + ## character + expect_true(xscat(s1, s2) == BString(paste0(s1, s2))) + expect_error(xscat(s1, NA), "arguments must be character vectors (with no NAs)", fixed=TRUE) + + ## XString + d1 <- DNAString(s1) + d2 <- DNAString(s2) + d3 <- DNAString(paste0(s1, s2)) + d4 <- DNAString(paste0(s2, s1)) + expect_true(xscat(d1, d2) == d3) + + ## XStringSet + dss1 <- DNAStringSet(list(d1, d2)) + dss2 <- DNAStringSet(list(d2, d1)) + dss3 <- DNAStringSet(list(d3, d4)) + expect_true(all(xscat(dss1, dss2) == dss3)) + + v1 <- Views(d3, start=c(1,7), width=3) + v2 <- Views(d3, start=c(4,10), width=3) + expect_true(all(xscat(v1, v2) == DNAStringSet(c(s1, s2)))) +}) + +test_that("padAndClip, stackStrings have correct functionality", { + ## just replicating the stuff in the examples + ## can add more tests later if necessary, I don't see these used super often + x <- BStringSet(c(seq1="ABCD", seq2="abcdefghijk", seq3="", seq4="XYZ")) + + p1 <- padAndClip(x, IRanges(3, 8:5), Lpadding.letter=">", Rpadding.letter="<") + expect_equal(unname(as.character(p1)), c("CD<<<<", "cdefg", "<<<<", "Z<<")) + + p2 <- padAndClip(x, IRanges(1:-2, 7), Lpadding.letter=">", Rpadding.letter="<") + expect_equal(unname(as.character(p2)), c("ABCD<<<", ">abcdefg", ">><<<<<<<", ">>>XYZ<<<<")) + + expect_equal(width(stackStrings(x, 2, 8)), rep(7L, 4L)) + + exp_out <- c("###ABCD....", "ijk........", "#########..", "##########X") + s1 <- stackStrings(x, -2, 8, shift=c(0, -11, 6, 7), Lpadding.letter="#", Rpadding.letter=".") + expect_equal(width(s1), rep(11, 4L)) + expect_equal(unname(as.character(s1)), exp_out) + + s2 <- stackStrings(x, -2, 8, shift=c(0, -14, 6, 7), + Lpadding.letter="#", Rpadding.letter=".", + remove.out.of.view.strings=TRUE) + expect_equal(names(s2), paste0("seq", c(1,3,4))) + expect_equal(unname(as.character(s2)), exp_out[c(1,3,4)]) +}) + +test_that("Biostrings strsplit, unstrsplit methods work correctly", { + ds <- "ATGCATGC" + d1 <- DNAStringSet(ds) + d2 <- DNAStringSet("ATCATC") + d_spl <- DNAStringSet(c("AT", "CAT", "C")) + + expect_true(all(strsplit(d1, "G") == d_spl)) + + expect_true(unstrsplit(strsplit(d1, "G")) == d2) + + ## round trip identity + expect_true(unstrsplit(strsplit(d1, "G"), "G") == d1) + + ## unstrsplit on a stringset should be a noop + expect_identical(unstrsplit(d1), d1) +}) + +test_that("N50 has correct behavior", { + set.seed(6L) + n_seq <- 20L + possible_len <- seq(100,10000) + lens <- sample(possible_len, n_seq) + many_seqs <- DNAStringSet( + vapply(seq_len(n_seq), \(i){ + paste(sample(DNA_BASES, lens[i], replace=TRUE), collapse='') + }, character(1L)) + ) + + ## N50 is calculated by: + ## 1. add sizes of contigs until half the total size is reached + all_widths <- sort(width(many_seqs), decreasing=TRUE) + total_width <- sum(all_widths) + cs <- cumsum(all_widths) + pos_first <- which.max(cs > total_width/2) + + ## 2. take the size of the contig that was added last + N50_exp <- all_widths[pos_first] + + expect_equal(N50(width(many_seqs)), N50_exp) +}) \ No newline at end of file diff --git a/tests/testthat/test-miscellaneous.R b/tests/testthat/test-miscellaneous.R new file mode 100644 index 00000000..f8b9dff5 --- /dev/null +++ b/tests/testthat/test-miscellaneous.R @@ -0,0 +1,46 @@ +## Miscellaneous tests +## these are all relatively low priority and/or for files with only a few things +## some tests are just for internal functions + +test_that("coloring works for DNA, RNA, and AA", { + ## not a super important test + dna_rna_expected <- c(DNA_BASES, "U", DNA_ALPHABET[-c(1:4,16:18)]) + expect_true(!any(duplicated(Biostrings:::make_DNA_AND_RNA_COLORED_LETTERS()))) + expect_equal(sort(names(Biostrings:::make_DNA_AND_RNA_COLORED_LETTERS())), sort(dna_rna_expected)) + + aa_expected <- AA_ALPHABET[-c(27:30)] + expect_true(!any(duplicated(Biostrings:::make_AA_COLORED_LETTERS()))) + expect_equal(sort(names(Biostrings:::make_AA_COLORED_LETTERS())), sort(aa_expected)) +}) + +test_that("utils functions work as they should", { + expect_true(Biostrings:::isNumericOrNAs(NA_character_)) + expect_true(Biostrings:::isNumericOrNAs(NA_real_)) + expect_true(Biostrings:::isNumericOrNAs(1)) + expect_true(Biostrings:::isNumericOrNAs(1L)) + expect_true(Biostrings:::isNumericOrNAs(1:2)) + expect_false(Biostrings:::isNumericOrNAs(NULL)) + expect_false(Biostrings:::isNumericOrNAs('a')) + + expect_error(Biostrings:::pow.int('a', 1), "must be a numeric vector") + expect_identical(Biostrings:::pow.int(3,5), as.integer(3**5)) + + expect_error(Biostrings:::normargUseNames(NA), "must be TRUE or FALSE") + expect_true(Biostrings:::normargUseNames(NULL)) + expect_true(Biostrings:::normargUseNames(TRUE)) + expect_false(Biostrings:::normargUseNames(FALSE)) +}) + +test_that("longestConsecutive still functions", { + ## adapted from the examples in the man page + v <- c("AAACTGTGFG", "GGGAATT", "CCAAAAAAAAAATT") + expect_equal(longestConsecutive(v, "A"), c(3L, 2L, 10L)) + expect_equal(longestConsecutive(v, "C"), c(1L, 0L, 2L)) + expect_equal(longestConsecutive(v, "C"), c(1L, 0L, 2L)) + expect_error(longestConsecutive(v, NA), "'letter' must be a character variable") + expect_error(longestConsecutive(NA, "A"), "'x' must be a string") +}) + +test_that("matchprobes is deprecated", { + expect_warning(matchprobes("A","A"), "matchprobes() is deprecated.", fixed=TRUE) +}) diff --git a/tests/testthat/test-replaceAt.R b/tests/testthat/test-replaceAt.R new file mode 100644 index 00000000..39634e43 --- /dev/null +++ b/tests/testthat/test-replaceAt.R @@ -0,0 +1,192 @@ +## testing for both replaceAt.R and replaceLetterAt.R +## +## these export the following: +## - extractAt +## - replaceAt +## - replaceLetterAt +## - (.inplaceReplaceLetterAt) +## These functions work like subseq(<-) but without copying data +## replaceLetterAt will make a copy of the sequence. +## .inplaceReplaceLetterAt will not, but its use is discouraged. +## note that replaceLetterAt is only defined for character, DNAString, DNAStringSet + +test_that("replaceLetterAt has expected behavior", { + str <- "AGAGAGAGAGAG" + d <- DNAString(str) + dss <- DNAStringSet(list(d,d)) + pos_replace <- c(1,3,5,7,8) + mat_replace <- matrix(rep(FALSE, length(d)*2), nrow=2L, byrow=TRUE) + mat_replace[,pos_replace] <- TRUE + r_str <- "CGCGCGCCAGAG" + replace_dss <- DNAStringSet(rep(paste(rep("C", length(pos_replace)), collapse=''), 2)) + #expect_equal(replaceLetterAt(str, pos_replace, "C"), replaced_str) + expect_equal(replaceLetterAt(d, pos_replace, rep("C", length(pos_replace))), DNAString(r_str)) + expect_equal(replaceLetterAt(d, mat_replace[1,], rep("C", length(pos_replace))), DNAString(r_str)) + expect_equal(replaceLetterAt(dss, mat_replace, replace_dss), DNAStringSet(c(r_str,r_str))) + + d2 <- DNAString("RMWSYK") + expect_equal(replaceLetterAt(d2, 1:6, rep(c("A","M"), each=3)), DNAString("AAAMMM")) + expect_equal(replaceLetterAt(d2, 1:6, rep(c("A","M"), each=3), if.not.extending="merge"), DNAString("RMWVHN")) + expect_equal(replaceLetterAt(d2, 1:6, rep(c("A","M"), each=3), if.not.extending="skip"), d2) + expect_error(replaceLetterAt(d2, 1:6, rep(c("A","M"), each=3), if.not.extending="error"), "does not extend old letter") +}) + +## old tests + +### From an XStringSet object +### ========================= + +### Only compare the classes, the shapes (i.e. lengths + elementNROWS + +### names), the inner names, and the sequence contents. Doesn't look at +### the metadata or the metadata columns (outer or inner). +identical_XStringSetList <- function(target, current){ + ok1 <- identical(class(target), class(current)) + ok2 <- identical(elementNROWS(target), elementNROWS(current)) + unlisted_target <- unlist(target, use.names=FALSE) + unlisted_current <- unlist(current, use.names=FALSE) + ok3 <- identical(names(unlisted_target), names(unlisted_current)) + ok4 <- all(unlisted_target == unlisted_current) + ok1 && ok2 && ok3 && ok4 +} + +### Only compare the classes, lengths, names, and sequence contents. +### Doesn't look at the metadata or the metadata columns. +identical_XStringSet <- function(target, current){ + ok1 <- identical(class(target), class(current)) + ok2 <- identical(names(target), names(current)) + ok3 <- all(target == current) + ok1 && ok2 && ok3 +} + +# randomDisjointRanges <- function(min_start, max_end, min_width, max_width, n){ +# offset <- min_start - 1L +# n1 <- n * (1.1 + max_width/(max_end-offset+1L)) # very approximative + +# some_starts <- sample(max_end-offset+1L, n1, replace=TRUE) + offset +# some_widths <- sample(min_width:max_width, n1, replace=TRUE) +# some_ranges <- IRanges(some_starts, width=some_widths) +# some_ranges <- some_ranges[end(some_ranges) <= max_end] +# ans <- disjoin(some_ranges) +# if (min_width == 0L) { +# is_empty <- width(some_ranges) == 0L +# some_empty_ranges <- some_ranges[is_empty] +# ans <- sample(c(ans, some_empty_ranges)) +# } +# if (length(ans) < n) +# stop("internal error, sorry") +# head(ans, n=n) +# } + + +test_that("old extractAt tests still work", { + x <- BStringSet(c(seq1="ABCD", seq2="abcdefghijk", seq3="XYZ")) + at <- IRangesList(IRanges(c(2, 1), c(3, 0)), + IRanges(c(7, 2, 12, 7), c(6, 5, 11, 8)), + IRanges(2, 2)) + ### Set inner names on 'at'. + unlisted_at <- unlist(at) + names(unlisted_at) <- paste0("rg", sprintf("%02d", seq_along(unlisted_at))) + at <- relist(unlisted_at, at) + + res1a <- extractAt(x, at) + res1b <- BStringSetList(mapply(extractAt, x, at)) + expect_true(identical_XStringSetList(res1a, res1b)) + + res2a <- extractAt(x, at[3]) + res2b <- BStringSetList(mapply(extractAt, x, at[3])) + expect_true(identical_XStringSetList(res2a, res2b)) + res2c <- extractAt(x, at[[3]]) + expect_true(identical_XStringSetList(res2a, res2c)) + + ## performance testing is cut, this is too time consuming + ### Testing performance with 2 millions small extractions at random + ### locations in Fly upstream sequences: + # dm3_upstream_filepath <- system.file("extdata", + # "dm3_upstream2000.fa.gz", + # package="Biostrings") + # dm3_upstream <- readDNAStringSet(dm3_upstream_filepath) + # dm3_upstream <- dm3_upstream[width(dm3_upstream) == 2000L] + + # set.seed(33) + # ## cut this down significantly from 2,000,000 + # ## performance is not important for unit testing + # NE <- 2000000L # total number of extractions + + # sample_size <- NE * 1.1 + # some_ranges <- IRanges(sample(2001L, sample_size, replace=TRUE), + # width=sample(0:75, sample_size, replace=TRUE)) + # some_ranges <- head(some_ranges[end(some_ranges) <= 2000L], n=NE) + # split_factor <- factor(sample(length(dm3_upstream), NE, replace=TRUE), + # levels=seq_along(dm3_upstream)) + # at <- unname(split(some_ranges, split_factor)) + + # ### Takes < 1s on my machine. + # res3a <- extractAt(dm3_upstream, at) + + ### Equivalent but about 250x slower than the above on my machine. + #system.time(res3b <- DNAStringSetList(mapply(extractAt, dm3_upstream, at))) + #expect_true(identical_XStringSetList(res3a, res3b)) +}) + +test_that("old replaceAt tests still work", { + ### On an XString object + ### ==================== + + ### Testing performance with half-million small substitutions at random + ### locations in Celegans chrI: + # library(BSgenome.Celegans.UCSC.ce2) + # genome <- BSgenome.Celegans.UCSC.ce2 + # chrI <- genome$chrI + # at4 <- randomDisjointRanges(1L, nchar(chrI), 0L, 20L, 500000L) + # ### Takes < 1s on my machine (64-bit Ubuntu). + # system.time(current <- replaceAt(chrI, at4, Views(chrI, at4))) + # stopifnot(current == chrI) + + # ### Testing performance with half-million single-letter insertions at random + # ### locations in Celegans chrI: + # at5 <- randomDisjointRanges(1L, nchar(chrI), 0L, 0L, 500000L) + # ### Takes < 1s on my machine (64-bit Ubuntu). + # system.time(current <- replaceAt(chrI, at5, value="-")) + # m <- matchPattern("-", current) + # stopifnot(identical(sort(start(at5)), start(m) - seq_along(at5) + 1L)) + + # system.time(current2 <- replaceAt(chrI, start(at5), value="-")) + # stopifnot(identical(current, current2)) + + # matchPattern("---", current) + + ### On an XStringSet object + ### ======================= + + x <- BStringSet(c(seq1="ABCD", seq2="abcdefghijk", seq3="XYZ")) + at <- IRangesList(IRanges(c(2, 1), c(3, 0)), + IRanges(c(7, 2, 12, 7), c(6, 5, 11, 8)), + IRanges(2, 2)) + ### Set inner names on 'at'. + unlisted_at <- unlist(at) + names(unlisted_at) <- paste0("rg", sprintf("%02d", seq_along(unlisted_at))) + at <- relist(unlisted_at, at) + + current <- replaceAt(x, at, value=extractAt(x, at)) # no-op + expect_true(identical_XStringSet(x, current)) + + res1a <- replaceAt(x, at) # deletions + res1b <- mendoapply(replaceAt, x, at) + expect_true(identical_XStringSet(res1a, res1b)) + + ## cutting this as well + ### Testing performance with half-million single-letter insertions at random + ### locations in Fly upstream sequences: + ## + # set.seed(33) + # split_factor <- factor(sample(length(dm3_upstream), 500000L, replace=TRUE), + # levels=seq_along(dm3_upstream)) + # at5 <- unname(split(sample(2001L, 500000L, replace=TRUE), + # split_factor)) + # ### Takes < 1s on my machine. + # system.time(res5a <- replaceAt(dm3_upstream, at5, value="-")) + # ### Equivalent but about 1400x slower than the above on my machine. + # system.time(res5b <- mendoapply(replaceAt, + # dm3_upstream, as(at5, "List"), as("-", "List"))) + # stopifnot(identical_XStringSet(res5a, res5b)) +}) \ No newline at end of file diff --git a/tests/testthat/test-translate.R b/tests/testthat/test-translate.R new file mode 100644 index 00000000..293d2462 --- /dev/null +++ b/tests/testthat/test-translate.R @@ -0,0 +1,125 @@ +## translate.R exports the following: +## - translate (DNAString, RNAString, DNAStringSet, RNAStringSet, +## MaskedDNAString, MaskedRNAString, Default) +## - codons (DNAString, RNAString, MaskedDNAString, MaskedRNAString, Default) + +d <- DNAString(paste(names(GENETIC_CODE), collapse='')) +r <- as(d, "RNAString") +a <- AAString(paste(GENETIC_CODE, collapse='')) + +test_that("translate works properly for all dna,rna", { + ## basic conversion + expect_identical(translate(d), a) + expect_identical(translate(r), a) + + dd <- DNAStringSet(list(d,d)) + rr <- RNAStringSet(list(r,r)) + aa <- AAStringSet(list(a,a)) + expect_identical(translate(dd) == aa, c(TRUE, TRUE)) + expect_identical(translate(rr) == aa, c(TRUE, TRUE)) + + ## errors + expect_error(translate(BString()), "unable to find an inherited method") + expect_error(translate(AAString()), "unable to find an inherited method") + expect_error(translate("ATG"), "unable to find an inherited method") + + alt_gen_code <- GENETIC_CODE + alt_gen_code[] <- rep(c("A", "B"), each=32L) + alt_a <- AAString(paste(rep(c("A", "B"), each=32L), collapse='')) + alt_aa <- AAStringSet(list(alt_a, alt_a)) + expect_identical(translate(d, genetic.code=alt_gen_code), alt_a) + expect_identical(translate(r, genetic.code=alt_gen_code), alt_a) + expect_identical(translate(dd, genetic.code=alt_gen_code) == alt_aa, c(TRUE, TRUE)) + expect_identical(translate(rr, genetic.code=alt_gen_code) == alt_aa, c(TRUE, TRUE)) + + ## fault paths + expect_error(translate(d, genetic.code=character(0L)), + "'genetic.code' must have the same names as predefined constant GENETIC_CODE", fixed=TRUE) + expect_error(translate(d, genetic.code=GENETIC_CODE[seq_len(10L)]), + "'genetic.code' must have the same names as predefined constant GENETIC_CODE", fixed=TRUE) + alt_gen_code[] <- '' + expect_error(translate(d, genetic.code=alt_gen_code), + "'genetic.code' must contain 1-letter strings", fixed=TRUE) + + ## this is promoted to an error with this test case + alt_gen_code[] <- '/' + expect_error(translate(d, genetic.code=alt_gen_code), + "some codons in 'genetic.code' are mapped to letters not in the Amino Acid\n alphabet", fixed=TRUE) + + ## alt_init_codon args + ## error message could be more informative here + error_msg <- "'genetic.code' must have an \"alt_init_codons\" attribute" + alt_gen_code <- GENETIC_CODE + attr(alt_gen_code, "alt_init_codons")[] <- c("BLAH", "NOTANAMINOACID") + expect_error(translate(d, genetic.code=alt_gen_code), error_msg, fixed=TRUE) + attr(alt_gen_code, "alt_init_codons")[] <- c("AAA", "AAA") + expect_error(translate(d, genetic.code=alt_gen_code), error_msg, fixed=TRUE) + attr(alt_gen_code, "alt_init_codons")[] <- c(NA_character_, "AAA") + expect_error(translate(d, genetic.code=alt_gen_code), error_msg, fixed=TRUE) + attr(alt_gen_code, "alt_init_codons")[] <- c(1,2) + expect_error(translate(d, genetic.code=alt_gen_code), error_msg, fixed=TRUE) + attr(alt_gen_code, "alt_init_codons") <- NULL + expect_error(translate(d, genetic.code=alt_gen_code), error_msg, fixed=TRUE) + + ## alternate init codons + dna1 <- DNAString("TTGATATGGCCCTTATAA") + expect_identical(translate(dna1), AAString("MIWPL*")) + expect_identical(translate(dna1, no.init.codon=TRUE), AAString("LIWPL*")) + rna1 <- as(dna1, "RNAString") + expect_identical(translate(rna1), AAString("MIWPL*")) + expect_identical(translate(rna1, no.init.codon=TRUE), AAString("LIWPL*")) + + ## fuzzy codons + dna1 <- DNAString("TTYTTTTTC") + expect_error(translate(dna1), "not a base at pos 3") + expect_identical(translate(dna1, if.fuzzy.codon='solve'), AAString("FFF")) + expect_identical(translate(dna1, if.fuzzy.codon='X'), AAString("XFF")) + expect_identical(translate(dna1, if.fuzzy.codon='error.if.X'), AAString("XFF")) + + dna2 <- DNAString("TTNTTYTTC") + expect_identical(translate(dna2, if.fuzzy.codon='solve'), AAString("XFF")) + expect_identical(translate(dna2, if.fuzzy.codon='X'), AAString("XXF")) + expect_error(translate(dna2, if.fuzzy.codon='error.if.X'), "ambiguous fuzzy codon starting at pos 1") + + expect_error(translate(dna2, if.fuzzy.codon='blahblahblaherror'), '\'arg\' should be one of ', fixed=TRUE) +}) + +test_that("translate, codons work properly for masked strings and string sets", { + mask0 <- Mask(mask.width=nchar(d), start=c(4,10), width=3) + mask1 <- Mask(mask.width=nchar(d), start=c(4,10), width=4) + mask2 <- Mask(mask.width=nchar(d), start=1, width=5) + + dm <- d + rm <- r + masks(dm) <- masks(rm) <- mask0 + + m_a <- AAString(paste(GENETIC_CODE[-c(2,4)], collapse='')) + expect_identical(translate(dm), m_a) + expect_identical(translate(rm), m_a) + + codon_views_d <- codons(dm) + codon_views_r <- codons(rm) + svec <- c(1L, 7L, seq(13L,nchar(d),3L)) + expect_identical(start(codon_views_d), svec) + expect_identical(start(codon_views_r), svec) + expect_identical(end(codon_views_d), svec+2L) + expect_identical(end(codon_views_r), svec+2L) + + masks(d) <- mask1 + expect_warning(translate(d), "last base was ignored") + +}) + +test_that("codon function works properly", { + expect_identical(codons(d), Views(d, start=seq(1,nchar(d),3), width=3)) + expect_identical(codons(r), Views(r, start=seq(1,nchar(d),3), width=3)) + + expect_warning(codons(DNAString("AT")), "the number of nucleotides in 'x' is not a multiple of 3") + expect_warning(codons(DNAString("ATATA")), "the number of nucleotides in 'x' is not a multiple of 3") + expect_warning(codons(RNAString("AU")), "the number of nucleotides in 'x' is not a multiple of 3") + expect_warning(codons(RNAString("AUAUA")), "the number of nucleotides in 'x' is not a multiple of 3") + + expect_error(codons(AAString()), "unable to find an inherited method for function") + expect_error(codons(BString()), "unable to find an inherited method for function") + expect_error(codons("ATG"), "unable to find an inherited method for function") +}) \ No newline at end of file diff --git a/tests/testthat/test-trimLRPatterns.R b/tests/testthat/test-trimLRPatterns.R new file mode 100644 index 00000000..e6374291 --- /dev/null +++ b/tests/testthat/test-trimLRPatterns.R @@ -0,0 +1,62 @@ +## trimLRPatterns.R exports trimLRPatterns for the following classes: +## - character +## - XString +## - XStringSet + +test_that("trimLRPatterns works correctly on supported input types", { + ## tests based on the examples in the docs + L <- "TTCTGCTTG" + R <- "GATCGGAAG" + DChr <- "TTCTGCTTGACGTGATCGGA" + DStr <- DNAString(DChr) + DSet <- DNAStringSet(c(DChr, DChr)) + + exp1 <- "ACGTGATCGGA" + exp2 <- "TTCTGCTTGACGT" + exp3 <- "ACGT" + + DSet2 <- DNAStringSet(c("TGCTTGACGGCAGATCGG", "TTCTGCTTGGATCGGAAG")) + + ## Perfect matches on the flanks + ### left match + expect_true(trimLRPatterns(Lpattern = L, subject = DChr) == exp1) + expect_true(trimLRPatterns(Lpattern = L, subject = DStr) == DNAString(exp1)) + expect_true(all(trimLRPatterns(Lpattern = L, subject = DSet) == DNAStringSet(rep(exp1, 2L)))) + + ### right match + expect_true(trimLRPatterns(Rpattern = R, subject = DChr) == exp2) + expect_true(trimLRPatterns(Rpattern = R, subject = DStr) == DNAString(exp2)) + expect_true(all(trimLRPatterns(Rpattern = R, subject = DSet) == DNAStringSet(rep(exp2, 2L)))) + + ### both match, perfect matches on flanking overlaps + expect_true(trimLRPatterns(Lpattern=L, Rpattern = R, subject = DChr) == exp3) + expect_true(trimLRPatterns(Lpattern=L, Rpattern = R, subject = DStr) == DNAString(exp3)) + expect_true(all(trimLRPatterns(Lpattern=L, Rpattern = R, subject = DSet) == DNAStringSet(rep(exp3, 2L)))) + + ## Mismatches on the flanks + + ## Allow for mismatches on the flanks + t1 <- trimLRPatterns(Lpattern = L, Rpattern = R, subject = DSet2, + max.Lmismatch = 0.2, max.Rmismatch = 0.2) + expect_true(all(t1 == DNAStringSet(c("ACGGCA", "")))) + + maxMismatches <- as.integer(0.2 * 1:9) + maxMismatches + t2 <- trimLRPatterns(Lpattern = L, Rpattern = R, subject = DSet2, + max.Lmismatch = maxMismatches, max.Rmismatch = maxMismatches) + ## This should have a better example + expect_true(all(t1 == t2)) + + t3 <- trimLRPatterns(Lpattern = L, Rpattern = R, subject = DSet2[1], + max.Lmismatch = maxMismatches, max.Rmismatch = maxMismatches, + with.Rindels=TRUE) + expect_true(t3 == DNAStringSet("ACGGC")) + t4 <- trimLRPatterns(Lpattern = L, Rpattern = R, subject = DSet2[1], + max.Lmismatch = maxMismatches, max.Rmismatch = maxMismatches, + with.Rindels=TRUE, with.Lindels=TRUE) + expect_true(t4 == DNAStringSet("CGGC")) + + + ## Produce ranges that can be an input into other functions + expect_s4_class(trimLRPatterns(Lpattern=L, Rpattern=R, subject=DSet2, ranges=TRUE), "IRanges") +}) \ No newline at end of file From f292546c438dbac1caff67fc2b4be287630afeb6 Mon Sep 17 00:00:00 2001 From: Aidan Lakshman Date: Thu, 17 Oct 2024 10:53:31 -0400 Subject: [PATCH 3/5] fix: Issue #101 (#111) * fix: fixes bug with 0-length input to consensusMatrix * adds some unit tests --- R/letterFrequency.R | 12 +++++++--- tests/testthat/test-letterFrequency.R | 34 +++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/R/letterFrequency.R b/R/letterFrequency.R index 39d754e8..5aa6dd07 100644 --- a/R/letterFrequency.R +++ b/R/letterFrequency.R @@ -794,9 +794,15 @@ setMethod("consensusMatrix", "XStringSet", } else { removeUnused <- FALSE } - ans <- .Call2("XStringSet_consensus_matrix", - x, shift, width, baseOnly, codes, - PACKAGE="Biostrings") + if(length(x) == 0){ + ans <- matrix(ifelse(as.prob, 0, 0L), nrow=0, + ncol=ifelse(removeUnused, 0, length(codes))) + if(!removeUnused) colnames(ans) <- names(codes) + } else { + ans <- .Call2("XStringSet_consensus_matrix", + x, shift, width, baseOnly, codes, + PACKAGE="Biostrings") + } if (removeUnused) { ans <- ans[rowSums(ans) > 0, , drop=FALSE] } diff --git a/tests/testthat/test-letterFrequency.R b/tests/testthat/test-letterFrequency.R index 2b8dedd5..ce2ce655 100644 --- a/tests/testthat/test-letterFrequency.R +++ b/tests/testthat/test-letterFrequency.R @@ -262,4 +262,38 @@ test_that("nucleotideFrequency methods work properly", { ## disallowed types expect_error(oligonucleotideFrequency(AAString(), "must contain sequences of type DNA or RNA")) expect_error(oligonucleotideFrequency(BStringSet(), "must contain sequences of type DNA or RNA")) +}) + +test_that("zero-length input doesn't cause errors in consensusMatrix, consensusString", { + ## verify previous functionality + exp_output <- matrix(0L, nrow=5, ncol=4) + diag(exp_output) <- 1L + rownames(exp_output) <- c(DNA_BASES, "other") + expect_identical(consensusMatrix(DNAStringSet("ACGT"), baseOnly=TRUE), exp_output) + expect_identical(consensusString(DNAStringSet(c("ATGC", "ATTC"))), "ATKC") + + ## DNA_ALPHABET is a subset of AA_ALPHABET + all_letters <- do.call(union, list(AA_ALPHABET, RNA_ALPHABET)) + exp_zero_output <- matrix(nrow=0, ncol=length(all_letters)) + colnames(exp_zero_output) <- all_letters + + ## having issues testing for matrix equality manually + expect_equal_matrix <- function(m1, m2){ + expect_identical(nrow(m1), nrow(m2)) + expect_identical(colnames(m1), colnames(m2)) + } + expect_equal_matrix(consensusMatrix(DNAStringSet()), exp_zero_output[,DNA_ALPHABET]) + expect_equal_matrix(consensusMatrix(RNAStringSet()), exp_zero_output[,RNA_ALPHABET]) + expect_equal_matrix(consensusMatrix( AAStringSet()), exp_zero_output[, AA_ALPHABET]) + expect_equal_matrix(consensusMatrix( BStringSet()), exp_zero_output[,NULL]) + + ## zero length input for consensusString + expect_identical(consensusString(DNAStringSet()), character(0L)) + expect_identical(consensusString(RNAStringSet()), character(0L)) + expect_identical(consensusString(AAStringSet()), character(0L)) + expect_identical(consensusString(BStringSet()), character(0L)) + expect_identical(consensusString(character(0L)), character(0L)) + + expect_true(is.integer(consensusMatrix(DNAStringSet(), as.prob=FALSE))) + expect_false(is.integer(consensusMatrix(DNAStringSet(), as.prob=TRUE))) }) \ No newline at end of file From 13ae854011ce41cad0f5e499ebbde55631cc39b5 Mon Sep 17 00:00:00 2001 From: Aidan Lakshman Date: Thu, 17 Oct 2024 17:27:39 -0400 Subject: [PATCH 4/5] fix: Issue #106 (#110) * fix: clear qualities from mcol to remove needless warning --- R/QualityScaledXStringSet.R | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/R/QualityScaledXStringSet.R b/R/QualityScaledXStringSet.R index 45b3724d..2ef886ea 100644 --- a/R/QualityScaledXStringSet.R +++ b/R/QualityScaledXStringSet.R @@ -186,6 +186,13 @@ readQualityScaledDNAStringSet <- function(filepath, nrec, skip, seek.first.rec, use.names, with.qualities=TRUE) qualities <- mcols(x)[ , "qualities"] + ## Clear out the qualities parameter from the DNAStringSet, + ## since it gets passed into the QualityScaledDNAStringSet + ## constructor via quals anyway + ## (otherwise we get a warning that doesn't make sense) + mcols(x)[,"qualities"] <- NULL + if(ncol(mcols(x)) == 0) + mcols(x) <- NULL quals <- switch(quality.scoring, phred=PhredQuality(qualities), solexa=SolexaQuality(qualities), @@ -201,4 +208,3 @@ writeQualityScaledXStringSet <- function(x, filepath, writeXStringSet(x, filepath, append, compress, compression_level, format="fastq", qualities=quality(x)) } - From 80bb49c9bdc15175c6721be77ce929d12f3cc7ba Mon Sep 17 00:00:00 2001 From: Aidan Lakshman Date: Fri, 18 Oct 2024 11:03:36 -0400 Subject: [PATCH 5/5] fix: remove expected warning in test --- tests/testthat/test-XStringSet-io.R | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/tests/testthat/test-XStringSet-io.R b/tests/testthat/test-XStringSet-io.R index ae4fcf63..484e3c21 100644 --- a/tests/testthat/test-XStringSet-io.R +++ b/tests/testthat/test-XStringSet-io.R @@ -144,20 +144,16 @@ test_that("XStringSet read/write is correct", { names(x1) <- names_dna writeXStringSet(x1, format="fastq", file=tmpfilefq) - expect_warning(qdna2 <- readQualityScaledDNAStringSet(tmpfilefq), - "metadata columns on input DNAStringSet object were dropped") - + qdna2 <- readQualityScaledDNAStringSet(tmpfilefq) expect_true(!all(quality(qdna2) == quality(qdna1))) expect_true(all(all(as(quality(qdna2),"IntegerList") == 26L))) writeXStringSet(x1, format="fastq", file=tmpfilefq, qualities = q1) - expect_warning(qdna2 <- readQualityScaledDNAStringSet(tmpfilefq), - "metadata columns on input DNAStringSet object were dropped") + qdna2 <- readQualityScaledDNAStringSet(tmpfilefq) expect_true(all(quality(qdna2) == quality(qdna1))) writeXStringSet(qdna1, format="fastq", file=tmpfilefq) - expect_warning(qdna2 <- readQualityScaledDNAStringSet(tmpfilefq), - "metadata columns on input DNAStringSet object were dropped") + qdna2 <- readQualityScaledDNAStringSet(tmpfilefq) expect_true(all(names(qdna2) == names_dna)) expect_true(all(quality(qdna2) == quality(qdna1)))