danforthcenter · joshqsumner · Nov 26, 2024 · Nov 25, 2024 · Nov 25, 2024 · Nov 26, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -33,12 +33,12 @@ Imports:
     ComplexUpset,
     plotly,
     Hmisc,
-    compositions
+    compositions,
+    ShortRead
 Suggests:
     knitr,
     brms,
     NetCoMi,
-    ShortRead,
     Biostrings,
     seqinr,
     ape,

diff --git a/NAMESPACE b/NAMESPACE
@@ -33,10 +33,15 @@ import(patchwork)
 import(stats)
 import(uwot)
 import(viridis)
+importClassesFrom(ShortRead,ShortReadQ)
 importFrom(ComplexUpset,upset)
 importFrom(FactoMineR,PCA)
 importFrom(Hmisc,rcorr)
 importFrom(MASS,glm.nb)
+importFrom(ShortRead,countFastq)
+importFrom(ShortRead,readFastq)
+importFrom(ShortRead,sread)
+importFrom(ShortRead,writeFastq)
 importFrom(compositions,clr)
 importFrom(dbscan,dbscan)
 importFrom(igraph,E)
@@ -72,3 +77,4 @@ importFrom(utils,write.csv)
 importFrom(uwot,umap)
 importFrom(vegan,vegdist)
 importFrom(viridis,scale_fill_viridis)
+importMethodsFrom(ShortRead,append)
diff --git a/R/demultiplex.R b/R/demultiplex.R
@@ -1,6 +1,8 @@
 #' Function to demultiplex sequences either to single files or to multiple files for use with dada2.
 #'
-#' @param reads A ShortReadQ object or a file path to a fastq file that will be read in with
+#' @param fwd_reads A ShortReadQ object or a file path to a fastq file that will be read in with
+#' \code{ShortRead::readFastq}.
+#' @param rev_reads A ShortReadQ object or a file path to a fastq file that will be read in with
 #' \code{ShortRead::readFastq}.
 #' @param barcodes A dataframe of barcodes and metadata for naming or a file path to such a flat file
 #' readable by \code{utils::read.delim}.
@@ -9,12 +11,8 @@
 #' @param rev The column name of the barcodes dataframe representing reverse barcodes.
 #' Defaults to "REV".
 #' @param name One or more column names from the barcodes dataframe to be used in naming samples.
-#' @param mode One of "dada" or "general". This controls how files are written out.
-#' If mode is "dada" then a fastq file will be written out for
-#' each row of the barcodes dataframe and the files will be named using the name argument.
-#' If mode is "general" then each input fastq file will yield one
-#' output fastq file with sample names in that file corresponding to the name argument.
-#' Defaults to "dada".
+#' @param mode Currently only "dada" is supported. This exists for tentative plans to have several
+#' output modes.
 #' @param cores Optionally the number of cores to run in parallel. This defaults to 1 if the "mc.cores"
 #' option is not set.
 #' @param writeOut Logical, should fastq/stats files be written out. Defaults to TRUE.
@@ -27,57 +25,27 @@
 #'
 #' @importFrom utils write.csv
 #' @importFrom methods is
+#' @importClassesFrom ShortRead ShortReadQ
+#' @importMethodsFrom ShortRead append
+#' @importFrom ShortRead readFastq writeFastq sread countFastq
 #' @import parallel
 #'
 #' @return Writes out fq files and summary statistics depending on writeOut and stat arguments.
 #' If both are TRUE then summary stats are returned as a dataframe.
 #'
-#' @examples
-#' # these examples are not run because they require specific fastq files.
-#' \dontrun{
-#' library(ShortRead)
-#' library(Biostrings)
-#' library(parallel)
-#' setwd("~/scripts/SINC/sincUtils/syncomBuilder/nastya96WellEx")
-#'
-#' barcodes <- read.delim("barcode_tab.tsv")
-#' name <- c("Name", "Well")
-#' reads <- ShortRead::readFastq("Plate11_R1_001.fastq.gz")
-#'
-#' demultiplex(reads, barcodes[1:10, ],
-#'   fwd = "FWD", rev = "REV",
-#'   name = name, mode = "dada", cores = 10, writeOut = "tests", stat = FALSE
-#' )
-#'
-#' path <- paste0(
-#'   "/run/user/1000/gvfs/smb-share:server=nas01,share=research/bart_lab/",
-#'   "Previous_people/Mingsheng_Qi/research_dr/SINC/limiting_dilution/20210314/p2p11_rawseq"
-#' )
-#' barcodes <- read.csv(paste0(path, "/barcodetable1.csv"))
-#' barcodes$exp <- "exp"
-#' reads <- ShortRead::readFastq(paste0(path, "/210314S05P10_merge.fq"))
-#'
-#' x <- demultiplex(reads, barcodes,
-#'   fwd = "FWD_primer", rev = "REV_primer", name = c("exp"), mode = "general",
-#'   writeOut = TRUE, stat = TRUE, cores = 10
-#' )
-#' x
-#' # benchmarking against:
-#' # "plate","totalReads","groupedReads","ratio"
-#' # "P10",234509,154507,0.6589
-#' }
-#'
 #' @export
 
-
-demultiplex <- function(reads, barcodes, fwd = "FWD", rev = "REV", name = c("Name", "Sample_name"),
-                        mode = c("dada", "general"), cores = getOption("mc.cores", 1), writeOut = TRUE,
-                        stat = FALSE) {
-  if (is.character(reads)) {
-    reads <- ShortRead::readFastq(reads)
+demultiplex <- function(fwd_reads, rev_reads, barcodes, fwd = "FWD", rev = "REV",
+                        name = c("Name", "Sample_name"), mode = c("dada", "general"),
+                        cores = getOption("mc.cores", 1), writeOut = TRUE, stat = FALSE) {
+  if (is.character(fwd_reads)) {
+    fwd_reads <- ShortRead::readFastq(fwd_reads)
   }
-  if (!methods::is(reads, "ShortReadQ")) {
-    stop("reads must be a ShortReadQ object or a file path to a fastq file.")
+  if (is.character(rev_reads)) {
+    rev_reads <- ShortRead::readFastq(rev_reads)
+  }
+  if (!methods::is(fwd_reads, "ShortReadQ") || !methods::is(rev_reads, "ShortReadQ")) {
+    stop("fwd and rev reads must be a ShortReadQ object or a file path to a fastq file.")
   }
   if (is.character(barcodes)) {
     barcodes <- utils::read.delim(barcodes)
@@ -90,9 +58,9 @@ demultiplex <- function(reads, barcodes, fwd = "FWD", rev = "REV", name = c("Nam
   }
   mode <- mode[1]
   if (mode == "dada") {
-    out <- .demultiplex_for_dada(reads, barcodes, fwd, rev, name, writeOut, stat, cores)
+    out <- .demultiplex_for_dada(fwd_reads, rev_reads, barcodes, fwd, rev, name, writeOut, stat, cores)
   } else if (mode == "general") {
-    out <- .demultiplex_general(reads, barcodes, fwd, rev, name, writeOut, stat, cores)
+    out <- .demultiplex_general(fwd_reads, rev_reads, barcodes, fwd, rev, name, writeOut, stat, cores)
   } else {
     stop("mode must be one of 'dada' or 'general'")
   }
@@ -113,44 +81,33 @@ demultiplex <- function(reads, barcodes, fwd = "FWD", rev = "REV", name = c("Nam
 #' @keywords internal
 #' @noRd
 
-.demultiplex_for_dada <- function(reads, barcodes, fwd = "FWD", rev = "REV",
+.demultiplex_for_dada <- function(fwd_reads, rev_reads, barcodes, fwd = "FWD", rev = "REV",
                                   name = c("Name", "Sample_name"), writeOut = TRUE,
                                   stat = FALSE, cores = 1) {
   # format barcodes to uppercase
   barcodes[[fwd]] <- toupper(barcodes[[fwd]])
   barcodes[[rev]] <- toupper(barcodes[[rev]])
-  # per barcode write out a fq and optionally a summary table
+  fwd_reads_char <- as.character(ShortRead::sread(fwd_reads))
+  rev_reads_char <- as.character(ShortRead::sread(rev_reads))
+
   test <- parallel::mclapply(seq_len(nrow(barcodes)), function(i) {
     # get a name for the sub sample
-    name <- paste(barcodes[i, c(name)], collapse = ".")
+    file_name <- paste(barcodes[i, c(name)], collapse = "_")
     # get barcodes
     fwd_bar <- barcodes[i, fwd]
     rev_bar <- barcodes[i, rev]
-    # filter for barcodes at beginning of sequence
-    rowReads <- reads[grepl(paste0("^", fwd_bar), ShortRead::sread(reads))]
-    columnReads <- reads[grepl(paste0("^", rev_bar), ShortRead::sread(reads))]
-    # make reverse complements of fwd and reverse barcodes
-    fwd_bar_reverse_complement <- as.character(
-      Biostrings::reverseComplement(
-        Biostrings::DNAString(fwd_bar)
-      )
-    )
-    rev_bar_reverse_complement <- as.character(
-      Biostrings::reverseComplement(
-        Biostrings::DNAString(rev_bar)
-      )
-    )
-    # search for reverse complements of opposite barcode at end of sequences
-    rowReads <- rowReads[grepl(
-      paste0(rev_bar_reverse_complement, "$"),
-      ShortRead::sread(rowReads)
-    )]
-    columnReads <- columnReads[grepl(
-      paste0(fwd_bar_reverse_complement, "$"),
-      ShortRead::sread(columnReads)
-    )]
-    # append row and column reads
-    clone <- append(rowReads, columnReads)
+
+    fwd_reads_w_fwd_bars <- which(grepl(paste0("^", fwd_bar), fwd_reads_char))
+    rev_reads_w_rev_bars <- which(grepl(paste0("^", rev_bar), rev_reads_char))
+    fwd_index_both <- intersect(fwd_reads_w_fwd_bars, rev_reads_w_rev_bars) # logical AND
+
+    fwd_reads_w_rev_bars <- which(grepl(paste0("^", rev_bar), fwd_reads_char))
+    rev_reads_w_fwd_bars <- which(grepl(paste0("^", fwd_bar), rev_reads_char))
+    rev_index_both <- intersect(fwd_reads_w_rev_bars, rev_reads_w_fwd_bars) # logical AND
+    # union of both groups of reads
+    total_index <- union(fwd_index_both, rev_index_both) # logical OR
+    fwd_clone <- fwd_reads[total_index]
+    rev_clone <- rev_reads[total_index]
     # write fastq
     if (!is.logical(writeOut)) {
       write <- writeOut
@@ -163,18 +120,23 @@ demultiplex <- function(reads, barcodes, fwd = "FWD", rev = "REV", name = c("Nam
     }
     if (writeOut) {
       ShortRead::writeFastq(
-        object = clone, file = paste0(write, name, ".fq"),
+        object = fwd_clone, file = paste0(write, file_name, "_f.fastq.gz"),
+        mode = "w", compress = TRUE
+      )
+      ShortRead::writeFastq(
+        object = rev_clone, file = paste0(write, file_name, "_r.fastq.gz"),
         mode = "w", compress = TRUE
       )
     }
     if (stat & writeOut) {
       stats <- data.frame(
-        name = name,
-        totalReads = length(reads),
-        groupedReads = ShortRead::countFastq(paste0(name, ".fq"))[1, 1]
+        name = file_name,
+        totalReads = length(fwd_reads),
+        groupedReads_fwd = ShortRead::countFastq(paste0(write, file_name, "_f.fastq.gz"))[1, 1],
+        groupedReads_rev = ShortRead::countFastq(paste0(write, file_name, "_r.fastq.gz"))[1, 1]
       )
-      stats$ratio <- signif(stats$groupedReads / stats$totalReads, digits = 4)
-      utils::write.csv(stats, file = paste0(name, ".csv"), row.names = FALSE)
+      stats$ratio <- signif(stats$groupedReads_fwd / stats$totalReads, digits = 4)
+      utils::write.csv(stats, file = paste0(write, file_name, ".csv"), row.names = FALSE)
       return(stats)
     }
   }, mc.cores = cores)
@@ -195,22 +157,26 @@ demultiplex <- function(reads, barcodes, fwd = "FWD", rev = "REV", name = c("Nam
 #' @keywords internal
 #' @noRd
 
-.demultiplex_general <- function(reads, barcodes, fwd = "FWD", rev = "REV",
-                                 name = c("Name", "Sample_name"),
-                                 writeOut = TRUE, stat = FALSE, cores = 1) {
+.demultiplex_general <- function(fwd_reads, rev_reads, barcodes, fwd = "FWD", rev = "REV",
+                                 name = c("Name", "Sample_name"), writeOut = TRUE,
+                                 stat = FALSE, cores = 1) {
+  # placeholder until I change how general works
+  return(.demultiplex_for_dada(fwd_reads, rev_reads, barcodes, fwd, rev,
+                               name, writeOut,
+                               stat, cores))
   # format barcodes to uppercase
   barcodes[[fwd]] <- toupper(barcodes[[fwd]])
   barcodes[[rev]] <- toupper(barcodes[[rev]])
   # per barcode write out a fq and optionally a summary table
   lapply(seq_len(nrow(barcodes)), function(i) {
     # get a name for the sub sample
-    name <- paste(barcodes[i, c(name)], collapse = ".")
+    name <- paste(barcodes[i, c(name)], collapse = "_")
     # get barcodes
     fwd_bar <- barcodes[i, fwd]
     rev_bar <- barcodes[i, rev]
     # filter for barcodes at beginning of sequence
-    rowReads <- reads[grepl(paste0("^", fwd_bar), ShortRead::sread(reads))]
-    columnReads <- reads[grepl(paste0("^", rev_bar), ShortRead::sread(reads))]
+    rowReads <- fwd_reads[grepl(paste0("^", fwd_bar), ShortRead::sread(fwd_reads))]
+    columnReads <- fwd_reads[grepl(paste0("^", rev_bar), ShortRead::sread(fwd_reads))]
     # make reverse complements of fwd and reverse barcodes
     fwd_bar_reverse_complement <- as.character(
       Biostrings::reverseComplement(
@@ -247,16 +213,16 @@ demultiplex <- function(reads, barcodes, fwd = "FWD", rev = "REV", name = c("Nam
         md <- "a"
       }
       ShortRead::writeFastq(
-        object = clone, file = paste0(write, name, ".fq"),
+        object = clone, file = paste0(write, name, ".fastq.gz"),
         mode = md, compress = TRUE
       )
     }
   })
   if (stat && writeOut) {
     stats <- data.frame(
       name = name,
-      totalReads = length(reads),
-      groupedReads = ShortRead::countFastq(paste0(name, ".fq"))[1, 1]
+      totalReads = length(fwd_reads),
+      groupedReads = ShortRead::countFastq(paste0(name, ".fastq.gz"))[1, 1]
     )
     stats$ratio <- signif(stats$groupedReads / stats$totalReads, digits = 4)
     utils::write.csv(stats, file = paste0(name, ".csv"), row.names = FALSE)

diff --git a/man/demultiplex.Rd b/man/demultiplex.Rd