diff --git a/DESCRIPTION b/DESCRIPTION index f135a6f..173bee1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,12 +33,12 @@ Imports: ComplexUpset, plotly, Hmisc, - compositions + compositions, + ShortRead Suggests: knitr, brms, NetCoMi, - ShortRead, Biostrings, seqinr, ape, diff --git a/NAMESPACE b/NAMESPACE index 9ffa5c4..a9c507d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -33,10 +33,15 @@ import(patchwork) import(stats) import(uwot) import(viridis) +importClassesFrom(ShortRead,ShortReadQ) importFrom(ComplexUpset,upset) importFrom(FactoMineR,PCA) importFrom(Hmisc,rcorr) importFrom(MASS,glm.nb) +importFrom(ShortRead,countFastq) +importFrom(ShortRead,readFastq) +importFrom(ShortRead,sread) +importFrom(ShortRead,writeFastq) importFrom(compositions,clr) importFrom(dbscan,dbscan) importFrom(igraph,E) @@ -72,3 +77,4 @@ importFrom(utils,write.csv) importFrom(uwot,umap) importFrom(vegan,vegdist) importFrom(viridis,scale_fill_viridis) +importMethodsFrom(ShortRead,append) diff --git a/R/demultiplex.R b/R/demultiplex.R index ddc6480..ae8ec91 100644 --- a/R/demultiplex.R +++ b/R/demultiplex.R @@ -1,6 +1,8 @@ #' Function to demultiplex sequences either to single files or to multiple files for use with dada2. #' -#' @param reads A ShortReadQ object or a file path to a fastq file that will be read in with +#' @param fwd_reads A ShortReadQ object or a file path to a fastq file that will be read in with +#' \code{ShortRead::readFastq}. +#' @param rev_reads A ShortReadQ object or a file path to a fastq file that will be read in with #' \code{ShortRead::readFastq}. #' @param barcodes A dataframe of barcodes and metadata for naming or a file path to such a flat file #' readable by \code{utils::read.delim}. @@ -9,12 +11,8 @@ #' @param rev The column name of the barcodes dataframe representing reverse barcodes. #' Defaults to "REV". #' @param name One or more column names from the barcodes dataframe to be used in naming samples. -#' @param mode One of "dada" or "general". This controls how files are written out. -#' If mode is "dada" then a fastq file will be written out for -#' each row of the barcodes dataframe and the files will be named using the name argument. -#' If mode is "general" then each input fastq file will yield one -#' output fastq file with sample names in that file corresponding to the name argument. -#' Defaults to "dada". +#' @param mode Currently only "dada" is supported. This exists for tentative plans to have several +#' output modes. #' @param cores Optionally the number of cores to run in parallel. This defaults to 1 if the "mc.cores" #' option is not set. #' @param writeOut Logical, should fastq/stats files be written out. Defaults to TRUE. @@ -27,57 +25,27 @@ #' #' @importFrom utils write.csv #' @importFrom methods is +#' @importClassesFrom ShortRead ShortReadQ +#' @importMethodsFrom ShortRead append +#' @importFrom ShortRead readFastq writeFastq sread countFastq #' @import parallel #' #' @return Writes out fq files and summary statistics depending on writeOut and stat arguments. #' If both are TRUE then summary stats are returned as a dataframe. #' -#' @examples -#' # these examples are not run because they require specific fastq files. -#' \dontrun{ -#' library(ShortRead) -#' library(Biostrings) -#' library(parallel) -#' setwd("~/scripts/SINC/sincUtils/syncomBuilder/nastya96WellEx") -#' -#' barcodes <- read.delim("barcode_tab.tsv") -#' name <- c("Name", "Well") -#' reads <- ShortRead::readFastq("Plate11_R1_001.fastq.gz") -#' -#' demultiplex(reads, barcodes[1:10, ], -#' fwd = "FWD", rev = "REV", -#' name = name, mode = "dada", cores = 10, writeOut = "tests", stat = FALSE -#' ) -#' -#' path <- paste0( -#' "/run/user/1000/gvfs/smb-share:server=nas01,share=research/bart_lab/", -#' "Previous_people/Mingsheng_Qi/research_dr/SINC/limiting_dilution/20210314/p2p11_rawseq" -#' ) -#' barcodes <- read.csv(paste0(path, "/barcodetable1.csv")) -#' barcodes$exp <- "exp" -#' reads <- ShortRead::readFastq(paste0(path, "/210314S05P10_merge.fq")) -#' -#' x <- demultiplex(reads, barcodes, -#' fwd = "FWD_primer", rev = "REV_primer", name = c("exp"), mode = "general", -#' writeOut = TRUE, stat = TRUE, cores = 10 -#' ) -#' x -#' # benchmarking against: -#' # "plate","totalReads","groupedReads","ratio" -#' # "P10",234509,154507,0.6589 -#' } -#' #' @export - -demultiplex <- function(reads, barcodes, fwd = "FWD", rev = "REV", name = c("Name", "Sample_name"), - mode = c("dada", "general"), cores = getOption("mc.cores", 1), writeOut = TRUE, - stat = FALSE) { - if (is.character(reads)) { - reads <- ShortRead::readFastq(reads) +demultiplex <- function(fwd_reads, rev_reads, barcodes, fwd = "FWD", rev = "REV", + name = c("Name", "Sample_name"), mode = c("dada", "general"), + cores = getOption("mc.cores", 1), writeOut = TRUE, stat = FALSE) { + if (is.character(fwd_reads)) { + fwd_reads <- ShortRead::readFastq(fwd_reads) } - if (!methods::is(reads, "ShortReadQ")) { - stop("reads must be a ShortReadQ object or a file path to a fastq file.") + if (is.character(rev_reads)) { + rev_reads <- ShortRead::readFastq(rev_reads) + } + if (!methods::is(fwd_reads, "ShortReadQ") || !methods::is(rev_reads, "ShortReadQ")) { + stop("fwd and rev reads must be a ShortReadQ object or a file path to a fastq file.") } if (is.character(barcodes)) { barcodes <- utils::read.delim(barcodes) @@ -90,9 +58,9 @@ demultiplex <- function(reads, barcodes, fwd = "FWD", rev = "REV", name = c("Nam } mode <- mode[1] if (mode == "dada") { - out <- .demultiplex_for_dada(reads, barcodes, fwd, rev, name, writeOut, stat, cores) + out <- .demultiplex_for_dada(fwd_reads, rev_reads, barcodes, fwd, rev, name, writeOut, stat, cores) } else if (mode == "general") { - out <- .demultiplex_general(reads, barcodes, fwd, rev, name, writeOut, stat, cores) + out <- .demultiplex_general(fwd_reads, rev_reads, barcodes, fwd, rev, name, writeOut, stat, cores) } else { stop("mode must be one of 'dada' or 'general'") } @@ -113,44 +81,33 @@ demultiplex <- function(reads, barcodes, fwd = "FWD", rev = "REV", name = c("Nam #' @keywords internal #' @noRd -.demultiplex_for_dada <- function(reads, barcodes, fwd = "FWD", rev = "REV", +.demultiplex_for_dada <- function(fwd_reads, rev_reads, barcodes, fwd = "FWD", rev = "REV", name = c("Name", "Sample_name"), writeOut = TRUE, stat = FALSE, cores = 1) { # format barcodes to uppercase barcodes[[fwd]] <- toupper(barcodes[[fwd]]) barcodes[[rev]] <- toupper(barcodes[[rev]]) - # per barcode write out a fq and optionally a summary table + fwd_reads_char <- as.character(ShortRead::sread(fwd_reads)) + rev_reads_char <- as.character(ShortRead::sread(rev_reads)) + test <- parallel::mclapply(seq_len(nrow(barcodes)), function(i) { # get a name for the sub sample - name <- paste(barcodes[i, c(name)], collapse = ".") + file_name <- paste(barcodes[i, c(name)], collapse = "_") # get barcodes fwd_bar <- barcodes[i, fwd] rev_bar <- barcodes[i, rev] - # filter for barcodes at beginning of sequence - rowReads <- reads[grepl(paste0("^", fwd_bar), ShortRead::sread(reads))] - columnReads <- reads[grepl(paste0("^", rev_bar), ShortRead::sread(reads))] - # make reverse complements of fwd and reverse barcodes - fwd_bar_reverse_complement <- as.character( - Biostrings::reverseComplement( - Biostrings::DNAString(fwd_bar) - ) - ) - rev_bar_reverse_complement <- as.character( - Biostrings::reverseComplement( - Biostrings::DNAString(rev_bar) - ) - ) - # search for reverse complements of opposite barcode at end of sequences - rowReads <- rowReads[grepl( - paste0(rev_bar_reverse_complement, "$"), - ShortRead::sread(rowReads) - )] - columnReads <- columnReads[grepl( - paste0(fwd_bar_reverse_complement, "$"), - ShortRead::sread(columnReads) - )] - # append row and column reads - clone <- append(rowReads, columnReads) + + fwd_reads_w_fwd_bars <- which(grepl(paste0("^", fwd_bar), fwd_reads_char)) + rev_reads_w_rev_bars <- which(grepl(paste0("^", rev_bar), rev_reads_char)) + fwd_index_both <- intersect(fwd_reads_w_fwd_bars, rev_reads_w_rev_bars) # logical AND + + fwd_reads_w_rev_bars <- which(grepl(paste0("^", rev_bar), fwd_reads_char)) + rev_reads_w_fwd_bars <- which(grepl(paste0("^", fwd_bar), rev_reads_char)) + rev_index_both <- intersect(fwd_reads_w_rev_bars, rev_reads_w_fwd_bars) # logical AND + # union of both groups of reads + total_index <- union(fwd_index_both, rev_index_both) # logical OR + fwd_clone <- fwd_reads[total_index] + rev_clone <- rev_reads[total_index] # write fastq if (!is.logical(writeOut)) { write <- writeOut @@ -163,18 +120,23 @@ demultiplex <- function(reads, barcodes, fwd = "FWD", rev = "REV", name = c("Nam } if (writeOut) { ShortRead::writeFastq( - object = clone, file = paste0(write, name, ".fq"), + object = fwd_clone, file = paste0(write, file_name, "_f.fastq.gz"), + mode = "w", compress = TRUE + ) + ShortRead::writeFastq( + object = rev_clone, file = paste0(write, file_name, "_r.fastq.gz"), mode = "w", compress = TRUE ) } if (stat & writeOut) { stats <- data.frame( - name = name, - totalReads = length(reads), - groupedReads = ShortRead::countFastq(paste0(name, ".fq"))[1, 1] + name = file_name, + totalReads = length(fwd_reads), + groupedReads_fwd = ShortRead::countFastq(paste0(write, file_name, "_f.fastq.gz"))[1, 1], + groupedReads_rev = ShortRead::countFastq(paste0(write, file_name, "_r.fastq.gz"))[1, 1] ) - stats$ratio <- signif(stats$groupedReads / stats$totalReads, digits = 4) - utils::write.csv(stats, file = paste0(name, ".csv"), row.names = FALSE) + stats$ratio <- signif(stats$groupedReads_fwd / stats$totalReads, digits = 4) + utils::write.csv(stats, file = paste0(write, file_name, ".csv"), row.names = FALSE) return(stats) } }, mc.cores = cores) @@ -195,22 +157,26 @@ demultiplex <- function(reads, barcodes, fwd = "FWD", rev = "REV", name = c("Nam #' @keywords internal #' @noRd -.demultiplex_general <- function(reads, barcodes, fwd = "FWD", rev = "REV", - name = c("Name", "Sample_name"), - writeOut = TRUE, stat = FALSE, cores = 1) { +.demultiplex_general <- function(fwd_reads, rev_reads, barcodes, fwd = "FWD", rev = "REV", + name = c("Name", "Sample_name"), writeOut = TRUE, + stat = FALSE, cores = 1) { + # placeholder until I change how general works + return(.demultiplex_for_dada(fwd_reads, rev_reads, barcodes, fwd, rev, + name, writeOut, + stat, cores)) # format barcodes to uppercase barcodes[[fwd]] <- toupper(barcodes[[fwd]]) barcodes[[rev]] <- toupper(barcodes[[rev]]) # per barcode write out a fq and optionally a summary table lapply(seq_len(nrow(barcodes)), function(i) { # get a name for the sub sample - name <- paste(barcodes[i, c(name)], collapse = ".") + name <- paste(barcodes[i, c(name)], collapse = "_") # get barcodes fwd_bar <- barcodes[i, fwd] rev_bar <- barcodes[i, rev] # filter for barcodes at beginning of sequence - rowReads <- reads[grepl(paste0("^", fwd_bar), ShortRead::sread(reads))] - columnReads <- reads[grepl(paste0("^", rev_bar), ShortRead::sread(reads))] + rowReads <- fwd_reads[grepl(paste0("^", fwd_bar), ShortRead::sread(fwd_reads))] + columnReads <- fwd_reads[grepl(paste0("^", rev_bar), ShortRead::sread(fwd_reads))] # make reverse complements of fwd and reverse barcodes fwd_bar_reverse_complement <- as.character( Biostrings::reverseComplement( @@ -247,7 +213,7 @@ demultiplex <- function(reads, barcodes, fwd = "FWD", rev = "REV", name = c("Nam md <- "a" } ShortRead::writeFastq( - object = clone, file = paste0(write, name, ".fq"), + object = clone, file = paste0(write, name, ".fastq.gz"), mode = md, compress = TRUE ) } @@ -255,8 +221,8 @@ demultiplex <- function(reads, barcodes, fwd = "FWD", rev = "REV", name = c("Nam if (stat && writeOut) { stats <- data.frame( name = name, - totalReads = length(reads), - groupedReads = ShortRead::countFastq(paste0(name, ".fq"))[1, 1] + totalReads = length(fwd_reads), + groupedReads = ShortRead::countFastq(paste0(name, ".fastq.gz"))[1, 1] ) stats$ratio <- signif(stats$groupedReads / stats$totalReads, digits = 4) utils::write.csv(stats, file = paste0(name, ".csv"), row.names = FALSE) diff --git a/man/demultiplex.Rd b/man/demultiplex.Rd index 46e467e..e665b8e 100644 --- a/man/demultiplex.Rd +++ b/man/demultiplex.Rd @@ -5,7 +5,8 @@ \title{Function to demultiplex sequences either to single files or to multiple files for use with dada2.} \usage{ demultiplex( - reads, + fwd_reads, + rev_reads, barcodes, fwd = "FWD", rev = "REV", @@ -17,7 +18,10 @@ demultiplex( ) } \arguments{ -\item{reads}{A ShortReadQ object or a file path to a fastq file that will be read in with +\item{fwd_reads}{A ShortReadQ object or a file path to a fastq file that will be read in with +\code{ShortRead::readFastq}.} + +\item{rev_reads}{A ShortReadQ object or a file path to a fastq file that will be read in with \code{ShortRead::readFastq}.} \item{barcodes}{A dataframe of barcodes and metadata for naming or a file path to such a flat file @@ -31,12 +35,8 @@ Defaults to "REV".} \item{name}{One or more column names from the barcodes dataframe to be used in naming samples.} -\item{mode}{One of "dada" or "general". This controls how files are written out. -If mode is "dada" then a fastq file will be written out for -each row of the barcodes dataframe and the files will be named using the name argument. -If mode is "general" then each input fastq file will yield one -output fastq file with sample names in that file corresponding to the name argument. -Defaults to "dada".} +\item{mode}{Currently only "dada" is supported. This exists for tentative plans to have several +output modes.} \item{cores}{Optionally the number of cores to run in parallel. This defaults to 1 if the "mc.cores" option is not set.} @@ -53,42 +53,6 @@ If both are TRUE then summary stats are returned as a dataframe. } \description{ Function to demultiplex sequences either to single files or to multiple files for use with dada2. -} -\examples{ -# these examples are not run because they require specific fastq files. -\dontrun{ -library(ShortRead) -library(Biostrings) -library(parallel) -setwd("~/scripts/SINC/sincUtils/syncomBuilder/nastya96WellEx") - -barcodes <- read.delim("barcode_tab.tsv") -name <- c("Name", "Well") -reads <- ShortRead::readFastq("Plate11_R1_001.fastq.gz") - -demultiplex(reads, barcodes[1:10, ], - fwd = "FWD", rev = "REV", - name = name, mode = "dada", cores = 10, writeOut = "tests", stat = FALSE -) - -path <- paste0( - "/run/user/1000/gvfs/smb-share:server=nas01,share=research/bart_lab/", - "Previous_people/Mingsheng_Qi/research_dr/SINC/limiting_dilution/20210314/p2p11_rawseq" -) -barcodes <- read.csv(paste0(path, "/barcodetable1.csv")) -barcodes$exp <- "exp" -reads <- ShortRead::readFastq(paste0(path, "/210314S05P10_merge.fq")) - -x <- demultiplex(reads, barcodes, - fwd = "FWD_primer", rev = "REV_primer", name = c("exp"), mode = "general", - writeOut = TRUE, stat = TRUE, cores = 10 -) -x -# benchmarking against: -# "plate","totalReads","groupedReads","ratio" -# "P10",234509,154507,0.6589 -} - } \keyword{dada2} \keyword{demultiplex,}