Skip to content

Commit

Permalink
Merge pull request #13 from danforthcenter/local
Browse files Browse the repository at this point in the history
Local
  • Loading branch information
joshqsumner authored Nov 26, 2024
2 parents 9241adf + 06df97f commit d24aa73
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 141 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,12 @@ Imports:
ComplexUpset,
plotly,
Hmisc,
compositions
compositions,
ShortRead
Suggests:
knitr,
brms,
NetCoMi,
ShortRead,
Biostrings,
seqinr,
ape,
Expand Down
6 changes: 6 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,15 @@ import(patchwork)
import(stats)
import(uwot)
import(viridis)
importClassesFrom(ShortRead,ShortReadQ)
importFrom(ComplexUpset,upset)
importFrom(FactoMineR,PCA)
importFrom(Hmisc,rcorr)
importFrom(MASS,glm.nb)
importFrom(ShortRead,countFastq)
importFrom(ShortRead,readFastq)
importFrom(ShortRead,sread)
importFrom(ShortRead,writeFastq)
importFrom(compositions,clr)
importFrom(dbscan,dbscan)
importFrom(igraph,E)
Expand Down Expand Up @@ -72,3 +77,4 @@ importFrom(utils,write.csv)
importFrom(uwot,umap)
importFrom(vegan,vegdist)
importFrom(viridis,scale_fill_viridis)
importMethodsFrom(ShortRead,append)
156 changes: 61 additions & 95 deletions R/demultiplex.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#' Function to demultiplex sequences either to single files or to multiple files for use with dada2.
#'
#' @param reads A ShortReadQ object or a file path to a fastq file that will be read in with
#' @param fwd_reads A ShortReadQ object or a file path to a fastq file that will be read in with
#' \code{ShortRead::readFastq}.
#' @param rev_reads A ShortReadQ object or a file path to a fastq file that will be read in with
#' \code{ShortRead::readFastq}.
#' @param barcodes A dataframe of barcodes and metadata for naming or a file path to such a flat file
#' readable by \code{utils::read.delim}.
Expand All @@ -9,12 +11,8 @@
#' @param rev The column name of the barcodes dataframe representing reverse barcodes.
#' Defaults to "REV".
#' @param name One or more column names from the barcodes dataframe to be used in naming samples.
#' @param mode One of "dada" or "general". This controls how files are written out.
#' If mode is "dada" then a fastq file will be written out for
#' each row of the barcodes dataframe and the files will be named using the name argument.
#' If mode is "general" then each input fastq file will yield one
#' output fastq file with sample names in that file corresponding to the name argument.
#' Defaults to "dada".
#' @param mode Currently only "dada" is supported. This exists for tentative plans to have several
#' output modes.
#' @param cores Optionally the number of cores to run in parallel. This defaults to 1 if the "mc.cores"
#' option is not set.
#' @param writeOut Logical, should fastq/stats files be written out. Defaults to TRUE.
Expand All @@ -27,57 +25,27 @@
#'
#' @importFrom utils write.csv
#' @importFrom methods is
#' @importClassesFrom ShortRead ShortReadQ
#' @importMethodsFrom ShortRead append
#' @importFrom ShortRead readFastq writeFastq sread countFastq
#' @import parallel
#'
#' @return Writes out fq files and summary statistics depending on writeOut and stat arguments.
#' If both are TRUE then summary stats are returned as a dataframe.
#'
#' @examples
#' # these examples are not run because they require specific fastq files.
#' \dontrun{
#' library(ShortRead)
#' library(Biostrings)
#' library(parallel)
#' setwd("~/scripts/SINC/sincUtils/syncomBuilder/nastya96WellEx")
#'
#' barcodes <- read.delim("barcode_tab.tsv")
#' name <- c("Name", "Well")
#' reads <- ShortRead::readFastq("Plate11_R1_001.fastq.gz")
#'
#' demultiplex(reads, barcodes[1:10, ],
#' fwd = "FWD", rev = "REV",
#' name = name, mode = "dada", cores = 10, writeOut = "tests", stat = FALSE
#' )
#'
#' path <- paste0(
#' "/run/user/1000/gvfs/smb-share:server=nas01,share=research/bart_lab/",
#' "Previous_people/Mingsheng_Qi/research_dr/SINC/limiting_dilution/20210314/p2p11_rawseq"
#' )
#' barcodes <- read.csv(paste0(path, "/barcodetable1.csv"))
#' barcodes$exp <- "exp"
#' reads <- ShortRead::readFastq(paste0(path, "/210314S05P10_merge.fq"))
#'
#' x <- demultiplex(reads, barcodes,
#' fwd = "FWD_primer", rev = "REV_primer", name = c("exp"), mode = "general",
#' writeOut = TRUE, stat = TRUE, cores = 10
#' )
#' x
#' # benchmarking against:
#' # "plate","totalReads","groupedReads","ratio"
#' # "P10",234509,154507,0.6589
#' }
#'
#' @export


demultiplex <- function(reads, barcodes, fwd = "FWD", rev = "REV", name = c("Name", "Sample_name"),
mode = c("dada", "general"), cores = getOption("mc.cores", 1), writeOut = TRUE,
stat = FALSE) {
if (is.character(reads)) {
reads <- ShortRead::readFastq(reads)
demultiplex <- function(fwd_reads, rev_reads, barcodes, fwd = "FWD", rev = "REV",
name = c("Name", "Sample_name"), mode = c("dada", "general"),
cores = getOption("mc.cores", 1), writeOut = TRUE, stat = FALSE) {
if (is.character(fwd_reads)) {
fwd_reads <- ShortRead::readFastq(fwd_reads)
}
if (!methods::is(reads, "ShortReadQ")) {
stop("reads must be a ShortReadQ object or a file path to a fastq file.")
if (is.character(rev_reads)) {
rev_reads <- ShortRead::readFastq(rev_reads)
}
if (!methods::is(fwd_reads, "ShortReadQ") || !methods::is(rev_reads, "ShortReadQ")) {
stop("fwd and rev reads must be a ShortReadQ object or a file path to a fastq file.")
}
if (is.character(barcodes)) {
barcodes <- utils::read.delim(barcodes)
Expand All @@ -90,9 +58,9 @@ demultiplex <- function(reads, barcodes, fwd = "FWD", rev = "REV", name = c("Nam
}
mode <- mode[1]
if (mode == "dada") {
out <- .demultiplex_for_dada(reads, barcodes, fwd, rev, name, writeOut, stat, cores)
out <- .demultiplex_for_dada(fwd_reads, rev_reads, barcodes, fwd, rev, name, writeOut, stat, cores)
} else if (mode == "general") {
out <- .demultiplex_general(reads, barcodes, fwd, rev, name, writeOut, stat, cores)
out <- .demultiplex_general(fwd_reads, rev_reads, barcodes, fwd, rev, name, writeOut, stat, cores)
} else {
stop("mode must be one of 'dada' or 'general'")
}
Expand All @@ -113,44 +81,33 @@ demultiplex <- function(reads, barcodes, fwd = "FWD", rev = "REV", name = c("Nam
#' @keywords internal
#' @noRd

.demultiplex_for_dada <- function(reads, barcodes, fwd = "FWD", rev = "REV",
.demultiplex_for_dada <- function(fwd_reads, rev_reads, barcodes, fwd = "FWD", rev = "REV",
name = c("Name", "Sample_name"), writeOut = TRUE,
stat = FALSE, cores = 1) {
# format barcodes to uppercase
barcodes[[fwd]] <- toupper(barcodes[[fwd]])
barcodes[[rev]] <- toupper(barcodes[[rev]])
# per barcode write out a fq and optionally a summary table
fwd_reads_char <- as.character(ShortRead::sread(fwd_reads))
rev_reads_char <- as.character(ShortRead::sread(rev_reads))

test <- parallel::mclapply(seq_len(nrow(barcodes)), function(i) {
# get a name for the sub sample
name <- paste(barcodes[i, c(name)], collapse = ".")
file_name <- paste(barcodes[i, c(name)], collapse = "_")
# get barcodes
fwd_bar <- barcodes[i, fwd]
rev_bar <- barcodes[i, rev]
# filter for barcodes at beginning of sequence
rowReads <- reads[grepl(paste0("^", fwd_bar), ShortRead::sread(reads))]
columnReads <- reads[grepl(paste0("^", rev_bar), ShortRead::sread(reads))]
# make reverse complements of fwd and reverse barcodes
fwd_bar_reverse_complement <- as.character(
Biostrings::reverseComplement(
Biostrings::DNAString(fwd_bar)
)
)
rev_bar_reverse_complement <- as.character(
Biostrings::reverseComplement(
Biostrings::DNAString(rev_bar)
)
)
# search for reverse complements of opposite barcode at end of sequences
rowReads <- rowReads[grepl(
paste0(rev_bar_reverse_complement, "$"),
ShortRead::sread(rowReads)
)]
columnReads <- columnReads[grepl(
paste0(fwd_bar_reverse_complement, "$"),
ShortRead::sread(columnReads)
)]
# append row and column reads
clone <- append(rowReads, columnReads)

fwd_reads_w_fwd_bars <- which(grepl(paste0("^", fwd_bar), fwd_reads_char))
rev_reads_w_rev_bars <- which(grepl(paste0("^", rev_bar), rev_reads_char))
fwd_index_both <- intersect(fwd_reads_w_fwd_bars, rev_reads_w_rev_bars) # logical AND

fwd_reads_w_rev_bars <- which(grepl(paste0("^", rev_bar), fwd_reads_char))
rev_reads_w_fwd_bars <- which(grepl(paste0("^", fwd_bar), rev_reads_char))
rev_index_both <- intersect(fwd_reads_w_rev_bars, rev_reads_w_fwd_bars) # logical AND
# union of both groups of reads
total_index <- union(fwd_index_both, rev_index_both) # logical OR
fwd_clone <- fwd_reads[total_index]
rev_clone <- rev_reads[total_index]
# write fastq
if (!is.logical(writeOut)) {
write <- writeOut
Expand All @@ -163,18 +120,23 @@ demultiplex <- function(reads, barcodes, fwd = "FWD", rev = "REV", name = c("Nam
}
if (writeOut) {
ShortRead::writeFastq(
object = clone, file = paste0(write, name, ".fq"),
object = fwd_clone, file = paste0(write, file_name, "_f.fastq.gz"),
mode = "w", compress = TRUE
)
ShortRead::writeFastq(
object = rev_clone, file = paste0(write, file_name, "_r.fastq.gz"),
mode = "w", compress = TRUE
)
}
if (stat & writeOut) {
stats <- data.frame(
name = name,
totalReads = length(reads),
groupedReads = ShortRead::countFastq(paste0(name, ".fq"))[1, 1]
name = file_name,
totalReads = length(fwd_reads),
groupedReads_fwd = ShortRead::countFastq(paste0(write, file_name, "_f.fastq.gz"))[1, 1],
groupedReads_rev = ShortRead::countFastq(paste0(write, file_name, "_r.fastq.gz"))[1, 1]
)
stats$ratio <- signif(stats$groupedReads / stats$totalReads, digits = 4)
utils::write.csv(stats, file = paste0(name, ".csv"), row.names = FALSE)
stats$ratio <- signif(stats$groupedReads_fwd / stats$totalReads, digits = 4)
utils::write.csv(stats, file = paste0(write, file_name, ".csv"), row.names = FALSE)
return(stats)
}
}, mc.cores = cores)
Expand All @@ -195,22 +157,26 @@ demultiplex <- function(reads, barcodes, fwd = "FWD", rev = "REV", name = c("Nam
#' @keywords internal
#' @noRd

.demultiplex_general <- function(reads, barcodes, fwd = "FWD", rev = "REV",
name = c("Name", "Sample_name"),
writeOut = TRUE, stat = FALSE, cores = 1) {
.demultiplex_general <- function(fwd_reads, rev_reads, barcodes, fwd = "FWD", rev = "REV",
name = c("Name", "Sample_name"), writeOut = TRUE,
stat = FALSE, cores = 1) {
# placeholder until I change how general works
return(.demultiplex_for_dada(fwd_reads, rev_reads, barcodes, fwd, rev,
name, writeOut,
stat, cores))
# format barcodes to uppercase
barcodes[[fwd]] <- toupper(barcodes[[fwd]])
barcodes[[rev]] <- toupper(barcodes[[rev]])
# per barcode write out a fq and optionally a summary table
lapply(seq_len(nrow(barcodes)), function(i) {
# get a name for the sub sample
name <- paste(barcodes[i, c(name)], collapse = ".")
name <- paste(barcodes[i, c(name)], collapse = "_")
# get barcodes
fwd_bar <- barcodes[i, fwd]
rev_bar <- barcodes[i, rev]
# filter for barcodes at beginning of sequence
rowReads <- reads[grepl(paste0("^", fwd_bar), ShortRead::sread(reads))]
columnReads <- reads[grepl(paste0("^", rev_bar), ShortRead::sread(reads))]
rowReads <- fwd_reads[grepl(paste0("^", fwd_bar), ShortRead::sread(fwd_reads))]
columnReads <- fwd_reads[grepl(paste0("^", rev_bar), ShortRead::sread(fwd_reads))]
# make reverse complements of fwd and reverse barcodes
fwd_bar_reverse_complement <- as.character(
Biostrings::reverseComplement(
Expand Down Expand Up @@ -247,16 +213,16 @@ demultiplex <- function(reads, barcodes, fwd = "FWD", rev = "REV", name = c("Nam
md <- "a"
}
ShortRead::writeFastq(
object = clone, file = paste0(write, name, ".fq"),
object = clone, file = paste0(write, name, ".fastq.gz"),
mode = md, compress = TRUE
)
}
})
if (stat && writeOut) {
stats <- data.frame(
name = name,
totalReads = length(reads),
groupedReads = ShortRead::countFastq(paste0(name, ".fq"))[1, 1]
totalReads = length(fwd_reads),
groupedReads = ShortRead::countFastq(paste0(name, ".fastq.gz"))[1, 1]
)
stats$ratio <- signif(stats$groupedReads / stats$totalReads, digits = 4)
utils::write.csv(stats, file = paste0(name, ".csv"), row.names = FALSE)
Expand Down
52 changes: 8 additions & 44 deletions man/demultiplex.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit d24aa73

Please sign in to comment.