Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Local #13

Merged
merged 5 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,12 @@ Imports:
ComplexUpset,
plotly,
Hmisc,
compositions
compositions,
ShortRead
Suggests:
knitr,
brms,
NetCoMi,
ShortRead,
Biostrings,
seqinr,
ape,
Expand Down
6 changes: 6 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,15 @@ import(patchwork)
import(stats)
import(uwot)
import(viridis)
importClassesFrom(ShortRead,ShortReadQ)
importFrom(ComplexUpset,upset)
importFrom(FactoMineR,PCA)
importFrom(Hmisc,rcorr)
importFrom(MASS,glm.nb)
importFrom(ShortRead,countFastq)
importFrom(ShortRead,readFastq)
importFrom(ShortRead,sread)
importFrom(ShortRead,writeFastq)
importFrom(compositions,clr)
importFrom(dbscan,dbscan)
importFrom(igraph,E)
Expand Down Expand Up @@ -72,3 +77,4 @@ importFrom(utils,write.csv)
importFrom(uwot,umap)
importFrom(vegan,vegdist)
importFrom(viridis,scale_fill_viridis)
importMethodsFrom(ShortRead,append)
156 changes: 61 additions & 95 deletions R/demultiplex.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#' Function to demultiplex sequences either to single files or to multiple files for use with dada2.
#'
#' @param reads A ShortReadQ object or a file path to a fastq file that will be read in with
#' @param fwd_reads A ShortReadQ object or a file path to a fastq file that will be read in with
#' \code{ShortRead::readFastq}.
#' @param rev_reads A ShortReadQ object or a file path to a fastq file that will be read in with
#' \code{ShortRead::readFastq}.
#' @param barcodes A dataframe of barcodes and metadata for naming or a file path to such a flat file
#' readable by \code{utils::read.delim}.
Expand All @@ -9,12 +11,8 @@
#' @param rev The column name of the barcodes dataframe representing reverse barcodes.
#' Defaults to "REV".
#' @param name One or more column names from the barcodes dataframe to be used in naming samples.
#' @param mode One of "dada" or "general". This controls how files are written out.
#' If mode is "dada" then a fastq file will be written out for
#' each row of the barcodes dataframe and the files will be named using the name argument.
#' If mode is "general" then each input fastq file will yield one
#' output fastq file with sample names in that file corresponding to the name argument.
#' Defaults to "dada".
#' @param mode Currently only "dada" is supported. This exists for tentative plans to have several
#' output modes.
#' @param cores Optionally the number of cores to run in parallel. This defaults to 1 if the "mc.cores"
#' option is not set.
#' @param writeOut Logical, should fastq/stats files be written out. Defaults to TRUE.
Expand All @@ -27,57 +25,27 @@
#'
#' @importFrom utils write.csv
#' @importFrom methods is
#' @importClassesFrom ShortRead ShortReadQ
#' @importMethodsFrom ShortRead append
#' @importFrom ShortRead readFastq writeFastq sread countFastq
#' @import parallel
#'
#' @return Writes out fq files and summary statistics depending on writeOut and stat arguments.
#' If both are TRUE then summary stats are returned as a dataframe.
#'
#' @examples
#' # these examples are not run because they require specific fastq files.
#' \dontrun{
#' library(ShortRead)
#' library(Biostrings)
#' library(parallel)
#' setwd("~/scripts/SINC/sincUtils/syncomBuilder/nastya96WellEx")
#'
#' barcodes <- read.delim("barcode_tab.tsv")
#' name <- c("Name", "Well")
#' reads <- ShortRead::readFastq("Plate11_R1_001.fastq.gz")
#'
#' demultiplex(reads, barcodes[1:10, ],
#' fwd = "FWD", rev = "REV",
#' name = name, mode = "dada", cores = 10, writeOut = "tests", stat = FALSE
#' )
#'
#' path <- paste0(
#' "/run/user/1000/gvfs/smb-share:server=nas01,share=research/bart_lab/",
#' "Previous_people/Mingsheng_Qi/research_dr/SINC/limiting_dilution/20210314/p2p11_rawseq"
#' )
#' barcodes <- read.csv(paste0(path, "/barcodetable1.csv"))
#' barcodes$exp <- "exp"
#' reads <- ShortRead::readFastq(paste0(path, "/210314S05P10_merge.fq"))
#'
#' x <- demultiplex(reads, barcodes,
#' fwd = "FWD_primer", rev = "REV_primer", name = c("exp"), mode = "general",
#' writeOut = TRUE, stat = TRUE, cores = 10
#' )
#' x
#' # benchmarking against:
#' # "plate","totalReads","groupedReads","ratio"
#' # "P10",234509,154507,0.6589
#' }
#'
#' @export


demultiplex <- function(reads, barcodes, fwd = "FWD", rev = "REV", name = c("Name", "Sample_name"),
mode = c("dada", "general"), cores = getOption("mc.cores", 1), writeOut = TRUE,
stat = FALSE) {
if (is.character(reads)) {
reads <- ShortRead::readFastq(reads)
demultiplex <- function(fwd_reads, rev_reads, barcodes, fwd = "FWD", rev = "REV",
name = c("Name", "Sample_name"), mode = c("dada", "general"),
cores = getOption("mc.cores", 1), writeOut = TRUE, stat = FALSE) {
if (is.character(fwd_reads)) {
fwd_reads <- ShortRead::readFastq(fwd_reads)
}
if (!methods::is(reads, "ShortReadQ")) {
stop("reads must be a ShortReadQ object or a file path to a fastq file.")
if (is.character(rev_reads)) {
rev_reads <- ShortRead::readFastq(rev_reads)
}
if (!methods::is(fwd_reads, "ShortReadQ") || !methods::is(rev_reads, "ShortReadQ")) {
stop("fwd and rev reads must be a ShortReadQ object or a file path to a fastq file.")
}
if (is.character(barcodes)) {
barcodes <- utils::read.delim(barcodes)
Expand All @@ -90,9 +58,9 @@ demultiplex <- function(reads, barcodes, fwd = "FWD", rev = "REV", name = c("Nam
}
mode <- mode[1]
if (mode == "dada") {
out <- .demultiplex_for_dada(reads, barcodes, fwd, rev, name, writeOut, stat, cores)
out <- .demultiplex_for_dada(fwd_reads, rev_reads, barcodes, fwd, rev, name, writeOut, stat, cores)
} else if (mode == "general") {
out <- .demultiplex_general(reads, barcodes, fwd, rev, name, writeOut, stat, cores)
out <- .demultiplex_general(fwd_reads, rev_reads, barcodes, fwd, rev, name, writeOut, stat, cores)
} else {
stop("mode must be one of 'dada' or 'general'")
}
Expand All @@ -113,44 +81,33 @@ demultiplex <- function(reads, barcodes, fwd = "FWD", rev = "REV", name = c("Nam
#' @keywords internal
#' @noRd

.demultiplex_for_dada <- function(reads, barcodes, fwd = "FWD", rev = "REV",
.demultiplex_for_dada <- function(fwd_reads, rev_reads, barcodes, fwd = "FWD", rev = "REV",
name = c("Name", "Sample_name"), writeOut = TRUE,
stat = FALSE, cores = 1) {
# format barcodes to uppercase
barcodes[[fwd]] <- toupper(barcodes[[fwd]])
barcodes[[rev]] <- toupper(barcodes[[rev]])
# per barcode write out a fq and optionally a summary table
fwd_reads_char <- as.character(ShortRead::sread(fwd_reads))
rev_reads_char <- as.character(ShortRead::sread(rev_reads))

test <- parallel::mclapply(seq_len(nrow(barcodes)), function(i) {
# get a name for the sub sample
name <- paste(barcodes[i, c(name)], collapse = ".")
file_name <- paste(barcodes[i, c(name)], collapse = "_")
# get barcodes
fwd_bar <- barcodes[i, fwd]
rev_bar <- barcodes[i, rev]
# filter for barcodes at beginning of sequence
rowReads <- reads[grepl(paste0("^", fwd_bar), ShortRead::sread(reads))]
columnReads <- reads[grepl(paste0("^", rev_bar), ShortRead::sread(reads))]
# make reverse complements of fwd and reverse barcodes
fwd_bar_reverse_complement <- as.character(
Biostrings::reverseComplement(
Biostrings::DNAString(fwd_bar)
)
)
rev_bar_reverse_complement <- as.character(
Biostrings::reverseComplement(
Biostrings::DNAString(rev_bar)
)
)
# search for reverse complements of opposite barcode at end of sequences
rowReads <- rowReads[grepl(
paste0(rev_bar_reverse_complement, "$"),
ShortRead::sread(rowReads)
)]
columnReads <- columnReads[grepl(
paste0(fwd_bar_reverse_complement, "$"),
ShortRead::sread(columnReads)
)]
# append row and column reads
clone <- append(rowReads, columnReads)

fwd_reads_w_fwd_bars <- which(grepl(paste0("^", fwd_bar), fwd_reads_char))
rev_reads_w_rev_bars <- which(grepl(paste0("^", rev_bar), rev_reads_char))
fwd_index_both <- intersect(fwd_reads_w_fwd_bars, rev_reads_w_rev_bars) # logical AND

fwd_reads_w_rev_bars <- which(grepl(paste0("^", rev_bar), fwd_reads_char))
rev_reads_w_fwd_bars <- which(grepl(paste0("^", fwd_bar), rev_reads_char))
rev_index_both <- intersect(fwd_reads_w_rev_bars, rev_reads_w_fwd_bars) # logical AND
# union of both groups of reads
total_index <- union(fwd_index_both, rev_index_both) # logical OR
fwd_clone <- fwd_reads[total_index]
rev_clone <- rev_reads[total_index]
# write fastq
if (!is.logical(writeOut)) {
write <- writeOut
Expand All @@ -163,18 +120,23 @@ demultiplex <- function(reads, barcodes, fwd = "FWD", rev = "REV", name = c("Nam
}
if (writeOut) {
ShortRead::writeFastq(
object = clone, file = paste0(write, name, ".fq"),
object = fwd_clone, file = paste0(write, file_name, "_f.fastq.gz"),
mode = "w", compress = TRUE
)
ShortRead::writeFastq(
object = rev_clone, file = paste0(write, file_name, "_r.fastq.gz"),
mode = "w", compress = TRUE
)
}
if (stat & writeOut) {
stats <- data.frame(
name = name,
totalReads = length(reads),
groupedReads = ShortRead::countFastq(paste0(name, ".fq"))[1, 1]
name = file_name,
totalReads = length(fwd_reads),
groupedReads_fwd = ShortRead::countFastq(paste0(write, file_name, "_f.fastq.gz"))[1, 1],
groupedReads_rev = ShortRead::countFastq(paste0(write, file_name, "_r.fastq.gz"))[1, 1]
)
stats$ratio <- signif(stats$groupedReads / stats$totalReads, digits = 4)
utils::write.csv(stats, file = paste0(name, ".csv"), row.names = FALSE)
stats$ratio <- signif(stats$groupedReads_fwd / stats$totalReads, digits = 4)
utils::write.csv(stats, file = paste0(write, file_name, ".csv"), row.names = FALSE)
return(stats)
}
}, mc.cores = cores)
Expand All @@ -195,22 +157,26 @@ demultiplex <- function(reads, barcodes, fwd = "FWD", rev = "REV", name = c("Nam
#' @keywords internal
#' @noRd

.demultiplex_general <- function(reads, barcodes, fwd = "FWD", rev = "REV",
name = c("Name", "Sample_name"),
writeOut = TRUE, stat = FALSE, cores = 1) {
.demultiplex_general <- function(fwd_reads, rev_reads, barcodes, fwd = "FWD", rev = "REV",
name = c("Name", "Sample_name"), writeOut = TRUE,
stat = FALSE, cores = 1) {
# placeholder until I change how general works
return(.demultiplex_for_dada(fwd_reads, rev_reads, barcodes, fwd, rev,
name, writeOut,
stat, cores))
# format barcodes to uppercase
barcodes[[fwd]] <- toupper(barcodes[[fwd]])
barcodes[[rev]] <- toupper(barcodes[[rev]])
# per barcode write out a fq and optionally a summary table
lapply(seq_len(nrow(barcodes)), function(i) {
# get a name for the sub sample
name <- paste(barcodes[i, c(name)], collapse = ".")
name <- paste(barcodes[i, c(name)], collapse = "_")
# get barcodes
fwd_bar <- barcodes[i, fwd]
rev_bar <- barcodes[i, rev]
# filter for barcodes at beginning of sequence
rowReads <- reads[grepl(paste0("^", fwd_bar), ShortRead::sread(reads))]
columnReads <- reads[grepl(paste0("^", rev_bar), ShortRead::sread(reads))]
rowReads <- fwd_reads[grepl(paste0("^", fwd_bar), ShortRead::sread(fwd_reads))]
columnReads <- fwd_reads[grepl(paste0("^", rev_bar), ShortRead::sread(fwd_reads))]
# make reverse complements of fwd and reverse barcodes
fwd_bar_reverse_complement <- as.character(
Biostrings::reverseComplement(
Expand Down Expand Up @@ -247,16 +213,16 @@ demultiplex <- function(reads, barcodes, fwd = "FWD", rev = "REV", name = c("Nam
md <- "a"
}
ShortRead::writeFastq(
object = clone, file = paste0(write, name, ".fq"),
object = clone, file = paste0(write, name, ".fastq.gz"),
mode = md, compress = TRUE
)
}
})
if (stat && writeOut) {
stats <- data.frame(
name = name,
totalReads = length(reads),
groupedReads = ShortRead::countFastq(paste0(name, ".fq"))[1, 1]
totalReads = length(fwd_reads),
groupedReads = ShortRead::countFastq(paste0(name, ".fastq.gz"))[1, 1]
)
stats$ratio <- signif(stats$groupedReads / stats$totalReads, digits = 4)
utils::write.csv(stats, file = paste0(name, ".csv"), row.names = FALSE)
Expand Down
52 changes: 8 additions & 44 deletions man/demultiplex.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading