Finished readStacks1 function.

lvclark · Jan 9, 2018 · 72ae496 · 72ae496
1 parent 3844281
commit 72ae496
Show file tree

Hide file tree

Showing 7 changed files with 100 additions and 7 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -10,7 +10,7 @@ GetBlankTaxa, GetContamRate, GetDonorParent, GetLikelyGen,
 GetLocDepth, GetLoci, GetRecurrentParent, GetTaxa, 
 GetWeightedMeanGenotypes, IterateHWE, IteratePopStruct,
 MakeTasselVcfFilter, nTaxa, nAlleles, OneAllelePerMarker, 
-PipelineMapping2Parents, RADdata, readHMC, readTagDigger,
+PipelineMapping2Parents, RADdata, readHMC, readStacks1, readTagDigger,
 SetBlankTaxa, SetContamRate, SetDonorParent, SetRecurrentParent,
 VCF2RADdata)
 

diff --git a/R/data_import.R b/R/data_import.R
@@ -807,7 +807,7 @@ readStacks1 <- function(allelesFile, matchesFolder,
   # read in catalog.alleles.tsv file
   af <- scan(allelesFile, what = list(NULL, NULL, integer(0), character(0),
                                       NULL, NULL),
-             sep = "\t", comment.char = "#")
+             sep = "\t", comment.char = "#", na.strings = character(0))
   # get locus names (numbers) and haplotypes for variable sites
   keep <- af[[4]] != ""
   locNames <- af[[3]][keep]
@@ -824,19 +824,24 @@ readStacks1 <- function(allelesFile, matchesFolder,
   alleleDepth <- matrix(0L, nrow = length(sampleNames), 
                         ncol = length(alleleNames),
                         dimnames = list(sampleNames, alleleNames))
+  # vector for reordering samples according to numbers in matches files
+  reorder <- integer(length(sampleNames))
 
   # read sstacks files
   for(i in 1:length(sampleNames)){
     mf <- scan(sstacksFiles[i], 
-               what = list(NULL, NULL, integer(0), NULL, NULL, character(0),
-                           integer(0), NULL), sep = "\t", comment.char = "#")
+               what = list(NULL, NULL, integer(0), integer(0), NULL, character(0),
+                           integer(0), NULL), sep = "\t", comment.char = "#", 
+               na.strings = character(0))
     keep <- mf[[6]] != "consensus"
     theseLocNames <- mf[[3]][keep]
     theseAlNuc <- mf[[6]][keep]
     theseDepth <- mf[[7]][keep]
     theseAlNames <- paste(theseLocNames, theseAlNuc, sep = "_")
     alleleDepth[i, theseAlNames] <- theseDepth
+    reorder[mf[[4]][1]] <- i
   }
+  alleleDepth <- alleleDepth[reorder,]
 
   # filter loci
   alRemove <- integer(0)
@@ -875,7 +880,7 @@ readStacks1 <- function(allelesFile, matchesFolder,
                what = list(NULL, NULL, integer(0), character(0),
                            integer(0), character(0), NULL, NULL, NULL, NULL,
                            NULL, NULL, NULL, NULL),
-               sep = "\t", comment.char = "#")
+               sep = "\t", comment.char = "#", na.strings = character(0))
     keeprows <- fastmatch::fmatch(uniqueLocNames, tf[[3]])
     locTable <- data.frame(row.names = as.character(uniqueLocNames),
                            Chr = tf[[4]][keeprows],

diff --git a/README.md b/README.md
@@ -17,6 +17,7 @@ polyRAD requires as input the sequence read depth at each allele for each sample
 * Variant Call Format (VCF).  The allele depth (AD) genotype field must be present.  I have tested the import function on files produced by the TASSEL GBSv2 pipeline.  It should also work for GATK.
 * TagDigger.  This is another piece of software I created, which reads FASTQ files and expected tag sequences and outputs a CSV file with read counts at samples x tags.
 * UNEAK.  The UNEAK pipeline outputs read depth in a file called HapMap.hmc.txt, which can be read by polyRAD.  (Beware that read depth is capped at 127 by UNEAK; TagDigger can help you if you expect high depth to be common in your dataset.)
+* Stacks version 1.  If you have catalog files (catalog.alleles.tsv etc.) and matches files (matches.tsv) generated by Stacks, they can be imported by polyRAD.
 
 Currently there are export functions for the following software.  Genotypes are exported as continuous rather than discrete variables.
 

diff --git a/man/VCF2RADdata.Rd b/man/VCF2RADdata.Rd
@@ -124,6 +124,9 @@ Lindsay V. Clark
 \seealso{
 \code{\link{MakeTasselVcfFilter}} for filtering to a smaller VCF file before
 reading with \code{VCF2RADdata}.
+
+Other data import functions:  \code{\link{readStacks1}}, \code{\link{readHMC}},
+\code{\link{readTagDigger}}
 }
 \examples{
 # get the example VCF installed with polyRAD

diff --git a/man/readHMC.Rd b/man/readHMC.Rd
@@ -62,7 +62,7 @@ archived versions of TASSEL.
 }
 
 \seealso{
-\code{\link{readTagDigger}}
+\code{\link{readTagDigger}}, \code{\link{VCF2RADdata}}, \code{\link{readStacks1}}
 }
 \examples{
 # for this example we'll create dummy files rather than using real ones

diff --git a/man/readStacks1.Rd b/man/readStacks1.Rd
@@ -0,0 +1,84 @@
+\name{readStacks1}
+\alias{readStacks1}
+\title{
+Import Read Depth from Stacks Version 1
+}
+\description{
+Using the catalog files output by cstacks and matches file output by sstacks,
+this function imports read depth into a \code{\link{RADdata}} object.  If
+genomic alignments were used, alignment data can optionally be imported.
+}
+\usage{
+readStacks1(allelesFile, matchesFolder, min.ind.with.reads = 200, 
+            min.ind.with.minor.allele = 10, readAlignmentData = FALSE, 
+            possiblePloidies = list(2), contamRate = 0.001)
+}
+\arguments{
+  \item{allelesFile}{
+Path to the "alleles" file from the Stacks catalog.
+}
+  \item{matchesFolder}{
+Path to the folder containing "matches" files to import.
+}
+  \item{min.ind.with.reads}{
+For filtering loci.  A locus must have at least this many samples with
+reads in order to be retained.
+}
+  \item{min.ind.with.minor.allele}{
+For filtering loci.  A locus must have at least this many samples with
+reads for the minor allele in order to be retained.  For loci with more
+than two alleles, at least two alleles must be present in at least this
+many individuals.
+}
+  \item{readAlignmentData}{
+If \code{TRUE}, the "tags" file from the Stacks catalog will be read,
+and chromosome, position, and strand will be imported to the \code{locTable}
+slot of the output.  It is assumed that the "tags" file is in the same
+directory as the "alleles" file.
+}
+  \item{possiblePloidies}{
+A list indicating possible inheritance modes in the dataset.
+See \code{\link{RADdata}}.
+}
+  \item{contamRate}{
+A number from 0 to 1 (generally very small) indicating the expected rate of
+cross contamination between samples.
+}
+}
+
+\value{
+A \code{\link{RADdata}} object.
+}
+\references{
+Stacks website: \url{http://catchenlab.life.illinois.edu/stacks/}
+
+Catchen, J., Hohenlohe, P. A., Bassham, S., Amores, A., and Cresko., W. A. (2013) 
+Stacks: an analysis tool set for population genomics. \emph{Molecular Ecology}
+\bold{22}, 3124--3140.
+
+Catchen, J. M., Amores, A., Hohenlohe, P., Cresko, W., and Postlethwait, J. H. (2011)
+Stacks: building and genotyping loci de novo from short-read sequences. 
+\emph{G3: Genes, Genomes, Genetics} \bold{1}, 171--182.
+}
+\author{
+Lindsay V. Clark
+}
+\note{
+This function has been tested with output from Stacks 1.47.
+}
+
+\seealso{
+\code{\link{VCF2RADdata}}, \code{\link{readTagDigger}},
+\code{\link{readHMC}}
+}
+\examples{
+\dontrun{
+
+# Assuming the working directory contains the catalog and all matches files:
+
+myStacks <- readStacks1("batch_1.catalog.alleles.tsv", ".", 
+                        readAlignmentData = TRUE)
+}
+}
+
+\keyword{ file }
diff --git a/man/readTagDigger.Rd b/man/readTagDigger.Rd
@@ -73,7 +73,7 @@ Lindsay V. Clark
 }
 
 \seealso{
-\code{\link{readHMC}}
+\code{\link{readHMC}}, \code{\link{readStacks1}}, \code{\link{VCF2RADdata}}
 }
 \examples{
 # for this example we'll create dummy files