From 72ae496c01551eacaa2775bf841f1a51762347a3 Mon Sep 17 00:00:00 2001 From: Lindsay Clark Date: Tue, 9 Jan 2018 15:30:04 -0600 Subject: [PATCH] Finished readStacks1 function. --- NAMESPACE | 2 +- R/data_import.R | 13 ++++--- README.md | 1 + man/VCF2RADdata.Rd | 3 ++ man/readHMC.Rd | 2 +- man/readStacks1.Rd | 84 ++++++++++++++++++++++++++++++++++++++++++++ man/readTagDigger.Rd | 2 +- 7 files changed, 100 insertions(+), 7 deletions(-) create mode 100644 man/readStacks1.Rd diff --git a/NAMESPACE b/NAMESPACE index 0850eb2..ba0f2e9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,7 +10,7 @@ GetBlankTaxa, GetContamRate, GetDonorParent, GetLikelyGen, GetLocDepth, GetLoci, GetRecurrentParent, GetTaxa, GetWeightedMeanGenotypes, IterateHWE, IteratePopStruct, MakeTasselVcfFilter, nTaxa, nAlleles, OneAllelePerMarker, -PipelineMapping2Parents, RADdata, readHMC, readTagDigger, +PipelineMapping2Parents, RADdata, readHMC, readStacks1, readTagDigger, SetBlankTaxa, SetContamRate, SetDonorParent, SetRecurrentParent, VCF2RADdata) diff --git a/R/data_import.R b/R/data_import.R index 6f35bc5..02ee136 100644 --- a/R/data_import.R +++ b/R/data_import.R @@ -807,7 +807,7 @@ readStacks1 <- function(allelesFile, matchesFolder, # read in catalog.alleles.tsv file af <- scan(allelesFile, what = list(NULL, NULL, integer(0), character(0), NULL, NULL), - sep = "\t", comment.char = "#") + sep = "\t", comment.char = "#", na.strings = character(0)) # get locus names (numbers) and haplotypes for variable sites keep <- af[[4]] != "" locNames <- af[[3]][keep] @@ -824,19 +824,24 @@ readStacks1 <- function(allelesFile, matchesFolder, alleleDepth <- matrix(0L, nrow = length(sampleNames), ncol = length(alleleNames), dimnames = list(sampleNames, alleleNames)) + # vector for reordering samples according to numbers in matches files + reorder <- integer(length(sampleNames)) # read sstacks files for(i in 1:length(sampleNames)){ mf <- scan(sstacksFiles[i], - what = list(NULL, NULL, integer(0), NULL, NULL, character(0), - integer(0), NULL), sep = "\t", comment.char = "#") + what = list(NULL, NULL, integer(0), integer(0), NULL, character(0), + integer(0), NULL), sep = "\t", comment.char = "#", + na.strings = character(0)) keep <- mf[[6]] != "consensus" theseLocNames <- mf[[3]][keep] theseAlNuc <- mf[[6]][keep] theseDepth <- mf[[7]][keep] theseAlNames <- paste(theseLocNames, theseAlNuc, sep = "_") alleleDepth[i, theseAlNames] <- theseDepth + reorder[mf[[4]][1]] <- i } + alleleDepth <- alleleDepth[reorder,] # filter loci alRemove <- integer(0) @@ -875,7 +880,7 @@ readStacks1 <- function(allelesFile, matchesFolder, what = list(NULL, NULL, integer(0), character(0), integer(0), character(0), NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL), - sep = "\t", comment.char = "#") + sep = "\t", comment.char = "#", na.strings = character(0)) keeprows <- fastmatch::fmatch(uniqueLocNames, tf[[3]]) locTable <- data.frame(row.names = as.character(uniqueLocNames), Chr = tf[[4]][keeprows], diff --git a/README.md b/README.md index 2adbc95..06f0f2a 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ polyRAD requires as input the sequence read depth at each allele for each sample * Variant Call Format (VCF). The allele depth (AD) genotype field must be present. I have tested the import function on files produced by the TASSEL GBSv2 pipeline. It should also work for GATK. * TagDigger. This is another piece of software I created, which reads FASTQ files and expected tag sequences and outputs a CSV file with read counts at samples x tags. * UNEAK. The UNEAK pipeline outputs read depth in a file called HapMap.hmc.txt, which can be read by polyRAD. (Beware that read depth is capped at 127 by UNEAK; TagDigger can help you if you expect high depth to be common in your dataset.) +* Stacks version 1. If you have catalog files (catalog.alleles.tsv etc.) and matches files (matches.tsv) generated by Stacks, they can be imported by polyRAD. Currently there are export functions for the following software. Genotypes are exported as continuous rather than discrete variables. diff --git a/man/VCF2RADdata.Rd b/man/VCF2RADdata.Rd index e17d91b..d2695b5 100644 --- a/man/VCF2RADdata.Rd +++ b/man/VCF2RADdata.Rd @@ -124,6 +124,9 @@ Lindsay V. Clark \seealso{ \code{\link{MakeTasselVcfFilter}} for filtering to a smaller VCF file before reading with \code{VCF2RADdata}. + +Other data import functions: \code{\link{readStacks1}}, \code{\link{readHMC}}, +\code{\link{readTagDigger}} } \examples{ # get the example VCF installed with polyRAD diff --git a/man/readHMC.Rd b/man/readHMC.Rd index 627692d..b2dd859 100644 --- a/man/readHMC.Rd +++ b/man/readHMC.Rd @@ -62,7 +62,7 @@ archived versions of TASSEL. } \seealso{ -\code{\link{readTagDigger}} +\code{\link{readTagDigger}}, \code{\link{VCF2RADdata}}, \code{\link{readStacks1}} } \examples{ # for this example we'll create dummy files rather than using real ones diff --git a/man/readStacks1.Rd b/man/readStacks1.Rd new file mode 100644 index 0000000..ca9dd80 --- /dev/null +++ b/man/readStacks1.Rd @@ -0,0 +1,84 @@ +\name{readStacks1} +\alias{readStacks1} +\title{ +Import Read Depth from Stacks Version 1 +} +\description{ +Using the catalog files output by cstacks and matches file output by sstacks, +this function imports read depth into a \code{\link{RADdata}} object. If +genomic alignments were used, alignment data can optionally be imported. +} +\usage{ +readStacks1(allelesFile, matchesFolder, min.ind.with.reads = 200, + min.ind.with.minor.allele = 10, readAlignmentData = FALSE, + possiblePloidies = list(2), contamRate = 0.001) +} +\arguments{ + \item{allelesFile}{ +Path to the "alleles" file from the Stacks catalog. +} + \item{matchesFolder}{ +Path to the folder containing "matches" files to import. +} + \item{min.ind.with.reads}{ +For filtering loci. A locus must have at least this many samples with +reads in order to be retained. +} + \item{min.ind.with.minor.allele}{ +For filtering loci. A locus must have at least this many samples with +reads for the minor allele in order to be retained. For loci with more +than two alleles, at least two alleles must be present in at least this +many individuals. +} + \item{readAlignmentData}{ +If \code{TRUE}, the "tags" file from the Stacks catalog will be read, +and chromosome, position, and strand will be imported to the \code{locTable} +slot of the output. It is assumed that the "tags" file is in the same +directory as the "alleles" file. +} + \item{possiblePloidies}{ +A list indicating possible inheritance modes in the dataset. +See \code{\link{RADdata}}. +} + \item{contamRate}{ +A number from 0 to 1 (generally very small) indicating the expected rate of +cross contamination between samples. +} +} + +\value{ +A \code{\link{RADdata}} object. +} +\references{ +Stacks website: \url{http://catchenlab.life.illinois.edu/stacks/} + +Catchen, J., Hohenlohe, P. A., Bassham, S., Amores, A., and Cresko., W. A. (2013) +Stacks: an analysis tool set for population genomics. \emph{Molecular Ecology} +\bold{22}, 3124--3140. + +Catchen, J. M., Amores, A., Hohenlohe, P., Cresko, W., and Postlethwait, J. H. (2011) +Stacks: building and genotyping loci de novo from short-read sequences. +\emph{G3: Genes, Genomes, Genetics} \bold{1}, 171--182. +} +\author{ +Lindsay V. Clark +} +\note{ +This function has been tested with output from Stacks 1.47. +} + +\seealso{ +\code{\link{VCF2RADdata}}, \code{\link{readTagDigger}}, +\code{\link{readHMC}} +} +\examples{ +\dontrun{ + +# Assuming the working directory contains the catalog and all matches files: + +myStacks <- readStacks1("batch_1.catalog.alleles.tsv", ".", + readAlignmentData = TRUE) +} +} + +\keyword{ file } diff --git a/man/readTagDigger.Rd b/man/readTagDigger.Rd index 4292123..8b2f327 100644 --- a/man/readTagDigger.Rd +++ b/man/readTagDigger.Rd @@ -73,7 +73,7 @@ Lindsay V. Clark } \seealso{ -\code{\link{readHMC}} +\code{\link{readHMC}}, \code{\link{readStacks1}}, \code{\link{VCF2RADdata}} } \examples{ # for this example we'll create dummy files