From 23a4dba22c77110294ce46d66d2d6950197a3311 Mon Sep 17 00:00:00 2001 From: "Adam R. Rivers" Date: Fri, 6 Jul 2018 15:44:27 -0400 Subject: [PATCH] Edited code documentation --- README.rst | 91 ++++++++------- itsxpress/definitions.py | 5 +- itsxpress/main.py | 241 ++++++++++++++++++++------------------- setup.py | 2 +- tests/test_main.py | 13 +-- 5 files changed, 181 insertions(+), 171 deletions(-) diff --git a/README.rst b/README.rst index abf247d..40545c6 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -ITSxpress: Software to rapidly trim the Internally transcribed spacer (ITS) region of FASTQ files +ITSxpress: Software to rapidly trim the Internally transcribed spacer (ITS) region of FASTQ files ================================================================================================== .. image:: https://travis-ci.org/USDA-ARS-GBRU/itsxpress.svg?branch=master :target: https://travis-ci.org/USDA-ARS-GBRU/itsxpress @@ -9,6 +9,9 @@ ITSxpress: Software to rapidly trim the Internally transcribed spacer (ITS) reg .. image:: https://api.codacy.com/project/badge/Grade/7e2a4c97cde74bccb3e84b706d7a2aa5 :target: https://www.codacy.com/app/GBRU/itsxpress?utm_source=github.com&utm_medium=referral&utm_content=USDA-ARS-GBRU/itsxpress&utm_campaign=Badge_Grade +.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.1304349.svg + :target: https://doi.org/10.5281/zenodo.1304349 + Author ------- * Adam R. Rivers, US Department of Agriculture, Agricultural Research Service @@ -19,7 +22,7 @@ Introduction The internally transcribed spacer region is a region between highly conserved the small subunit (SSU) of rRNA and the large subunit (LSU) of the rRNA. In Eukaryotes it contains -the 5.8s genes and two variable length spacer regions. In amplicon sequening studies it is +the 5.8s genes and two variable length spacer regions. In amplicon sequencing studies it is common practice to trim off the conserved (SSU, 5,8S or LSU) regions. `Bengtsson-Palme et al. (2013)`_ published software the software package ITSx_ to do this. @@ -29,25 +32,27 @@ sequence, so each input sequence must be trimmed. ITSXpress makes this possible taking FASTQ data, de-replicating the sequences then identifying the start and stop sites using HMMSearch. Results are parsed and the trimmed files are returned. The ITS 1, ITS2 or the entire ITS region including the 5.8s rRNA gene can be selected. ITSxpress -uses the hmm model from ITSx so results are comprable. +uses the hmm model from ITSx so results are comparable. +ITSxpress is also available as a `QIIME2 Plugin`_ -.. _`Bengtsson-Palme et al. (2013)`: https://doi.org/10.1111/2041-210X.12073 +.. _`Bengtsson-Palme et al. (2013)`: https://doi.org/10.1111/2041-210X.12073 .. _ITSx: http://microbiology.se/software/itsx/ .. _OTUs: https://doi.org/10.1038/ismej.2017.119 +.. _`QIIME2 Plugin`: https://github.com/USDA-ARS-GBRU/q2_itsxpress Installation ------------- ITSxpress can be installed from: -1. Preferred method - Bioconda (to be done): +1. Bioconda: (preferred method because it handles dependencies): .. code-block:: bash conda install itsxpress -2. Pip: +2. Pip: https://pypi.org/project/itsxpress/: .. code-block:: bash @@ -63,91 +68,91 @@ ITSxpress can be installed from: Dependencies ------------- -The software requires Vsearch, BBtools, Hmmer and Biopython. Bioconda takes care of this -for you so it is the preferred installation method. +The software requires Vsearch, BBtools, Hmmer >= 3.1b and Biopython. Bioconda +takes care of this for you so it is the preferred installation method. -Usage +Usage --------- -h, --help Show this help message and exit. --fastq A ``.fastq``, ``.fq``, ``.fastq.gz`` or ``.fq.gz`` file. Interleaved - or not. - ---single_end A flag to specify if the fastq file is inteleaved. + or not. Required. + +--single_end A flag to specify that the fastq file is single-ended (not paired). single-ended (not paired). Default is false. - ---fastq2 A ``.fastq``, ``.fq``, ``.fastq.gz`` or ``.fq.gz`` file representing read 2, optional. ---outfile The trimmed Fastq file, if it ends in ``gz`` it will be gzipped. +--fastq2 A ``.fastq``, ``.fq``, ``.fastq.gz`` or ``.fq.gz`` file representing read 2 if present, optional. ---tempdir Specify the temp file directory. +--outfile The trimmed FASTQ file, if it ends in ``gz`` it will be gzipped. ---keeptemp Should intermediate files be kept? +--tempdir Specify the temp file directory. Default is None. + +--keeptemp Should intermediate files be kept? Default is false. --region Options : {ITS2, ITS1, ALL} --taxa Select the taxonomic group sequenced: {Alveolata, Bryophyta, Bacillariophyta, Amoebozoa, Euglenozoa, Fungi, Chlorophyta, - Rhodophyta, Phaeophyceae, Marchantiophyta, Metazoa, Microsporidia, + Rhodophyta, Phaeophyceae, Marchantiophyta, Metazoa, Oomycota, Haptophyceae, Raphidophyceae, Rhizaria, Synurophyceae, - Tracheophyta, Eustigmatophyceae, Apusozoa, Parabasalia} - ---log Log file + Tracheophyta, Eustigmatophyceae, All}. Default Fungi. + +--log Log file. Default is ITSxpress.log. ---threads Number of processor threads to use +--threads Number of processor threads to use. Default is 1. Examples --------- -Use case 1: Trimming the ITS2 region from a fungal amplicon sequencing dataset with -forward and reverse gzipped fastq files using two cpu threads. +Use case 1: Trimming the ITS2 region from a fungal amplicon sequencing dataset with +forward and reverse gzipped FASTQ files using two cpu threads. .. code-block:: bash - itsxpress --fastq r1.fastq.gz --fastq2 r2.fastq.gz --region ITS2 --taxa Fungi \ - --log logfile.txt --outfile trimmed_reads.fastq.gz --threads 2 + itsxpress --fastq r1.fastq.gz --fastq2 r2.fastq.gz --region ITS2 \ + --taxa Fungi --log logfile.txt --outfile trimmed_reads.fastq.gz --threads 2 + +ITSxpress can take gzipped or un-gzipped FASTQ files and it can write gzipped or +un-gzipped FASTQ files. It expects FASTQ files to end in: .fq, .fastq, .fq.gz or fastq.gz. -ITSxpress can take gzipped or ungzipped fastq files and it can write gzipped or -ungzipped fastq files. It expects fastq files to end in : .fq, .fastq, .fq.gz or fastq.gz +Use case 2: Trimming the ITS2 region from a fungal amplicon sequencing dataset with +an interleaved gzipped FASTQ files using two cpu threads. -Use case 2: Trimming the ITS2 region from a fungal amplicon sequencing dataset with -an interleaved gzipped fastq files using two cpu threads. - .. code-block:: bash itsxpress --fastq interleaved.fastq.gz --region ITS2 --taxa Fungi \ --log logfile.txt --outfile trimmed_reads.fastq.gz --threads 2 -Use case 3: Trimming the ITS2 region from a fungal amplicon sequencing dataset with -an interleaved gzipped fastq files using two cpu threads. - +Use case 3: Trimming the ITS2 region from a fungal amplicon sequencing dataset with +an single-ended gzipped FASTQ files using two cpu threads. + .. code-block:: bash itsxpress --fastq single-end.fastq.gz --single_end --region ITS2 --taxa Fungi \ --log logfile.txt --outfile trimmed_reads.fastq.gz --threads 2 -Single ended data is less common and may come from a dataset where the reads have already +Single ended data is less common and may come from a dataset where the reads have already been merged. -Use case 4: Trimming the ITS1 region from a Microsporidia amplicon sequencing dataset with -an interleaved gzipped fastq files using 40 cpu threads. +Use case 4: Trimming the ITS1 region from a Alveolata amplicon sequencing dataset with +an interleaved gzipped FASTQ files using 40 cpu threads. .. code-block:: bash - itsxpress --fastq interleaved.fastq.gz --region ITS1 --taxa Microsporidia \ + itsxpress --fastq interleaved.fastq.gz --region ITS1 --taxa Alveolata \ --log logfile.txt --outfile trimmed_reads.fastq.gz --threads 40 License information -------------------- -This software is a work of the United States Department of Agriculture, Agricultural -Research Service. 17 U.S.C. Section 105 states that "Copyright protection under this -title is not available for any work of the United States Government". While I anticipate -that this work will be released under a CC0 public domain attribution, only the USDA ARS -Office of Technology transfer has the authority to make that determination. \ No newline at end of file +This software is a work of the United States Department of Agriculture, Agricultural +Research Service. 17 U.S.C. Section 105 states that "Copyright protection under this +title is not available for any work of the United States Government". While I anticipate +that this work will be released under a CC0 public domain attribution, only the USDA ARS +Office of Technology transfer has the authority to make that determination. diff --git a/itsxpress/definitions.py b/itsxpress/definitions.py index 76d16b0..f554954 100644 --- a/itsxpress/definitions.py +++ b/itsxpress/definitions.py @@ -1,3 +1,6 @@ +"""Definitions.py: variables shared across the package. + +""" import os # This is the project Root @@ -36,4 +39,4 @@ "Rhodophyta":"H.hmm","Phaeophyceae":"I.hmm","Marchantiophyta":"L.hmm","Metazoa":"M.hmm", "Oomycota":"O.hmm","Haptophyceae":"P.hmm", "Raphidophyceae":"Q.hmm"," Rhizaria":"R.hmm","Synurophyceae":"S.hmm", - "Tracheophyta":"T.hmm","Eustigmatophyceae":"U.hmm","All":"all.hmm"} \ No newline at end of file + "Tracheophyta":"T.hmm","Eustigmatophyceae":"U.hmm","All":"all.hmm"} diff --git a/itsxpress/main.py b/itsxpress/main.py index f4976aa..cef8495 100755 --- a/itsxpress/main.py +++ b/itsxpress/main.py @@ -1,23 +1,26 @@ #!/usr/bin/env python -"""ITSxpress: A python module to rapidly trim ITS amplicon sequences from Fastq files. -Author: Adam Rivers, USDA Agricultural Research Service +"""ITSxpress: A python module to rapidly trim ITS amplicon sequences from FASTQ files. -The internally transcribed spacer region is a region between highly conserved the small -subunit (SSU) of rRNA and the large subunit (LSU) of the rRNA. In Eukaryotes it contains -the 5.8s genes and two variable length spacer regions. In amplicon sequencing studies it is +Authors: Adam Rivers, Kyle weber, USDA Agricultural Research Service + +The internally transcribed spacer region is a region between the highly conserved small +subunit (SSU) of rRNA and the large subunit (LSU) of the rRNA. The eukaryotic ITS contains +the 5.8s gene and two variable length spacer regions. In amplicon sequencing studies it is common practice to trim off the conserved (SSU, 5,8S or LSU) regions. Bengtsson-Palme et al. (2013) published software the software package ITSx to do this. -ITSxpress is a high-speed implementation of the methods in ITSx than also allows FASTQ -files to be processed. It is approximately 6-9x faster than ITSx v1.1b. It also trims fastq -files Which is essential for Analizing sequences using the newer exact Sequence Variant -methods in Qiime2, Dada2, Deblur and Unoise that are replacing OTU clustering. +ITSxpress is a high-speed implementation of the methods in ITSx than also allows FASTQ +files to be processed. Processing FASTQ files Which is essential for analyzing +sequences using the newer exact Sequence Variant methods in Qiime2, Dada2, Deblur +and Unoise that are replacing OTU clustering. + +ITSxpress is also available as a QIIME Plugin. See +https://github.com/USDA-ARS-GBRU/q2_itsxpress for details. Process: * Merges and error corrects reads using bbduk if reads are paired-end * Deduplicates reads using Vmatch to eliminate redundant hmm searches * Searches for conserved regions using the ITSx hmms, using HMMsearch: - https://cryptogenomicon.org/2011/05/27/hmmscan-vs-hmmsearch-speed-the-numerology/ * Parses everything in python returning (optionally gzipped) fastq files. Reference: @@ -49,18 +52,18 @@ def myparser(): parser.add_argument('--fastq', '-f', type=str, required=True, help='A .fastq, .fq, .fastq.gz or .fq.gz file. Interleaved or not.') parser.add_argument('--single_end', '-s', action='store_true', default=False, - help='A flag to specify if the fastq file is interleaved single-ended (not paired). Default is false.') + help='A flag to specify that the FASTQ file is single-ended (not paired). Default is false.') parser.add_argument('--fastq2', '-f2', type=str, default=None, help='A .fastq, .fq, .fastq.gz or .fq.gz file. representing read 2 (optional)') parser.add_argument('--outfile', '-o', type=str, help="the trimmed Fastq file, if it \ ends in 'gz' it will be gzipped") - parser.add_argument('--tempdir', help='Specify the temp file directory', default=None) + parser.add_argument('--tempdir', help='The temp file directory', default=None) parser.add_argument('--keeptemp' ,help="Should intermediate files be kept?", action='store_true') parser.add_argument('--region', help='', choices=["ITS2", "ITS1", "ALL"], required=True) - parser.add_argument('--taxa', help='Select the taxonomic group sequenced', + parser.add_argument('--taxa', help='The taxonomic group sequenced.', choices=taxa_choices, default="Fungi") parser.add_argument('--log' ,help="Log file", default="ITSxpress.log") - parser.add_argument('--threads' ,help="Number of processor threads to use", default="1") + parser.add_argument('--threads' ,help="Number of processor threads to use.", type=int, default=1) return parser @@ -83,18 +86,18 @@ class ItsPosition: """ - + def _left_score(self, sequence, score, to_pos): """Evaluates left scores and positions from the new line of a domtable file and - updates ddict if neccicary. - + updates ddict if necessary. + Args: sequence (str): The name of the sequence. score (int): The bit score from HMMSearch. - to_pos (int): the ending position of the left seqeunce. - + to_pos (int): the ending position of the left sequence. + """ - + if "left" in self.ddict[sequence]: if score > self.ddict[sequence]["left"]["score"]: self.ddict[sequence]["left"]["score"]=score @@ -103,17 +106,17 @@ def _left_score(self, sequence, score, to_pos): self.ddict[sequence]["left"]={} self.ddict[sequence]["left"]["score"]=score self.ddict[sequence]["left"]["pos"]=to_pos - - + + def _right_score(self, sequence, score, from_pos): """Evaluates right scores and positions form the new line of a domtable file and - updates ddict if neccicary. - + updates ddict if necessary. + Args: sequence (str): The name of the sequence. score (int): The bit score from HMMSearch. - from_pos (int): The beginning position of the right seqeunce. - + from_pos (int): The beginning position of the right sequence. + """ if "right" in self.ddict[sequence]: if score > self.ddict[sequence]["right"]["score"]: @@ -123,15 +126,14 @@ def _right_score(self, sequence, score, from_pos): self.ddict[sequence]["right"]={} self.ddict[sequence]["right"]["score"]=score self.ddict[sequence]["right"]["pos"]=from_pos - + def parse(self): """Parses dom table from HMMsearch. - - The dom table is parsed to record the start and stop position from the top scoring - hmm mathces. This populates the ddict attribute containing the positions at - which to trim each sequence. - + + The dom table is parsed and the start and stop position from the top scoring + hmm math is saved. The start and stop positions of reach sequence are added to the ddict attribute. + """ try: with open(self.domtable , 'r') as f: @@ -152,7 +154,7 @@ def parse(self): except Exception as e: logging.error("Exception occured when parsing HMMSearh results") raise e - + def __init__(self, domtable, region): self.domtable = domtable self.ddict = {} @@ -166,22 +168,22 @@ def __init__(self, domtable, region): self.leftprefix='1_' self.rightprefix='4_' self.parse() - - + + def get_position(self, sequence): - """ Retuns the start and stop positions for a given sequence - + """ Returns the start and stop positions for a given sequence. + Args: sequence (str): The name of the sequence. - + Returns: (tuple): (start position, end position) zero indexed - + Raises: KeyError: If input sequence is not present in dictionary (no ITS start or stop sites were found) - + """ - + try: if "left" in self.ddict[sequence]: start = int(self.ddict[sequence]["left"]["pos"]) @@ -195,30 +197,30 @@ def get_position(self, sequence): except KeyError: logging.debug("No ITS stop or start sites were identified for sequence {}, skipping.".format(sequence)) raise KeyError - + class Dedup: """A class to handle deduplicated sequence data. - + To speed processing Vmatch is used to remove duplicate amplicons so that the - start ansd stop sites are determened only once. - + start and stop sites are determined only once. + Attributes: matchdict (dict): a dictionary of each sequence ID as a key and its representative sequence ID as a value {seq1:rep1, seq2:rep1, seq2:rep2}. - uc_file (str): the location of the .uc file contianing matching information. + uc_file (str): the location of the .uc file containing matching information. rep_file (str): The location of the representative sequences file. - seq_file (str): Teh location of the complete sequences file. - - + seq_file (str): The location of the complete sequences file. + + """ - - + + def parse(self): """Parse the uc data file to populate the matchdict attribute. - + Raises: Exception: General exception if uc file is not parsed properly - + """ try: with open(self.uc_file, 'r') as f: @@ -235,40 +237,40 @@ def parse(self): except Exception as e: logging.exception("Could not parse the Vsearch '.uc' file.") raise e - + def __init__(self, uc_file, rep_file, seq_file): self.matchdict = None self.uc_file = uc_file self.rep_file = rep_file self.seq_file = seq_file self.parse() - - - + + + def _get_trimmed_seq_generator(self, seqgen, itspos): - """This function takes a Biopython SeqIO sequence generator of sequences, and + """This function takes a Biopython SeqIO sequence generator, and returns a generator of trimmed sequences. Sequences where the ITS ends could - not be determined are ommited. - + not be determined are omitted. + Args: seqgen (obj): A Biopython SeqIO generator of all input sequences - ispos (obj): a itsxpress ItsPosition object - + ispos (obj): An itsxpress ItsPosition object + Returns: - (obj): A map object generator that yeilds filtered, trimmed sequence records. - + (obj): A map object generator that yields filtered, trimmed sequence records. + """ - - + + def _filterfunc(record): """ Filters records down to those that contain a valid ITS start and stop position - + Args: record (obj): a Biopython SeqRecord object - + Returns: bool: True if an ITS start and stop positions are present false otherwise - + """ try: if record.id in self.matchdict: @@ -282,36 +284,36 @@ def _filterfunc(record): return False def map_func(record): - """Trims the record down to correct region - + """Trims the record down to the selected ITS region + Args: record (obj): a Biopython SeqRecord object - + Returns: obj: a Biopython SeqRecord object trimmed to the ITS region """ repseq = self.matchdict[record.id] start, stop = itspos.get_position(repseq) return record[start:stop] - + filt = filter(_filterfunc, seqgen) return map(map_func, filt) - + def create_trimmed_seqs(self, outfile, gzipped, itspos): - """Creates a fastq file, optionally gzipped, with the reads trimmed to the + """Creates a FASTQ file, optionally gzipped, with the reads trimmed to the selected region. - + Args: - outfile (str): the file to write the sequences to. + outfile (str): The file to write the sequences to. gzip (bool): Should the files be gzipped? itspos (object): an ItsPosition object - + Returns: - str: name of the file written - + str: Name of the file written + """ - + def _write_seqs(): if gzipped: with gzip.open(outfile, 'wt') as g: @@ -319,7 +321,7 @@ def _write_seqs(): else: with open(outfile, 'w') as g: SeqIO.write(seqs, g, "fastq") - + if self.seq_file.endswith(".gz"): with gzip.open(self.seq_file, 'rt') as f: seqgen = SeqIO.parse(f, 'fastq') @@ -331,20 +333,20 @@ def _write_seqs(): seqgen = SeqIO.parse(f, 'fastq') seqs = self._get_trimmed_seq_generator(seqgen, itspos) _write_seqs() - - + + class SeqSample: """The class for processing sequence data into trimmed sequences. - + Attributes: tempdir (obj): A temporary directory object - fastq (str): The path to the input fastq file + fastq (str): The path to the input FASTQ file uc_file (str): The path to the Vsearch uc mapping file - rep_file: (str) the path to the representative sequences fasta file created by Vsearch + rep_file: (str) the path to the representative sequences FASTA file created by Vsearch seq_file (str): the location of the fastq or fastq.gz sequence file used for analysis - + """ - + def __init__(self, fastq, tempdir=None): if tempdir: @@ -361,14 +363,14 @@ def __init__(self, fastq, tempdir=None): self.dom_file = None self.seq_file = None - - + + def _deduplicate(self, threads=1): - """Runs Vsearch dereplication to create a fasta file of nonredundant sequences. - + """Runs Vsearch dereplication to create a FASTA file of non-redundant sequences. + Args: threads (int or str):the number of processor threads to use - + """ try: self.uc_file=os.path.join(self.tempdir, 'uc.txt') @@ -413,14 +415,14 @@ def _search(self, hmmfile, threads=1): except FileNotFoundError as f: logging.error("hmmsearch was not found, make sure HMMER3 is installed and executable") raise f - + class SeqSamplePairedInterleaved(SeqSample): """SeqSample class extended to paired, interleaved format. - + """ def __init__(self, fastq, tempdir): SeqSample.__init__(self, fastq, tempdir) - + def _merge_reads(self, threads): try: seq_file = os.path.join(self.tempdir, 'seq.fq.gz') @@ -440,13 +442,13 @@ def _merge_reads(self, threads): raise f class SeqSamplePairedNotInterleaved(SeqSample): - """SeqSample class extended to paired, two fastq file format. - + """SeqSample class extended to paired, two FASTQ file format. + """ def __init__(self, fastq, tempdir, fastq2): SeqSample.__init__(self, fastq, tempdir) self.fastq2 = fastq2 - + def _merge_reads(self, threads): try: seq_file = os.path.join(self.tempdir, 'seq.fq.gz') @@ -468,20 +470,20 @@ def _merge_reads(self, threads): class SeqSampleNotPaired(SeqSample): """SeqSample class extended to unpaired format. - + """ def __init__(self, fastq, tempdir): SeqSample.__init__(self, fastq, tempdir) self.seq_file = self.fastq - + def _is_paired(fastq, fastq2, single_end): """Determines the workflow based on file inputs. - + Args: - + """ if fastq and fastq2: paired_end = True @@ -496,11 +498,11 @@ def _is_paired(fastq, fastq2, single_end): def _logger_setup(logfile): """Set up logging to a logfile and the terminal standard out. - + Args: fastq (str): The path to a fastq or fastq.gz file fastq2 (str): The path to a fastq or fastq.gz file for the reverse sequences - + """ try: logging.basicConfig(level=logging.DEBUG, @@ -518,18 +520,19 @@ def _logger_setup(logfile): # add the handler to the root logger logging.getLogger('').addHandler(console) except Exception as e: - print("An error occured setting up logging") + print("An error occurred setting up logging") raise e def _check_fastqs(fastq, fastq2=None): """Verifies the input files are valid fastq or fastq.gz files. - + Args: fastq (str): The path to a fastq or fastq.gz file fastq2 (str): The path to a fastq or fastq.gz file for the reverse sequences - + Raises: - SystemExit if invalid input sequences are found. + FileNotFound: Error if BBTools was not found. + subprocess.CalledProcessError: Error if there was an issue processing files """ try: parameters=['reformat.sh', 'in='+fastq, 'reads=50'] @@ -546,8 +549,8 @@ def _check_fastqs(fastq, fastq2=None): def main(args=None): - """Run Complete ITS trimming workflow - + """Run Complete ITS trimming workflow. + """ # Set up logging t0 = time.time() @@ -557,7 +560,7 @@ def main(args=None): _logger_setup(args.log) try: - logging.info("Verifing the input sequences.") + logging.info("Verifying the input sequences.") _check_fastqs(args.fastq, args.fastq2) # Parse input types paired_end, interleaved = _is_paired(args.fastq,args.fastq2, args.single_end) @@ -565,22 +568,22 @@ def main(args=None): if paired_end and interleaved: logging.info("Sequences are paired-end and interleaved. They will be merged using BBmerge.") sobj = SeqSamplePairedInterleaved(fastq=args.fastq, tempdir=args.tempdir) - sobj._merge_reads(threads=args.threads) + sobj._merge_reads(threads=str(args.threads)) elif paired_end and not interleaved: logging.info("Sequences are paired-end in two files. They will be merged using BBmerge.") sobj = SeqSamplePairedNotInterleaved(fastq=args.fastq, fastq2=args.fastq2, tempdir=args.tempdir) - sobj._merge_reads(threads=args.threads) + sobj._merge_reads(threads=str(args.threads)) elif not paired_end and not interleaved: logging.info("Sequences are assumed to be single-end.") sobj = SeqSampleNotPaired(fastq=args.fastq, tempdir=args.tempdir) logging.info("Temporary directory is: {}".format(sobj.tempdir)) # Deduplicate - logging.info("Unique sequences are being written to a temporary fasta file with Vsearch.") - sobj._deduplicate(threads=args.threads) + logging.info("Unique sequences are being written to a temporary FASTA file with Vsearch.") + sobj._deduplicate(threads=str(args.threads)) # HMMSearch for ITS regions logging.info("Searching for ITS start and stop sites using HMMSearch. This step takes a while.") hmmfile = os.path.join(ROOT_DIR,"ITSx_db","HMMs", taxa_dict[args.taxa]) - sobj._search(hmmfile=hmmfile, threads=args.threads) + sobj._search(hmmfile=hmmfile, threads=str(args.threads)) # Parse Hmmsearch output logging.info("Parsing HMM results.") its_pos = ItsPosition(domtable=sobj.dom_file, region=args.region) @@ -596,7 +599,7 @@ def main(args=None): fmttime = time.strftime("%H:%M:%S",time.gmtime(t1-t0)) logging.info("ITSxpress ran in {}".format(fmttime)) except Exception as e: - logging.error("ITSXpress terminated with errors. see the log file fo details.") + logging.error("ITSXpress terminated with errors. See the log file fo details.") logging.error(e) raise SystemExit(1) finally: diff --git a/setup.py b/setup.py index c381aa7..fd5aa8c 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='itsxpress', - version='1.5.6', + version='1.5.7', packages=['itsxpress'], license='License :: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication', description="Rapidly trim sequences down to their Internally Transcribed Spacer (ITS) regions", diff --git a/tests/test_main.py b/tests/test_main.py index ed8593a..9f42526 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -95,7 +95,7 @@ def test_seq_sample_not_paired(): sobj._deduplicate(threads=1) sobj._search(hmmfile=hmmfile, threads=1) shutil.rmtree(sobj.tempdir) - + def test_seq_sample_paired_not_interleaved(): fastq = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_R1.fastq") fastq2 = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_R2.fastq") @@ -104,8 +104,8 @@ def test_seq_sample_paired_not_interleaved(): sobj._deduplicate(threads=1) sobj._search(hmmfile=hmmfile, threads=1) shutil.rmtree(sobj.tempdir) - - + + def test_is_paired(): paired_end, interleaved = itsxpress.main._is_paired("fastq1.fq", "fastq2.fq", single_end=False) assert paired_end == True and interleaved == False @@ -133,7 +133,7 @@ def test_main_interleaved(): n += 1 ok_(n==226) shutil.rmtree(tf) - + def test_main_paired(): parser = itsxpress.main.myparser() tf = tempfile.mkdtemp() @@ -141,7 +141,7 @@ def test_main_paired(): fastq2 = os.path.join(TEST_DIR, "test_data", "4774-1-MSITS3_R2.fastq") outfile = os.path.join(tf,'testout.fastq') validation = os.path.join(TEST_DIR, "test_data", "testout.fastq") - args = parser.parse_args(['--fastq', fastq, '--fastq2', fastq2, '--outfile', outfile, '--region','ITS2', '--taxa', 'Fungi']) + args = parser.parse_args(['--fastq', fastq, '--fastq2', fastq2, '--outfile', outfile, '--region','ITS2', '--taxa', 'Fungi', '--threads', '1']) itsxpress.main.main(args=args) seqs = SeqIO.parse(outfile, 'fastq') n = 0 @@ -157,7 +157,7 @@ def test_main_merged(): outfile = os.path.join(tf,'testout.fastq') validation = os.path.join(TEST_DIR, "test_data", "testout.fastq") - args = parser.parse_args(['--fastq', fastq, '--single_end', '--outfile', outfile, '--region','ITS2', '--taxa', 'Fungi']) + args = parser.parse_args(['--fastq', fastq, '--single_end', '--outfile', outfile, '--region','ITS2', '--taxa', 'Fungi', '--threads', '1']) itsxpress.main.main(args=args) seqs = SeqIO.parse(outfile, 'fastq') n = 0 @@ -165,4 +165,3 @@ def test_main_merged(): n += 1 ok_(n==226) shutil.rmtree(tf) -