diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0d20b64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.pyc diff --git a/LICENSE b/LICENSE new file mode 100755 index 0000000..8dada3e --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright {yyyy} {name of copyright owner} + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..f46489a --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +include README.md version.txt +recursive-include zippy * diff --git a/README.md b/README.md new file mode 100755 index 0000000..4f0c5cf --- /dev/null +++ b/README.md @@ -0,0 +1,118 @@ +# ZIPPY +### The +### ZIppy +### Pipeline +### Prototyping +### sYstem + +ZIPPY is a powerful, easy-to-use NGS pipeline prototyping system, with batteries included. ZIPPY helps you create JSON-based pipeline specifications, which it then uses to execute a series of pipeline stages, with no user-written code required. + +## With ZIPPY, you can: +- Generate a new pipeline without any coding required +- Auto-generate parameters files from just a list of stages +- Use our ultra-modular system for adding new modules, which can then interface with every other module +- Make a custom pipeline in 10 minutes or less + +## ZIPPY includes a variety of modules, including: +- Bcl2Fastq +- BWA +- Picard alignment stats +- RSEM +- MACS +- BAM subsampling + +There will be many more to come! (See the full list of modules [here](https://github.com/Illumina/zippy/wiki/Zippy-modules)). + +## Limitations: +ZIPPY uses black magic that relies upon the CPython implementation of Python. Currently, only CPython 2.7 is supported. ZIPPY also requires several python modules. To make life easier, an executable version of ZIPPY is available (see the releases page!). + +### Running ZIPPY from source +If you would like to run ZIPPY from source, there are a couple of things to note. +- You must use CPython 2.7 +- You must install the modules 'commentjson' and 'pyflow' (note: pyflow is not in pypi, but can be found [here](https://github.com/Illumina/pyflow)). You may optionally install the package 'meta' which may improve the coverage of parameters from make_params if you do not have the source of some of your imported modules (e.g., only .pyc files). +- Run the tests file make_params_test.py and see if it's OK! + +# Using ZIPPY: +0. Install ZIPPY by using 'pip install zippy' + +1. Make a proto-workflow file. + + A proto-workflow is a very simple json file that lists the pipeline stages you want, in the order you want them. Here's a simple proto-workflow that I use for some RNA-seq applications: + ``` + { + "stages": ["bcl2fastq", "rsem"] + } + ``` +Yeah, that's really it. + +2. Compile the proto-workflow + + execute 'python -m zippy.make_params my_proto.json my_params.json' + +3. Fill in the blanks and/or connect the dots + + Open up my_params.json and fill in all the parameters that are currently blank. + +4. Run ZIPPY + + To run ZIPPY, execute 'python -m zippy.zippy my_params.json' + + +**More information is on the git wiki.** + +v2.1.0 (4/11/2018) +- First public release +- Parameters 2.0. Support for subworkflows, parameter nesting and environments. +- Support for running docker containers through singularity +- Better optional parameters support. You can now create the function define_optionals(self), which returns a map of default values for optional parameters. +- New stages (Nirvana variant annotation, minimap2 aligner, primer dimer detection) +- New unit tests +- Fewer known bugs + +v2.0.0 (2/14/2018) + +It's here! Release 2.0 inaugurates ZIPPY-as-a-package. You can now pip install ZIPPY, and once you do so, run it as python -m zippy.zippy. Furthermore, you can import zippy from anywhere and use the API. Special thanks to Wilfred Li for this work. As a bonus, we're finally moving to semantic-ish versioning. + +- ZIPPY as a package +- API interface should be finalized +- better docker support (can now support docker pull) +- several small bugfixes +- small documentation improvements + +v1.9{6} (12/7/2017) +- Provisional ZIPPY API (function names are interfaces not final) +- Removed several external dependencies +- Parameter detection now defaults to the inspect module, instead of the meta module (i.e., we avoid decompilation when possible) +- Support for running locally instead of on SGE +- Memory/core requirements can now be set both per-stage and globally +- New stages (allowing you to delete intermediate outputs, to convert a bam to a fastq, and to downsample in style with a bloom filter) +- DataRunner improvements: can now load sample list from a samplesheet, and can now manually map file names to zippy types +- Better support for modules in external files (make_params now supports them fully) +- Yet fewer bugs! Or at least... different bugs. + +v1.99999 (8/30/17) +- Support for external modules (Runners defined in external code) +- Lots of workflow improvements (argument passing for nearly everything, new bcl2fastq and BWA workflows) +- New workflows (DNA/RNA QC, edger and Salmon) +- New help system (run 'python zippy.py --help') + +v1.9999 (5/1/17) +- Unit tests +- Wildcards in the parameters files +- Merge stages +- Support for optional parameters + +v1.999 (2/16/17) +- Arbitrary stage chaining +- More stages +- Fewer bugs + +v1.99 +First end-to-end version of ZIPPY + +## License +Copyright 2018 Illumina + +Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..9154afb --- /dev/null +++ b/setup.py @@ -0,0 +1,43 @@ +""" +Setup for Zippy (zippy) +""" +from setuptools import setup, find_packages + +package = __import__('zippy') + +def readme(): + with open('README.md') as f: + return f.read() + +try: + long_desc = readme() +except IOError as err: + long_desc = str(err) + +try: + version_str = open('version.txt').read() +except IOError as err: + version_str = '2.0.0' + +setup( + name='zippy', + long_description=long_desc, + url='https://github.com/Illumina/zippy/wiki', + author='Aaron Wise', + author_email='quejebo@gmail.com', + license='Apache 2.0', + install_requires=[ + 'distribute', + 'commentjson', + 'ast', + 'pybloomfiltermmap' + ], + version=version_str, + description='NGS Pipeline Prototyping Tool for Python', + include_package_data=True, + packages=find_packages(), + test_suite='nose.collector', + tests_require=['nose'], + scripts=[], + requires=[], + zip_safe=False) diff --git a/version.txt b/version.txt new file mode 100644 index 0000000..715e646 --- /dev/null +++ b/version.txt @@ -0,0 +1 @@ +2.0.0.21 diff --git a/zippy/__init__.py b/zippy/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/zippy/bcl2fastq.py b/zippy/bcl2fastq.py new file mode 100755 index 0000000..80053cf --- /dev/null +++ b/zippy/bcl2fastq.py @@ -0,0 +1,25 @@ +from pyflow import WorkflowRunner + +class Bcl2Fastq(WorkflowRunner): + def __init__(self, bcl2fastq, runfolder_dir, output_dir, sample_sheet, args='', max_job_cores=16): + optional_argument_defaults = { #'demultiplexing-threads': 10, + 'processing-threads': 16} + self.max_job_cores = max_job_cores + command = "{} --runfolder-dir {} --output-dir {} --sample-sheet {}".format( + bcl2fastq, runfolder_dir, output_dir, sample_sheet) + if args != '': + command+=" {}".format(args) + default_additions = [] + for x in optional_argument_defaults.keys(): + if x not in command: + default_additions.append("--{} {}".format(x, optional_argument_defaults[x])) + if len(default_additions) > 0: + command = command+" "+" ".join(default_additions) + self.command = command + + def workflow(self): + self.flowLog('bcl2fastq command: %s' % self.command) + self.addTask('bcl2fastq', self.command, nCores=self.max_job_cores) + +if __name__ == "__main__": + pass \ No newline at end of file diff --git a/zippy/bloom_kmer_finder.py b/zippy/bloom_kmer_finder.py new file mode 100755 index 0000000..e4eb632 --- /dev/null +++ b/zippy/bloom_kmer_finder.py @@ -0,0 +1,229 @@ +import cProfile +import os +import gzip +import csv +import time +from collections import defaultdict +from itertools import combinations +from pybloomfilter import BloomFilter +#from pybloom import ScalableBloomFilter, BloomFilter #pybloom used cryptographic hashes in a bloom filter. This is a bad idea. +import numpy + +class BloomKmerFinder(): + """ + Finds all kmers that show up more than a certain number of times. Can choose to ignore dimerized reads + or do only dimerized reads. Useful for finding common kmers in unmapped reads. We use a bloom filter + to do this, so it is very fast, but requires a few GB of ram to keep the filter in memory. + """ + def __init__(self, params, k, exclude_monomers=False, exclude_dimers=False): + self.params = params + self.k = k #kmer for global matching + self.primer_k = 15 #kmer for primer matching + self.exclude_monomers = exclude_monomers + self.exclude_dimers = exclude_dimers + self.run_dimer_detector = False + self.bloom = BloomFilter(1e9, 0.01, None) + self.count_map = defaultdict(int) + if exclude_monomers or exclude_dimers: + self.run_dimer_detector = True + self.reads_read = 0 # how many lines we've looked at + self.dimer_reads_read = 0 # how many lines we've looked at with at least 2 primers + self.monomer_reads_read = 0 # how many lines we've looked at with exactly 1 primer + self.out_stats_file = open(os.path.join(self.params.output_dir,'count_stats'), 'w') + + def reverse_complement(self, seq): + rev_map = {'A':'T','C':'G','G':'C','T':'A', 'N':'N'} + new_seq = ''.join([rev_map[x] for x in seq]) #sub letters + new_seq = new_seq[::-1] # reverse it + return new_seq + + def parse_probe_list(self): + """ + Creates probe map, which is a map of probe names to sequence. + """ + probe_map = {} + with open(self.params.probe_list, 'r') as f: + c = csv.reader(f, delimiter="\t") + for line in c: + probe_map[line[0]] = line[1].upper() + return probe_map + + def build_kmer_map(self, probe_map, k): + """ + Builds a map from kmer to probenames that have this kmer. + Also does reverse complements. + """ + kmer_map = defaultdict(set) + for (probe, seq) in probe_map.iteritems(): + seq_rc = self.reverse_complement(seq) + for i in range(0, len(seq)-k): + kmer_map[seq[i:i+k]].add(probe) + kmer_map[seq_rc[i:i+k]].add(probe+"rc") + return kmer_map + + def run_matcher(self, input_file, kmer_map): + """ + Goes through a fastq, and registers all the kmers in it. + """ + if input_file is None: # we don't require the input files... one of them can be undefined + return + debug = 0 + with open(input_file, 'r') as f: + counter = 0 # 0: header 1: sequence 2: junk 3: quality + for line in f: + if counter == 0: + read_name = line.strip() + if counter == 1: + line = line.upper() + if self.run_dimer_detector: + probe_matches = self.find_matches(line.strip(), kmer_map) + if len(probe_matches) > 1 and not self.exclude_dimers: + self.dimer_reads_read += 1 + self.register_kmers(line.strip()) + elif len(probe_matches) == 1 and not self.exclude_monomers: + self.monomer_reads_read += 1 + self.register_kmers(line.strip()) + elif len(probe_matches) == 0: + debug += 1 + self.register_kmers(line.strip()) + else: + self.register_kmers(line.strip()) + self.reads_read += 1 + counter += 1 + counter = counter % 4 + print '{} dimer: {}'.format(input_file, self.dimer_reads_read) + print '{} monomer: {}'.format(input_file, self.monomer_reads_read) + print '{} none: {}'.format(input_file, debug) + print '{} total: {}'.format(input_file, self.reads_read) + + def register_kmers(self, read): + """ + Adds the read and its reverse complement to our bloom filter, and if we have seen it before, + adds it to the count map. The idea is that the bloom filter can approximately determine + if we've seen something before or not, and to the count map are added all kmers that the bloom + filter reports that we've seen before. + """ + for i in range(0, len(read)-self.k): + seq = read[i:i+self.k] + seq_rc = self.reverse_complement(seq) + if self.bloom.add(seq): + self.count_map[seq]+=1 + if self.bloom.add(seq_rc): + self.count_map[seq_rc]+=1 + + + def find_matches(self, line, kmer_map): + """ + For a single read, reports all found primers + """ + in_primer = None + matches = [] + for i in range(0, len(line)-self.primer_k): + sub_seq = line[i:i+self.primer_k] + if in_primer is None: #we are not currently in a primer. + if len(kmer_map[sub_seq]) == 1: # If we see a uniquely mappable kmer, we enter a primer. + (in_primer,) = kmer_map[sub_seq] + matches.append(in_primer) + else: # Otherwise, we continue + continue + else: # we are in the middle of seeing a primer sequence + if in_primer in kmer_map[sub_seq]: # we see this primer again, and are thus still reading it. Continue. + continue + elif len(kmer_map[sub_seq]) == 1: # We no longer see our current primer, but this sequence is mappable to another primer. We are now in a different primer. + (in_primer,) = kmer_map[sub_seq] + matches.append(in_primer) + else: # We aren't in our current primer, and aren't uniquely in a different primer. + in_primer = None + return matches + + + def output_stats(self, kmer_map): + """ + We print the top-two unique maximal strings, and then a sorted list of all kmers that appear + at least twice in our reads. + """ + sorted_map = sorted(self.count_map.items(), key=lambda x: -x[1]) + first_string = self.extend(sorted_map[0][0], self.count_map) + for (kmer, count) in sorted_map[1:]: + if kmer not in first_string and self.reverse_complement(kmer) not in first_string: + second_string = self.extend(kmer, self.count_map) + second_score = count + break + self.out_stats_file.write("{}\t{}\n".format(sorted_map[0][1], first_string)) + self.out_stats_file.write("{}\t{}\n".format(second_score, second_string)) + for (kmer, count) in sorted_map: + probe_matches = self.find_matches(kmer, kmer_map) + if len(probe_matches) == 0: + self.out_stats_file.write("{}\t{}\n".format(kmer, count)) + else: + self.out_stats_file.write("{}\t{}\t{}\n".format(probe_matches, kmer, count)) + + def extend(self, seed, kmer_map): + """ + Given a kmer, we greedily extend it in both directions by looking for kmers that differ by 1 on either side. We add + the new kmer if its count is at least half of our peak kmer. + """ + final_string = [seed] + value = kmer_map[seed] + forward_extend = True + current_seed = seed + while forward_extend: + extender = current_seed[1:] + new_kmers = [extender+x for x in 'ACGT'] + new_scores = [kmer_map[x] for x in new_kmers] + if numpy.max(new_scores)>value*0.5: #we extend + new_kmer = new_kmers[numpy.argmax(new_scores)] + if new_kmer == current_seed: #we hit a pathological (recursive) read + forward_extend = False + final_string.append(new_kmer[-1]) + current_seed = new_kmer + else: + forward_extend = False + reverse_extend = True + current_seed = seed + while reverse_extend: + extender = current_seed[:-1] + new_kmers = [x+extender for x in 'ACGT'] + new_scores = [kmer_map[x] for x in new_kmers] + if numpy.max(new_scores)>value*0.5: #we extend + new_kmer = new_kmers[numpy.argmax(new_scores)] + if new_kmer == current_seed: #we hit a pathological read + reverse_extend = False + final_string = [new_kmer[0]]+final_string + current_seed = new_kmer + else: + reverse_extend = False + return ''.join(final_string) + + + def run(self): + """ + Main execution function for kmer finder + """ + if self.run_dimer_detector: + probe_map = self.parse_probe_list() + kmer_map = self.build_kmer_map(probe_map, self.primer_k) #kmer-map for dimer detection + else: + kmer_map = defaultdict(set) + for input_file in [self.params.input_file1, self.params.input_file2]: + self.run_matcher(input_file, kmer_map) + self.output_stats(kmer_map) + self.out_stats_file.close() + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--probe_list', type=str, help="Needed if you want to filter by dimers. tsv with 2 columns: (probe_name, sequence). If you are looking for adapters or other short sequences, they should be added to the probe list.") + parser.add_argument('--kmer', type=int, default=30, help="How big a fragment size to count") + parser.add_argument('--input_file1', help="A fastq file with reads to analyze") + parser.add_argument('--input_file2', help="Another fastq file (Optional)") + parser.add_argument('--output_dir') + parser.add_argument('--exclude_monomers', dest='exclude_monomers', action='store_true', help="Whether we exclude primer monomers from kmer counting") + parser.set_defaults(exclude_monomers=False) + parser.add_argument('--exclude_dimers', dest='exclude_dimers', action='store_true', help="Whether we exclude primer dimers from kmer counting") + parser.set_defaults(exclude_dimers=False) + params = parser.parse_args() + bloomy = BloomKmerFinder(params, params.kmer, params.exclude_monomers, params.exclude_dimers) + start = time.time() + bloomy.run() + print time.time()-start \ No newline at end of file diff --git a/zippy/build_all.spec b/zippy/build_all.spec new file mode 100644 index 0000000..4acd092 --- /dev/null +++ b/zippy/build_all.spec @@ -0,0 +1,67 @@ +# -*- mode: python -*- + +#just an example. With full paths, this file can be used to create a binary using pyinstaller. +block_cipher = None + + +a = Analysis(['make_params.py'], + pathex=['path/to/zippy'], + binaries=[], + datas=[('/path/to/python/lib/python2.7/site-packages/pyflow/pyflow.py', './pyflow'), + ('/path/to/python/lib/python2.7/site-packages/pyflow/pyflow.py', './site-packages/pyflow')], + hiddenimports=[], + hookspath=[], + runtime_hooks=[], + excludes=[], + win_no_prefer_redirects=False, + win_private_assemblies=False, + cipher=block_cipher) + +b = Analysis(['zippy.py'], + pathex=['path/to/zippy'], + binaries=[], + datas=[], + hiddenimports=[], + hookspath=[], + runtime_hooks=[], + excludes=[], + win_no_prefer_redirects=False, + win_private_assemblies=False, + cipher=block_cipher) +MERGE((a, 'make_params', 'make_params'), (b, 'zippy', 'zippy')) +pyz = PYZ(a.pure, a.zipped_data, + cipher=block_cipher) +pyz2 = PYZ(b.pure, b.zipped_data, + cipher=block_cipher) +exe = EXE(pyz, + a.scripts, + exclude_binaries=True, + name='make_params', + debug=False, + strip=False, + upx=True, + console=True ) +exe2 = EXE(pyz2, + b.scripts, + exclude_binaries=True, + name='zippy', + debug=False, + strip=False, + upx=True, + console=True ) +coll = COLLECT(exe, + a.binaries, + a.zipfiles, + a.datas, + strip=False, + upx=True, + name='make_params') +coll2 = COLLECT(exe2, + b.binaries, + b.zipfiles, + b.datas, + strip=False, + upx=True, + name='zippy') + + diff --git a/zippy/bwa.py b/zippy/bwa.py new file mode 100755 index 0000000..c012f56 --- /dev/null +++ b/zippy/bwa.py @@ -0,0 +1,63 @@ +import os.path +from pyflow import WorkflowRunner + +class BWAWorkflow(WorkflowRunner): + def __init__(self, output_dir, + bwa_exec, samtools_exec, genome_fa, cores, mem, + fastq, sample='', args=''): + self.output_dir = output_dir + self.bwa_exec = bwa_exec + self.samtools_exec = samtools_exec + self.genome_fa = genome_fa + self.cores = cores + self.mem = mem + self.fastq = fastq + self.sample = sample + self.args = args + + def workflow(self): + # Create the output directory and create a dummy samplesheet + cmd = "mkdir -p {}".format(self.output_dir) + self.addTask(label="make_out_dir", command=cmd, isForceLocal=True) + out_bam = os.path.join(self.output_dir, "out.bam") + + + # from fastq: + if len(self.fastq) == 2: + fastq = " ".join(self.fastq) + elif len(self.fastq) == 1: + fastq = self.fastq + else: + raise("More than two FASTQs passed to bwamem!") + if self.args != "": + self.args=" "+self.args + else: + self.args = " -M -R \'@RG\\tID:1\\tLB:{0}\\tPL:ILLUMINA\\tSM:{0}\'".format(self.sample) + cmd = "%s mem" % self.bwa_exec \ + + " -t %i" % (self.cores*2) \ + + self.args \ + + " %s %s" % (self.genome_fa, fastq) \ + + " | %s view -b -o %s -" % (self.samtools_exec, out_bam) + self.flowLog(cmd) + self.addTask(label="bwamem", command=cmd, nCores=self.cores, + memMb=self.mem, dependencies="make_out_dir") + + + # Sort BAM + out_sorted_bam = os.path.join(self.output_dir, "out.sorted.bam") + out_temp = os.path.join(self.output_dir, "tmp") + cmd = self.samtools_exec \ + + " sort %s" % out_bam \ + + " -O bam" \ + + " -o " + out_sorted_bam \ + + " -T " + out_temp \ + + " -@ %i" % self.cores + self.addTask(label="sort_bam", command=cmd, nCores=self.cores, memMb=min(1024 * 32, self.mem), dependencies="bwamem") + + # Clean up the unsorted BAM + cmd = "rm {}".format(out_bam) + self.addTask(label="del_unsorted_bam", command=cmd, dependencies="sort_bam") + + +if __name__ == "__main__": + pass \ No newline at end of file diff --git a/zippy/deseq.r b/zippy/deseq.r new file mode 100755 index 0000000..e744745 --- /dev/null +++ b/zippy/deseq.r @@ -0,0 +1,26 @@ +### +#Runs deseq2 on our data. +#TODOS: +##We can only normalize against a single sample +##We have chosen a fittype that may not make sense for real data +#args +#1: data directory +#2: control sample +#3: output filename +#source("http://bioconductor.org/biocLite.R") +#biocLite("DESeq2") +### +library("DESeq2") +args <- commandArgs(trailingOnly = TRUE) +#get output from htseq +sampleFiles <- grep("hts.txt",list.files(args[1]),value=TRUE) +#condition is currently based on file name. We may eventually want to take in a list for the condition +sampleCondition <- sub("(.*).hts.txt","\\1",sampleFiles) +sampleTable <- data.frame(sampleName = sampleFiles, fileName = sampleFiles, condition = sampleCondition) +dds <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable, directory = args[1], design= ~ condition) + +dds <- dds[ rowSums(counts(dds)) > 1, ] # filter to genes with at least 2 reads +dds <- DESeq(dds, fitType="mean") +dds$condition <- relevel(dds$condition, ref=args[2]) # define the control sample. +res <- results(dds) +write.csv(as.data.frame(res), file=paste(args[3], "deseq.csv", sep="/")) diff --git a/zippy/downsampling_bloom.py b/zippy/downsampling_bloom.py new file mode 100755 index 0000000..7f69f40 --- /dev/null +++ b/zippy/downsampling_bloom.py @@ -0,0 +1,234 @@ +# -*- coding: utf-8 -*- +""" +Downsampling script for paired end data that retains information about non-linear alignments. +If the specified value for downsampling is less than the number of reads in the original bam, +then it will simply copy the old bam to the new path while omitting any pairs that have no +alignment whatsoever. This makes it so that regardless if downsampling was actually performed, +the output is in the same "style" (i.e. includes no pairs that are complete unmapped). + +Author: Kevin Wu, Jan. 2017 +""" + +import argparse +import os +import random + +import pybloomfilter +import pysam +import time +import multiprocessing +import shutil + + +BF_PROB = 0.0001 +MAX_ITERS = 10 # Maximum times we'll loop through a bam + +def partition(lst, n): + """Given a list, partition it into n chunks of approximately equal size""" + # https://stackoverflow.com/questions/2659900/python-slicing-a-list-into-n-nearly-equal-length-partitions + return [lst[i::n] for i in xrange(n)] + + +class allReadNamesProcess(multiprocessing.Process): + """ + Outputs all read names from given chromosomes and writes to a centralized queue + This is meant for the non-downsampling branch of this script. + """ + def __init__(self, bamfile, chromosome_list, outqueue): + multiprocessing.Process.__init__(self) + self.out_queue = outqueue + self.chromosomes = chromosome_list + self.bam = bamfile + + def run(self): + """ + Walks through each chromosome, outputting every single readname + """ + for chromosome in self.chromosomes: + with pysam.AlignmentFile(self.bam, 'rb') as alignment: + for read in alignment.fetch(chromosome): + self.out_queue.put(read.query_name) + + +class sampleReadNamesProcess(multiprocessing.Process): + """ + Sample read names from the given chromosomes and writes them to a centralized queue + """ + def __init__(self, bamfile, perChromSingleEndReads, perChromUniqueReadnamesTarget, outQueue): + multiprocessing.Process.__init__(self) + self.bam = bamfile + # The dictionary keys in the perChrom things should be the same + assert isinstance(perChromSingleEndReads, dict) + assert isinstance(perChromUniqueReadnamesTarget, dict) + for key in perChromSingleEndReads.keys(): + assert key in perChromUniqueReadnamesTarget + + self.chromosomes = perChromSingleEndReads.keys() + self.perChromSingleEndReads = perChromSingleEndReads + self.perChromReadnamesTarget = perChromUniqueReadnamesTarget + + # assert isinstance(outQueue, multiprocessing.Queue) + self.outQueue = outQueue + + def run(self): + """ + Walk through this thread's assigned chromosomes, getting readnames for each. + """ + for chromosome in self.chromosomes: + # Sanity checks + assert chromosome in self.perChromSingleEndReads + assert chromosome in self.perChromReadnamesTarget + + # Initialize values for this chromosome's sampling + targetReadnamesCount = self.perChromReadnamesTarget[chromosome] + numSingleEndReads = self.perChromSingleEndReads[chromosome] + acceptedProbability = float(targetReadnamesCount) / float(numSingleEndReads) + acceptedReadnames = pybloomfilter.BloomFilter(targetReadnamesCount * 2, BF_PROB) + + numIters = 0 + keepGoing = True + with pysam.AlignmentFile(self.bam, 'rb') as alignment: + while keepGoing and numIters < MAX_ITERS: + for read in alignment.fetch(region=chromosome): + read_name, flag = read.query_name, int(read.flag) + # If read is supplementary/seconadary, skip it + if (flag & 2048) or (flag & 256): + continue + if random.random() <= acceptedProbability: + # False output from add indicates that item was not already in bloom filter + # So we tehn add it to the output queue + if not acceptedReadnames.add(read.query_name): + self.outQueue.put(read_name) + if len(acceptedReadnames) >= targetReadnamesCount: + keepGoing = False + break + numIters += 1 + # Write to stdout + print("Processed %s %s: (readnames selected/readnames desired) (%i/%i)" % (os.path.basename(self.bam), chromosome, len(acceptedReadnames), targetReadnamesCount)) + + +class readnameConsumer(multiprocessing.Process): + """ + Consumes readnames and writes the output bam. + """ + def __init__(self, totalTargetReadnameCount, bamfileIn, bamfileOut, readnameQueue): + multiprocessing.Process.__init__(self) + # assert isinstance(readnameQueue, multiprocessing.Queue) + assert isinstance(totalTargetReadnameCount, int) + self.bamIn = bamfileIn + self.bamOut = bamfileOut + self.queue = readnameQueue + self.readnameCount = totalTargetReadnameCount + + def run(self): + # Combine the readnames from each of the child processes into one single bloom filter + masterBF = pybloomfilter.BloomFilter(self.readnameCount * 2, BF_PROB) + counter = 0 + while True: + readname = self.queue.get() + if readname is None: + break + masterBF.add(readname) + counter += 1 + + print("Finished sampling readnames. Writing output bam...") + # Finished getting all the reads, now write the output bam + with pysam.AlignmentFile(self.bamIn, 'rb') as bamSource: + with pysam.AlignmentFile(self.bamOut, 'wb', template=bamSource) as bamOutput: + for read in bamSource.fetch(): + if read.query_name in masterBF: + bamOutput.write(read) + print("Finished downsampling %s to %s. Outputted (selected/desired) %i/%i readnames." % (self.bamIn, self.bamOut, counter, self.readnameCount)) + +def getTotalReads(bam): + totalReads = 0 + perChromCount = {} + stats = pysam.idxstats(bam) + for line in stats.split('\n'): + tokenized = line.split() + if len(tokenized) == 0 or tokenized[0] == "*": continue + c = int(tokenized[2]) + int(tokenized[3]) # mapped + unmapped reads + perChromCount[tokenized[0]] = c + totalReads += c + return totalReads, perChromCount + + +def downsample(inputBam, outputBam, targetNumUniqueReadnames, numThreads=4): + # Sanity checks + if not os.path.isfile(inputBam): + raise RuntimeError("Cannot find %s" % inputBam) + indexFile = inputBam + ".bai" + if not os.path.isfile(indexFile): + raise RuntimeError("Input bam %s must be sorted" % inputBam) + + totalSingleEndReads, perChromSingleEndReads = getTotalReads(inputBam) + assert totalSingleEndReads != -1 + + # Skip downsampling if we already have fewer than downsampled amount + if targetNumUniqueReadnames * 2 >= totalSingleEndReads: + print("%s does not need downsampling. Copying old bam to new path while omitting completely unaligned pairs" % inputBam) + # Copy the file over + needs_downsampling = False + else: + print("Starting downsampling for %s" % inputBam) + needs_downsampling = True + + # Get the chromosomes included in this bam + with pysam.AlignmentFile(inputBam, 'rb') as bIn: + chromosomes = bIn.references + + # Split the chromosomes into per-thread lists of chromosomes + chromosomeChunks = partition(chromosomes, numThreads) + readnamesQueue = multiprocessing.Queue() + # Initialize the readnames consumer + consumerProcess = readnameConsumer(targetNumUniqueReadnames, inputBam, outputBam, readnamesQueue) + consumerProcess.start() + producerProcesses = [] + for chunk in chromosomeChunks: + if needs_downsampling: + perThreadSingleEndReads = {} + for chrom in chunk: + perThreadSingleEndReads[chrom] = perChromSingleEndReads[chrom] + perThreadTargetReadnames = {} + for chrom in chunk: + perThreadTargetReadnames[chrom] = int(float(perChromSingleEndReads[chrom]) / float(totalSingleEndReads) * float(targetNumUniqueReadnames)) + chunk_process = sampleReadNamesProcess(inputBam, perThreadSingleEndReads, perThreadTargetReadnames, readnamesQueue) + else: + chunk_process = allReadNamesProcess(inputBam, chunk, readnamesQueue) + + chunk_process.start() + producerProcesses.append(chunk_process) + + # Wait for all the per-chrom threads to terminate + for proc in producerProcesses: + proc.join() + + # Signal to the writer that we're done + readnamesQueue.put(None) + + # Wait for the writer to finish + consumerProcess.join() + + +def buildParser(): + parser = argparse.ArgumentParser( + description = __doc__, + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("--bam", type=str, required=True, + help="Bam file to downsample") + parser.add_argument("--downsampled_pairs", type=int, required=True, + help="Number of unique pairs to downsample to") + parser.add_argument("--output", type=str, required=True, + help="Path to output bam file") + parser.add_argument("--threads", type=int, default=4, + help="Number of threads to use") + return parser + + +if __name__ == "__main__": + # main() + parser = buildParser() + args = parser.parse_args() + + downsample(args.bam, args.output, args.downsampled_pairs, args.threads) diff --git a/zippy/edger.r b/zippy/edger.r new file mode 100755 index 0000000..a4c9dc4 --- /dev/null +++ b/zippy/edger.r @@ -0,0 +1,34 @@ +### +#Runs edger on our data. +#args +#1: data file +#2: number of samples in group 1 +#3: number of samples in group 2 +#4: output filename +#5: temp directory +args <- commandArgs(trailingOnly = TRUE) +source("http://bioconductor.org/biocLite.R") +biocLite("edgeR", lib=args[5]) +### +library("edgeR", lib.loc=args[5]) +#build sample groupings +groups <- c(rep(1,args[2]), rep(2,args[3])) +#read data into edger +print(args[1]) +x <- read.delim(args[1], row.names="Symbol", sep=",") +print(groups) +print(colnames(x)) +y <- DGEList(counts=x, group=groups) +#filter +keep <- rowSums(cpm(y)>1) >= 1 #very minimal filter step here, requiring a cpm of 1 in at least 1 sample +y <- y[keep, , keep.lib.sizes=FALSE] +y <- calcNormFactors(y) #performs TMM +design <- model.matrix(~groups) +y <- estimateDisp(y,design) +if(args[2] == 1 || args[3] == 1){ + fit <- glmFit(y,design,dispersion=0.04) #TODO: currently this is somewhat less than the recommended 'human' level of 0.16. This should be a user-defineable param. +} else { + fit <- glmFit(y,design) +} +lrt <- glmLRT(fit,coef=2) +write.csv(lrt$table, file=args[4]) \ No newline at end of file diff --git a/zippy/examples/chipseq.json b/zippy/examples/chipseq.json new file mode 100755 index 0000000..5fe0278 --- /dev/null +++ b/zippy/examples/chipseq.json @@ -0,0 +1,40 @@ +{ + // This simple workflow could be used to analyze Chip-seq data: it aligns the reads to the genome, + // and then finds peaks with a large number of reads (corresponding to binding sites). + "wildcards" : { + "path": "/path/to/run/folder" + }, + "stages": [ + { + "identifier": "bcl2fastq", + "output_dir": "{path}/fastq", + "stage": "bcl2fastq" + }, + { + "identifier": "bwa", + "output_dir": "{path}/align", + "previous_stage": "bcl2fastq", + "stage": "bwa" + }, + { + "identifier": "bwaalignstats", + "output_dir": "{path}/stats", + "previous_stage": "bwa", + "stage": "picardalignstats" + }, + { + "identifier": "macs", + "output_dir": "{path}/peak", + "previous_stage": "bwa", + "stage": "macs" + } + ], + "sample_sheet": "{path}/SampleSheet.csv", + "sample_path": "{path}", + "bwa_path": "", + "samtools_path": "", + "macs_path": "", + "picard": "", + "genome": "", + "scratch_path": "/path/to/scratch" +} diff --git a/zippy/examples/docker.json b/zippy/examples/docker.json new file mode 100755 index 0000000..6ecaba4 --- /dev/null +++ b/zippy/examples/docker.json @@ -0,0 +1,34 @@ +{ //Here's a simple docker example that uses a dockerized copy of bwa + "wildcards": {"path": "/home/path/to/output", + "sample_path": "/home/path/to/data", + "scratch": "/scratch"}, + "docker": { //The docker section of the config contains a map from identifiers to docker configurations. + "docker_bwa": { + "image": "some/dockerhub/path/example_bwa_image", + "mount_points": ["/home:/home"], //Mount points contains a list of docker-style mounts, in the format mount_from:mount_to + } + }, + "stages": [ + { + "identifier": "bcl2fastq", + "output_dir": "{path}/fastq", + "stage": "bcl2fastq", + "skip": false + }, + { + "identifier": "bwa1", + "output_dir": "{path}/bwa1_docker", + "previous_stage": "bcl2fastq", + "stage": "bwa", + "skip": false, + "use_docker": "docker_bwa" //to run a stage with docker, simply specify the name of the docker image to use in the stage dictionary. + } + ], + "sample_sheet": "{sample_path}/SampleSheet.csv", + "bcl2fastq_path": "/home/path/to/bcl2fastq", + "sample_path": "{sample_path}", //root of your sample folder for bcl2fastq + "bwa_path": "bwa", + "samtools_path": "samtools", + "genome": "/home/path/to/genome", + "scratch_path": "{scratch}" +} \ No newline at end of file diff --git a/zippy/examples/rna.json b/zippy/examples/rna.json new file mode 100755 index 0000000..c327a5f --- /dev/null +++ b/zippy/examples/rna.json @@ -0,0 +1,28 @@ +{ + // A very simple workflow that goes from a run folder (with bcl files) to quantified results using RSEM + "wildcards": { + "path": "/path/to/run/folder" + }, + "stages": [ + { + "identifier": "bcl2fastq", + "output_dir": "{path}/fastq", + "stage": "bcl2fastq", + "skip": false + }, + { + "stage": "rsem", + "identifier": "rsem", + "output_dir": "{path}/quant", + "previous_stage": "bcl2fastq", + "skip": false + } + ], + "sample_sheet": "{path}/SampleSheet.csv", + "bcl2fastq_path": "", + "sample_path" : "{path}", + "rsem_path": "", + "rsem_annotation": "", + "genome": "", + "scratch_path": "/path/to/scratch", +} \ No newline at end of file diff --git a/zippy/examples/subworkflow.json b/zippy/examples/subworkflow.json new file mode 100755 index 0000000..7a757bd --- /dev/null +++ b/zippy/examples/subworkflow.json @@ -0,0 +1,53 @@ +{ +//simple example of subworkflows, which runs a simple DNA pipeline with/without downsampling + //all curly-bracket-enclosed wildcard strings will be replaced at json load-time + "wildcards": {"path": "/path/to/output_root", + "input_path":"/path/to/input_root"}, + "stages": [ + { + "identifier": "fastq", + "output_dir": "{path}/fastq", + "stage": "bcl2fastq", + "args": "--ignore-missing-bcls" //most stages for which command like arguments make sense support the 'args' tag + }, + { + "identifier": "alignment", + "output_dir": "{path}/align", + "previous_stage": "fastq", + "stage": "bwa" + }, + { + "identifier": "subsample", + "output_dir": "{path}/subsample", + "previous_stage": "alignment", + "stage": "bloomsubsamplebam", + "reads": 50000000, + "skip": false //all stages can be set to skip: true if you do not want that stage to be executed (for debugging or other purposes) + }, + {//downstream processing direct from alignment + "subworkflow": "subworkflow_downstream.json", + "identifier": "", + "previous_stage": {"pisces": "alignment"} //for subworkflows, previous stage is a dictionary mapping + // subworkflow stages to the top-level stage they inherit from. So here pisces is called directly on the output from the alignment stage + }, + {//downstream processing afer subsampling to 50M reads + "subworkflow": "subworkflow_downstream.json", + "identifier": "_down", //We want to run the same subworkflow, so we give it an identifier _suffix_ that disambiguates it from the other instance of the subworkflow. + "previous_stage": {"pisces": "subsample"} + } + ], + "sample_sheet": "{input_path}/SampleSheet.csv", + "bcl2fastq_path": "", + "sample_path": "{input_path}", + "bwa_path": "", + "samtools_path": "", + "genome": "", + "dotnet": "", + "pisces_path": "", + "python": "", + "scratch_path": "/path/to/scratch", + "nirvana_path": "", + "nirvana_cache": "", + "nirvana_supplement": "", + "nirvana_ref": "", +} diff --git a/zippy/examples/subworkflow_down.json b/zippy/examples/subworkflow_down.json new file mode 100755 index 0000000..e8c0c16 --- /dev/null +++ b/zippy/examples/subworkflow_down.json @@ -0,0 +1,21 @@ +{ + // Subworkflow that calls variants, and then annotates them. Note that the subworkflow can only + // contain wildcards, stages and dockers. Other top-level properties (e.g., pisces_path) must + // be defined in the top-level workflow. + "wildcards": {"path": "/path/to/output_root", + "input_path":"/path/to/input_root"}, + "stages": [ + { + "identifier": "pisces", + "output_dir": "{path}/pisces", //in subworkflows, output_dir will be appended with the _subworkflow_ identifier automatically + "previous_stage": "", // previous stage here does not need to be defined, as pisces will be fed-in at the top-level + "stage": "pisces", + }, + { + "identifier": "nirvana", + "output_dir": "{path}/nirvana", + "previous_stage": "pisces", + "stage": "nirvana", + } + ] +} \ No newline at end of file diff --git a/zippy/make_params.py b/zippy/make_params.py new file mode 100755 index 0000000..20d6020 --- /dev/null +++ b/zippy/make_params.py @@ -0,0 +1,276 @@ +import argparse +import ast +import _ast +import imp +import inspect +try: + import meta #meta is optional, and used for parameter detection if the source code is not found. +except ImportError: + pass +import sys +import re +from collections import OrderedDict, namedtuple +import commentjson as json +from .modular_runner import * +from .params import get_params + +param_tuple = namedtuple('param_tuple', ['param', 'is_optional']) +class ParamFinder(ast.NodeVisitor): + ''' + AST graph walker designed to find all uses of the params namespace in the ast graph. + There is some magic in handling the 'self' keyword, which is used to denote class-specific params. + For example, params.self.output_dir in class RSEMRunner will be mapped to the tuple ('RSEMRunner', 'output_dir') + ''' + def __init__(self): + self.params_found = [] + + def filter_private(self): + print self.params_found + self.params_found = [x for x in self.params_found if x.param[-1][0]!='_'] + + def register_param(self, raw_tuple): + ''' + Creates a list of params that keeps the order they were seen, but removes duplicates. + ''' + if self.is_valid_param(raw_tuple): + self.params_found.append(self.format_param(raw_tuple)) + + def is_valid_param(self, x): + ''' + Given a raw tuple formed from an AST path, we see if it's well formed. A well formed + tuple is currently defined as any param that is not wholly containing self and optional + tags. More sophisticated validation here is possible. + ''' + if len(x) == 1 and x[0] in ['self', 'optional']: + return False + if len(x) == 2 and x[0] in ['self', 'optional'] and x[1] in ['self', 'optional']: + return False + return True + + def format_param(self, raw_tuple): + ''' + Converts a raw ast trace from the ast graph into a param_tuple named tuple. It: + -strips out optional and puts it in its own field + -converts self into the current class name. + ''' + try: + raw_tuple.remove('optional') + is_optional = True + except ValueError: #optional is not in the param. + is_optional = False + try: + while True: + self_index = raw_tuple.index('self') + raw_tuple[self_index] = self.current_class + except ValueError: + pass + if len(raw_tuple) > 2: + raise ValueError('Malformed parameter tuple: {}'.format(raw_tuple)) + return param_tuple(tuple(raw_tuple), is_optional) + + + + + def uniqueify(self): + seen = set() + self.params_found = [x for x in self.params_found if x not in seen and not seen.add(x)] + + def old_visit_Attribute(self, node): + ''' + TODO: check that it's read and not written + ''' + if isinstance(node.value, _ast.Attribute) and node.value.attr == 'params' and node.attr != 'self': + self.params_found.append(node.attr) + #This next bit identifies lines two steps removed from params. In that case, we know that the middle attribute is 'self' and we + #replace it with the relevant class name. + elif isinstance(node.value, _ast.Attribute) and isinstance(node.value.value, _ast.Attribute) and node.value.value.attr == 'params': + self.params_found.append((self.current_class, node.attr)) + self.generic_visit(node) + + def visit_Attribute(self, node): + ''' + TODO: check that it's read and not written + ''' + if isinstance(node, _ast.Attribute): + current_node = node + param_builder = [] + param_builder.append(node.attr) + while isinstance(current_node.value, _ast.Attribute): + if current_node.value.attr == 'params': + self.register_param(param_builder[::-1]) + break + param_builder.append(current_node.value.attr) + current_node = current_node.value + self.generic_visit(node) + +def case_insensitive_list_match(query, l): + """ + Returns the first case-insensitive matching item to the query + """ + query = query.lower() + l_low = [x.lower() for x in l] + for (i,x) in enumerate(l_low): + if query == x: + return l[i] + return None + +def walk_all_methods(pf, class_object): + ''' + ''' + for func in dir(class_object): + try: + try: + #we get each method from source, deindent it, and then parse it + source_lines = inspect.getsourcelines(getattr(class_object,func))[0] + indent = len(source_lines[0]) - len(source_lines[0].lstrip()) + source_lines = [line[indent:] for line in source_lines] + ast_tree = ast.parse(''.join(source_lines)) + except IOError: + print('Module source code not found: ({}, {}). Decompiling instead.'.format(class_object, func)) + try: + ast_tree = meta.decompile(getattr(class_object,func).__code__) + except AssertionError: + print 'meta failed to decompile function {} in class {}. Some parameters may be missing from the generated file.'.format(func, class_object.__name__) + continue + except NameError: + print 'meta is not installed. Parameters may be missing from the generated file.' + except TypeError: + continue + pf.visit(ast_tree) + except AttributeError: + continue + +class JsonBuilderMain(): + """ + Compiles a params proto file into a template that must be filled out for running ZIPPY. + """ + def __init__(self, input_args): + self.input_args = input_args + if self.input_args.out: + self.output_file = self.input_args.out + else: + arg_split = self.input_args.proto.split('.') + self.output_file = '.'.join(arg_split[:-1]+['compiled']+ [arg_split[-1]]) + self.params = get_params(self.input_args.proto, proto=True) + self.modules_loaded = 0 + for x in getattr(self.params, 'imports', []): + self.load_external_modules(x) + + def load_external_modules(self, module_path): + modules_to_add = {} + m = imp.load_source('user_module_import_{}'.format(self.modules_loaded), module_path) + self.modules_loaded += 1 + for k in vars(m).keys(): + if 'Runner' in k: + modules_to_add[k] = vars(m)[k] + globals().update(modules_to_add) + + def make_params_file(self): + ''' + Given the params.stages list in the input params file, finds all the needed params to run the pipeline. + Each stage will have an identifier, and may have a previous stage. We also add a 'scratch_path' for the pyflow workspace. + ''' + pf = ParamFinder() + for stage in self.params.stages: + class_name = case_insensitive_list_match('{}Runner'.format(stage), globals().keys()) + try: + class_object = globals()[class_name] + except KeyError: + raise KeyError("One of your workflow stages, {}, is not found. If it is defined in a custom file, make sure that file is imported in your params file. If it is built-in, make sure you are spelling it correctly!".format(stage)) + pf.current_class = stage + walk_all_methods(pf, class_object) + pf.uniqueify() + pf.filter_private() + try: + defaults = get_params(self.input_args.defaults, proto=True) + except IOError: + print 'Warning: No defaults file found.' + defaults = None + output_map = OrderedDict() + if hasattr(self.params, 'imports'): + output_map["imports"] = self.params.imports + identifiers = set() + optional_params = filter(lambda x: x.is_optional, pf.params_found) + required_params = filter(lambda x: not x.is_optional, pf.params_found) + output_map["stages"] = [] + #stage params + for (i,stage) in enumerate(self.params.stages): + identifier = get_identifier(stage, identifiers) + stage_map = {"stage": stage, "identifier": identifier} + if i > 0: + print output_map + stage_map['previous_stage'] = output_map["stages"][i-1]["identifier"] + for param in required_params: + param = param.param + print param + if param[0] == stage and len(param)>1: + if hasattr(defaults, 'stages') and stage in defaults.stages and param[1] in defaults.stages[stage]: + stage_map[param[1]] = defaults.stages[stage][param[1]] + else: + stage_map[param[1]] = '' + for param in optional_params: + if self.input_args.default_behavior == 'ignore': + continue + param = param.param + if param[0] == stage: + if self.input_args.default_behavior == 'include': + if stage in defaults.stages and param[1] in defaults.stages[stage]: + stage_map[param[1]] = defaults.stages[stage][param[1]] + else: + stage_map[param[1]] = '' + elif self.input_args.default_behavior == 'warn': + if not hasattr(defaults, 'stages') or stage not in defaults.stages or param[1] not in defaults.stages[stage]: + print "Warning: parameter {} is not included in stage {} defaults".format(param[1], stage) + output_map["stages"].append(stage_map) + #global params + for param in required_params: + if len(param.param) > 1: + continue + param = param.param[0] + if hasattr(defaults, param): + output_map[param] = getattr(defaults, param) + else: + output_map[param] = '' + for param in optional_params: + if len(param.param) > 1: + continue + if self.input_args.default_behavior == 'ignore': + continue + param = param.param[0] + if self.input_args.default_behavior == 'include': + if hasattr(defaults, param): + output_map[param] = getattr(defaults, param) + else: + output_map[param] = '' + elif self.input_args.default_behavior == 'warn': + if not hasattr(defaults, param): + print "Warning: global parameter {} is not included".format(param) + output_map["scratch_path"] = '' + with open(self.output_file, 'w') as f: + json.dump(output_map, f, indent=4) + +def get_identifier(stage, identifiers): + ''' + Called when beginning the output for a new pipeline stage. Returns a unique id for that pipeline stage. + ''' + while stage in identifiers: + instance = re.search('(.+_)(\d+)$', stage) + if instance is not None: #we are an identifer of the form stagename.# + stage = instance.group(1)+str(int(instance.group(2))+1) + else: #we are an identifier of the form stagename + stage = stage+'_1' + identifiers.add(stage) + return stage + +def get_argparser(): + parser = argparse.ArgumentParser(usage='make_params is used to compile a proto-workflow into a form useable by ZIPPY', formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('proto', help='The proto-workflow to compile.') + parser.add_argument('out', help='The name of the compiled workflow to create. If none, prepends compiled to the filetype as output (e.g., foo.compiled.json)', nargs='?') + parser.add_argument('--defaults', default='defaults.json', help='File with default parameter values (in json format)') + parser.add_argument('--default_behavior', default='warn', choices=['include', 'warn', 'ignore'], help='How to treat optional parameters. them all in your file, you if they are not in the specified defaults file, or them.') + return parser +if __name__ == '__main__': + parser = get_argparser() + input_params = parser.parse_args() + jsb = JsonBuilderMain(input_params) + jsb.make_params_file() diff --git a/zippy/modular_helper.py b/zippy/modular_helper.py new file mode 100755 index 0000000..d27be9e --- /dev/null +++ b/zippy/modular_helper.py @@ -0,0 +1,46 @@ +from __future__ import print_function +from .modular_runner import * +from .make_params import case_insensitive_list_match, ParamFinder, walk_all_methods + +def run_help(arg): + print('Welcome to the ZIPPY help system.') + if arg is None: + print('ZIPPY is a powerful, easy-to-use NGS pipeline prototyping system, with batteries included.') + print('This help system can be used to learn more about the various modules included with ZIPPY') + print(' ') + print('To see a list of all modules, use "python zippy.py --help all"') + print('To get help for a specific module, use "python zippy.py --help "') + print(' ') + print('ZIPPY command line arguments:') + print(' ') + print('zippy.py workflow [--defaults defaults_file]') + print('workflow\tThe ZIPPY workflow json file.') + print('--defaults\tDEFAULTS\tFile with default parameter values (in json format)\n\t\t(default: defaults.json)') + print('--run_local\tIf set, we will run things on the local node, rather than attempt to use SGE.') + elif arg == 'all': + ignore_set = set(['ModularRunner', 'WorkflowRunner']) + print('All built-in ZIPPY modules:') + print(' ') + print('\n'.join(sorted([x for x in globals().keys() if 'Runner' in x and x not in ignore_set]))) + else: + if 'Runner' not in arg: + arg = case_insensitive_list_match('{}Runner'.format(arg), globals().keys()) + module = globals()[arg] + print(module) + print(module.__doc__) + print("\tParameters for {}:".format(arg)) + pf = ParamFinder() + pf.current_class = arg + walk_all_methods(pf, module) + pf.uniqueify() + for param in pf.params_found: + if param.param[0] == arg: + if param.is_optional: + print("\t\t{}\t(optional)".format(param.param[1])) + else: + print("\t\t{}".format(param.param[1])) + + + +if __name__ == '__main__': + main() diff --git a/zippy/modular_runner.py b/zippy/modular_runner.py new file mode 100755 index 0000000..80b3a5b --- /dev/null +++ b/zippy/modular_runner.py @@ -0,0 +1,1154 @@ +import resource +import glob +import string +import os +import sys +import fnmatch +from collections import defaultdict, namedtuple +from functools import wraps +from abc import ABCMeta, abstractmethod +import itertools +import numpy + +from .utils import sub_check_wilds +from pyflow import WorkflowRunner + +from star import SingleStarFlow +from bwa import BWAWorkflow +from samplesheet import SampleSheet, check_valid_samplename +from bcl2fastq import Bcl2Fastq + +SampleTuple = namedtuple('SampleTuple', ['id', 'name']) +zippy_dir = os.path.dirname(__file__) + + +def passthrough(*all_pass): + """ + Function decorator used to forward outputs. + """ + def passthrough_decorator(get_output): + @wraps(get_output) + def func_wrapper(self, sample): + output = get_output(self, sample) + for to_pass in all_pass: + pass_in = self.collect_input(sample, to_pass) + if pass_in is not None and len(pass_in) > 0: + output[to_pass] = pass_in + return output + return func_wrapper + return passthrough_decorator + +def organize_fastqs(fastq_files, is_paired_end=True): + ''' + Helper function used to separate R1/R2 + ''' + r1_files = [x for x in fastq_files if '_R1_' in x] + r2_files = [x for x in fastq_files if '_R2_' in x] + if is_paired_end and len(r1_files)>0 and len(r2_files)>0: + assert not set(r1_files).intersection(set(r2_files)) + return (r1_files, r2_files) + else: + return fastq_files + +class ModularRunner(): + ''' + Naming convention: a collect method is called at the front end of a stage to take in data. + A get method is called at the back end of a stage to give data to the next stage. + typically, collect methods should never need to be overridden. Get methods must be changed when + the stage does something unusual (such as merging or creating samples). + ''' + __metaclass__ = ABCMeta + + def __init__(self, identifier, params, previous_stages): + self.identifier = identifier + self.params = params + self.previous_stages = previous_stages + self.set_up_optional_params() + try: + self.sample_sheet = SampleSheet(self.params.sample_sheet, fix_dup_sample_names=self.params.optional.zippy_fix_duplicate_sample_names) + except AttributeError: + self.sample_sheet = None + + def set_up_optional_params(self): + overrides = self.define_optionals() + self.params.self._update_from_dict(overrides) + overrides = self.define_zippy_optionals() + self.params._update_from_dict(overrides) + + def define_optionals(self): + ''' + Overrideable method that can be used to return a map of default values. If these parameters + are not set in the json file, they will be added here + ''' + return {} + + def define_zippy_optionals(self): + ''' + Zippy's built-in parameters have their defaults given here. + ''' + return {'zippy_fix_duplicate_sample_names': False} + + @abstractmethod + def get_output(self, sample): + ''' + This function is called to return the output generated by this stage. + 'sample' is the sample named_tuple for that sample + The form of the output is a dictionary from zippy type to file path. See the + wiki for more details. + ''' + pass + + def get_dependencies(self, sample): + ''' + This function is called for a stage to return its own outgoing tasks related to a sample. + I.e., this is the list of workflow tasks that must be run before the stage is complete for + that sample. By default, we assume that self.task is a + dict (or, often, a defaultdict of lists). This can be overloaded for more complicated tasks. + ''' + return self.task[sample] + + @abstractmethod + def workflow(self, workflowRunner): + ''' + Add all of this stage's tasks to the workflow graph using the input workflowRunner. + ''' + pass + + def get_memory_count(self, count): + ''' + Checks the default core count through the filter of a global max memory, and a in-stage set parameter. Memory is defined in MB + ''' + if hasattr(self.params.self, 'memory'): + count = self.params.self.optional.memory + if hasattr(self.params, 'max_memory'): + count = min(count, self.params.optional.max_memory) + return count + + def get_core_count(self, count): + ''' + Checks the default core count through the filter of a global max cores count, and a in-stage set parameter + ''' + if hasattr(self.params.self, 'cores'): + count = self.params.self.optional.cores + if hasattr(self.params, 'max_cores'): + count = min(count, self.params.optional.max_cores) + return count + + def get_samples(self): + ''' + Returns a list of all sample tuples that this stage returns AS OUTPUT. By default, stages do not change the + sample list, and so we just chain our samples along our previous stages by calling collect samples. + ''' + return self.collect_samples() + + def collect_samples(self): + ''' + Returns a list of all sample tuples that this stage must process by taking the union of all previous stages. + ''' + samples = set() + for previous_stage in self.previous_stages: + samples |= set(previous_stage.get_samples()) + return samples + + def setup_workflow(self, workflowRunner): + ''' + If the skip param for this stage is set to true, we don't add its workflow. + ''' + try: + skip = self.params.self.optional.skip + except AttributeError: + self.workflow(workflowRunner) + else: + if skip: + self.get_dependencies=lambda x: [] + else: + self.workflow(workflowRunner) + + def collect_input(self, sample, file_type, as_list=False): + ''' + For input, we transparently return either a single instance or a list of 2 or more instances, depending on what we have. + ''' + results = [] + for previous_stage in self.previous_stages: + try: + stage_results = previous_stage.get_output(sample)[file_type] + if isinstance(stage_results, list): + results.extend(stage_results) + elif isinstance(stage_results, str): + results.append(stage_results) + elif isinstance(stage_results, unicode): + results.append(str(stage_results)) + else: + raise TypeError('Input for {} received neither a string or list: {}'.format(self.identifier, stage_results)) + except (KeyError,TypeError): + continue + if len(results) > 1: + results = list(set(results)) #we want to remove duplicates that got to this point through multiple paths (e.g., passthroughs) + if len(results) > 1 or as_list: + return results + else: + return results[0] + if len(results) == 1: + if as_list: + return results + else: + return results[0] + else: + return None + + def collect_dependencies(self, sample): + dependencies = [] + for previous_stage in self.previous_stages: + new_dependencies = previous_stage.get_dependencies(sample) + if isinstance(new_dependencies, list): + dependencies.extend(new_dependencies) + elif isinstance(new_dependencies, str): + dependencies.append(new_dependencies) + elif isinstance(new_dependencies, unicode): #python 2 = derp + dependencies.append(str(new_dependencies)) + else: + raise TypeError('Dependencies for {} received neither a string or list: {}'.format(self.identifier, new_dependencies)) + return dependencies + + +class Bcl2FastQRunner(ModularRunner): + """ + Produces fastqs for the samples in your samplesheet. Built-in parameter turns off lane splitting, as the framework makes the assumption + one sample = one set of fastqs. + Input: none explicitly! + Output: fastq + """ + def get_samples(self): + sample_id_map = {} + samples = [] + for line in self.sample_sheet.get("Data"): + if line.get("Sample_ID") == '' and line.get("Sample_Name") == '': + continue + sample = line.get("Sample_ID") + if sample not in sample_id_map: + #samples.append(SampleTuple(sample, self.sample_sheet.sample_id_to_sample_name(sample))) + samples.append(SampleTuple(sample, self.sample_sheet.unique_sample_name_map[sample])) + sample_id_map[sample] = True + return samples + + + def get_output(self, sample): + """ + Limitations: bcl2fastq2 has insanely varied output given the input. Currently, we assume that sample_index is consecutively numeric. + #TODO: detect paired end automatically + """ + sample_first_instance = {} + samples_to_return = [] + for line in self.sample_sheet.get("Data"): + if line.get("Sample_ID") == '' and line.get("Sample_Name") == '': + continue + if sample.id == line.get("Sample_ID"): + if not sample in sample_first_instance: + sample_first_instance[sample] = line.data_i + else: #if no_lane_splitting=True this is needed + continue + #if line.has("Lane"): #this is not necessary when no_lane_splitting=True + # samples_to_return.append("{path}/{sample_name}_S{sample_index}_L{lane:03d}_R1_001.fastq.gz".format( + # path=self.params.self.output_dir, sample_name=line.get("Sample_Name"), sample_index=sample_first_instance[sample], lane=int(line.get("Lane")))) + # samples_to_return.append("{path}/{sample_name}_S{sample_index}_L{lane:03d}_R2_001.fastq.gz".format( + # path=self.params.self.output_dir, sample_name=line.get("Sample_Name"), sample_index=sample_first_instance[sample], lane=int(line.get("Lane")))) + #else: #no lane information + samples_to_return.append("{path}/{sample_name}_S{sample_index}_R1_001.fastq.gz".format( + path=self.params.self.output_dir, sample_name=line.get("Sample_Name"), sample_index=sample_first_instance[sample])) + samples_to_return.append("{path}/{sample_name}_S{sample_index}_R2_001.fastq.gz".format( + path=self.params.self.output_dir, sample_name=line.get("Sample_Name"), sample_index=sample_first_instance[sample])) + print 'z' + print samples_to_return + print 'z' + return {'fastq': samples_to_return} + + def get_dependencies(self, sample): + return self.bcl2fastq_task + + def workflow(self, workflowRunner): + if hasattr(self.params.self, 'args'): + args = " " + self.params.self.optional.args + if "no-lane-splitting" not in args: + args+=" --no-lane-splitting" + else: + args = '--no-lane-splitting' + bcl2fastq_wf = Bcl2Fastq(self.params.bcl2fastq_path, self.params.sample_path, self.params.self.output_dir, self.params.sample_sheet, args=args, max_job_cores=self.get_core_count(16)) + self.bcl2fastq_task = workflowRunner.addWorkflowTask(self.identifier, bcl2fastq_wf) + +class RSEMRunner(ModularRunner): + """ + Currently, the RSEM runner only goes from fastq to quants. + TODOs: + -Add support for chaining based on the RSEM bam file + -Add support for running from bams + Input: fastqs. The fastqs are assumed to contain r1/r2 information. + Output: .gene.results files. + """ + + def get_output(self, sample): + return {'rsem': os.path.join(self.params.self.output_dir,sample.name+".genes.results"), + 'rsem_model': os.path.join(self.params.self.output_dir,sample.name+".stat",sample.name+".model"), + 'transcript_bam': os.path.join(self.params.self.output_dir,sample.name+".transcript.bam")} + + def define_optionals(self): + return {'args': ''} + + def workflow(self, workflowRunner): + self.task = defaultdict(list) + if not os.path.exists(self.params.self.output_dir): + os.makedirs(self.params.self.output_dir) + cores = self.get_core_count(16) + for sample in self.collect_samples(): + sample_name = sample.name + fastq_files = self.collect_input(sample, 'fastq', as_list=True) + fastq_files = organize_fastqs(fastq_files, self.sample_sheet.is_paired_end()) + if self.sample_sheet.is_paired_end(): + workflowRunner.flowLog(fastq_files) + rsem_command = "{rsem_path} --paired-end {r1} {r2} {reference} {sample_name} --star -p {cores} --star-path {star_path} --star-gzipped-read-file --temporary-folder {temp_folder}".format( #I have no idea why we need to specify a directory below reference. So weird. + rsem_path=self.params.rsem_path, + r1=",".join(fastq_files[0]), + r2=",".join(fastq_files[1]), + reference=self.params.rsem_annotation, + sample_name=os.path.join(self.params.self.output_dir,sample_name), + star_path=self.params.star_path[:-5], #-5 due to the star_path variable having the /STAR suffix, which rsem doesn't want + genome_dir=self.params.genome, + cores=cores, + temp_folder=os.path.join(self.params.scratch_path,sample.id)) + else: + rsem_command = "{rsem_path} {r1} {reference} {sample_name} --star -p {cores} --star-path {star_path} --star-gzipped-read-file --temporary-folder {temp_folder}".format( #I have no idea why we need to specify a directory below reference. So weird. + rsem_path=self.params.rsem_path, + r1=",".join(fastq_files), + reference=self.params.rsem_annotation, + sample_name=os.path.join(self.params.self.output_dir,sample_name), + star_path=self.params.star_path[:-5], #-5 due to the star_path variable having the /STAR suffix, which rsem doesn't want + genome_dir=self.params.genome, + cores=cores, + temp_folder=os.path.join(self.params.scratch_path,sample.id)) + rsem_command += ' ' + self.params.self.optional.args + workflowRunner.flowLog(rsem_command) + workflowRunner.flowLog("output: {}".format(self.get_output(sample))) + dependencies = self.collect_dependencies(sample) + self.task[sample].append(workflowRunner.addTask('{}_{}'.format(self.identifier, sample.id), rsem_command, dependencies=dependencies, nCores=cores, memMb=self.get_memory_count(1024*100))) + +class BWARunner(ModularRunner): + ''' + Runs BWA. + Input: fastq + Output: bam + ''' + def get_output(self, sample): + return {'bam': os.path.join(self.params.self.output_dir, sample.name, sample.name+".raw.bam")} + + def define_optionals(self): + return {'args': ''} + + def workflow(self, workflowRunner): + self.task = defaultdict(list) + if not os.path.exists(self.params.self.output_dir): + os.makedirs(self.params.self.output_dir) + cores = self.get_core_count(20) + mem = self.get_memory_count(1024 * 127) + args = self.params.self.optional.args + for sample in self.collect_samples(): + sample_name = sample.name + dependencies = self.collect_dependencies(sample) + fastq_files = self.collect_input(sample, 'fastq', as_list=True) + fastq_files = organize_fastqs(fastq_files, self.sample_sheet.is_paired_end()) + if self.sample_sheet.is_paired_end(): + if len(fastq_files[0]) != 1 or len(fastq_files[1]) != 1: + raise NotImplementedError("bwa only supports one fastq per sample") + #put R1, then R2 in a list + fastq_files = [x for x in itertools.chain(*fastq_files)] + bwa_wf = BWAWorkflow(os.path.join(self.params.self.output_dir, sample_name), + self.params.bwa_path, self.params.samtools_path, self.params.genome+"/genome.fa", cores, mem, fastq_files, sample=sample.id, args=args) + else: + if len(fastq_files) != 1: + raise NotImplementedError("bwa only supports one fastq per sample: {}".format(fastq_files)) + bwa_wf = BWAWorkflow(os.path.join(self.params.self.output_dir, sample_name), + self.params.bwa_path, self.params.samtools_path, self.params.genome+"/genome.fa", cores, mem, fastq_files, sample=sample.id, args=args) + bwa_task = workflowRunner.addWorkflowTask('bwa_{}_{}'.format(self.identifier, sample.id), bwa_wf, dependencies=dependencies) + mv_task = workflowRunner.addTask('mv_{}_{}'.format(self.identifier, sample.id), "mv {} {}".format(os.path.join(self.params.self.output_dir, sample_name, "out.sorted.bam"), os.path.join(self.params.self.output_dir, sample_name, sample_name+".raw.bam")), dependencies=bwa_task, isForceLocal=True) + self.task[sample].append(workflowRunner.addTask('index_{}_{}'.format(self.identifier, sample.id), "{} index {}".format(self.params.samtools_path, os.path.join(self.params.self.output_dir,sample_name, sample_name+".raw.bam")), dependencies=mv_task)) + +class CommandLineRunner(ModularRunner): + ''' + The CommandLineRunner allows the execution of code which is not modularized for zippy. It uses a simple templating system to create + sample specific commands. To run command line runner, you need to specify: + -input_format and output_format: the filetypes you will require and hand off + -output from the stage path, how to get to the file outputs. This command can be templated to use the sample id (sample.id) or sample name (sample.name) + -command The general command you wish to run. This command can be templated to use the sample id (sample.id) or sample name (sample.name) for + per-sample command execution. It can also be templated with 'self.output' to put the output string into the command + -input_delimiter: in the case where there is more than one input sample + TODO: handle cases of non-per-sample execution (merge stages), or initial stages. + TODO: handle input + ''' + def get_output(self, sample): + return {self.params.self.output_format : os.path.join(self.params.self.output_dir, self.create_output_string(sample))} + + def create_output_string(self, sample): + sample_dict = {"sample.id": sample.id, + "sample.name": sample.name} + return sub_check_wilds(sample_dict, self.params.self.output) + + def create_command_string(self, sample, input_files): + sample_dict = {"sample.id": sample.id, + "sample.name": sample.name, + "self.output": create_output_string(sample)} + return sub_check_wilds(sample_dict, self.params.self.command) + + + def workflow(self, workflowRunner): + self.task = defaultdict(list) + if not os.path.exists(self.params.self.output_dir): + os.makedirs(self.params.self.output_dir) + cores = self.get_core_count(4) + mem = self.get_memory_count(1024 * 32) + for sample in self.collect_samples(): + dependencies = self.collect_dependencies(sample) + input_files = self.collect_input(sample, self.params.self.input_format) + custom_command = create_command_string(sample, input_files) + self.task[sample].append(workflowRunner.addTask('{}_{}'.format(self.identifier, sample.id), + custom_command, dependencies=dependencies, nCores=cores, memMb=mem)) + +class SubsampleBAMRunner(ModularRunner): + ''' + Uses samtools to sample a percentage of reads from an input bam. + Input: bam + Output: bam + ''' + def get_output(self, sample): + return {'bam': os.path.join(self.params.self.output_dir, sample.name+".sub.bam")} + + def workflow(self, workflowRunner): + """ + TODO: might the aligner return more than 1 BAM? + """ + self.task = defaultdict(list) + if not os.path.exists(self.params.self.output_dir): + os.makedirs(self.params.self.output_dir) + for sample in self.collect_samples(): + dependencies = self.collect_dependencies(sample) + bam_file = self.collect_input(sample, 'bam') + new_bam_file = os.path.join(self.params.self.output_dir, sample.name+".sub.bam") + subsample_command = '{4} view -s {3}{0} -b {1} > {2}'.format( + self.params.self.subsample_fraction, bam_file, new_bam_file, random.randint(0,100000), self.params.samtools_path) + self.task[sample].append(workflowRunner.addTask('{}_{}'.format(self.identifier, sample.id), + subsample_command, dependencies=dependencies)) + +class BAMtoFASTQRunner(ModularRunner): + ''' + Uses samtools to convert a bam to a fastq.gz + TODO: currently paired-end only + Input: bam + Output: fastq + ''' + def get_output(self, sample): + return {'fastq': [os.path.join(self.params.self.output_dir, sample.name+"_R1_.fastq.gz"), os.path.join(self.params.self.output_dir, sample.name+"_R2_.fastq.gz")]} + + def workflow(self, workflowRunner): + """ + TODO: might the aligner return more than 1 BAM? + """ + self.task = defaultdict(list) + if not os.path.exists(self.params.self.output_dir): + os.makedirs(self.params.self.output_dir) + for sample in self.collect_samples(): + dependencies = self.collect_dependencies(sample) + bam_file = self.collect_input(sample, 'bam') + new_r1 = os.path.join(self.params.self.output_dir, sample.name+"_R1_.fastq") + new_r2 = os.path.join(self.params.self.output_dir, sample.name+"_R2_.fastq") + command = '{samtools} fastq -1 {r1} -2 {r2} {input_bam}'.format( + samtools=self.params.samtools_path, r1=new_r1, r2=new_r2, input_bam=bam_file) + fastq_task = workflowRunner.addTask('{}_{}'.format(self.identifier, sample.id), + command, dependencies=dependencies) + self.task[sample].append(workflowRunner.addTask('gzip1_{}_{}'.format(self.identifier, sample.id), + 'gzip -f {}'.format(new_r1), dependencies=fastq_task)) + self.task[sample].append(workflowRunner.addTask('gzip2_{}_{}'.format(self.identifier, sample.id), + 'gzip -f {}'.format(new_r2), dependencies=fastq_task)) + +class PicardAlignStatsRunner(ModularRunner): + """ + Produces a bunch of picard stats from bwa output. Currently includes: + -CollectAlignmentSummaryMetrics + -CollectInsertSizeMetrics + -CollectHsMetrics + + """ + def get_output(self, sample): + return {'align_stats': os.path.join(self.params.self.output_dir,"{}.stats.txt".format(sample.name)), + 'insert_stats': os.path.join(self.params.self.output_dir,"{}.insert.txt".format(sample.name)), + 'insert_plot': os.path.join(self.params.self.output_dir,"{}.insert.plot.pdf".format(sample.name))} + + def workflow(self, workflowRunner): + self.task = defaultdict(list) + if not os.path.exists(self.params.self.output_dir): + os.makedirs(self.params.self.output_dir) + mem = self.get_memory_count(24 * 1024) + for sample in self.collect_samples(): + sample_scratch = os.path.join(self.params.scratch_path, sample.name+self.identifier) + if not os.path.exists(sample_scratch): + os.makedirs(sample_scratch) + sample_name = sample.name + bam_file = self.collect_input(sample, 'bam') + dependencies = self.collect_dependencies(sample) + picard_command = "java -Xmx4G -Djava.io.tmpdir={} -jar {} CollectAlignmentSummaryMetrics R={}/genome.fa I={} O={} TMP_DIR={}".format( + sample_scratch, self.params.picard, self.params.genome, bam_file, os.path.join(self.params.self.output_dir,"{}.stats.txt".format(sample_name)), sample_scratch) + stats_task = workflowRunner.addTask('alignstats_{}_{}'.format(self.identifier, sample.id), picard_command, memMb=mem, dependencies=dependencies) + self.task[sample].append(stats_task) + picard_command = "java -Xmx4G -Djava.io.tmpdir={} -jar {} CollectInsertSizeMetrics I={} O={} H={} M=0.1 TMP_DIR={}".format( + sample_scratch, self.params.picard, bam_file, os.path.join(self.params.self.output_dir,"{}.insert.txt".format(sample_name)), os.path.join(self.params.self.output_dir,"{}.insert.plot.pdf".format(sample_name)), sample_scratch) + stats_task = workflowRunner.addTask('insertstats_{}_{}'.format(self.identifier, sample.id), picard_command, memMb=mem, dependencies=dependencies) + self.task[sample].append(stats_task) + if hasattr(self.params.self, 'target_intervals') and hasattr(self.params.self, 'bait_intervals'): + picard_command = "java -Xmx4G -Djava.io.tmpdir={} -jar {} CollectHsMetrics R={}/genome.fa I={} O={} TMP_DIR={} BAIT_INTERVALS={} TARGET_INTERVALS={}".format( + sample_scratch, self.params.picard, self.params.genome, bam_file, os.path.join(self.params.self.output_dir,"{}.stats.txt".format(sample_name)), sample_scratch, self.params.self.optional.bait_intervals, self.params.self.optional.target_intervals) + stats_task = workflowRunner.addTask('hsstats_{}_{}'.format(self.identifier, sample.id), picard_command, memMb=mem, dependencies=dependencies) + self.task[sample].append(stats_task) + +class MarkDuplicatesRunner(ModularRunner): + """ + Runs picard markduplicates. Currently fixed java heap size to 8GB, as it seems to fail with smaller heap sizes + Input/output are both bams. + """ + @passthrough('starlog') + def get_output(self, sample): + ''' + Passes through starlog if available. + ''' + return {'bam': os.path.join(self.params.self.output_dir,"{}.dedup.bam".format(sample.name))} + + def workflow(self, workflowRunner): + self.task = defaultdict(list) + if not os.path.exists(self.params.self.output_dir): + os.makedirs(self.params.self.output_dir) + mem = self.get_memory_count(50 * 1024) + for sample in self.collect_samples(): + sample_name = sample.name + bam_file = self.collect_input(sample, 'bam') + dependencies = self.collect_dependencies(sample) + sample_scratch = os.path.join(self.params.scratch_path, sample.name+self.identifier) + if self.params.self.optional.use_mate_cigar: + function = 'MarkDuplicatesWithMateCigar' + else: + function = 'MarkDuplicates' + picard_command = "java -Xmx8G -Djava.io.tmpdir={} -jar {} {} I={} O={} M={} TMP_DIR={}".format( + sample_scratch, self.params.picard, function, bam_file, os.path.join(self.params.self.output_dir,"{}.dedup.bam".format(sample_name)), + os.path.join(self.params.self.output_dir,"{}.dedup.stats.txt".format(sample_name)), sample_scratch) + dedup_task = workflowRunner.addTask('{}_{}'.format(self.identifier, sample.id), picard_command, memMb=mem, dependencies=dependencies) + self.task[sample].append(workflowRunner.addTask('index_{}_{}'.format(self.identifier, sample.id), "{} index {}".format(self.params.samtools_path, os.path.join(self.params.self.output_dir, "{}.dedup.bam".format(sample_name))), dependencies=dedup_task)) + + +class MACSRunner(ModularRunner): + ''' + Calls macs for peak detection. Defaults to --format BAMPE unless arguments are specified. + Input: bams + Output: none consumable by ZIPPY + ''' + def get_output(self, sample): + return None + + def define_optionals(self): + return {'args': '-g hs --format BAMPE'} + + def workflow(self, workflowRunner): + self.task = defaultdict(list) + if not os.path.exists(self.params.self.output_dir): + os.makedirs(self.params.self.output_dir) + cores = self.get_core_count(6) + mem = self.get_memory_count(32 * 1024) + for sample in self.collect_samples(): + sample_name = sample.name + bam_file = self.collect_input(sample, 'bam') + dependencies = self.collect_dependencies(sample) + macs_command = "{} callpeak --name {} --treatment {} --outdir {}".format( + self.params.macs_path, sample_name, bam_file, self.params.self.output_dir) + macs_command += ' ' + self.params.self.optional.args + workflowRunner.flowLog(macs_command) + task = workflowRunner.addTask('{}_{}'.format(self.identifier, sample.id), macs_command, nCores=cores, memMb=mem, dependencies=dependencies) + self.task[sample].append(task) + +class DataRunner(ModularRunner): + ''' + Datarunner loads all the files in its given directory, and provides it as output. It assumes that sample names will be in the name of relevant files. + It performs no workflow. + + To get its samples, you must either define self.params.self.samples, a list of samples to load, or self.params.self.sample_sheet, a csv sample sheet that has the + information about what samples to load. + For file types that need to be mapped to ZIPPY types, you can use self.params.self.optional.type_map, which is a map from raw file types to types that zippy expects. + For example, for rsem files going to edger, rsem produces '.genes.results' files, but the proper ZIPPY type is 'rsem'. + TODO: it should have an option to look recursively through a directory + ''' + + def get_samples(self): + if hasattr(self.params.self, 'sample_sheet'): + sample_sheet = SampleSheet(self.params.self.optional.sample_sheet) + sample_id_map = {} + samples = [] + for line in sample_sheet.get("Data"): + if line.get("Sample_ID") == '' and line.get("Sample_Name") == '': + continue + sample = line.get("Sample_ID") + if sample not in sample_id_map: + samples.append(SampleTuple(sample, sample_sheet.sample_id_to_sample_name(sample))) + sample_id_map[sample] = True + return samples + else: + for sample_name in self.params.self.samples: + check_valid_samplename(sample_name) + return [SampleTuple(i,x) for (i,x) in enumerate(self.params.self.samples)] + + def type_map_match(self, fname): + type_map = self.params.self.optional.type_map + for (raw_type, zippy_type) in type_map.iteritems(): + if fname.endswith(raw_type): + return zippy_type + return False + + def get_output(self, sample): + output_map = {} + #print os.path.join(self.params.self.output_dir, '*') + #print glob.glob(os.path.join(self.params.self.output_dir, '*')) + for fname in glob.glob(os.path.join(self.params.self.output_dir, '*')): + sample_name = sample.name + if sample_name in fname: + file_split = fname.split('.') + if hasattr(self.params.self, 'type_map') and self.type_map_match(fname) != False: + file_type = self.type_map_match(fname) + elif file_split[-1] == 'gz': + file_type = file_split[-2] + else: + file_type = file_split[-1] + if file_type in output_map: + if not isinstance(output_map[file_type], list): + output_map[file_type] = [output_map[file_type]] + output_map[file_type].append(fname) + else: + output_map[file_type] = fname + return output_map + + def get_dependencies(self, sample): + return [] + + def workflow(self, workflowRunner): + pass + +class IndelRealignmentRunner(ModularRunner): + ''' + Needs to be tested! + ''' + def get_output(self, sample): + base_file_name = os.path.basename(self.collect_input(sample.name, 'bam')) + return {'bam': os.path.join(self.params.self.output_dir,base_file_name)} + + def workflow(self, workflowRunner): + self.task = defaultdict(list) + if not os.path.exists(self.params.self.output_dir): + os.makedirs(self.params.self.output_dir) + cores = self.get_core_count(6) + mem = self.get_memory_count(12 * 1024) + for sample in self.collect_samples(): + bam_file = self.collect_input(sample, 'bam') + base_file_name = os.path.basename(self.collect_input(sample, 'bam')) + dependencies = self.collect_dependencies(sample) + realign_command = "source {mono_source}; {mono} {RI} -bamFiles {bam} -genomeFolders {genome} -outFolder \ + {outFolder}".format(mono_source = self.params.mono_source, mono = self.params.mono, RI = self.params.indelRealign, \ + bam = bam_file, genome = self.params.genome, outFolder=self.params.self.output_dir) + ri_task = workflowRunner.addTask('{}_{}'.format(self.identifier, sample.id), realign_command, nCores=cores, memMb=mem, dependencies=dependencies) + self.task[sample].append(workflowRunner.addTask('index_{}_{}'.format(self.identifier, sample.id), "{} index {}".format(self.params.samtools_path, os.path.join(self.params.self.output_dir, base_file_name)), dependencies=ri_task)) + + +class PiscesRunner(ModularRunner): + ''' + Runs the pisces variant caller. + Note: Pisces takes as input a directory, so all the samples are processed at once. + TODO: gvcf False support, better cores support, per-sample mode. + Input: bam + Output: vcf + ''' + def get_output(self, sample): + base_file_name = os.path.basename(self.collect_input(sample, 'bam')) + out_file_name = '.'.join(base_file_name.split('.')[:-1])+'.genome.vcf' + return {'vcf': os.path.join(self.params.self.output_dir,out_file_name)} + + def get_dependencies(self, sample): + return self.task + + def define_optionals(self): + return {'args': ''} + + def workflow(self, workflowRunner): + if not os.path.exists(self.params.self.output_dir): + os.makedirs(self.params.self.output_dir) + mem = self.get_memory_count(32 * 1024) + cores = self.get_core_count(8) + args = self.params.self.optional.args + dependencies = [] + for sample in self.collect_samples(): + bam_file = self.collect_input(sample, 'bam') + dependencies.extend(self.collect_dependencies(sample)) + bam_dir = os.path.normpath(os.path.join(bam_file, '..')) + command = "{dotnet} {pisces} -BAMFolder {input_path} -G {genome} -OutFolder {output_path} {args}".format( + dotnet=self.params.dotnet, pisces=self.params.pisces_path, input_path=bam_dir, genome=self.params.genome, output_path=self.params.self.output_dir, args=args) + workflowRunner.flowLog(command) + self.task = workflowRunner.addTask('pisces_{}_{}'.format(self.identifier, sample.id), command, dependencies=dependencies, nCores=cores, memMb=mem) + + +class StarRunner(ModularRunner): + ''' + Runs the STAR aligner. Uses a minimally useful set of default parameters. Further parameters can be + passed via the command line using the 'args' param. + Input: fastq + Output: a sorted, indexed bam. + ''' + def get_output(self, sample): + return {'bam': os.path.join(self.params.self.output_dir,'{}.raw.bam'.format(sample.name)), + 'starlog': os.path.join(self.params.self.output_dir,'{}Log.final.out'.format(sample.name))} + + def define_optionals(self): + return {'args': ''} + + def workflow(self, workflowRunner): + self.task = defaultdict(list) + if not os.path.exists(self.params.self.output_dir): + os.makedirs(self.params.self.output_dir) + cores = self.get_core_count(16) + mem = self.get_memory_count(12 * 1024) + args = self.params.self.optional.args + for sample in self.collect_samples(): + fastq = self.collect_input(sample, 'fastq') + dependencies = self.collect_dependencies(sample) + star_wf = SingleStarFlow(self.params.star_path, self.params.star_index, sample.name, fastq, self.params.self.output_dir, + max_job_cores=cores, tmp_path=os.path.join(self.params.scratch_path, 'star{}'.format(sample.name)), command_args=args) + self.task[sample].append(workflowRunner.addWorkflowTask('star_{}_{}'.format(self.identifier, sample.id), star_wf, dependencies=dependencies)) + + +class FastQCRunner(ModularRunner): + ''' + Runs fastqc. + Input: fastq + Output: none consumable by ZIPPY + ''' + def get_output(self, sample): + pass + + def workflow(self, workflowRunner): + self.task = defaultdict(list) + if not os.path.exists(self.params.self.output_dir): + os.makedirs(self.params.self.output_dir) + for sample in self.collect_samples(): + fastq_files = self.collect_input(sample, 'fastq') + dependencies = self.collect_dependencies(sample) + command = "{} {} -o {}".format(self.params.fastqc_path, ' '.join(fastq_files), self.params.self.output_dir) + self.task[sample].append(workflowRunner.addTask('fastqc_{}_{}'.format(self.identifier, sample.id), command, dependencies=dependencies, memMb=self.get_memory_count(8*1024))) + +class MergeBamRunner(ModularRunner): + ''' + Uses samtools merge to combine a set of bams. This is our first merge stage, and hence is currently considered experimental. + Currently, it takes as input a list of sample names, so it does not explicitly depend on the samplesheet. + TODO: make a merge be many-to-many, instead of many-to-one. This would involve taking in a list of lists as input. + TODO: currently we merge to 1 by only returning get_output for our first sample name. The right way to do this is to modify + the sample information downstream stages see. So we must decouple our pipeline from the samplesheet. Which is probably a good idea anyway. No offense, Isas. + ''' + def get_samples(self): + return [SampleTuple('1', self.identifier)] + + def get_output(self, sample): + return {'bam': os.path.join(self.params.self.output_dir,'{}.merged.bam'.format(self.identifier))} + + def get_dependencies(self, sample): + return self.task + + def workflow(self, workflowRunner): + bams = [] + dependencies = [] + if not os.path.exists(self.params.self.output_dir): + os.makedirs(self.params.self.output_dir) + for sample in self.collect_samples(): + if sample.name not in self.params.self.samples: + continue + bams.append(self.collect_input(sample, 'bam')) + dependencies.extend(self.collect_dependencies(sample)) + output_file_path = os.path.join(self.params.self.output_dir,'{}.merged.bam'.format(self.identifier)) + merge_command = '{} merge -f {} {}'.format(self.params.samtools_path, output_file_path, " ".join(bams)) + merge_task = workflowRunner.addTask('mergebam_{}'.format(self.identifier), merge_command, dependencies=dependencies) + index_command = '{} index {}'.format(self.params.samtools_path, output_file_path) + self.task = [workflowRunner.addTask('indexbam_{}'.format(self.identifier), index_command, dependencies=merge_task)] + +class RNAQCRunner(ModularRunner): + ''' + @TODO: make literally everything optional + Produces RNA-seq stats for your run. Tested with star. + Input: bam + ''' + def get_output(self, sample): + pass + + def workflow(self, workflowRunner): + self.task = defaultdict(list) + if not os.path.exists(self.params.self.output_dir): + os.makedirs(self.params.self.output_dir) + for sample in self.collect_samples(): + workflowRunner.flowLog('sample {}'.format(sample)) + dependencies = self.collect_dependencies(sample) + bam_path = self.collect_input(sample, 'bam') + rsem_model_path = self.collect_input(sample, 'rsem_model') + sample_scratch = os.path.join(self.params.scratch_path, sample.name+self.identifier) + if not os.path.exists(sample_scratch): + os.makedirs(sample_scratch) + starlog_path = self.collect_input(sample, 'starlog') + if starlog_path is None: #we did not use star, so we can't get the star stats + command = "{python} rna_stats.py {bam_path} {stat_path}/{sample_name}.summary.txt --ribosome_bed {ribosome_bed} --intron_bed {intron_bed} --temp_path {temp} ".format( + python=self.params.python, bam_path=bam_path, stat_path=self.params.self.output_dir, sample_name=sample.name, ribosome_bed=self.params.ribosome_bed, intron_bed=self.params.intron_bed, temp=sample_scratch+'a.out') + else: + command = "{python} rna_stats.py {bam_path} {stat_path}/{sample_name}.summary.txt --ribosome_bed {ribosome_bed} --intron_bed {intron_bed} --starlog_path {starlog_path} --temp_path {temp}".format( + python=self.params.python, bam_path=bam_path, stat_path=self.params.self.output_dir, sample_name=sample.name, ribosome_bed=self.params.ribosome_bed, intron_bed=self.params.intron_bed, starlog_path=starlog_path, temp=sample_scratch+'a.out') + if hasattr(self.params, 'manifest_bed'): + command += " --manifest_bed {}".format(self.params.optional.manifest_bed) + if hasattr(self.params.self, 'dup_stats') and self.params.self.optional.dup_stats: + command += " --dup_stats" + if rsem_model_path is not None: + command += " --rsem_model {}".format(rsem_model_path) + self.task[sample].append(workflowRunner.addTask('stats_combine_{}'.format(sample.id), command, memMb=40*1024, dependencies=dependencies)) + workflowRunner.flowLog(command) + + +class SalmonRunner(ModularRunner): + """ + Input: fastqs. The fastqs are assumed to contain r1/r2 information. + Output: .gene.results files. + """ + def get_output(self, sample): + #TODO + return {'rsem': os.path.join(self.params.self.output_dir,sample.name+".genes.results")} + + def workflow(self, workflowRunner): + self.task = defaultdict(list) + if not os.path.exists(self.params.self.output_dir): + os.makedirs(self.params.self.output_dir) + cores = self.get_core_count(16) + mem = self.get_memory_count(100 * 1024) + for sample in self.collect_samples(): + sample_name = sample.name + fastq_files = self.collect_input(sample, 'fastq') + if self.sample_sheet.is_paired_end(): + r1_files = [x for x in fastq_files if '_R1_' in x] + r2_files = [x for x in fastq_files if '_R2_' in x] + salmon_command = "{salmon_path} quant -i {salmon_index} -l A -1 {r1} -2 {r2} -o {output_path} --numThreads {cores}".format( + salmon_path=self.params.salmon_path, + r1=" ".join(r1_files), + r2=" ".join(r2_files), + reference=self.params.salmon_index, + output_path=self.params.self.output_dir) + else: + salmon_command = "{salmon_path} quant -i {salmon_index} -l A -1 {r1} -o {output_path} --numThreads {cores}".format( + salmon_path=self.params.salmon_path, + r1=" ".join(fastq_files), + reference=self.params.salmon_index, + output_path=self.params.self.output_dir) + if hasattr(self.params.self, 'args'): + salmon_command += ' ' + self.params.self.optional.args + workflowRunner.flowLog(salmon_command) + dependencies = self.collect_dependencies(sample) + self.task[sample].append(workflowRunner.addTask('{}_{}'.format(self.identifier, sample.id), salmon_command, dependencies=dependencies, nCores=cores, memMb=mem)) + +class StrelkaRunner(ModularRunner): + """ + Strelka call variants from either tumor/normal or germline samples. To select the mode of strelka, you must specify is_somatic as true or false. + Input: bam + Output: vcf + """ + def get_output(self, sample): + if self.params.self.is_somatic: + return {'vcf': os.path.join(self.params.self.output_dir, sample.name+".somatic.vcf.gz")} + else: + return {'vcf': os.path.join(self.params.self.output_dir, sample.name+".germline.vcf.gz")} + + def define_optionals(self): + return {'args': ''} + + def workflow(self, workflowRunner): + self.task = defaultdict(list) + if not os.path.exists(self.params.self.output_dir): + os.makedirs(self.params.self.output_dir) + cores = self.get_core_count(16) + mem = self.get_memory_count(1024 * 32) + args = self.params.self.optional.args + for sample in self.collect_samples(): + #we do this because strelka gets very upset if your directory already exists. So if you stop your pipeline between strelka configure + #and strelka run, you would get stuck with an error. + random_id = ''.join(numpy.random.choice(list(string.digits+string.ascii_lowercase), 6)) + scratch_path = os.path.join(self.params.scratch_path, sample.name+random_id) + if os.path.exists(os.path.join(scratch_path, 'runWorkflow.py')): + os.remove(os.path.join(scratch_path, 'runWorkflow.py')) + if not os.path.exists(scratch_path): + os.makedirs(scratch_path) + dependencies = self.collect_dependencies(sample) + bam_file = self.collect_input(sample, 'bam') + workflowRunner.flowLog('Strelka bams for sample {}: {}'.format(sample.name, bam_file)) + if self.params.self.is_somatic: + strelka_config = '{} {}/bin/configureStrelkaSomaticWorkflow.py'.format(self.params.python, self.params.strelka_path) + assert len(bam_file) == 2 + if hasattr(self.params.self, 'normal_pattern'): + normal_matches = [x for x in bam_file if self.params.self.optional.normal_pattern in x] + tumor_matches = [x for x in bam_file if not self.params.self.optional.normal_pattern in x] + elif hasattr(self.params.self, 'tumor_pattern'): + normal_matches = [x for x in bam_file if not self.params.self.optional.tumor_pattern in x] + tumor_matches = [x for x in bam_file if self.params.self.optional.tumor_pattern in x] + else: + raise AttributeError('For somatic strelka, either "normal_pattern" or "tumor_pattern" must be defined. This argument contains a string that identifies the normal/tumor sample respectively.') + if len(normal_matches)!=1 or len(tumor_matches)!=1: + raise AttributeError('Pattern {} was unable to differentiate bam files: {}'.format(self.params.self.optional.normal_pattern, bam_file)) + bam_string = "--normalBam {} --tumorBam {}".format(normal_matches[0], tumor_matches[0]) + else: + strelka_config = '{} {}/bin/configureStrelkaGermlineWorkflow.py'.format(self.params.python, self.params.strelka_path) + bam_string = '--bam {}'.format(bam_file) + configure_command = '{strelka_config} {bam_string} \ + --referenceFasta={genome}/genome.fa --runDir={scratch_path} --callMemMb={mem} {args}'.format( + strelka_config=strelka_config, bam_string=bam_string, genome=self.params.genome, + scratch_path=scratch_path, mem=mem, args=args) + sub_task = workflowRunner.addTask('configure_{}_{}'.format(self.identifier, sample.id), + configure_command, dependencies=dependencies, isForceLocal=True) + strelka_command = '{python} {scratch_path}/runWorkflow.py -m local -j {core_count}'.format( + python=self.params.python, scratch_path=scratch_path, core_count=cores) + strelka_task = workflowRunner.addTask('{}_{}'.format(self.identifier, sample.id), + strelka_command, dependencies=sub_task, nCores=cores) + if self.params.self.is_somatic: + module_dir = os.path.abspath(os.path.dirname(__file__)) + merge_path = os.path.join(module_dir, 'vcf_merge.py') + merge_command = "{python} {merge_path} {scratch_path}/somatic.snvs.vcf.gz {scratch_path}/somatic.indels.vcf.gz {output_dir}/{sample_name}.somatic.vcf.gz".format( + python=self.params.python, scratch_path=os.path.join(scratch_path, 'results', 'variants'), output_dir=self.params.self.output_dir, sample_name=sample.name, + merge_path=merge_path) + move_task = workflowRunner.addTask('merge_{}_{}'.format(self.identifier, sample.id), + merge_command, dependencies=strelka_task, nCores=1, memMb=4*1024) + else: + move_command = "cp {scratch_path}/variants.vcf.gz {output_dir}/{sample_name}.germline.vcf.gz".format( + scratch_path=os.path.join(scratch_path, 'results', 'variants'), output_dir=self.params.self.output_dir, sample_name=sample.name) + move_task = workflowRunner.addTask('move_{}_{}'.format(self.identifier, sample.id), + move_command, dependencies=strelka_task, nCores=1, memMb=4*1024, isForceLocal=True) + self.task[sample].append(workflowRunner.addTask('clean_temp_{}_{}'.format(self.identifier, sample.id), + "rm -r {}".format(scratch_path), dependencies=move_task, isForceLocal=True)) + + +class EdgerRunner(ModularRunner): + ''' + This is a merge stage that runs edger to perform differential expression analysis. + Requires sample_groups, a two element list. Each element of the list is a list of sample names of one group of the analysis. The sample + names can also be wildcards matching unix filename wildcard syntax. + ''' + def get_samples(self): + return [SampleTuple('1', self.identifier)] + + def get_output(self, sample): + pass + #return {'rsem': os.path.join(self.params.self.output_dir,sample.name+".genes.results")} + + def get_dependencies(self, sample): + return self.task + + def sample_in_group(self, sample_name, sample_group): + for group_pattern in sample_group: + if fnmatch.fnmatch(sample_name, group_pattern): + return True + return False + + def workflow(self, workflowRunner): + rsems_by_group = [ [] , [] ] #it's... an owl? + dependencies = [] + if not os.path.exists(self.params.self.output_dir): + os.makedirs(self.params.self.output_dir) + temp_path = os.path.join(self.params.scratch_path, self.identifier) + if not os.path.exists(temp_path): + os.makedirs(temp_path) + for sample in self.collect_samples(): + for (i,sample_group) in enumerate(self.params.self.sample_groups): + if not self.sample_in_group(sample.name, sample_group): + continue + rsems_by_group[i].append(self.collect_input(sample, 'rsem')) + dependencies.extend(self.collect_dependencies(sample)) + workflowRunner.flowLog('Edger groups: {}'.format(rsems_by_group)) + workflowRunner.flowLog('dependencies: {}'.format(dependencies)) + script_path = os.path.join(zippy_dir, 'run_edger.py') + output_file_path = os.path.join(self.params.self.output_dir,'{}.edger.out'.format(self.identifier)) + command = '{python} {script_path} --group1 {group1} --group2 {group2} --out_file {out_file} --temp_folder {temp_folder} --r_path {r_path}'.format( + group1=' '.join(rsems_by_group[0]), + group2=' '.join(rsems_by_group[1]), + out_file=output_file_path, + temp_folder=temp_path, + r_path=self.params.r_path, + python=self.params.python, + script_path=script_path) + self.task = [workflowRunner.addTask('edger_{}'.format(self.identifier), command, dependencies=dependencies)] + +class DeleteRunner(ModularRunner): + def get_output(self, sample): + pass + + def workflow(self, workflowRunner): + self.task = defaultdict(list) + for sample in self.collect_samples(): + files_to_delete = [] + dependencies = self.collect_dependencies(sample) + for file_type in self.params.self.file_types: + output_for_format = self.collect_input(sample, file_type) + if isinstance(output_for_format, list): + files_to_delete.extend(output_for_format) + else: + files_to_delete.append(output_for_format) + command = 'rm -f {}'.format(' '.join(files_to_delete)) + workflowRunner.flowLog(command) + self.task[sample].append(workflowRunner.addTask('delete_{}_{}'.format(self.identifier, sample.id), command, dependencies=dependencies)) + +class BloomSubsampleBAMRunner(ModularRunner): + ''' + Uses a bloom filter to sample a NUMBER of reads from an input bam. Bloom filter requires pybloomfilter-mmap package. + Input: bam + Output: bam + ''' + def get_output(self, sample): + return {'bam': os.path.join(self.params.self.output_dir, sample.name+".sub.bam")} + + def workflow(self, workflowRunner): + """ + TODO: might the aligner return more than 1 BAM? + """ + self.task = defaultdict(list) + if not os.path.exists(self.params.self.output_dir): + os.makedirs(self.params.self.output_dir) + cores = self.get_core_count(16) + for sample in self.collect_samples(): + dependencies = self.collect_dependencies(sample) + bam_file = self.collect_input(sample, 'bam') + new_bam_file = os.path.join(self.params.self.output_dir, sample.name+".sub.bam") + script_path = os.path.join(zippy_dir, 'downsampling_bloom.py') + subsample_command = '{python} {script_path} --bam {bam} --downsampled_pairs {count} --output {output_path} --threads {threads}'.format( + python=self.params.python, script_path=script_path, bam=bam_file, count=self.params.self.reads, output_path=new_bam_file, threads=cores) + mem = self.get_memory_count(1024 * 32) + sub_task = workflowRunner.addTask('{}_{}'.format(self.identifier, sample.id), + subsample_command, dependencies=dependencies, nCores=cores, memMb=mem) + self.task[sample].append(workflowRunner.addTask('index_{}_{}'.format(self.identifier, sample.id), + "{} index {}".format(self.params.samtools_path, os.path.join(self.params.self.output_dir, sample.name+".sub.bam")), dependencies=sub_task)) + +class Minimap2Runner(ModularRunner): + ''' + Runs miniMap2. Experimental! + Limitations: single char arg chaining (i.e., -ax instead of -a -x) is not supported. + args: + genome_filename (optional): override this to change the name of the genome (default: genome.fa) + Input: fastq + Output: if '-a' is in args, it produces sorted, indexed bams. Otherwise it produces pafs + ''' + def get_output(self, sample): + if self.make_bams: + return {'bam': os.path.join(self.params.self.output_dir, sample.name+".raw.bam")} + else: + return {'paf': os.path.join(self.params.self.output_dir, sample.name+".paf")} + + def define_optionals(self): + return {'genome_filename': 'genome.fa', 'args': ''} + + def workflow(self, workflowRunner): + self.task = defaultdict(list) + if not os.path.exists(self.params.self.output_dir): + os.makedirs(self.params.self.output_dir) + cores = self.get_core_count(20) + mem = self.get_memory_count(1024 * 127) + genome_suffix = self.params.self.optional.genome_filename + args = self.params.self.optional.args + if '-a' in args: + self.make_bams = True + for sample in self.collect_samples(): + sample_name = sample.name + dependencies = self.collect_dependencies(sample) + fastq_files = self.collect_input(sample, 'fastq', as_list=True) + fastq_files = organize_fastqs(fastq_files, self.sample_sheet.is_paired_end()) + if self.sample_sheet.is_paired_end(): + #if r1/r2 are separate, we wish to iterleave them: + #r1_1, r2_1, r1_2, r2_2 etc. This does that. + fastq_files = [x for x in itertools.chain(*zip(*fastq_files))] + if self.make_bams: + #we pipe to bam and sort + unsorted_file = os.path.join(self.params.self.output_dir, sample.name+'.unsorted.bam') + sorted_file = os.path.join(self.params.self.output_dir, sample.name+'.raw.bam') + output_command = '| {samtools} view -u > '.format(samtools=self.params.samtools_path) + output_command += unsorted_file + output_command += ' && {samtools} sort -@ 32 -m 16G {unsorted} > {sorted}'.format( + samtools=self.params.samtools_path, unsorted=unsorted_file, sorted=sorted_file) + output_command += ' && rm {unsorted}'.format(unsorted=unsorted_file) + else: + output_command = ' > '+os.path.join(self.params.self.output_dir, sample.name+'.paf') + command = '{minimap2} {args} {ref} {fastq_files} {output_command}'.format( + minimap2=self.params.minimap2_path, args=args, + ref=os.path.join(self.params.genome, genome_suffix), + fastq_files=" ".join(fastq_files), output_command=output_command) + minimap_task = workflowRunner.addTask('minimap2_{}_{}'.format(self.identifier, sample.id), command, nCores=cores, memMb=mem, dependencies=dependencies) + if self.make_bams: + self.task[sample].append(workflowRunner.addTask('index_{}_{}'.format(self.identifier, sample.id), "{} index {}".format(self.params.samtools_path, os.path.join(self.params.self.output_dir,sample_name+".raw.bam")), dependencies=minimap_task)) + else: + self.task[sample].append(minimap_task) + +class NirvanaRunner(ModularRunner): + ''' + Uses Nirvana to annotate variants in a vcf file + ''' + def get_output(self, sample): + return {'json': os.path.join(self.params.self.output_dir, sample.name + ".json.gz"), + 'json_index': os.path.join(self.params.self.output_dir, sample.name + ".json.gz.jsi"), + 'vcf': os.path.join(self.params.self.output_dir, sample.name + ".vcf.gz")} + + def workflow(self, workflowRunner): + self.task = defaultdict(list) + if not os.path.exists(self.params.self.output_dir): + os.makedirs(self.params.self.output_dir) + + args = '' + if hasattr(self.params.self, 'args'): + args = self.params.self.optional.args + + for sample in self.collect_samples(): + dependencies = self.collect_dependencies(sample) + input_vcf = self.collect_input(sample, 'vcf') + output_path = os.path.join(self.params.self.output_dir, sample.name) + command = "{dotnet} {nirvana_path} -i {input_vcf} -c {nirvana_cache} -sd {nirvana_supplement} -r {nirvana_ref} -o {output_path} {args}".format( + dotnet=self.params.dotnet, + nirvana_path=self.params.nirvana_path, + input_vcf=input_vcf, + nirvana_cache=self.params.nirvana_cache, + nirvana_supplement=self.params.nirvana_supplement, + nirvana_ref=self.params.nirvana_ref, + output_path=output_path, + args=args) + mem = self.get_memory_count(1024 * 32) + self.task[sample].append(workflowRunner.addTask('{}_{}'.format(self.identifier, sample.id), + command, dependencies=dependencies, memMb=mem)) + +class PrimerDimerMinerRunner(ModularRunner): + ''' + Runs the included primer dimer miner script. + Detects reads with more than one of a provided list of primers. + Input: fastq + Output: None exposed in ZIPPY. Produces a file containing pairs of primers that seem to be dimerizing + ''' + def get_output(self, sample): + return {} + + def define_optionals(self): + return {'kmer': 15} + + def workflow(self, workflowRunner): + self.task = defaultdict(list) + if not os.path.exists(self.params.self.output_dir): + os.makedirs(self.params.self.output_dir) + for sample in self.collect_samples(): + dependencies = self.collect_dependencies(sample) + script_path = os.path.join(zippy_dir, 'downsampling_bloom.py') + fastq_files = self.collect_input(sample, 'fastq', as_list=True) + pdm_command = '{python} {script_path} --probe_list {probe_list} --kmer {kmer}'.format( + python=self.params.python, script_path=script_path, probe_list=self.params.self.probe_list, + kmer=self.params.self.optional.kmer) + if len(fastq_files) > 2 or len(fastq_files) < 1: + raise NotImplementedError('PrimerDimerMiner must take 1 or 2 fastqs as input') + else: + for (i,x) in enumerate(fastq_files): + pdm_command+=' --input_file{} {}'.format(i+1, x) + self.task[sample].append(workflowRunner.addTask('{}_{}'.format(self.identifier, sample.id), + pdm_command, dependencies=dependencies)) \ No newline at end of file diff --git a/zippy/params.py b/zippy/params.py new file mode 100755 index 0000000..5878a74 --- /dev/null +++ b/zippy/params.py @@ -0,0 +1,298 @@ +''' +params.py contains the code to parse a compiled ZIPPY .json file. It performs several layers of functionality +compared to a pure json loader, such as setting up special keywords 'self' and 'optional', and filling in wildcards. +''' +from __future__ import print_function +import copy +import commentjson as json +import inspect +import sys +from .utils import sub_check_wilds +from .modular_runner import ModularRunner + +class ObjectDict(object): + ''' + It is required that all bound methods of ObjectDict be private (start with _) + Note: need to figure out how to deal with nested function names. Like an if not in dir(this object) or something? + ''' + def __init__(self, **kwargs): + self._self_flag = False + for (k,v) in kwargs.iteritems(): + setattr(self, k, v) + + def _lookup_identifier_doublestack(self): + try: + stack = inspect.stack() + parentframe = stack[2][0] + print(parentframe.f_locals['self']) + if isinstance(parentframe.f_locals['self'], ModularRunner): + return parentframe.f_locals['self'].identifier + else: + return None + except (AttributeError,KeyError): + return None + + + def _update_from_dict(self, d): + if self._self_flag: + identifier = self._lookup_identifier_doublestack() + if identifier is not None: + stage_params = object.__getattribute__(self, identifier) + stage_params._update_from_dict(d) + self._self_flag = False + else: + for (k,v) in d.iteritems(): + if not hasattr(self, k): + setattr(self, k, v) + + + def __getattribute__(self, attr): + ''' + When .self is encountered, a flag (self.self_mode) is set so that the next non-'self' getattribute will be in 'self mode'. + Thus, we first check in the params.self namespace, and then (if not in self mode) look in the broader namespace. + After every parameter lookup, it sets the self_mode flag to false + Downside: lookups to the params object are not threadsafe. This shouldn't be a big issue. + ''' + if attr == 'self': + self._self_flag = True + return self + elif attr == 'optional': + return self + elif attr[0] == '_': # no magic for private references + return object.__getattribute__(self, attr) + else: + try: + stack = inspect.stack() + parentframe = stack[1][0] + if isinstance(parentframe.f_locals['self'], ModularRunner): + identifier = parentframe.f_locals['self'].identifier + else: + self._self_flag = False + return object.__getattribute__(self, attr) + except (AttributeError,KeyError): # This handles other failures where the calling frame of reference is not a stage + self._self_flag = False + return object.__getattribute__(self, attr) + stage_params = object.__getattribute__(self, identifier) + try: #check in self.params.self + value = object.__getattribute__(stage_params, attr) + self._self_flag = False + return value + except AttributeError: + if not self._self_flag: # if we have not required self, we can check the top level + self._self_flag = False + return object.__getattribute__(self, attr) + else: + self._self_flag = False + raise AttributeError('Stage level parameter {} not found'.format(attr)) + + def __repr__(self): + return str(self.__dict__) + + +def _json_object_hook(d, fname, proto): + """ + Turns the dictionary into a simple object, emulating a Namespace object from argparse. + """ + if not proto and 'stages' in d: + for (i,stage) in enumerate(copy.copy(d['stages'])): + if stage['identifier'] in d: + raise KeyError('Stage identifier {} is the same as a parameter name. This is not allowed.'.format(stage['identifier'])) + print(stage) + d[stage['identifier']] = ObjectDict(**stage) + d['stages'][i] = d[stage['identifier']] + if 'docker' in d: + for docker in copy.copy(d['docker']).keys(): + d['docker'][docker] = ObjectDict(**d['docker'][docker]) + d['docker'] = ObjectDict(**d['docker']) + d['params'] = fname + return ObjectDict(**d) + +def merge_defaults(defaults, params): + """ + There's some subtlety to this. Currently, we merge over: + 1. All top-level keys + 2. Every stage matched by it's .stage + """ + for k in defaults.keys(): + if k not in params: + params[k] = defaults[k] + for (i,stage) in enumerate(copy.copy(params['stages'])): + if stage['stage'] not in defaults['stages']: + continue + for (k,v) in defaults['stages'][stage['stage']].iteritems(): + if k not in stage: + params['stages'][i][k] = v + return params + +def resolve_wildcards(wildcard_map, params): + new_params_map = {} + for (k, v) in params.iteritems(): + k = sub_check_wilds(wildcard_map, k) + if isinstance(v, dict): + v = resolve_wildcards(wildcard_map, v) + elif isinstance(v, list): + v = resolve_wildcards_list(wildcard_map, v) + elif isinstance(v, str) or isinstance(v, unicode): + v = sub_check_wilds(wildcard_map, v) + new_params_map[k] = v + return new_params_map + +def resolve_wildcards_list(wildcard_map, params_list): + ''' + Helper function for resolve wildcards that iterates over a list, not a dict. + ''' + new_members = [] + for x in params_list: + if isinstance(x, dict): + new_members.append(resolve_wildcards(wildcard_map, x)) + elif isinstance(x, list): + new_members.append(resolve_wildcards_list(wildcard_map, x)) + else: + new_members.append(sub_check_wilds(wildcard_map, x)) + return new_members + + +def old_get_params(fname, defaults_fname=None, proto=False): + """ + Returns an object with fields defined for every value in the json file. Hardcodes its own file location + to the 'params' entry. + proto: whether this is a prototype params file or not. If it is, then we do not unroll stages (because it is flat) + """ + with open(fname, 'rb') as f: + params = json.load(f) + if 'wildcards' in params: + params = resolve_wildcards(params['wildcards'], params) + if defaults_fname is not None: + with open(defaults_fname, 'rb') as def_f: + default_params = json.load(def_f) + params = merge_defaults(default_params, params) + params = _json_object_hook(params, fname, proto=proto) + return params + +def get_params(fname, proto=False): + ''' + To build a params object: + 0. Turn the json into a python object + 1. Resolve wildcards + 2. Build dockers + 3. Build stages. + 3a. Expand subworkflows + 3b. Turn stage_dict into an ObjectDict + 3c. Update stage_dict and docker_dict from stage environment + 4. Build global + 4a. Turn d into an ObjectDict + 4b. Update global environment + ''' + subworkflow_index = 0 + with open(fname, 'rb') as f: + d = json.load(f) + if proto: + return ObjectDict(**d) + if 'wildcards' in d: + d = resolve_wildcards(d['wildcards'], d) + if 'docker' in d: + for docker in copy.copy(d['docker']).keys(): + d['docker'][docker] = ObjectDict(**d['docker'][docker]) + d['docker'] = ObjectDict(**d['docker']) + if 'stages' in d: + stages = copy.deepcopy(d['stages']) + d['stages'] = [] + for stage in stages: + if 'subworkflow' in stage: + handle_subworkflow(stage, d, subworkflow_index) + subworkflow_index += 1 + elif 'identifier' in stage: + stage_dict = ObjectDict(**stage) + handle_stage(stage_dict, d) + else: + raise ValueError('Stage must either define identifier (and be a stage) or subworkflow (and be a subworkflow)') + d['params'] = fname + d = ObjectDict(**d) + if hasattr(d, 'environment'): + env = get_params(d.environment) + if hasattr(env, 'docker'): + d.docker._update_from_dict(env.docker.__dict__) + d._update_from_dict(env.__dict__) + return d + +def handle_stage(stage_dict, d): + ''' + Adds a stage to the params workflow representation, and uses the environment + file to fill in any missing parameters (if defined) + ''' + if stage_dict.identifier in d: + raise KeyError('Stage identifier {} is the same as a parameter name. This is not allowed.'.format(stage['identifier'])) + # set up an alias for the stage at params.stage_identifier. + d[stage_dict.identifier] = stage_dict + d['stages'].append(stage_dict) + if hasattr(stage_dict, 'environment'): + env = get_params(stage_dict.environment) + stage_dict._update_from_dict(getattr(env, stage_dict.identifier).__dict__) + if hasattr(env, 'docker'): + d['docker']._update_from_dict(env.docker.__dict__) + +def handle_subworkflow(stage, d, subworkflow_index): + ''' + Adds a subworkflow to the params representation by iteratively adding its stages. + ''' + subworkflow = get_params(stage['subworkflow']) + subworkflow_feedins = stage['previous_stage'] if 'previous_stage' in stage else {} + subworkflow_identifier = stage['identifier'] if 'identifier' in stage else subworkflow_index + for stage_dict in subworkflow.stages: + # we add the subworkflow identifier to internal previous_stage references + if hasattr(stage_dict, 'previous_stage') and stage_dict.previous_stage !='': + if isinstance(stage_dict.previous_stage, list): + stage_dict.previous_stage = [x+'{}'.format(subworkflow_identifier) for x in stage_dict.previous_stage] + else: + stage_dict.previous_stage+='{}'.format(subworkflow_identifier) + # we add external previous_stages defined in the feedins + if stage_dict.identifier in subworkflow_feedins: + add_feedins_to_previous_stage(stage_dict, subworkflow_feedins[stage_dict.identifier]) + # by default, we append the subworkflow identifier to output paths. + if not getattr(stage_dict, 'zippy_do_not_append_identifier_to_paths', False): + if hasattr(stage_dict, 'output_dir'): + stage_dict.output_dir+='{}'.format(subworkflow_identifier) + # now we add the stage to the workflow + stage_dict.identifier+='{}'.format(subworkflow_identifier) + handle_stage(stage_dict, d) + + +def add_feedins_to_previous_stage(stage_dict, feedins): + ''' + We add the new feedins into the dependencies of the curent stage. + ''' + if feedins == '': #there's a feedin defined, but it's not an identifier. We can ignore it. + return '' + if not isinstance(feedins, list): + feedins = [feedins] + if isinstance(stage_dict.previous_stage, list): # formerly, there were multiple previous stages + stage_dict.previous_stage.extend(feedins) + elif stage_dict.previous_stage != '': # formerly, there was one previous stage + stage_dict.previous_stage = [stage_dict.previous_stage]+feedins + else: # formerly, there was no prvious stage + stage_dict.previous_stage = feedins + +def save_params_as_json(params, fname): + params = rejsonify(params) + for stage in params['stages']: #we delete the duplicate copies of the stages, to match the input json style + del params[stage['identifier']] + with open(fname, 'w') as f: + json.dump(params, f, indent=4) + +def rejsonify(params): + ''' + Used to convert ObjectDicts to serializable dicts + ''' + if isinstance(params, ObjectDict): + to_return = params.__dict__ + #strip out private members + to_return = {k:v for (k,v) in to_return.iteritems() if k[0]!='_'} + for (k,v) in copy.copy(to_return).iteritems(): + to_return[k] = rejsonify(v) + elif isinstance(params, list): + for (i,v) in enumerate(copy.copy(params)): + params[i] = rejsonify(v) + to_return = params + else: + to_return = params + return to_return diff --git a/zippy/primer_dimer_miner.py b/zippy/primer_dimer_miner.py new file mode 100755 index 0000000..ab39246 --- /dev/null +++ b/zippy/primer_dimer_miner.py @@ -0,0 +1,167 @@ +import os +import gzip +import csv +from collections import defaultdict +from itertools import combinations + +class PrimerDimerMiner(): + """ + A primer dimer detector. Looks for kmers that can be uniquely mapped to a primer or its reverse complement, + and reports all reads that have kmers belonging to at least 2 such primers. Note that a primer and its reverse + complement are counted separately. We also report autodimers, defined as non-contiguous kmers that map to the same + primer. + #TODO: we ignore whether reads are paired end; not sure if there's anything smart we want to do there. + #TODO: all matches must be exact. if there's a read error somewhere, we won't catch it, or we might call it an autodimer + # if it results in 15bp matches on each end of the error. + """ + def __init__(self, params): + self.params = params + self.k = self.params.kmer + self.reads_read = 0 # how many lines we've looked at + self.dimer_reads_read = 0 # how many lines we've looked at with at least 2 primers + self.monomer_reads_read = 0 # how many lines we've looked at with exactly 1 primer + self.pairs_map = defaultdict(int) # map from tuple of 2 primers (alphabetical) to dimer count + self.out_reads_file = open(os.path.join(self.params.output_dir,'matched_reads'), 'w') + self.out_pairs_file = open(os.path.join(self.params.output_dir,'pair_stats'), 'w') + self.out_stats_file = open(os.path.join(self.params.output_dir,'run_stats'), 'w') + + def reverse_complement(self, seq): + rev_map = {'A':'T','C':'G','G':'C','T':'A'} + new_seq = ''.join([rev_map[x] for x in seq]) #sub letters + new_seq = new_seq[::-1] # reverse it + return new_seq + + def parse_probe_list(self): + """ + Creates probe map, which is a map of probe names to sequence. + """ + probe_map = {} + with open(self.params.probe_list, 'r') as f: + c = csv.reader(f, delimiter="\t") + for line in c: + probe_map[line[0]] = line[1].upper() + return probe_map + + def build_kmer_map(self, probe_map, k): + """ + Builds a map from kmer to probenames that have this kmer. + Also does reverse complements. + """ + kmer_map = defaultdict(set) + for (probe, seq) in probe_map.iteritems(): + seq_rc = self.reverse_complement(seq) + for i in range(0, len(seq)-k): + kmer_map[seq[i:i+k]].add(probe) + kmer_map[seq_rc[i:i+k]].add(probe+"rc") + return kmer_map + + def run_matcher_helper(self, f, kmer_map): + counter = 0 # 0: header 1: sequence 2: junk 3: quality + for line in f: + if counter == 0: + read_name = line.strip() + if counter == 1: + line = line.upper() + probe_matches = self.find_matches(line.strip(), kmer_map) + if len(probe_matches) > 1: + self.output_matches(read_name, probe_matches, line.strip()) + self.register_pairs(probe_matches) + self.dimer_reads_read += 1 + elif len(probe_matches) == 1: + self.monomer_reads_read += 1 + self.reads_read += 1 + counter += 1 + counter = counter % 4 + + def run_matcher(self, input_file, kmer_map): + """ + Goes through a fastq, and finds all dimerized reads + """ + if input_file is None: # we don't require the input files... one of them can be undefined + return + try: + with gzip.open(input_file, 'r') as f: + self.run_matcher_helper(f, kmer_map) + except IOError: #not a gzip + with open(input_file, 'r') as f: + self.run_matcher_helper(f, kmer_map) + + + def find_matches(self, line, kmer_map): + """ + For a single read, reports all found primers + """ + in_primer = None + matches = [] + for i in range(0, len(line)-self.k): + sub_seq = line[i:i+self.k] + if in_primer is None: #we are not currently in a primer. + if len(kmer_map[sub_seq]) == 1: # If we see a uniquely mappable kmer, we enter a primer. + (in_primer,) = kmer_map[sub_seq] + matches.append(in_primer) + else: # Otherwise, we continue + continue + else: # we are in the middle of seeing a primer sequence + if in_primer in kmer_map[sub_seq]: # we see this primer again, and are thus still reading it. Continue. + continue + elif len(kmer_map[sub_seq]) == 1: # We no longer see our current primer, but this sequence is mappable to another primer. We are now in a different primer. + (in_primer,) = kmer_map[sub_seq] + matches.append(in_primer) + else: # We aren't in our current primer, and aren't uniquely in a different primer. + in_primer = None + return matches + + def output_matches(self, read_name, probe_matches, line): + self.out_reads_file.write(read_name+"\n") + self.out_reads_file.write(line+"\n") + self.out_reads_file.write("\t".join(probe_matches)+"\n") + + def register_pairs(self, probe_matches): + """ + Increments the occurrence count of all unique dimer pairs found in a line. + """ + seen_pairs = set() + probe_matches = sorted(probe_matches) + for pair in combinations(probe_matches, 2): + if pair not in seen_pairs: + self.pairs_map[pair] += 1 + seen_pairs.add(pair) + + def output_stats(self): + self.out_pairs_file.write("\t".join(["Primer 1", "Primer 2", "# reads containing both", "% reads containing both"])+"\n") + for (pair, count) in sorted(self.pairs_map.items(), key=lambda x: -x[1]): + self.out_pairs_file.write("\t".join([pair[0], pair[1], str(count), "{:.4%}".format(float(count)/float(self.dimer_reads_read))])+"\n") + self.out_stats_file.write("Number of reads seen: {}".format(self.reads_read)+"\n") + self.out_stats_file.write("Number of reads with dimers: {}, ({:.4%})\n".format(self.dimer_reads_read, float(self.dimer_reads_read)/float(self.reads_read))) + self.out_stats_file.write("Number of reads with exactly 1 primer: {}, ({:.4%})\n".format(self.monomer_reads_read, float(self.monomer_reads_read)/float(self.reads_read))) + + + + + def run(self): + """ + Main execution function for PDM + """ + probe_map = self.parse_probe_list() + kmer_map = self.build_kmer_map(probe_map, self.k) + for input_file in [self.params.input_file1, self.params.input_file2]: + self.run_matcher(input_file, kmer_map) + self.output_stats() + self.out_reads_file.close() + self.out_stats_file.close() + self.out_pairs_file.close() + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--probe_list', type=str, help="tsv with 2 columns: (probe_name, sequence). If you are looking for adapters or other short sequences, they should be added to the probe list.") + parser.add_argument('--kmer', type=int, default=15, help="How big a fragment size to require for a match") + parser.add_argument('--input_file1', help="A fastq file with reads to analyze") + parser.add_argument('--input_file2', help="Another fastq file (Optional)") + parser.add_argument('--output_dir') + params = parser.parse_args() + if not os.path.exists(params.output_dir): + os.makedirs(params.output_dir) + pdm = PrimerDimerMiner(params) + pdm.run() \ No newline at end of file diff --git a/zippy/rna_stats.py b/zippy/rna_stats.py new file mode 100755 index 0000000..a9ac314 --- /dev/null +++ b/zippy/rna_stats.py @@ -0,0 +1,146 @@ +""" +Combines star_stats, rseqc insert size and rrna count into a single easy to read file. +""" +import subprocess +import numpy +import csv +import os +import pysam + +def star_keep_line(line): + if 'Started job on' in line: + return False + if 'Started mapping on' in line: + return False + if 'Finished on' in line: + return False + if 'Mapping speed' in line: + return False + if '|' in line: + return True + +def parse_starlog(fname): + results = [] + number_reads = -1 + unique_reads = -1 + with open(fname, 'r') as f: + for line in f: + line = line.strip() + if star_keep_line(line): + split_line = line.split('|') + split_line = [x.strip() for x in split_line] + if 'Number of input reads' in line: + number_reads = int(split_line[1]) + if 'Uniquely mapped reads number' in line: + unique_reads = int(split_line[1]) + results.append("\t".join(split_line)) #we swap the | divider with : + return (results, number_reads, unique_reads) + +def parse_insert_size(fname): + data = [] + results = [] + with open(fname, 'rb') as f: + c = csv.reader(f, delimiter='\t') + for line in c: + if 'sameTranscript=No' in line or 'sameChrom=No' in line or 'dist=genomic' in line: #we remove non-traditional transcripts + continue + if int(line[1]) > 1000: #we remove the extreme large insert sizes as not 'real' + continue + data.append(int(line[1])) + results.append('Average insert size\t{}'.format(numpy.mean(data))) + results.append('Standard deviation insert size\t{}'.format(numpy.std(data))) + return results + +def get_intron_read_count(params): + subprocess.call("bedtools intersect -a {bam_path} -b {intron_bed} -u -wa -f 0.1 -sorted > {temp_path}.temp.bam".format(bam_path=params.bam_path, intron_bed=params.intron_bed, temp_path=params.temp_path), shell=True) + read_names = set() + samfile = pysam.AlignmentFile("{}.temp.bam".format(params.temp_path), "rb") + for line in samfile: + read_names.add(line.query_name) + samfile.close() + #count = subprocess.check_output("samtools view -c {temp_path}.temp".format(temp_path=params.temp_path), shell=True) + #subprocess.call("rm {temp_path}.temp".format(temp_path=params.temp_path), shell=True) + return len(read_names) + +def get_ribosome_read_count(params): + ''' deprecated ''' + subprocess.call("samtools view {bam_path} -b -L {ribosome_bed} -o {temp_path}.bam".format(bam_path=params.bam_path, ribosome_bed=params.ribosome_bed, temp_path=params.temp_path), shell=True) + read_names = set() + samfile = pysam.AlignmentFile("{}.bam".format(params.temp_path), "rb") + for line in samfile: + read_names.add(line.query_name) + samfile.close() + print "{temp_path}.bam".format(temp_path=params.temp_path) + #subprocess.call("rm {temp_path}.bam".format(temp_path=params.temp_path), shell=True) + return len(read_names) + +def get_unique_read_count(bam_path, bed_path, temp_bam_file): + exit_code = subprocess.call("samtools view {bam_path} -b -L {bed_path} -o {temp_bam_file}".format(bam_path=bam_path, bed_path=bed_path, temp_bam_file=temp_bam_file), shell=True) + if exit_code != 0: + raise RuntimeError('Samtools exited with nonzero exit code.') + read_names = set() + samfile = pysam.AlignmentFile(temp_bam_file, "rb") + for line in samfile: + read_names.add(line.query_name) + samfile.close() + subprocess.call("rm {}".format(temp_bam_file), shell=True) + return len(read_names) + +def get_dup_count(bam_path): + dup_count = subprocess.check_output("samtools view -c -f 1024 -q 255 {}".format(bam_path), shell=True) + return int(dup_count.strip())/2 #per paired-read + +def analyze_rna(params): + results = [] + sample_name = os.path.basename(params.output_path) + results.append('Sample\t{}'.format(sample_name)) + number_reads = -1 + if params.starlog_path is not None: + (star_results, number_reads, unique_reads) = parse_starlog(params.starlog_path) + results.extend(star_results) + if params.ribosome_bed is not None: + #ribosome_read_count = get_ribosome_read_count(params) + ribosome_read_count = get_unique_read_count(params.bam_path, params.ribosome_bed, params.temp_path+'.ribosome.bam') + results.append('Ribosomal reads\t{}'.format(ribosome_read_count)) + if number_reads > 0: + results.append('Ribosomal read percentage\t{:.2%}'.format(float(ribosome_read_count)/float(number_reads))) + print results + #if params.intron_bed is not None: + # intron_read_count = get_intron_read_count(params) + # results.append('Intron-overlapping reads\t{}'.format(intron_read_count)) + # if number_reads > 0: + # results.append('Intron-overlapping read percentage\t{:.2%}'.format(float(intron_read_count)/float(number_reads))) #2*x because the read count is paired end reads, but our ribosomal count is not. + # print results + if params.manifest_bed is not None: + manifest_read_count = get_unique_read_count(params.bam_path, params.manifest_bed, params.temp_path+'.manifest.bam') + #manifest_read_count = subprocess.check_output("samtools view {bam_path} -c -L {manifest_bed}".format(bam_path=params.bam_path, manifest_bed=params.manifest_bed), shell=True) + results.append('Manifest reads\t{}'.format(manifest_read_count)) + if number_reads > 0: + results.append('Manifest read percentage\t{:.2%}'.format(float(manifest_read_count)/float(number_reads))) #2*x because the read count is paired end reads, but our ribosomal count is not. + print results + if params.rseqc_inner_distance is not None: + results.extend(parse_insert_size(params.rseqc_inner_distance)) + if params.dup_stats: + dup_count = get_dup_count(params.bam_path) + results.append('Duplicate uniquely mapping reads\t{}'.format(dup_count)) + if unique_reads > 0: + results.append('Duplicate read percentage (of all uniquely mapped reads)\t{:.2%}'.format(float(dup_count)/float(number_reads))) #2*x because the read count is paired end reads, but our ribosomal count is not. + with open(params.output_path, 'w') as fout: + print results + fout.write('\n'.join(results)) + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(usage='', formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('bam_path', help='The bam to analyze.') + parser.add_argument('output_path', help='Where to place the stats file.') + parser.add_argument('--ribosome_bed') + parser.add_argument('--intron_bed') + parser.add_argument('--manifest_bed') + parser.add_argument('--temp_path') + parser.add_argument('--rseqc_inner_distance') + parser.add_argument('--starlog_path', help="Star's Final.log.out file.") + parser.add_argument('--dup_stats', dest='dup_stats', action='store_true', help="Whether we generate duplicate statistics") + parser.set_defaults(dup_stats=False) + input_params = parser.parse_args() + analyze_rna(input_params) \ No newline at end of file diff --git a/zippy/run_edger.py b/zippy/run_edger.py new file mode 100755 index 0000000..ea48ec5 --- /dev/null +++ b/zippy/run_edger.py @@ -0,0 +1,75 @@ +''' +Runs edger from a set of RSEM inputs +''' +import os +import random +import subprocess +import csv + +def parse_rsem(f_name): + ''' + Returns a map from gene name to a list with one item, the expected count. + ''' + rsem_map = {} + with open(f_name, 'rb') as f: + c = csv.reader(f, delimiter='\t') + c.next() + for line in c: + gene = line[0] + rsem_map[gene] = [float(line[4])] + return rsem_map + +def add_to_data_grid(data_grid, sample_map): + if data_grid is None: + data_grid = sample_map + else: + for k in sample_map.keys(): + try: + data_grid[k].extend(sample_map[k]) + except KeyError: + continue + return data_grid + +def write_data_grid(data_grid, key_ordering, temp_folder): + #we write out every gene that has the + random_path_suffix = str(random.randint(0,100000)) + data_grid_path = os.path.join(temp_folder, 'datagrid'+random_path_suffix+'.csv') + print 'DGP: {}'.format(data_grid_path) + with open(data_grid_path, 'wb') as f: + c = csv.writer(f) + c.writerow(['Symbol']+key_ordering) + for symbol in data_grid.keys(): + if len(data_grid[symbol]) == len(key_ordering): + c.writerow([symbol]+data_grid[symbol]) + else: + print len(data_grid[s]) + return data_grid_path + +def run_edger(params): + data_grid = None + key_ordering = [] + for (group_index, samples) in enumerate([params.group1, params.group2]): + for sample in samples: + sample_map = parse_rsem(sample) + data_grid = add_to_data_grid(data_grid, sample_map) + key_ordering.append(os.path.basename(sample)) + data_grid_path = write_data_grid(data_grid, key_ordering, params.temp_folder) + subprocess.call('{r_path} edger.r {data_grid_path} {group1_len} {group2_len} {out_file} {temp_path}'.format( + data_grid_path=data_grid_path, + group1_len=len(params.group1), + group2_len=len(params.group2), + out_file=params.out_file, + r_path=params.r_path, + temp_path=params.temp_folder), shell=True) + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--group1', nargs='+') + parser.add_argument('--group2', nargs='+') + parser.add_argument('--temp_folder') + parser.add_argument('--out_file') + parser.add_argument('--r_path') + params = parser.parse_args() + run_edger(params) \ No newline at end of file diff --git a/zippy/samplesheet.py b/zippy/samplesheet.py new file mode 100755 index 0000000..bdf250a --- /dev/null +++ b/zippy/samplesheet.py @@ -0,0 +1,258 @@ +''' +@author jtoung +''' +import re +from collections import defaultdict + +valid_sections = set(('Header', 'Reads', 'Settings', 'Data')) + +def check_valid_samplename(name): + if "_R1_" in name or "_R2_" in name or name.endswith("_R1") or name.endswith("_R2"): + raise RuntimeError('Sample name {} contains _R1_ or endswith _R1. This interferes with how paired-end reads are formatted. Please rename your samples'.format(name)) + +def check_valid_section(section): + if section not in valid_sections: + raise Exception('invalid section %s; valid sections are %s' % (section, " ".join(valid_sections))) + +class SampleSheet(object): + + def __init__(self, sample_sheet, fix_dup_sample_names=False): + self.unique_sample_name_map = {} + if not sample_sheet: + raise AttributeError('samplesheet is required for SampleSheet') + self.sample_sheet = sample_sheet + self.fix_dup_sample_names = fix_dup_sample_names + + self._parse_sample_sheet(self.sample_sheet) + self._check_names() + + def _check_names(self): + ''' + Ensures that names are unique (it either fails, or fixes them + if fix_dup_sample_names is true. Also ensures that names fit in + our sample name guidelines (do not end with _R1/_R2 and do not + contain _R1_/_R2_) + ''' + name_to_ids = defaultdict(set) + ids_to_names = defaultdict(set) + for sample_id in self.get_sample_ids(): + sample_lines = self.get_samples_for_id(sample_id) + check_valid_samplename(sample_id) + for line in sample_lines: + name = line.get("Sample_Name") + name_to_ids[name].add(sample_id) + ids_to_names[sample_id].add(name) + check_valid_samplename(name) + for (identifier, name_set) in ids_to_names.iteritems(): + if len(name_set) > 1: + raise IOError('Sample ID {} has multiple sample names. Sample name and sample ID must be unique'.format(identifier)) + for (name, id_set) in name_to_ids.iteritems(): + if len(id_set) > 1: + if self.fix_dup_sample_names: + for sample_id in id_set: + self.uniqueify_sample_names(sample_id) + if name == '': + self.overwrite_sample_name(sample_id) + else: + raise IOError('Sample name {} has multiple sample IDs. Sample name and sample ID must be unique'.format(name)) + else: + self.unique_sample_name_map[list(id_set)[0]] = name + + def sample_id_to_sample_index(self, sample_id): + sample_indices = 1 + samples_seen = set() + for line in self._ss["Data"]: + current_id = line.get("Sample_ID") + if current_id == sample_id: + return sample_indices + else: + if current_id not in samples_seen: + sample_indices += 1 + samples_seen.add(current_id) + raise ValueError('Sample id {} not found in samplesheet'.format(sample_id)) + + + def uniqueify_sample_names(self, sample_id): + self.unique_sample_name_map[sample_id] = sample_id + + def overwrite_sample_name(self, sample_id): + sample_lines = self.get_samples_for_id(sample_id) + for line in sample_lines: + line.set("Sample_Name", sample_id) + + def print_sample_sheet(self, output): + """prints sample sheet to output""" + fh_out = open(output, 'wb') + for line in self._l: + fh_out.write(",".join(line) + "\n") + fh_out.close() + + def _parse_sample_sheet(self, sample_sheet): + """parse the sample sheet into two objects: _l is a list of each line in the original file; _ss is a dictionary that lets us look up the index of each line conveniently""" + """first, _ss points to sections Header, Reads, or Settings. the value of these keys are themselves dictionaries whose keys are the first column, and values are the line idx of the original file, which let's us get the corresponding line in _l""" + """second, _ss points to sections Data, which contains a list of "Data" elements, each of which contain a link to the original sample sheet (self), as well as the line idx of the original file""" + + header_regex = re.compile("^\[(.*)\]$") + section_name = None + # _ss is a dictionary that stores keys called "Header", "Reads", or "Settings" + # + self._ss = {} + # _l is a list that of the original file + self._l = [] + _data_header = None + _data_i = 1 + with open(sample_sheet, 'rb') as f: + for i, line in enumerate(f): + line = line.rstrip('\n\r').split(',') + self._l.append(line) + + if len(line) == 1 and line[0] == '': #skip blank lines + continue + + if header_regex.match(line[0]): + section_name = header_regex.match(line[0]).group(1) + continue + + if not section_name: + raise Exception('could not parse section name in sample sheet %s' % self.sample_sheet) + + if section_name not in self._ss: + if section_name in ["Data", "Reads"]: + self._ss[section_name] = [] + else: + self._ss[section_name] = {} + + if section_name in ["Header", "Settings"]: + (key, value) = (line[0], line[1]) + if key in self._ss[section_name]: + raise Exception('key %s observed twice in section %s' % (key, section_name)) + self._ss[section_name][key] = i + + if section_name in ["Data"]: + if not _data_header: + _data_header = line + else: + self._ss[section_name].append(Data(_data_header, _data_i, self, i)) + _data_i += 1 + + if section_name in ["Reads"]: + self._ss[section_name].append(line) + + def get(self, section, attribute=None): + + check_valid_section(section) + + if section in ["Data",]: + # returns a list of "Data" items + return self._ss[section] + else: + if attribute in self._ss[section]: + i = self.ss_[section][attribute] + return self._l[i][1] + elif attribute is None: + return self._ss[section] + else: + return None + + def set(self, section, attribute, new_value): + + check_valid_section(section) + + if section in ["Data",]: + raise Exception('set method not defined for Data') + + if not attribute in self._ss[section]: + raise Exception('%s not found in section %s' % (attribute, section)) + + i = self._ss[section][attribute] + self._l[i][1] = new_value + + def calculate_number_of_lanes(self): + lanes = set() + for d in self._ss["Data"]: + for lane in d.get("Lane").split("+"): + lanes.add(lane) + lanes = sorted([int(lane) for lane in lanes]) + return lanes + + def get_sample_ids(self): + sample_ids = set() + for line in self._ss["Data"]: + sample_ids.add(line.get("Sample_ID")) + return sample_ids + + def get_samples_for_id(self, sample_id): + sample_lines = [] + for line in self._ss["Data"]: + if line.get("Sample_ID") == sample_id: + sample_lines.append(line) + return sample_lines + + def sample_id_to_sample_name(self, sample_id): + ''' + Will return the first sample name associated with this sample id + ''' + sample_lines = self.get_samples_for_id(sample_id) + return sample_lines[0].get("Sample_Name") + + def is_paired_end(self): + count = 0 + for line in self._ss["Reads"]: + if len(line) > 0 and line[0] != '': + count += 1 + if count == 2: + return True + if count == 1: + return False + raise ValueError("Reads section of samplesheet is malformed") + +class Data(object): + + def __init__(self, + data_header, + data_i, # the ith data object (order of this data object relative to other data objects in the spreadsheet) + sample_sheet, # the original sample sheet object + sample_sheet_i # the line number in the original sample sheet + ): + + for i in ('data_i', 'sample_sheet', 'sample_sheet_i'): + setattr(self, i, locals()[i]) + + self.data_header_idx = {} + for (i, v) in enumerate(data_header): + if v == '': + continue + if v in self.data_header_idx: + raise Exception('observed Data header %s twice' % v) + self.data_header_idx[v] = i + + def __repr__(self): + return str(self.sample_sheet._l[self.sample_sheet_i]) + + def set(self, attribute, new_value): + # get the line from the original sample sheet + line = self.sample_sheet._l[self.sample_sheet_i] + if attribute not in self.data_header_idx: + raise AttributeError('invalid attribute for Data %s' % attribute) + line[self.data_header_idx[attribute]] = new_value + + def get(self, attribute): + + # row_idx is another synonym for the ith data object or 'data_i' + if attribute == "row_idx": + return self.data_i + + # get the line from the original sample sheet + line = self.sample_sheet._l[self.sample_sheet_i] + if attribute not in self.data_header_idx: + raise AttributeError('invalid attribute for Data %s' % attribute) + return line[self.data_header_idx[attribute]] + + def has(self, attribute): + + # row_idx is another synonym for the ith data object or 'data_i' + if attribute == "row_idx": + return True + if attribute not in self.data_header_idx: + return False + return True \ No newline at end of file diff --git a/zippy/star.py b/zippy/star.py new file mode 100755 index 0000000..e3320ee --- /dev/null +++ b/zippy/star.py @@ -0,0 +1,153 @@ +import os +import re +import random +import string +import subprocess +from collections import defaultdict +from pyflow import WorkflowRunner + +char_set = string.ascii_uppercase + string.ascii_lowercase + string.digits + +def find_fastq_files(fastq_dir, rm_undetermined=False, no_crawl=False): + ''' + @author: jtoung + ''' + fastq_files = {} + if no_crawl: + find = subprocess.Popen(['find', '-L', fastq_dir, '-maxdepth', '1', '-name', '*.fastq.gz'], stdout=subprocess.PIPE) + else: + find = subprocess.Popen(['find', '-L', fastq_dir, '-name', '*.fastq.gz'], stdout=subprocess.PIPE) + fastq_regex = re.compile("^(.*)_(S[0-9]+)_([L0-9\-]+)?_?(R[1|2])_001\.fastq\.gz$") + for fastq_file in find.stdout: + fastq_file = fastq_file.rstrip('\n').split()[0] + fastq_file_basename = os.path.basename(fastq_file) + if fastq_regex.match(fastq_file_basename): + sample_name = fastq_regex.match(fastq_file_basename).group(1) + sample_id = fastq_regex.match(fastq_file_basename).group(2) + lane = fastq_regex.match(fastq_file_basename).group(3) + read = fastq_regex.match(fastq_file_basename).group(4) + + if sample_name == "Undetermined": + continue + + if not lane: + lane = 'None' + + if lane not in fastq_files: + fastq_files[lane] = {} + + sample = (sample_id, sample_name) + if sample not in fastq_files[lane]: + fastq_files[lane][sample] = {} + + if read in fastq_files[lane][sample]: + raise Exception('already observed fastq file for lane %s, sample_id %s, sample_name %s, read %read in fastq_dir %s' % (lane, sample_id, sample_name, read, fastq_dir)) + + fastq_files[lane][sample][read] = fastq_file + + return fastq_files + +class starFlow(WorkflowRunner): + """ + Pyflow workflow to call the STAR aligner. We follow file formatting guidelines from the isaac workflow + from bioxutils. + """ + def __init__(self, star_path, star_index, fastq_dir, out_dir, max_job_cores=8): + self.star_path = star_path + self.star_index = star_index + self.fastq_dir = fastq_dir + self.out_dir = out_dir + self.max_job_cores = max_job_cores + + def workflow(self): + if not os.path.exists(self.out_dir): + os.makedirs(self.out_dir) + project_name = 'autogen_project' + fastq_files = find_fastq_files(self.fastq_dir, rm_undetermined=True) + + mv_fastq_tasks = [] + fastq_cleanup_cmds = [] + sample_id_TO_sample_name = {} + fastq_by_sample_1 = defaultdict(list) + fastq_by_sample_2 = defaultdict(list) + for (lane, sample_data) in fastq_files.iteritems(): + for (sample, reads) in sample_data.iteritems(): + self.flowLog(reads.keys()) + fastq_by_sample_1[sample].append(reads['R1']) + try: + fastq_by_sample_2[sample].append(reads['R2']) + except KeyError: + no_r2 = True + for sample in fastq_by_sample_1.keys(): + (sample_id, sample_name) = sample + sample_path = os.path.join(self.out_dir, '_'.join(sample)) + if not os.path.exists(sample_path): + os.makedirs(sample_path) + # STAR alignment + if no_r2: + read_files_string = ','.join(fastq_by_sample_1[sample]) + else: + read_files_string = ','.join(fastq_by_sample_1[sample])+" "+','.join(fastq_by_sample_2[sample]) + self.star_command = "{star_path} --genomeDir {star_index} --readFilesIn {read_files_string} --runThreadN {max_job_cores} --outFileNamePrefix {out_dir} --outTmpDir {tmp_dir} --readFilesCommand zcat --outSAMtype BAM SortedByCoordinate --outSAMunmapped Within".format( + star_path=self.star_path, star_index=self.star_index, + read_files_string=read_files_string, max_job_cores=self.max_job_cores, + out_dir=sample_path+"/", tmp_dir = os.path.join(self.out_dir, 'tmp'+str(sample_id))) + self.flowLog(self.star_command) + star_task = self.addTask("star"+str(sample_id), self.star_command, nCores=self.max_job_cores, memMb=40000, dependencies=mv_fastq_tasks) + # cleanup + self.addTask("delete_tmp"+str(sample_id), "rm -r {tmp_dir}".format(tmp_dir=os.path.join(self.out_dir, 'tmp'+str(sample_id))), dependencies=star_task) + rename_task = self.addTask("rename_star"+str(sample_id), "mv {path}/Aligned.sortedByCoord.out.bam {path}/{sample_id}_{sample_name}.raw.bam".format(path=sample_path, sample_id=sample_id, sample_name=sample_name), dependencies=star_task) + # build bai + self.addTask("bai"+str(sample_id), "samtools index {path}/{sample_id}_{sample_name}.raw.bam".format(path=sample_path, sample_id=sample_id, sample_name=sample_name), dependencies=rename_task) + +class SingleStarFlow(WorkflowRunner): + """ + Pyflow workflow to call the STAR aligner. We follow file formatting guidelines from the isaac workflow + from bioxutils. + """ + def __init__(self, star_path, star_index, sample, fastqs, out_dir, max_job_cores=8, tmp_path='', command_args=''): + self.star_path = star_path + self.star_index = star_index + self.sample = sample + self.fastqs = fastqs + self.out_dir = out_dir + self.max_job_cores = max_job_cores + if tmp_path !='': + self.tmp_path = tmp_path + else: + self.tmp_path = os.path.join(self.out_dir, 'tmp'+self.sample) + self.tmp_path+=''.join(random.choice(char_set) for _ in range(4)) + self.command_args = command_args + + def workflow(self): + if not os.path.exists(self.out_dir): + os.makedirs(self.out_dir) + pre_out_prefix = os.path.join(self.out_dir, self.sample) + fastq_files = self.fastqs + fastq_cleanup_cmds = [] + fastq_by_sample_1 = [x for x in fastq_files if '_R1_' in x] + fastq_by_sample_2 = [x for x in fastq_files if '_R2_' in x] + if len(fastq_by_sample_2) > 0: + read_files_string = ','.join(fastq_by_sample_1)+" "+','.join(fastq_by_sample_2) + else: + read_files_string = ','.join(fastq_by_sample_1) + self.star_command = "{star_path} --genomeDir {star_index} --readFilesIn {read_files_string} --runThreadN {max_job_cores} --outFileNamePrefix {out_dir} --outTmpDir {tmp_dir} --readFilesCommand zcat --outSAMtype BAM SortedByCoordinate --outSAMunmapped Within".format( + star_path=self.star_path, star_index=self.star_index, + read_files_string=read_files_string, max_job_cores=self.max_job_cores, + out_dir=pre_out_prefix, tmp_dir = self.tmp_path) + if self.command_args != "": + self.star_command+=" "+self.command_args + self.flowLog(self.star_command) + star_task = self.addTask("star", self.star_command, nCores=self.max_job_cores, memMb=100*1024, dependencies=None) + #self.addTask("delete_tmp", "rm -r {tmp_dir}".format(tmp_dir=os.path.join(self.out_dir, 'tmp'+self.sample)), dependencies=star_task) + rename_task = self.addTask("rename_star"+str(self.sample), "mv {prefix}Aligned.sortedByCoord.out.bam {path}/{sample}.raw.bam".format(prefix=pre_out_prefix, path=self.out_dir, sample=self.sample), dependencies=star_task) + # build bai + self.addTask("bai", "samtools index {path}/{sample}.raw.bam".format(path=self.out_dir, sample=self.sample), dependencies=rename_task) + + + + +if __name__ == '__main__': + wflow = STARFlow() + retval = wflow.run(mode='sge') + sys.exit(retval) \ No newline at end of file diff --git a/zippy/test/OtherRunner.py b/zippy/test/OtherRunner.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/zippy/test/OtherRunner.py @@ -0,0 +1 @@ + diff --git a/zippy/test/defaults.json b/zippy/test/defaults.json new file mode 100755 index 0000000..4424c68 --- /dev/null +++ b/zippy/test/defaults.json @@ -0,0 +1,23 @@ +{ + "stages": { + "markduplicates": + { + "use_mate_cigar": true, + "test": true + } + }, + "python": "/python/bin/python", + "max_job_cores": 16, + "genome": "/path/to/WholeGenomeFasta", + "samtools_path": "/path/to/samtools/latest/bin/samtools", + "bedtools_path": "/path/to/bedtools/latest/bin/bedtools", + "bcl2fastq_path" : "/path/to/bin/bcl2fastq2-v2.18.0.6/bin/bcl2fastq", + "star_path" : "/path/to/bin/STAR_2.5.1b/STAR", + "bwa_path" : "/path/to/bwa/bwa-0.7.12/bwa", + "indelRealign": "/path/to/bin/RealignIndels_5.1.2.80/RealignIndels.exe", + "picard": "/path/to/bin/picard-tools-2.9.0/picard.jar", + "pisces": "/path/to/bin/pisces/5.1.3.48/Release/Pisces.exe", + "rsem_path": "/path/to/bin/rsem-1.2.31/bin/rsem-calculate-expression", + "rsem_annotation": "/path/to/static_files/rsem/GRCh38", + "samtools_path": "/path/to/samtools/latest/bin/samtools" + } diff --git a/zippy/test/defaults_ignore.json b/zippy/test/defaults_ignore.json new file mode 100644 index 0000000..974c45e --- /dev/null +++ b/zippy/test/defaults_ignore.json @@ -0,0 +1,13 @@ +{ + "stages": [ + { + "identifier": "markduplicates", + "output_dir": "", + "stage": "markduplicates" + } + ], + "sample_sheet": "", + "scratch_path": "", + "picard": "/path/to/picard", + "samtools_path": "" +} diff --git a/zippy/test/defaults_include.json b/zippy/test/defaults_include.json new file mode 100755 index 0000000..e0e8ce8 --- /dev/null +++ b/zippy/test/defaults_include.json @@ -0,0 +1,20 @@ +{ + "stages": [ + { + "skip": "", + "use_mate_cigar": false, + "output_dir": "", + "memory": "", + "cores": "", + "identifier": "markduplicates", + "stage": "markduplicates" + } + ], + "sample_sheet": "", + "scratch_path": "", + "picard": "/path/to/picard", + "samtools_path": "", + "zippy_fix_duplicate_sample_names": "", + "max_cores": "", + "max_memory": "" +} \ No newline at end of file diff --git a/zippy/test/defaults_test.proto b/zippy/test/defaults_test.proto new file mode 100644 index 0000000..da99ec6 --- /dev/null +++ b/zippy/test/defaults_test.proto @@ -0,0 +1,3 @@ +{ + "stages": ["markduplicates"] +} diff --git a/zippy/test/defaults_warn.json b/zippy/test/defaults_warn.json new file mode 100644 index 0000000..974c45e --- /dev/null +++ b/zippy/test/defaults_warn.json @@ -0,0 +1,13 @@ +{ + "stages": [ + { + "identifier": "markduplicates", + "output_dir": "", + "stage": "markduplicates" + } + ], + "sample_sheet": "", + "scratch_path": "", + "picard": "/path/to/picard", + "samtools_path": "" +} diff --git a/zippy/test/dummy_params_test.json b/zippy/test/dummy_params_test.json new file mode 100755 index 0000000..8e8e5d6 --- /dev/null +++ b/zippy/test/dummy_params_test.json @@ -0,0 +1,15 @@ +{ + "stages": [ + { + "identifier": "dummy", + "output_dir": "", + "previous_stage": "", + "stage": "dummy", + "args": "test_args", + "overridden_arg": "internal" + } + ], + "dummy_path": "path/dummy", + "overridden_arg": "external", + "missing_args": "nope!" +} \ No newline at end of file diff --git a/zippy/test/environment_params_test.json b/zippy/test/environment_params_test.json new file mode 100755 index 0000000..722c2e4 --- /dev/null +++ b/zippy/test/environment_params_test.json @@ -0,0 +1,12 @@ +{ + "stages": [ + { + "identifier": "super_dummy", + "output_dir": "", + "previous_stage": "", + "stage": "dummy", + "environment": "zippy/test/subworkflow_params_test.json" + } + ], + "environment": "zippy/test/subworkflow_params_test.json" +} \ No newline at end of file diff --git a/zippy/test/import_test.json b/zippy/test/import_test.json new file mode 100644 index 0000000..ba94ba7 --- /dev/null +++ b/zippy/test/import_test.json @@ -0,0 +1,7 @@ +{ + "imports": [ + "test/OtherRunner.py" + ], + "stages": [], + "scratch_path": "" +} diff --git a/zippy/test/import_test.proto b/zippy/test/import_test.proto new file mode 100755 index 0000000..cc6df70 --- /dev/null +++ b/zippy/test/import_test.proto @@ -0,0 +1,4 @@ +{ + "imports":["zippy/test/OtherRunner.py"], + "stages": [] +} diff --git a/zippy/test/merge_and_macs.json b/zippy/test/merge_and_macs.json new file mode 100644 index 0000000..dc5cb8c --- /dev/null +++ b/zippy/test/merge_and_macs.json @@ -0,0 +1,26 @@ +{ + "stages": [ + { + "identifier": "data", + "output_dir": "", + "stage": "data" + }, + { + "samples": "", + "identifier": "mergebam", + "output_dir": "", + "previous_stage": "data", + "stage": "mergebam" + }, + { + "identifier": "macs", + "output_dir": "", + "previous_stage": "mergebam", + "stage": "macs" + } + ], + "sample_sheet": "", + "samtools_path": "/path/to/samtools/latest/bin/samtools", + "macs_path": "", + "scratch_path": "" +} \ No newline at end of file diff --git a/zippy/test/merge_and_macs.proto b/zippy/test/merge_and_macs.proto new file mode 100644 index 0000000..af44a7b --- /dev/null +++ b/zippy/test/merge_and_macs.proto @@ -0,0 +1,3 @@ +{ + "stages": ["data", "mergebam", "macs"] +} diff --git a/zippy/test/rsem.json b/zippy/test/rsem.json new file mode 100644 index 0000000..dd79eb3 --- /dev/null +++ b/zippy/test/rsem.json @@ -0,0 +1,23 @@ +{ + "stages": [ + { + "identifier": "bcl2fastq", + "output_dir": "", + "stage": "bcl2fastq" + }, + { + "identifier": "rsem", + "output_dir": "", + "previous_stage": "bcl2fastq", + "stage": "rsem" + } + ], + "sample_sheet": "", + "bcl2fastq_path": "/path/to/bin/bcl2fastq2-v2.18.0.6/bin/bcl2fastq", + "sample_path": "", + "rsem_path": "/path/to/bin/rsem-1.2.31/bin/rsem-calculate-expression", + "rsem_annotation": "/path/to/static_files/rsem/GRCh38", + "star_path": "/path/to/bin/STAR_2.5.1b/STAR", + "genome": "path/to/WholeGenomeFasta", + "scratch_path": "" +} \ No newline at end of file diff --git a/zippy/test/rsem.proto b/zippy/test/rsem.proto new file mode 100644 index 0000000..08b0699 --- /dev/null +++ b/zippy/test/rsem.proto @@ -0,0 +1,3 @@ +{ + "stages": ["bcl2fastq", "rsem"] +} diff --git a/zippy/test/subworkflow_params_test.json b/zippy/test/subworkflow_params_test.json new file mode 100755 index 0000000..eea054a --- /dev/null +++ b/zippy/test/subworkflow_params_test.json @@ -0,0 +1,20 @@ +{ + "stages": [ + { + "identifier": "super_dummy", + "output_dir": "", + "previous_stage": "", + "stage": "dummy", + "args": "test_args", + "overridden_arg": "internal" + }, + { + "subworkflow": "zippy/test/dummy_params_test.json", + "previous_stage": {"dummy": "super_dummy"}, + "identifier": "" + } + ], + "dummy_path": "path/dummy2", + "overridden_arg": "external", + "missing_args": "nope!" +} \ No newline at end of file diff --git a/zippy/test/test_defaults.json b/zippy/test/test_defaults.json new file mode 100644 index 0000000..e0ad10b --- /dev/null +++ b/zippy/test/test_defaults.json @@ -0,0 +1,8 @@ +{ + "stages": { + "markduplicates": { + "use_mate_cigar": false + } + }, + "picard": "/path/to/picard" +} diff --git a/zippy/test/test_make_params.py b/zippy/test/test_make_params.py new file mode 100755 index 0000000..052fc5f --- /dev/null +++ b/zippy/test/test_make_params.py @@ -0,0 +1,121 @@ +import os +import unittest +from ..make_params import get_argparser, JsonBuilderMain, get_identifier, case_insensitive_list_match +import commentjson as json + +def compare_json_structures(a_path,b_path): + with open(a_path, 'r') as a_handle: + a = json.load(a_handle) + with open(b_path, 'r') as b_handle: + b = json.load(b_handle) + return recursive_compare_keys(a,b) and recursive_compare_keys(b,a) + +def recursive_compare_keys(a,b): + for k in a.keys(): + if k in b: + if isinstance(a[k], dict): + if not recursive_compare_keys(a[k], b[k]): + return False + else: + print 'missing key {}'.format(k) + return False + return True + +class ImportsTestCase(unittest.TestCase): + ''' + Asserts that we pass through the imports + ''' + def runTest(self): + parser = get_argparser() + params = parser.parse_args(['zippy/test/import_test.proto', 'test_output_12344.json', '--defaults', 'zippy/test/defaults.json']) + jsb = JsonBuilderMain(params) + jsb.make_params_file() + self.assertTrue(compare_json_structures('test_output_12344.json','zippy/test/import_test.json')) + + def tearDown(self): + try: + pass + os.remove('test_output_12344.json') + except OSError: + pass + +class MacsTestCase(unittest.TestCase): + ''' + Asserts that we produce a json file with the proper parameters exposed given the workflow + ''' + def runTest(self): + parser = get_argparser() + params = parser.parse_args(['zippy/test/merge_and_macs.proto', 'test_output_12345.json', '--defaults', 'zippy/test/defaults.json']) + jsb = JsonBuilderMain(params) + jsb.make_params_file() + self.assertTrue(compare_json_structures('test_output_12345.json','zippy/test/merge_and_macs.json')) + + def tearDown(self): + try: + pass + os.remove('test_output_12345.json') + except OSError: + pass + +class RSEMTestCase(unittest.TestCase): + ''' + Asserts that we produce a json file with the proper parameters exposed given the workflow + ''' + def runTest(self): + parser = get_argparser() + params = parser.parse_args(['zippy/test/rsem.proto', 'test_output_12346.json', '--defaults', 'zippy/test/defaults.json']) + jsb = JsonBuilderMain(params) + jsb.make_params_file() + self.assertTrue(compare_json_structures('test_output_12346.json','zippy/test/rsem.json')) + + def tearDown(self): + try: + pass + os.remove('test_output_12346.json') + except OSError: + pass + +class GetIdentifiersTestCase(unittest.TestCase): + def runTest(self): + current_identifiers = set() + get_identifier('test', current_identifiers) + self.assertEqual(current_identifiers, set(['test'])) + get_identifier('test', current_identifiers) + self.assertEqual(current_identifiers, set(['test', 'test_1'])) + get_identifier('test', current_identifiers) + self.assertEqual(current_identifiers, set(['test', 'test_1', 'test_2'])) + print current_identifiers + +class CaseInsensitiveListMatchTestCase(unittest.TestCase): + def runTest(self): + parser = get_argparser() + params = parser.parse_args(['zippy/test/rsem.proto', 'test_output_12348.json']) + jsb = JsonBuilderMain(params) + l = ['hello', 'HELLO', 'HeLlO', 'hello1', 'ello', 'h', 'e', 'l' ,'l', 'o', 'heLLO'] + result = case_insensitive_list_match('hello', l) + self.assertEqual(result, 'hello') + l = ['HELLO', 'HeLlO', 'hello1', 'ello', 'h', 'e', 'l' ,'l', 'o', 'heLLO', 'hello'] + result = case_insensitive_list_match('hello', l) + self.assertEqual(result, 'HELLO') + l = ['ELLO', 'eLlO', 'helo1', 'ello', 'h', 'e', 'l' ,'l', 'o', 'heL', 'hllo'] + result = case_insensitive_list_match('hello', l) + self.assertEqual(result, None) + +class DefaultParameterTest(unittest.TestCase): + def runTest(self): + for behavior in ['include', 'warn', 'ignore']: + parser = get_argparser() + params = parser.parse_args(['zippy/test/defaults_test.proto', 'test_output_12347.json', '--defaults', 'zippy/test/test_defaults.json', '--default_behavior', behavior]) + jsb = JsonBuilderMain(params) + jsb.make_params_file() + self.assertTrue(compare_json_structures('test_output_12347.json','zippy/test/defaults_{}.json'.format(behavior))) + + def tearDown(self): + try: + pass + os.remove('test_output_12347.json') + except OSError: + pass + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/zippy/test/test_params.py b/zippy/test/test_params.py new file mode 100755 index 0000000..ca721cb --- /dev/null +++ b/zippy/test/test_params.py @@ -0,0 +1,105 @@ +import os +import unittest +import commentjson as json +from ..params import get_params +from ..modular_runner import ModularRunner + +THIS_DIR = os.path.dirname(os.path.abspath(__file__)) + +class DummyRunner(ModularRunner): + ''' + Does nothing. Designed for doing asserts. + ''' + + def get_output(self, sample): + pass + + def workflow(self, workflowRunner): + pass + + def get_self_parameter(self): + return self.params.self.args + + def get_global_parameter(self): + return self.params.dummy_path + + def get_overridden_parameter(self): + return self.params.overridden_arg + + def get_nonlocalized_local(self): + return self.params.self.missing_arg + + def check_hasattr_self(self, arg): + if hasattr(self.params.self, arg): + return True + return False + + def check_hasattr(self, arg): + if hasattr(self.params, arg): + return True + return False + +class ParamsTestCase(unittest.TestCase): + ''' + Asserts that we properly handle self.params.self parameters + ''' + def runTest(self): + params = get_params(os.path.join(THIS_DIR, 'dummy_params_test.json')) + test_stage = DummyRunner(params.dummy.identifier, params, None) + self.assertTrue(test_stage.check_hasattr_self('args')) + self.assertFalse(test_stage.check_hasattr_self('missing_args')) + self.assertTrue(test_stage.check_hasattr('missing_args')) + self.assertEqual(test_stage.get_self_parameter(), 'test_args') + self.assertEqual(test_stage.get_global_parameter(), 'path/dummy') + self.assertEqual(test_stage.get_overridden_parameter(), 'internal') + self.assertRaises(AttributeError, test_stage.get_nonlocalized_local) + + def tearDown(self): + pass + + +class SubworkflowTestCase(unittest.TestCase): + ''' + Asserts that we properly handle self.params.self parameters + ''' + def runTest(self): + params = get_params(os.path.join(THIS_DIR, 'subworkflow_params_test.json')) + test_stage = DummyRunner(params.super_dummy.identifier, params, None) + test_stage2 = DummyRunner(params.dummy.identifier, params, test_stage) + self.assertTrue(test_stage.check_hasattr_self('args')) + self.assertFalse(test_stage.check_hasattr_self('missing_args')) + self.assertTrue(test_stage.check_hasattr('missing_args')) + self.assertEqual(test_stage.get_self_parameter(), 'test_args') + self.assertEqual(test_stage.get_global_parameter(), 'path/dummy2') + self.assertEqual(test_stage.get_overridden_parameter(), 'internal') + self.assertTrue(test_stage2.check_hasattr_self('args')) + self.assertFalse(test_stage2.check_hasattr_self('missing_args')) + self.assertTrue(test_stage2.check_hasattr('missing_args')) + self.assertEqual(test_stage2.get_self_parameter(), 'test_args') + self.assertEqual(test_stage2.get_global_parameter(), 'path/dummy2') + self.assertEqual(test_stage2.get_overridden_parameter(), 'internal') + self.assertRaises(AttributeError, test_stage.get_nonlocalized_local) + + def tearDown(self): + pass + +class EnvironmentTestCase(unittest.TestCase): + ''' + Asserts that we properly handle self.params.self parameters + ''' + def runTest(self): + params = get_params(os.path.join(THIS_DIR, 'environment_params_test.json')) + test_stage = DummyRunner(params.super_dummy.identifier, params, None) + self.assertTrue(test_stage.check_hasattr_self('args')) + self.assertFalse(test_stage.check_hasattr_self('missing_args')) + self.assertTrue(test_stage.check_hasattr('missing_args')) + self.assertEqual(test_stage.get_self_parameter(), 'test_args') + self.assertEqual(test_stage.get_global_parameter(), 'path/dummy2') + self.assertEqual(test_stage.get_overridden_parameter(), 'internal') + self.assertRaises(AttributeError, test_stage.get_nonlocalized_local) + + def tearDown(self): + pass + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/zippy/utils.py b/zippy/utils.py new file mode 100755 index 0000000..a5762f7 --- /dev/null +++ b/zippy/utils.py @@ -0,0 +1,8 @@ +def sub_check_wilds(wildcard_map, s): + ''' + Given a dictionary of substitutions to make, and a string to find such substitutions, return + the substituted string. + ''' + for wildcard in wildcard_map.keys(): + s = s.replace('{'+wildcard+'}', wildcard_map[wildcard]) + return s diff --git a/zippy/zippy.py b/zippy/zippy.py new file mode 100755 index 0000000..61e973e --- /dev/null +++ b/zippy/zippy.py @@ -0,0 +1,223 @@ +from __future__ import print_function +import re +import json +import copy +import sys +import subprocess +import imp +import os +import types +from functools import partial +from pyflow import WorkflowRunner +from .modular_runner import * +from .modular_helper import run_help +from .params import get_params, save_params_as_json +from .make_params import case_insensitive_list_match + + +def docker_task_reformat(params, current_stage_setup, command): + ''' + Pulls docker information from the json file and wraps the command appropriately. + ''' + stage_params = getattr(params, current_stage_setup) + if hasattr(stage_params, 'use_docker'): + docker_config = getattr(params.docker,stage_params.use_docker) + mount_points = [] + for m in docker_config.mount_points: + mount_points.append('{prefix} {mount_dir}'.format( + prefix=_get_docker_mount_prefix(docker_config), mount_dir=m)) + if getattr(docker_config, 'intercept_mounts', False): + (from_path,to_path) = m.split(':') + command = re.sub(r"(\s+)"+from_path, r'\1'+to_path, command) + command = re.sub("^"+from_path, to_path, command) + mount_points = " ".join(mount_points) + if getattr(docker_config, 'pull', True): + load_command = _get_docker_pull_command(docker_config) + else: + load_command = _get_docker_load_command(docker_config) + command = _get_docker_run_command(docker_config, mount_points, command) + command = load_command+" ; "+command + return command + +def _get_docker_mount_prefix(docker_config): + if docker_config.engine == 'singularity': + return '-B' + else: + return '-v' + +def _get_docker_load_command(docker_config): + if docker_config.engine == 'singularity': + raise NotImplementedError('must use "pull" mode with singularity. Set "pull" to true in the config.') + else: + return "cat {} | docker load".format(docker_config.image_file_path) + +def _get_docker_pull_command(docker_config): + if docker_config.engine == 'singularity': + return 'singularity pull docker://{}'.format(docker_config.image_file_path) + else: + return 'docker pull {}'.format(docker_config.image_file_path) + +def _get_docker_run_command(docker_config, mount_points, command): + if docker_config.engine == 'singularity': + return 'singularity exec {mount_points} docker://{docker_image} {command}'.format( + mount_points=mount_points, command=command, docker_image=docker_config.image) + else: + if hasattr(docker_config, 'uid'): + uid = docker_config.uid + else: + uid = subprocess.check_output('id -u', shell=True).strip() + return 'docker run --rm {mount_points} --user={uid} --entrypoint=/bin/bash {docker_image} -c "{command}"'.format( + mount_points=mount_points, command=command, docker_image=docker_config.image, uid=uid) + +def subworkflow_addTask(current_stage_setup, params, self, label, command=None, **kwargs): + ''' + Used to monkey patch pyflow's subworkflows, so that we can inject docker wrappers into them. + Very similar to the version used for the main workflow, but we have to hardcode current_stage_setup + and params using partial functions, to fix the context for the subworkflow. + ''' + if current_stage_setup is not None and command is not None: + command = docker_task_reformat(params, current_stage_setup, command) + self.flowLog(command) + return WorkflowRunner.addTask(self, label, command=command, **kwargs) + + + +class ModularMain(WorkflowRunner): + """ + ZIPPY, the modular pipeline execution system. + """ + def __init__(self, input_params, from_dict=False): + self.modules_loaded = 0 + if from_dict: #initiate from api + self.params = input_params + else: #initiate from command line + if not os.path.exists(input_params.defaults): + print("Warning: Defaults file {} is not found.".format(input_params.defaults)) + self.params = get_params(input_params.workflow) + else: + self.params = get_params(input_params.workflow, input_params.defaults) + self.stage_dict = {} # map from identifier to stage + for x in getattr(self.params, 'imports', []): + self.load_external_modules(x) + + def addWorkflowTask(self, label, workflowRunnerInstance, dependencies=None): + ''' + We inject a docker-aware addTask function into subworkflows. + ''' + workflowRunnerInstance.addTask = types.MethodType(partial(subworkflow_addTask, self.current_stage_setup, self.params), workflowRunnerInstance) + return WorkflowRunner.addWorkflowTask(self, label, workflowRunnerInstance, dependencies=dependencies) + + + def addTask(self, label, command=None, **kwargs): + ''' + We wrap pyflow's addTask function to inject docker commands when applicable. + self.current_stage_setup is set in run_stage, and is switched as each stage adds its tasks to the DAG. + ''' + if self.current_stage_setup is not None and command is not None: + command = docker_task_reformat(self.params, self.current_stage_setup, command) + self.flowLog(command) + return WorkflowRunner.addTask(self, label, command=command, **kwargs) + + def run_stage(self, stage, previous_stage): + ''' + So this is some crazy stuff. What this does is allow us to name classes as strings + and then instantiate them as workflow tasks. Each workflow task is self contained, and takes as + input its previous stage, which knows where its outputs will be per-sample. Thus, so long + as the input/output formats line up, stages can be arbitrarily chained. + @return the newly instantiated stage + TODO: use __import__ to allow the importation of arbitrary dependencies. + ''' + print(stage) + class_name = case_insensitive_list_match('{}Runner'.format(stage.stage), globals().keys()) + print (class_name, stage, previous_stage) + try: + stage_out = globals()[class_name](stage.identifier, self.params, previous_stage) + except KeyError: + raise KeyError("Your workflow stage {} is not found. If it is defined in a custom file, make sure that file is imported in your params file. If it is built-in, make sure you are spelling it correctly!".format(stage.stage)) + self.current_stage_setup = stage.identifier + stage_out.setup_workflow(self) + self.stage_dict[stage.identifier] = stage_out + self.current_stage_setup = None + return stage_out + + def load_external_modules(self, module_path): + modules_to_add = {} + m = imp.load_source('user_module_import_{}'.format(self.modules_loaded), module_path) + self.modules_loaded += 1 + for k in vars(m).keys(): + if 'Runner' in k: + modules_to_add[k] = vars(m)[k] + globals().update(modules_to_add) + + + + def workflow(self): + ''' + For each stage, we add its workflow, passing along its explicit dependencies, if it has any. If it does not, + we assume it depends on the previous stage run. + ''' + previous_stage = None + for stage in self.params.stages: + print("Adding stage {}".format(stage.identifier)) + if hasattr(stage, 'previous_stage'): + if isinstance(stage.previous_stage, list): + previous_stages = [self.stage_dict[x] for x in stage.previous_stage] + previous_stage = self.run_stage(stage, previous_stages) + elif isinstance(stage.previous_stage, str): + previous_stage = self.run_stage(stage, [self.stage_dict[stage.previous_stage]]) + elif isinstance(stage.previous_stage, unicode): + previous_stage = self.run_stage(stage, [self.stage_dict[str(stage.previous_stage)]]) + else: + raise TypeError('Previous stage for {} is neither list or string'.format(stage.identifier)) + else: + previous_stage = self.run_stage(stage, [previous_stage]) + + def run_zippy(self, mode='sge', mail_to=None): + """Runs the workflow. Returns 0 upon successful completion, 1 otherwise""" + # pyflow.WorkflowRunner's run function by default already returns 0/1 for success/fail + return self.run(mode=mode, dataDirRoot=self.params.scratch_path, retryMax=0, mailTo=mail_to) + +def build_zippy(params_dictionary): + """ + Experimental! Call ZIPPY from python, specifying a params dictionary instead of going through makeparams. Returns a zippy object, + which can then be run using the call x.api_run_zippy() + """ + return ModularMain(params_dictionary, from_dict=True) + +def load_params(fname, defaults_fname=None): + """ + Loads a valid zippy parameters file or template file from disk. Represents the file as a native python object (c.f., the python json module) + """ + if defaults_fname is not None: + return get_params(fname) + return get_params(fname, defaults_fname) + +def save_params(params, fname): + """ + Writes a python object (structured as json) to a file. Used to write files which can then by run using 'python zippy.py your_file.json' + """ + save_params_as_json(copy.deepcopy(params), fname) + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser(usage='ZIPPY will run your workflow compiled by make_params and filled out by YOU! Run "python zippy.py --help" for more.', formatter_class=argparse.ArgumentDefaultsHelpFormatter, add_help=False) + parser.add_argument('workflow', help='The ZIPPY workflow json file.', nargs='?') + parser.add_argument('--defaults', default='defaults.json', help='File with default parameter values (in json format)') + parser.add_argument('--email', default=None, help='If specified, you will be emailed here upon job completion') + parser.add_argument('--run_local', dest='run_local', action='store_true', help='Setting this flag will run things on the local node, rather than attempt to use SGE.') + parser.add_argument('--help', dest='help', action='store_true', help='Our help system!') + parser.set_defaults(help=False) + parser.set_defaults(run_local=False) + input_params = parser.parse_args() + if input_params.help or input_params.workflow is None: + run_help(input_params.workflow) + if not input_params.help and input_params.workflow is not None: + print(parser.get_help()) + else: + wflow = ModularMain(input_params) + if input_params.run_local: + mode = 'local' + else: + mode = 'sge' + retval = wflow.run(mode=mode, dataDirRoot=wflow.params.scratch_path, retryMax=0, mailTo=input_params.email) + sys.exit(retval)