-
Notifications
You must be signed in to change notification settings - Fork 6
/
split_fastq.py
executable file
·57 lines (43 loc) · 1.84 KB
/
split_fastq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#!/usr/bin/env python
'''
Split an input fastq file into multiple fastq files by
* opening a cycling set of output file handles
* iterating over each entry in the input
* write the entry an output filehandle
* cycle the output filehandle.
The plus line is stripped of content.
Used for early steps in the pipeline that are embarrassingly parallel.
'''
import itertools, os, os.path, sys, argparse, itertools, shutil
from Bio import SeqIO
import util
def output_filenames(input_filename, k):
'''destination filenames foo.fastq.0, etc.'''
return ['%s.%d' % (input_filename, i) for i in range(k)]
def split_fastq_entries(fastq, fhs):
'''
Send entries in the input to filenames, cycling over each filename.
fastq : filename or filehandle
input
outs : list or iterator of filehandles
outputs
returns : nothing
'''
# open all the filehandles
fhs_cycle = itertools.cycle(fhs)
# prepare an iterator over the fastq entries
for record, fh in itertools.izip(SeqIO.parse(fastq, 'fastq'), fhs_cycle):
SeqIO.write(record, fh, 'fastq')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Split a fastq file foo.fastq into multiple fastq files foo.fastq.0, foo.fastq.1, etc.', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('fastq', help='input fastq')
parser.add_argument('n_files', type=int, help='number of split files to output')
args = parser.parse_args()
filenames = output_filenames(args.fastq, args.n_files)
util.check_for_collisions(filenames)
if len(filenames) == 1:
# just copy the file
shutil.copy(args.fastq, filenames[0])
else:
# split the file entry by entry
split_fastq_entries(args.fastq, [open(fn, 'w') for fn in filenames])