-
Notifications
You must be signed in to change notification settings - Fork 0
/
fastqdumps2histo.py
executable file
·292 lines (259 loc) · 11.9 KB
/
fastqdumps2histo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
#! /usr/bin/env python
#
# fastqdumps2histo.py v1.5
# by WRF
"""
fastqdumps2histo.py v1.5 last modified 2015-08-24
first generate fastq.counts file from zipped reads with jellyfish count
gzip -dc reads.fastq.gz | jellyfish count -m 25 -o fastq.counts -C -U 1000 -s 1G /dev/fd/0
add option -t for multithreads
change -m for different kmer sizes
change -U for maximum count, 1000 is good if genome coverage is 100+
otherwise perhaps use a value around 500
using a larger kmer will lower coverage, so if k > 45, set max to 500
change -s for memory allocation (see jellyfish manual)
fastq.counts file can be several gigabytes
jellyfish dump output would normally be quite large, so it is piped
then use jellyfish dump command as stdin
jellyfish dump fastq.counts | fastqdumps2histo.py - > histo.csv
because some downstream steps require the jellyfish dump, but can exclude
entries with a value of 1, a filtered jellyfish dump can be created
as it is read in the piped input, using the -j option
jellyfish dump fastq.counts | fastqdumps2histo.py -j fastq.dumps - > histo.csv
otherwise use directly as an input file
fastqdumps2histo.py fastq.dumps > histo.csv
if using a different kmer (other than 25, the default)
change with -k to correspond to the -m of jellyfish
also change -u to correspond to the -U of jellyfish
change -u accordingly if multiple jellyfish counts are merged
histo.csv is a comma separated matrix by GC count and coverage
coverage is taken from the fasta header for the kmer, which appears as:
>123
ACTTGATCGTGATGCTAGTAGCTGT
GC counts are integer counts of all Gs plus Cs
so the max value is the kmer length
this file can be imported directly into R for generating lava lamp plots
FOR USAGE IN TRINITY MODE - count read coverage and GC
use the intermediate output files (.stats) from kmersorter.py
fastqdumps2histo.py -f *.fastq.gz -z -s *.stats -u 1000 -T - > histo.csv
-r can set the read length, otherwise it is determined automatically
-u is set as above
-f are the original fastq reads (use wildcard or list multiple)
this could also be a subset of the original reads, or gzipped files
-s are the .stats files for each read
-t to specify fasta read files (-t fasta)
-T - must set both -T for Trinity mode and - for null input
-k is unused for this step
-z used to indicate that reads are gzipped
FOR TRINITY MODE WITH LONG READS (such as Sanger ESTs)
OR FOR LIBRARIES OF DIFFERENT LENGTH READS (such as 100bp and 125bp)
use the -p option to calculate GC as percentage rather than raw count
fastqdumps2histo.py -f reads.fasta -s reads.stats -u 1000 -T -p - > histo.csv
"""
import sys
import argparse
import time
import gzip
from itertools import izip
def get_freq(line):
return int(line[1:].rstrip())
def get_gc(kmer):
return kmer.count("G")+kmer.count("C")
def get_gc_perc_int(kmer):
# N's must be substracted from total length, otherwise GC values will be too low for scaffolds
return (kmer.count("G")+kmer.count("C")) * 100 / ( len(kmer.rstrip()) - kmer.count("N") )
def get_gc_perc_float(kmer):
# N's must be substracted from total length, otherwise GC values will be too low for scaffolds
return (kmer.count("G")+kmer.count("C")) * 100.0 / ( len(kmer.rstrip()) - kmer.count("N") )
def stats_to_dict(statfile):
print >> sys.stderr, "Reading stats file %s" % statfile, time.asctime()
sd = {}
for line in open(statfile, 'r').readlines():
lsplits = line.split("\t")
# lines of stats file look like:
#12 17.6632 10.3014 58.3215 HISEQ:150:C5KTJANXX:4:1101:1469:1959/1 thread:5
# or this for SRA files:
#53 57.4143 16.8252 29.3049 SRR1032106.2000.1/H thread:5
if len(lsplits) > 5:
medCov = int(lsplits[0])
acc = lsplits[4].split(" ")[0].rsplit("/",1)[0]
else:
medCov = int(lsplits[0])
acc = lsplits[1].split(" ")[0].rsplit("/",1)[0]
# was previously using split("/")[0], caused errors with different split in fastx_line_acc
sd[acc] = medCov
statCount = len(sd)
print >> sys.stderr, "Found stats for %d sequences" % statCount, time.asctime()
return sd
def fastx_line_acc(line):
# added rsplit to work with add_cov_to_acc
return line.rstrip().split(" ")[0][1:].rsplit("/",1)[0]
def get_fastx_type(seqsFile, verbose, opentype):
with opentype(seqsFile, 'r') as lr:
headerLine = lr.readline()
headerType = headerLine[0]
seqLength = len(lr.readline().rstrip())
if verbose:
print >> sys.stderr, "Sequence header in format of: {}".format(headerLine.rstrip())
print >> sys.stderr, "Header starts with: {}".format(headerType)
return headerType, seqLength
def count_matrix(fastxfile, statfile, FH, LC, m, kmercount, maxcount, gcfunction, opentype, verbose):
statDict = stats_to_dict(statfile)
if verbose:
print >> sys.stderr, "Stats stored in format of: {}".format(statDict.iterkeys().next())
print >> sys.stderr, "Reading file %s" % (fastxfile), time.asctime()
linecount = 0
for line in opentype(fastxfile, 'r'):
linecount += 1
if linecount==1 and line[0]==FH:
### TODO linecount should be reset when a new header line is found for fasta
kmercount += 1
acc = fastx_line_acc(line)
freq = statDict.get(acc, 0)
elif linecount==2:
gc = gcfunction(line)
# if it is the last line of each sequence, so cannot be elif
if linecount==LC:
# use try except in case the wrong -u is used or files were merged
try:
m[gc][freq] += 1
#if freq == 1: # for debugging only
# print >> sys.stderr, acc
except IndexError:
# most likely reason for this is setting -k to kmer rather than read length
maxcount += 1
except UnboundLocalError:
# this happened when freq was called before assignment
print >> sys.stderr, "ERROR Check data type", time.asctime()
# possibly wrong data type, no point continuing
sys.exit()
linecount = 0
return m, kmercount, maxcount
def count_list(fastxfile, statfile, FH, LC, kmercount, gcfunction, wayout, opentype, verbose):
statDict = stats_to_dict(statfile)
if verbose:
print >> sys.stderr, "Stats stored in format of: {}".format(statDict.iterkeys().next())
print >> sys.stderr, "Reading file %s" % (fastxfile), time.asctime()
linecount = 0
for line in opentype(fastxfile, 'r'):
linecount += 1
if linecount==1 and line[0]==FH:
kmercount += 1
acc = fastx_line_acc(line)
freq = statDict.get(acc, 0)
elif linecount==2:
gc = gcfunction(line)
# if it is the last line of each sequence, so cannot be elif
if linecount==LC:
print >> wayout, acc, freq, "{:.4f}".format(gc)
linecount = 0
return kmercount
def main(argv, wayout):
if not len(argv):
argv.append("-h")
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__)
parser.add_argument('input_file', type = argparse.FileType('rU'), default='-', help="fasta format file")
parser.add_argument('-d', '--delimiter', default=",", help="use alternate delimiter [,]")
parser.add_argument('-f', '--fastx', nargs='*', help="fastx files, either fasta or fastq, can be gzipped with -z")
parser.add_argument('-j', '--jellyfish', help="optional file for the filtered jellyfish dump")
parser.add_argument('-k', '--kmer', type=int, metavar='N', default=25, help="kmer length [25]")
parser.add_argument('-l', '--lower', type=int, metavar='N', default=2, help="lower limit for optional jellyfish dump [2]")
parser.add_argument('-p', '--percentage', action="store_true", help="count GC as percentage for long reads, not by fixed read length (ignores -r)")
parser.add_argument('-L', '--list', action="store_true", help="print one line for each sequence rather than matrix, overrides -p and -T")
parser.add_argument('-r', '--read-length', type=int, metavar='N', help="read length for Trinity mode [auto-detect]")
parser.add_argument('-s', '--stats', nargs='*', help="Trinity stats files")
parser.add_argument('-t', '--type', help="sequence type, fasta, fastq or [auto-detect]")
parser.add_argument('-T', '--trinity', action="store_true", help="use Trinity method to extract reads")
parser.add_argument('-u', '--upper', type=int, metavar='N', default=1000, help="upper limit for histogram [1000]")
parser.add_argument('-z', '--gzip', action="store_true", help="reads are gzipped, .gz")
parser.add_argument('-v', '--verbose', action="store_true", help="extra output for debugging")
args = parser.parse_args(argv)
# gc and freq are set by default to 0, in case the reads cannot be parsed
gc, freq = 0, 0
kmercount = 0
maxcount = 0
kmertag = "kmers"
if args.trinity or args.list:
# checking for matching stats and fastx files
if len(args.stats) != len(args.fastx):
print >> sys.stderr, "ERROR Unequal numbers of stats and %s files" % args.type, time.asctime()
print >> sys.stderr, "Exiting", time.asctime()
sys.exit()
# use gzip open if files are specified as gzip with -z
if args.gzip:
opentype = gzip.open
else:
opentype = open
# autodetect length and header type from fastx files
FH, seqLen = get_fastx_type(args.fastx[0], args.verbose, opentype)
# checking for sequence type and getting basic parameters of sequence parsing
if (FH=="@" or args.type=="fastq"):
LC = 4
else: # otherwise args.type=="fasta"
LC = 2
# overwrite if read length is given
if args.read_length:
seqLen = args.read_length
# set up different matrix if using percentage counting
if args.percentage:
# generate a matrix of 0 to 100 x cov maximum
m = [[0 for x in range(args.upper+1)] for y in range(101)]
else:
# generate read length x cov maximum matrix, using read length as ymax
m = [[0 for x in range(args.upper+1)] for y in range(seqLen+1)]
if args.verbose:
print >> sys.stderr, "Generating matrix of %d by %d" % (len(m), len(m[0]) )
# iterate over pairs of fastx and stats files
for statfile, fastxfile in izip(args.stats, args.fastx):
if args.percentage or args.list:
if args.list:
kmercount = count_list(fastxfile, statfile, FH, LC, kmercount, get_gc_perc_float, wayout, opentype, args.verbose)
else:
m, kmercount, maxcount = count_matrix(fastxfile, statfile, FH, LC, m, kmercount, maxcount, get_gc_perc_int, opentype, args.verbose)
else:
m, kmercount, maxcount = count_matrix(fastxfile, statfile, FH, LC, m, kmercount, maxcount, get_gc, opentype, args.verbose)
if sum(x[0] for x in m)==kmercount:
print >> sys.stderr, "ERROR No stats found for reads", time.asctime()
kmertag = "reads"
else:
if args.jellyfish:
jf = open(args.jellyfish, 'w')
jfwroteback = 0
# generate kmer x maximum matrix, using kmer length as ymax
m = [[0 for x in range(args.upper+1)] for y in range(args.kmer+1)]
print >> sys.stderr, "Reading fastq dumps", time.asctime()
for line in args.input_file:
# assumes jellyfish dump is in fasta format of kmers where header is count
# for header, take the count
if line[0]==">":
kmercount += 1
freq = get_freq(line)
# if not a header, then it is sequence
else:
seq = line
# calculate gc content for output
gc = get_gc(seq)
# use try except in case the wrong -u is used or files were merged
try:
m[gc][freq] += 1
except IndexError:
# any values outside of the range of the matrix are thus ignored
maxcount += 1
if args.jellyfish and (args.upper >= freq >= args.lower):
jf.write(">{}\n{}".format(freq, seq) )
jfwroteback += 1
if args.jellyfish:
jf.close()
if not args.list: # write matrix if not in list mode
print >> sys.stderr, "Writing counts", time.asctime()
for i in m:
print >> wayout, args.delimiter.join([str(j) for j in i])
print >> sys.stderr, "Counted %d %s" % (kmercount, kmertag), time.asctime()
if maxcount:
print >> sys.stderr, "%d %s exceeded cutoffs" % (maxcount, kmertag), time.asctime()
if not args.trinity and args.jellyfish:
if jfwroteback:
# only used in kmer counting mode
print >> sys.stderr, "Wrote %d kmers to %s" % (jfwroteback, args.jellyfish), time.asctime()
if __name__ == "__main__":
main(sys.argv[1:],sys.stdout)