-
Notifications
You must be signed in to change notification settings - Fork 8
/
rename-with-partitions.py
68 lines (55 loc) · 2.07 KB
/
rename-with-partitions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#! /usr/bin/env python
import screed
import sys
import gzip
import os
import argparse
def main():
parser = argparse.ArgumentParser()
parser.add_argument('prefix')
parser.add_argument('transcripts_file')
args = parser.parse_args()
prefix = args.prefix
filename = args.transcripts_file
# first pass: count partition sizes
partition_sizes = {}
for n, record in enumerate(screed.open(filename, parse_description=0)):
if n % 10000 == 0:
print '...', n
partition = record.name.split()[-1]
partition_sizes[partition] = partition_sizes.get(partition, 0) + 1
# show top 10 biggest partitions
print '---------------'
print 'partition, size'
for n, (_, size) in enumerate(sorted(partition_sizes.items(),
key=lambda x: -x[1])):
print n, size
if n == 10:
break
print '---------------'
# now, make a sensible header for each sequence that uniquely ids it
partition_sofar = {}
seq_id = 1
new_filename = os.path.basename(filename)
if new_filename.endswith('.gz'):
new_filename = new_filename[:-3]
if new_filename.endswith('.fasta'):
new_filename = new_filename[:-6]
new_filename += '.renamed.fasta.gz'
print 'creating', new_filename
outfp = gzip.open(new_filename, 'wb')
for n, record in enumerate(screed.open(sys.argv[2], parse_description=0)):
if n % 10000 == 0:
print '...writing', n
partition = record.name.split()[-1]
sofar = partition_sofar.get(partition, 0) + 1
partition_sofar[partition] = sofar
partition_size = partition_sizes[partition]
new_name = '%s.id%d.tr%s %d_of_%d_in_tr%s len=%d id=%s tr=%s' % \
(prefix, seq_id, partition, sofar, partition_size, partition, len(record.sequence), seq_id, partition)
outfp.write('>%s\n%s\n' % (new_name, record.sequence))
seq_id += 1
print 'total sequences:', n+1
print 'total transcript families:', len(partition_sizes)
if __name__ == '__main__':
main()