-
Notifications
You must be signed in to change notification settings - Fork 7
/
bin3C.py
executable file
·216 lines (181 loc) · 9.5 KB
/
bin3C.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#!/usr/bin/env python
from mzd.cluster import *
from mzd.contact_map import *
from mzd.exceptions import ApplicationException
from mzd.io_utils import load_object, save_object
from mzd.utils import *
import logging
import sys
__version__ = '0.1.1'
if __name__ == '__main__':
import argparse
def mk_version():
return 'bin3C v{}'.format(__version__)
def out_name(base, suffix):
return '{}{}'.format(base, suffix)
def ifelse(arg, default):
if arg is None:
return default
else:
return arg
runtime_defaults = {
'min_reflen': 1000,
'min_signal': 5,
'max_image': 4000,
'min_extent': 50000,
'min_mapq': 60,
'strong': 10
}
global_parser = argparse.ArgumentParser(add_help=False)
global_parser.add_argument('-V', '--version', default=False, action='store_true', help='Show the application version')
global_parser.add_argument('-v', '--verbose', default=False, action='store_true', help='Verbose output')
global_parser.add_argument('--clobber', default=False, action='store_true', help='Clobber existing files')
global_parser.add_argument('--log', help='Log file path [OUTDIR/bin3C.log]')
global_parser.add_argument('--max-image', type=int, help='Maximum image size for plots [4000]')
global_parser.add_argument('--min-extent', type=int,
help='Minimum cluster extent used in output [50000]')
global_parser.add_argument('--min-reflen', type=int,
help='Minimum acceptable reference length [1000]')
global_parser.add_argument('--min-signal', type=int, help='Minimum acceptable signal [5]')
parser = argparse.ArgumentParser(description='bin3C: a Hi-C based metagenome deconvolution tool')
subparsers = parser.add_subparsers(title='commands', dest='command', description='Valid commands',
help='choose an analysis stage for further options')
cmd_mkmap = subparsers.add_parser('mkmap', parents=[global_parser],
description='Create a new contact map from assembly sequences and Hi-C bam file.')
cmd_cluster = subparsers.add_parser('cluster', parents=[global_parser],
description='Cluster an existing contact map into genome bins.')
"""
make and save the contact map object
"""
cmd_mkmap.add_argument('--eta', default=False, action='store_true',
help='Pre-count bam alignments to provide an ETA')
cmd_mkmap.add_argument('--bin-size', type=int,
help='Size of bins for windows extent maps [disabled]')
cmd_mkmap.add_argument('--min-insert', type=int,
help='Minimum pair separation [None]')
cmd_mkmap.add_argument('--min-mapq', type=int,
help='Minimum acceptable mapping quality [60]')
cmd_mkmap.add_argument('--strong', type=int,
help='Accepted alignments must being N matches [10]')
cmd_mkmap.add_argument('-e', '--enzyme', metavar='NEB_NAME', required=True, action='append',
help='Case-sensitive NEB enzyme name. Use multiple times for multiple enzymes')
cmd_mkmap.add_argument('FASTA', help='Reference fasta sequence')
cmd_mkmap.add_argument('BAM', help='Input bam file in query order')
cmd_mkmap.add_argument('OUTDIR', help='Output directory')
"""
cluster the map and save results
"""
cmd_cluster.add_argument('-s', '--seed', default=None, help='Random seed')
cmd_cluster.add_argument('--no-report', default=False, action='store_true',
help='Do not generate a cluster report')
cmd_cluster.add_argument('--no-spades', default=False, action='store_true',
help='Assembly was not done using SPAdes')
cmd_cluster.add_argument('--no-plot', default=False, action='store_true',
help='Do not generate a clustered heatmap')
cmd_cluster.add_argument('--no-fasta', default=False, action='store_true',
help='Do not generate cluster FASTA files')
cmd_cluster.add_argument('--only-large', default=False, action='store_true',
help='Only write FASTA for clusters longer than min_extent')
# cmd_cluster.add_argument('--algo', default='infomap', choices=['infomap', 'louvain', 'mcl', 'slm', 'simap'],
# help='Clustering algorithm to apply [infomap]')
cmd_cluster.add_argument('--fasta', default=None,
help='Alternative source FASTA location from that supplied during mkmap')
cmd_cluster.add_argument('MAP', help='Contact map')
cmd_cluster.add_argument('OUTDIR', help='Output directory')
args = parser.parse_args()
if args.version:
print mk_version()
sys.exit(0)
try:
make_dir(args.OUTDIR, args.clobber)
except IOError as e:
print 'Error: {}'.format(e.message)
sys.exit(1)
logging.captureWarnings(True)
logger = logging.getLogger('main')
# root log listens to everything
root = logging.getLogger('')
root.setLevel(logging.DEBUG)
# log message format
formatter = logging.Formatter(fmt='%(levelname)-8s | %(asctime)s | %(name)7s | %(message)s')
# Runtime console listens to INFO by default
ch = logging.StreamHandler()
if args.verbose:
ch.setLevel(logging.DEBUG)
else:
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
root.addHandler(ch)
# File log listens to all levels from root
if args.log is not None:
log_path = args.log
else:
log_path = os.path.join(args.OUTDIR, 'bin3C.log')
fh = logging.FileHandler(log_path, mode='a')
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
root.addHandler(fh)
# Add some environmental details
logger.debug(mk_version())
logger.debug(sys.version.replace('\n', ' '))
logger.debug('Command line: {}'.format(' '.join(sys.argv)))
try:
if args.command == 'mkmap':
# Create a contact map for analysis
cm = ContactMap(args.BAM,
args.enzyme,
args.FASTA,
args.min_insert,
ifelse(args.min_mapq, runtime_defaults['min_mapq']),
min_len=ifelse(args.min_reflen, runtime_defaults['min_reflen']),
min_sig=ifelse(args.min_signal, runtime_defaults['min_signal']),
min_extent=ifelse(args.min_extent, runtime_defaults['min_extent']),
strong=ifelse(args.strong, runtime_defaults['strong']),
bin_size=args.bin_size,
precount=args.eta)
if cm.is_empty():
logger.info('Stopping as the map is empty')
sys.exit(1)
logger.info('Saving contact map instance')
save_object(os.path.join(args.OUTDIR, 'contact_map.p'), cm)
elif args.command == 'cluster':
if not args.seed:
args.seed = make_random_seed()
logger.info('Generated random seed: {}'.format(args.seed))
else:
logger.info("User set random seed: {}".format(args.seed))
# Load a pre-existing serialized contact map
logger.info('Loading existing contact map from: {}'.format(args.MAP))
cm = load_object(args.MAP)
cm.min_extent = ifelse(args.min_extent, runtime_defaults['min_extent'])
# update the mask if the user has specified a threshold, which might be the same as before
if args.min_signal is not None or args.min_reflen is not None:
# pedantically set these and pass to method just in-case of logic oversight
min_reflen = ifelse(args.min_reflen, runtime_defaults['min_reflen'])
min_signal = ifelse(args.min_signal, runtime_defaults['min_signal'])
cm.min_len = min_reflen
cm.min_sig = min_signal
cm.set_primary_acceptance_mask(min_sig=min_signal, min_len=min_reflen, update=True)
# cluster the entire map
clustering = cluster_map(cm, method='infomap', seed=args.seed, work_dir=args.OUTDIR)
# generate report per cluster
cluster_report(cm, clustering, is_spades=not args.no_spades)
# write MCL clustering file
write_mcl(cm, os.path.join(args.OUTDIR, 'clustering.mcl'), clustering)
# serialize full clustering object
save_object(os.path.join(args.OUTDIR, 'clustering.p'), clustering)
if not args.no_report:
# write a tabular report
write_report(os.path.join(args.OUTDIR, 'cluster_report.csv'), clustering)
if not args.no_fasta:
# write per-cluster fasta files, also separate ordered fasta if an ordering exists
write_fasta(cm, args.OUTDIR, clustering, source_fasta=args.fasta, clobber=True,
only_large=args.only_large)
if not args.no_plot:
# the entire clustering
plot_clusters(cm, os.path.join(args.OUTDIR, 'cluster_plot.png'), clustering,
max_image_size=args.max_image, ordered_only=False, simple=False, permute=True)
except ApplicationException as ex:
import sys
logger.error(ex.message)
sys.exit(1)