-
Notifications
You must be signed in to change notification settings - Fork 5
/
fammer.py
executable file
·144 lines (129 loc) · 6.15 KB
/
fammer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python2.7
"""A toolkit for protein superfamily sequence profile alignment and curation.
Manages a hierarchical directory tree containing sequence sets (FASTA files)
corresponding to each subfamily of a protein superfamily.
"""
from __future__ import absolute_import
import logging
import argparse
from fammerlib.build import cmd_build
from fammerlib.scan import cmd_scan
from fammerlib.add import cmd_add
from fammerlib.refine import cmd_refine
from fammerlib.cluster import cmd_cluster
from fammerlib.update_fasta import cmd_update_fasta
if __name__ == '__main__':
AP = argparse.ArgumentParser(
description=__doc__,
epilog="Contact Eric <[email protected]> for help.")
# Global options
AP.add_argument('-q', '--quiet',
action='store_true',
help="Don't print status messages, only warnings and errors.")
AP_subparsers = AP.add_subparsers(
help='sub-commands (use with -h for more info)')
# Sub-command: build
P_build = AP_subparsers.add_parser('build',
help='Build profiles from a given directory tree.')
P_build.add_argument('basedir',
help='Top of the directory tree where the sequences sets are.')
P_build.add_argument('--align-script', dest='align_script', metavar='FILE',
help='Script to align a FASTA file, writing Clustal format.')
P_build.add_argument('--clean',
action='store_true',
help='Clean auxilliary files after building.')
P_build.add_argument('--no-pdb',
dest='pdb', action='store_false', default=True,
help="Don't use PDB structures when constructing alignments.")
P_build.add_argument('--hmmer',
action='store_true',
help='Build profiles for HMMer.')
P_build.add_argument('--mapgaps',
action='store_true',
help='Build profiles for MAPGAPS.')
P_build.add_argument('--tree',
action='store_true',
help='Write a Newick tree indicating the directory structure.')
P_build.set_defaults(func=cmd_build)
# Sub-command: scan
P_scan = AP_subparsers.add_parser('scan',
help='Scan and classify input sequences using a set of profiles.')
P_scan.add_argument('profile_db',
help='HMM profile name (or basename).')
P_scan.add_argument('target',
help='Sequences to scan, in unaligned FASTA format.')
P_scan.add_argument('-E', '--evalue',
default='0.0001',
help='E-value cutoff for significant search hits.')
P_scan.add_argument('--table',
action='store_true',
help='Print a table of sequence-to-profile assignments.')
P_scan.add_argument('-o', '--seqout',
help='Write a FASTA file of all sequences matching any profile.')
P_scan.add_argument('-O', '--seqsets',
action='store_true',
help='Write FASTA files of the sequences matching each profile.')
P_scan.add_argument('-A', '--align',
action='store_true',
help='Make a FASTA alignment of sequences matching each profile.')
P_scan.add_argument('-n', '--include',
help='Only match profiles whose name contains this string(s).')
P_scan.add_argument('-x', '--exclude',
help='Exclude matches to profiles whose name contains this string.')
P_scan.set_defaults(func=cmd_scan)
# Sub-command: add
P_add = AP_subparsers.add_parser('add',
help="""Scan a target database with the given HMM profile set. Add
hits that meet acceptance thresholds to the profile FASTA files.""")
P_add.add_argument('basedir',
help="""Top of the directory tree where the sequences sets are.
There must also be an HMM profile database at <basedir>_all.hmm.""")
P_add.add_argument('sequence_db',
help='Sequence database to scan for new additions to the profiles.')
P_add.add_argument('-E', '--evalue',
default='1e-20',
help='E-value cutoff for significant search hits.')
P_add.add_argument('-L', '--min-length',
default=0.8, type=float,
help="""Minimum length, as a proportion of the profile length, of
hits to accept into the profiles.""")
# ENH: -u, --unclassified : exclude? put parent-level hits here?
P_add.set_defaults(func=cmd_add)
# Sub-command: refine
P_refine = AP_subparsers.add_parser('refine',
help='Leave-one-out validation of HMM profiles.')
P_refine.add_argument('basedir',
help='Top of the directory tree where the sequences sets are.')
P_refine.add_argument('-n', '--dry-run',
action='store_true',
help='Do a "dry run"; don\'t modify the original sequence sets.')
P_refine.add_argument('-u', '--unclassified',
help="""The set of unclassified/unassigned sequences (FASTA file).
By default, this is <basedir>/<basedir>-Unique.fasta.""")
P_refine.add_argument('-x', '--exclude',
help="Exclude this profile from evaluation.")
P_refine.set_defaults(func=cmd_refine)
# Sub-command: update-fasta
P_update = AP_subparsers.add_parser('update-fasta',
help="""Replace original FASTA sequence sets with the ungapped
sequences from the corresponding alignment (.aln) files, sorted by
decreasing length.""")
P_update.add_argument('basedir',
help='Top of the directory tree where the sequences sets are.')
P_update.set_defaults(func=cmd_update_fasta)
# Sub-command: cluster
P_cluster = AP_subparsers.add_parser('cluster',
help="""Split a sequence set into smaller clusters.""")
P_cluster.add_argument('sequences',
help='Clustal alignment (.aln) or unaligned FASTA (.fasta).')
P_cluster.set_defaults(func=cmd_cluster)
args = AP.parse_args()
# Handle global options here
if args.quiet:
logging.basicConfig(level=logging.WARNING,
format="%(module)s: %(message)s")
else:
logging.basicConfig(level=logging.INFO,
format="%(module)s [@%(lineno)s]: %(message)s")
# Pass the rest along to the sub-command implementation
args.func(args)