diff --git a/TEsorter/app.py b/TEsorter/app.py index b9da106..a2e90e8 100755 --- a/TEsorter/app.py +++ b/TEsorter/app.py @@ -92,6 +92,10 @@ def Args(): parser.add_argument("-prob", "--min-probability", action="store", default=0.5, type=float, help="mininum posterior probability for protein domains in HMMScan output [default=%(default)s]") + parser.add_argument("-score", "--min-score", action="store", + default=0.1, type=float, + help="mininum score for protein domains in HMMScan output [default=%(default)s]") + parser.add_argument("-mask", action="store", type=str, nargs='+', default=None, choices=['soft', 'hard'], help="output masked sequences (soft: masking with lowercase; hard: masking with N) [default=%(default)s]") @@ -215,6 +219,18 @@ def pipeline(args): check_db(db_file) gap = 'X' if args.seq_type == 'prot' else 'N' + kargs = dict(hmmdb = db_file, + db_name = db_name, + prefix = args.prefix, + force_write_hmmscan = args.force_write_hmmscan, + processors = args.processors, + tmpdir = args.tmp_dir, + mincov = args.min_coverage, + maxeval = args.max_evalue, + minprob = args.min_probability, + minscore = args.min_score + ) + if args.genome: logger.info( 'Start identifying pipeline (GENOME mode)' ) #print(open(args.sequence)) @@ -222,16 +238,8 @@ def pipeline(args): seq_type = 'nucl' gff, geneSeq = genomeAnn(genome=args.sequence, window_size=args.win_size, window_ovl=args.win_ovl, - hmmdb = db_file, - db_name = db_name, seqtype = seq_type, - prefix = args.prefix, - force_write_hmmscan = args.force_write_hmmscan, - processors = args.processors, - tmpdir = args.tmp_dir, - mincov = args.min_coverage, - maxeval = args.max_evalue, - minprob = args.min_probability, + **kargs ) mask_gff3(args.sequence, gff, args.prefix, types=args.mask, gap=gap) cleanup(args) @@ -248,16 +256,8 @@ def pipeline(args): seq_type = 'prot' if db_name == 'sine' else args.seq_type gff, geneSeq = LTRlibAnn( ltrlib = args.sequence, - hmmdb = db_file, - db_name = db_name, seqtype = seq_type, - prefix = args.prefix, - force_write_hmmscan = args.force_write_hmmscan, - processors = args.processors, - tmpdir = args.tmp_dir, - mincov = args.min_coverage, - maxeval = args.max_evalue, - minprob = args.min_probability, + **kargs ) mask_gff3(args.sequence, gff, args.prefix, types=args.mask, gap=gap) @@ -936,7 +936,7 @@ def format_gff_id(id): return re.compile(r'[;=\|]').sub("_", id) def hmm2best(inSeqs, inHmmouts, nucl_len=None, prefix=None, db='rexdb', seqtype='nucl', - mincov=20, maxeval=1e-3, minprob=0.6, genome=False): + mincov=20, maxeval=1e-3, minprob=0.6, minscore=0.5, genome=False): if prefix is None: prefix = inSeqs[0] if nucl_len is None and seqtype=='nucl': @@ -950,7 +950,7 @@ def hmm2best(inSeqs, inHmmouts, nucl_len=None, prefix=None, db='rexdb', seqtype= qid, s, e, domain = key else: qid, domain = key - if rc.hmmcov < mincov or rc.evalue > maxeval or rc.acc < minprob: + if rc.hmmcov < mincov or rc.evalue > maxeval or rc.acc < minprob or rc.score < minscore: continue rawid = qid gene,clade = parse_hmmname(rc.tname, db=db) @@ -1150,7 +1150,7 @@ def genomeAnn(genome, tmpdir='./tmp', seqfmt='fasta',window_size=1e6, window_ovl def LTRlibAnn(ltrlib, hmmdb='rexdb', db_name='rexdb', seqtype='nucl', prefix=None, force_write_hmmscan=False, genome=False, processors=4, tmpdir='./tmp', - mincov=20, maxeval=1e-3, minprob=0.5): + mincov=20, maxeval=1e-3, minprob=0.5, minscore=0.5): if prefix is None: prefix = '{}.{}'.format(ltrlib, hmmdb) bin = 'hmmscan' @@ -1166,7 +1166,7 @@ def LTRlibAnn(ltrlib, hmmdb='rexdb', db_name='rexdb', seqtype='nucl', prefix=No processors=processors, bin=bin, seqtype=seqtype, force_write_hmmscan=force_write_hmmscan) logger.info( 'generating gene anntations' ) gff, geneSeq = hmm2best(chunk_files, [domtbl], db=db_name, nucl_len=d_nucl_len, genome=genome, - prefix=prefix, seqtype=seqtype, mincov=mincov, maxeval=maxeval, minprob=minprob) + prefix=prefix, seqtype=seqtype, mincov=mincov, maxeval=maxeval, minprob=minprob, minscore=minscore) return gff, geneSeq def replaceCls(ltrlib, seqtype='nucl', db='rexdb'):