From 1ba402aa55ae83e8d997659c2511437682d3598c Mon Sep 17 00:00:00 2001 From: Jon Palmer Date: Thu, 15 Sep 2016 19:38:30 -0500 Subject: [PATCH] update to v0.3.8 --- bin/funannotate-predict.py | 74 +++++++++++++++++++------------------- funannotate.py | 4 +-- lib/library.py | 8 +++-- 3 files changed, 45 insertions(+), 41 deletions(-) diff --git a/bin/funannotate-predict.py b/bin/funannotate-predict.py index d6b48abe..5abf3bb9 100755 --- a/bin/funannotate-predict.py +++ b/bin/funannotate-predict.py @@ -1,23 +1,23 @@ #!/usr/bin/env python -import sys, os, subprocess, inspect, multiprocessing, shutil, argparse, time, re, platform +import sys, os, subprocess, inspect, shutil, argparse, re from Bio import SeqIO currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) parentdir = os.path.dirname(currentdir) -sys.path.insert(0,parentdir) +sys.path.insert(0, parentdir) import lib.library as lib #setup menu with argparse class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): - def __init__(self,prog): - super(MyFormatter,self).__init__(prog,max_help_position=48) -parser=argparse.ArgumentParser(prog='funannotate-predict.py', usage="%(prog)s [options] -i genome.fasta", - description='''Script that does it all...''', - epilog="""Written by Jon Palmer (2016) nextgenusfs@gmail.com""", + def __init__(self, prog): + super(MyFormatter, self).__init__(prog, max_help_position=48) +parser = argparse.ArgumentParser(prog='funannotate-predict.py', usage="%(prog)s [options] -i genome.fasta", + description = '''Script that does it all.''', + epilog = """Written by Jon Palmer (2016) nextgenusfs@gmail.com""", formatter_class = MyFormatter) -parser.add_argument('-i','--input', required=True, help='Genome in FASTA format') -parser.add_argument('-o','--out', required=True, help='Basename of output files') -parser.add_argument('-s','--species', required=True, help='Species name (e.g. "Aspergillus fumigatus") use quotes if there is a space') +parser.add_argument('-i', '--input', required=True, help='Genome in FASTA format') +parser.add_argument('-o', '--out', required=True, help='Basename of output files') +parser.add_argument('-s', '--species', required=True, help='Species name (e.g. "Aspergillus fumigatus") use quotes if there is a space') parser.add_argument('--isolate', help='Isolate/strain name (e.g. Af293)') parser.add_argument('--header_length', default=16, type=int, help='Max length for fasta headers') parser.add_argument('--name', default="FUN_", help='Shortname for genes, perhaps assigned by NCBI, eg. VC83') @@ -52,7 +52,7 @@ def __init__(self,prog): conflict = ['busco', 'busco_proteins', 'RepeatMasker', 'RepeatModeler', 'genemark', 'EVM_tmp', 'braker'] if args.out in conflict: lib.log.error("%s output folder conflicts with a hard coded tmp folder, please change -o parameter" % args.out) - os._exit(1) + sys.exit(1) #create folder structure if not os.path.exists(args.out): @@ -87,19 +87,18 @@ def __init__(self,prog): blastdb = os.path.join(parentdir,'DB','REPEATS.psq') if not os.path.isfile(blastdb): lib.log.error("funannotate database is not properly configured, please run `./setup.sh` in the %s directory" % parentdir) - os._exit(1) + sys.exit(1) #check buscos, download if necessary if not os.path.isdir(os.path.join(parentdir, 'DB', args.busco_db)): lib.download_buscos(args.busco_db) - #do some checks and balances try: EVM = os.environ["EVM_HOME"] except KeyError: if not args.EVM_HOME: lib.log.error("$EVM_HOME environmental variable not found, Evidence Modeler is not properly configured. You can use the --EVM_HOME argument to specifiy a path at runtime") - os._exit(1) + sys.exit(1) else: EVM = args.EVM_HOME @@ -108,7 +107,7 @@ def __init__(self,prog): except KeyError: if not args.AUGUSTUS_CONFIG_PATH: lib.log.error("$AUGUSTUS_CONFIG_PATH environmental variable not found, Augustus is not properly configured. You can use the --AUGUSTUS_CONFIG_PATH argument to specify a path at runtime.") - os._exit(1) + sys.exit(1) else: AUGUSTUS = args.AUGUSTUS_CONFIG_PATH @@ -119,7 +118,7 @@ def __init__(self,prog): if not lib.which('gmes_petap.pl'): if not args.GENEMARK_PATH: lib.log.error("GeneMark not found and $GENEMARK_PATH environmental variable missing, BRAKER1 is not properly configured. You can use the --GENEMARK_PATH argument to specify a path at runtime.") - os._exit(1) + sys.exit(1) else: GENEMARK_PATH = args.GENEMARK_PATH @@ -130,7 +129,7 @@ def __init__(self,prog): if not lib.which('bamtools'): if not args.BAMTOOLS_PATH: lib.log.error("Bamtools not found and $BAMTOOLS_PATH environmental variable missing, BRAKER1 is not properly configured. You can use the --BAMTOOLS_PATH argument to specify a path at runtime.") - os._exit(1) + sys.exit(1) else: BAMTOOLS_PATH = args.BAMTOOLS_PATH @@ -141,7 +140,7 @@ def __init__(self,prog): AutoAug = os.path.join(AUGUSTUS_BASE, 'scripts', 'autoAug.pl') GeneMark2GFF = os.path.join(parentdir, 'util', 'genemark_gtf2gff3.pl') -programs = ['tblastn', 'exonerate', 'makeblastdb','dustmasker','gag.py','tbl2asn','gmes_petap.pl', 'BuildDatabase', 'RepeatModeler', 'RepeatMasker', GeneMark2GFF, AutoAug, 'bedtools', 'gmap', 'gmap_build', 'blat', 'pslCDnaFilter', 'augustus', 'etraining', 'rmOutToGFF3.pl'] +programs = ['tblastn', 'exonerate', 'makeblastdb', 'dustmasker', 'gag.py', 'tbl2asn', 'gmes_petap.pl', 'BuildDatabase', 'RepeatModeler', 'RepeatMasker', GeneMark2GFF, AutoAug, 'bedtools', 'gmap', 'gmap_build', 'blat', 'pslCDnaFilter', 'augustus', 'etraining', 'rmOutToGFF3.pl'] lib.CheckDependencies(programs) #check augustus species now, so that you don't get through script and then find out it is already in DB @@ -155,10 +154,11 @@ def __init__(self,prog): #check augustus functionality augustuscheck = lib.checkAugustusFunc(AUGUSTUS_BASE) +system_os = lib.systemOS() if args.rna_bam: if augustuscheck[1] == 0: lib.log.error("ERROR: %s is not installed properly for BRAKER1 (check bam2hints compilation)" % augustuscheck[0]) - os._exit(1) + sys.exit(1) if not augspeciescheck: #means training needs to be done if augustuscheck[2] == 0: if 'MacOSX' in system_os: @@ -170,7 +170,7 @@ def __init__(self,prog): else: lib.log.error("ERROR: %s is not installed properly and this version not work with BUSCO, this is a problem with Augustus compliatation, you may need to compile manually on %s." % (augustuscheck[0], system_os)) if not args.pasa_gff: #first training will use pasa, otherwise BUSCO - os._exit(1) + sys.exit(1) else: lib.log.info("Will proceed with PASA models to train Augustus") @@ -204,7 +204,7 @@ def __init__(self,prog): header_test = lib.checkFastaHeaders(args.input, args.header_length) if not header_test: lib.log.error("Fasta headers on your input have more characters than the max (16), reformat headers to continue.") - os._exit(1) + sys.exit(1) #setup augustus parallel command AUGUSTUS_PARALELL = os.path.join(parentdir, 'bin', 'augustus_parallel.py') @@ -248,7 +248,7 @@ def __init__(self,prog): #check for masked genome here if not os.path.isfile(MaskGenome) or lib.getSize(MaskGenome) < 10: lib.log.error("RepeatMasking failed, check log files.") - os._exit(1) + sys.exit(1) #load contig names and sizes into dictionary. ContigSizes = {} @@ -258,7 +258,7 @@ def __init__(self,prog): ContigSizes[rec.id] = len(rec.seq) else: lib.log.error("Error, duplicate contig names, exiting") - os._exit(1) + sys.exit(1) #check for previous files and setup output files Predictions = os.path.join(args.out, 'predict_misc', 'gene_predictions.gff3') @@ -293,7 +293,7 @@ def __init__(self,prog): genesources.append(source) if not genesources: lib.log.error("Maker2 GFF not parsed correctly, no gene models found, exiting.") - os._exit(1) + sys.exit(1) for i in genesources: if i == 'maker': output.write("ABINITIO_PREDICTION\t%s\t1\n" % i) @@ -373,7 +373,7 @@ def __init__(self,prog): #check for protein evidence/format as needed p2g_out = os.path.join(args.out, 'predict_misc', 'exonerate.out') prot_temp = os.path.join(args.out, 'predict_misc', 'proteins.combined.fa') - P2G = os.path.join(parentdir, 'bin','funannotate-p2g.py') + P2G = os.path.join(parentdir, 'bin', 'funannotate-p2g.py') if not args.exonerate_proteins: if args.protein_evidence: if os.path.isfile(prot_temp): @@ -412,7 +412,7 @@ def __init__(self,prog): subprocess.call([ExoConverter, exonerate_out], stdout = output, stderr = FNULL) except OSError: lib.log.error("$EVM_HOME variable is incorrect, please double-check: %s" % EVM) - os._exit(1) + sys.exit(1) Exonerate = os.path.abspath(Exonerate) #now run exonerate2 hints for Augustus exonerate2hints = os.path.join(AUGUSTUS_BASE, 'scripts', 'exonerate2hints.pl') @@ -448,7 +448,7 @@ def __init__(self,prog): GeneMark = os.path.join(args.out, 'predict_misc', 'genemark.evm.gff3') with open(GeneMark, 'w') as output: with open(GeneMarkTemp, 'rU') as input: - lines = input.read().replace("Augustus","GeneMark") + lines = input.read().replace("Augustus", "GeneMark") output.write(lines) if args.augustus_gff: @@ -556,7 +556,7 @@ def __init__(self,prog): GeneMark = os.path.join(args.out, 'predict_misc', 'genemark.evm.gff3') with open(GeneMark, 'w') as output: with open(GeneMarkTemp, 'rU') as input: - lines = input.read().replace("Augustus","GeneMark") + lines = input.read().replace("Augustus", "GeneMark") output.write(lines) else: #have training parameters file, so just run genemark with GeneMarkGFF3 = os.path.join(args.out, 'predict_misc', 'genemark.gff') @@ -586,7 +586,7 @@ def __init__(self,prog): subprocess.call(['perl', Converter, GeneMarkTemp], stdout = output, stderr = FNULL) with open(GeneMark, 'w') as output: with open(GeneMarkTemp2, 'rU') as input: - lines = input.read().replace("Augustus","GeneMark") + lines = input.read().replace("Augustus", "GeneMark") output.write(lines) else: @@ -602,7 +602,7 @@ def __init__(self,prog): GeneMark = os.path.join(args.out, 'predict_misc', 'genemark.evm.gff3') with open(GeneMark, 'w') as output: with open(GeneMarkTemp, 'rU') as input: - lines = input.read().replace("Augustus","GeneMark") + lines = input.read().replace("Augustus", "GeneMark") output.write(lines) if not Augustus: @@ -644,7 +644,7 @@ def __init__(self,prog): lib.log.error("BUSCO training of Augusus failed, check busco logs, exiting") #remove the augustus training config folder shutil.rmtree(os.path.join(AUGUSTUS, 'species', aug_species)) - os._exit(1) + sys.exit(1) #proper training files exist, now run EVM on busco models to get high quality predictions. lib.log.info("BUSCO predictions complete, now formatting for EVM") #move the busco folder now where it should reside @@ -735,12 +735,12 @@ def __init__(self,prog): total = lib.countGFFgenes(EVM_busco) except IOError: lib.log.error("EVM did not run correctly, output file missing") - os._exit(1) + sys.exit(1) #check number of gene models, if 0 then failed, delete output file for re-running if total < 1: lib.log.error("Evidence modeler has failed, exiting") os.remove(EVM_busco) - os._exit(1) + sys.exit(1) else: lib.log.info('{0:,}'.format(total) + ' total gene models from EVM') #move EVM folder to predict folder @@ -787,7 +787,7 @@ def __init__(self,prog): #just double-check that you've gotten here and both Augustus/GeneMark are finished if not any([Augustus, GeneMark]): lib.log.error("Augustus or GeneMark prediction is missing, check log files for errors") - os._exit(1) + sys.exit(1) #GeneMark can fail if you try to pass a single contig, check file length GM_check = lib.line_count(GeneMark) @@ -902,12 +902,12 @@ def __init__(self,prog): total = lib.countGFFgenes(EVM_out) except IOError: lib.log.error("EVM did not run correctly, output file missing") - os._exit(1) + sys.exit(1) #check number of gene models, if 0 then failed, delete output file for re-running if total < 1: lib.log.error("Evidence modeler has failed, exiting") os.remove(EVM_out) - os._exit(1) + sys.exit(1) else: lib.log.info('{0:,}'.format(total) + ' total gene models from EVM') @@ -1037,4 +1037,4 @@ def __init__(self,prog): os.rename('funannotate-EVM.log', os.path.join(args.out, 'logfiles', 'funannotate-EVM.log')) if os.path.isfile('funannotate-p2g.log'): os.rename('funannotate-p2g.log', os.path.join(args.out, 'logfiles', 'funannotate-p2g.log')) -os._exit(1) +sys.exit(1) diff --git a/funannotate.py b/funannotate.py index 80229c6f..7187245d 100755 --- a/funannotate.py +++ b/funannotate.py @@ -31,7 +31,7 @@ def fmtcols(mylist, cols): for i in range(0,num_lines)) return "\n".join(lines) -version = '0.3.7' +version = '0.3.8' default_help = """ Usage: funannotate @@ -95,6 +95,7 @@ def fmtcols(mylist, cols): Arguments: -i, --input Multi-fasta genome file. (Required) -o, --output Sorted by size and relabeled output file. (Required) -b, --base Base name to relabel contigs. Default: scaffold + --minlen Shorter contigs are discarded. Default: 0 Written by Jon Palmer (2016) nextgenusfs@gmail.com """ % (sys.argv[1], version) @@ -316,7 +317,6 @@ def fmtcols(mylist, cols): if len(arguments) > 0: cmd = os.path.join(script_path, 'setup.sh') arguments.insert(0, cmd) - print [cmd, 'dep'] if '--all' in arguments: subprocess.call(cmd, cwd = script_path) elif '--dep' in arguments: diff --git a/lib/library.py b/lib/library.py index 7a129334..f5688764 100644 --- a/lib/library.py +++ b/lib/library.py @@ -842,8 +842,8 @@ def MemoryCheck(): mem = psutil.virtual_memory() RAM = int(mem.total) return round(RAM / 1024000000) - -def SystemInfo(): + +def systemOS(): if sys.platform == 'darwin': system_os = 'MacOSX '+ platform.mac_ver()[0] elif sys.platform == 'linux': @@ -851,6 +851,10 @@ def SystemInfo(): system_os = linux_version[0]+ ' '+linux_version[1] else: system_os = sys.platform + return system_os + +def SystemInfo(): + system_os = systemOS() python_vers = str(sys.version_info[0])+'.'+str(sys.version_info[1])+'.'+str(sys.version_info[2]) log.info("OS: %s, %i cores, ~ %i GB RAM. Python: %s" % (system_os, multiprocessing.cpu_count(), MemoryCheck(), python_vers))