From bfea9f08816f64aafb6589002a1848acfc323cf2 Mon Sep 17 00:00:00 2001 From: Jon Palmer Date: Fri, 1 Jul 2016 10:15:34 -0500 Subject: [PATCH] updates to v0.3.2 --- README.md | 2 +- bin/funannotate-predict.py | 192 +++++++++++++++++++++++++++++++------ docs/ubuntu_install.md | 1 + funannotate.py | 3 +- lib/library.py | 37 ++++--- util/filter_buscos.py | 62 ++++++++++++ util/fix_busco_naming.py | 47 +++++++++ util/funannotate-BUSCO.py | 6 +- 8 files changed, 300 insertions(+), 50 deletions(-) create mode 100755 util/filter_buscos.py create mode 100755 util/fix_busco_naming.py diff --git a/README.md b/README.md index ff90acf6..53667835 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ To see the help menu, simply type `funannotate` in the terminal window. Similar $ funannotate Usage: funannotate -version: 0.1.4 +version: 0.3.2 Description: Funannotate is a genome prediction, annotation, and comparison pipeline. diff --git a/bin/funannotate-predict.py b/bin/funannotate-predict.py index e3f65da8..dbce3427 100755 --- a/bin/funannotate-predict.py +++ b/bin/funannotate-predict.py @@ -39,6 +39,7 @@ def __init__(self,prog): parser.add_argument('--keep_no_stops', action='store_true', help='Keep gene models without valid stop codons') parser.add_argument('--cpus', default=2, type=int, help='Number of CPUs to use') parser.add_argument('--busco_seed_species', default='anidulans', help='Augustus species to use as initial training point for BUSCO') +parser.add_argument('--optimize_augustus', action='store_true', help='Run "long" training of Augustus') parser.add_argument('--busco_db', default='fungi', choices=['fungi', 'metazoa', 'eukaryota', 'arthropoda', 'vertebrata'], help='BUSCO model database') parser.add_argument('--organism', default='fungus', choices=['fungus', 'other'], help='Fungal specific settings') parser.add_argument('--EVM_HOME', help='Path to Evidence Modeler home directory, $EVM_HOME') @@ -144,7 +145,7 @@ def __init__(self,prog): aug_species = args.augustus_species augspeciescheck = lib.CheckAugustusSpecies(aug_species) if augspeciescheck: - lib.log.error("Augustus training set for %s already exists, thus funannotate will use those parameters. If this is not what you want, exit script and provide a unique name for the --augustus_species argument" % (aug_species)) + lib.log.error("Augustus training set for %s already exists, thus funannotate will use those parameters. If you want to re-train, provide a unique name for the --augustus_species argument" % (aug_species)) #check augustus functionality augustuscheck = lib.checkAugustusFunc(AUGUSTUS_BASE) @@ -168,7 +169,7 @@ def __init__(self,prog): lib.log.info("Will proceed with PASA models to train Augustus") #if made it here output Augustus version -lib.log.info("%s detected, version compatible with BRAKER1 and BUSCO" % augustuscheck[0]) +lib.log.info("%s detected, version seems to be compatible with BRAKER1 and BUSCO" % augustuscheck[0]) if args.protein_evidence == 'uniprot.fa': args.protein_evidence = os.path.join(parentdir, 'DB', 'uniprot_sprot.fasta') @@ -206,6 +207,8 @@ def __init__(self,prog): Converter = os.path.join(EVM, 'EvmUtils', 'misc', 'augustus_GFF3_to_EVM_GFF3.pl') ExoConverter = os.path.join(EVM, 'EvmUtils', 'misc', 'exonerate_gff_to_alignment_gff3.pl') Validator = os.path.join(EVM, 'EvmUtils', 'gff3_gene_prediction_file_validator.pl') +Converter2 = os.path.join(EVM, 'EvmUtils', 'misc', 'augustus_GTF_to_EVM_GFF3.pl') +EVM2proteins = os.path.join(EVM, 'EvmUtils', 'gff3_file_to_proteins.pl') #so first thing is to reformat genome fasta only if there is no aligned evidence already sort_out = os.path.join(args.out, 'predict_misc', 'genome.fasta') @@ -440,8 +443,11 @@ def __init__(self,prog): lib.log.info("Training Augustus using PASA data, this may take awhile") GFF2GB = os.path.join(AUGUSTUS_BASE, 'scripts', 'gff2gbSmallDNA.pl') trainingset = os.path.join(args.out, 'predict_misc', 'augustus.pasa.gb') - subprocess.call([GFF2GB, args.pasa_gff, MaskGenome, '1000', trainingset], stderr = FNULL) - lib.trainAugustus(AUGUSTUS_BASE, aug_species, trainingset, MaskGenome, args.out, args.cpus) + subprocess.call([GFF2GB, args.pasa_gff, MaskGenome, '250', trainingset], stderr = FNULL) + if args.optimize_augustus: + lib.trainAugustus(AUGUSTUS_BASE, aug_species, trainingset, MaskGenome, args.out, args.cpus, True) + else: + lib.trainAugustus(AUGUSTUS_BASE, aug_species, trainingset, MaskGenome, args.out, args.cpus, False) #now run whole genome Augustus using trained parameters. lib.log.info("Running Augustus gene prediction") @@ -531,35 +537,163 @@ def __init__(self,prog): else: aug_species = args.augustus_species aug_out = os.path.join(args.out, 'predict_misc', 'augustus.gff3') + busco_log = os.path.join(args.out, 'logfiles', 'busco.log') if not lib.CheckAugustusSpecies(aug_species): - #run BUSCO and then train Augustus with those results + #run BUSCO #define BUSCO and FUNGI models BUSCO = os.path.join(parentdir, 'util', 'funannotate-BUSCO.py') BUSCO_FUNGI = os.path.join(parentdir, 'DB', args.busco_db) - lib.log.info("Running BUSCO to find conserved gene models for training Augustus") - if not os.path.isdir('busco'): - os.makedirs('busco') + runbusco = True + if os.path.isdir(os.path.join(args.out, 'predict_misc', 'busco')): + #check if complete run + if lib.checkannotations(os.path.join(args.out, 'predict_misc', 'busco', 'run_'+aug_species, 'training_set_'+aug_species)): + lib.log.info("BUSCO has already been run, using existing data") + runbusco = False + else: + shutil.rmtree(os.path.join(args.out, 'predict_misc', 'busco')) + if runbusco: + lib.log.info("Running BUSCO to find conserved gene models for training Augustus") + if not os.path.isdir('busco'): + os.makedirs('busco') + else: + shutil.rmtree('busco') #delete if it is there + os.makedirs('busco') #create fresh folder + if lib.CheckAugustusSpecies(args.busco_seed_species): + busco_seed = args.busco_seed_species + else: + busco_seed = 'generic' + with open(busco_log, 'w') as logfile: + subprocess.call([sys.executable, BUSCO, '--genome', MaskGenome, '--lineage', BUSCO_FUNGI, '-o', aug_species, '--cpu', str(args.cpus), '--species', busco_seed], cwd = 'busco', stdout = logfile, stderr = logfile) + #check if BUSCO found models for training, if not error out and exit. + busco_training = os.path.join('busco', 'run_'+aug_species, 'training_set_'+aug_species) + if not lib.checkannotations(busco_training): + lib.log.error("BUSCO training of Augusus failed, check busco logs, exiting") + #remove the augustus training config folder + shutil.rmtree(os.path.join(AUGUSTUS, 'species', aug_species)) + os._exit(1) + #proper training files exist, now run EVM on busco models to get high quality predictions. + lib.log.info("BUSCO predictions complete, now formatting for EVM") + #move the busco folder now where it should reside + if os.path.isdir('busco'): + if os.path.isdir(os.path.join(args.out, 'predict_misc', 'busco')): + shutil.rmtree(os.path.join(args.out, 'predict_misc', 'busco')) + os.rename('busco', os.path.join(args.out, 'predict_misc', 'busco')) + + #open output and pull locations to make bed file + busco_bed = os.path.join(args.out, 'predict_misc', 'buscos.bed') + busco_fulltable = os.path.join(args.out, 'predict_misc', 'busco', 'run_'+aug_species, 'full_table_'+aug_species) + busco_complete = {} + with open(busco_bed, 'w') as bedfile: + with open(busco_fulltable, 'rU') as buscoinput: + for line in buscoinput: + line = line.replace(']', '') + cols = line.split('\t') + if cols[1] == 'Complete': + if not cols[0] in busco_complete: + busco_complete[cols[0]] = cols[2]+':'+cols[3]+'-'+cols[4] + start = int(cols[3]) - 100 + end = int(cols[4]) + 100 + bedfile.write('%s\t%i\t%i\t%s\n' % (cols[2],start,end,cols[0])) + #now get BUSCO GFF models + busco_augustus_tmp = os.path.join(args.out, 'predict_misc', 'busco_augustus.tmp') + with open(busco_augustus_tmp, 'w') as output: + for i in busco_complete: + file = os.path.join(args.out, 'predict_misc', 'busco', 'run_'+aug_species, 'gffs', i+'.gff') + subprocess.call(['perl', Converter2, file], stdout = output) + #finally rename models so they are not redundant + busco_augustus = os.path.join(args.out, 'predict_misc', 'busco_augustus.gff3') + subprocess.call([os.path.join(parentdir, 'util', 'fix_busco_naming.py'), busco_augustus_tmp, busco_fulltable, busco_augustus]) + #now get genemark-es models in this region + busco_genemark = os.path.join(args.out, 'predict_misc', 'busco_genemark.gff3') + with open(busco_genemark, 'w') as output: + subprocess.call(['bedtools', 'intersect', '-a', GeneMark, '-b', busco_bed], stdout = output) + #combine predictions + busco_predictions = os.path.join(args.out, 'predict_misc', 'busco_predictions.gff3') + with open(busco_predictions, 'w') as output: + with open(busco_augustus) as input: + output.write(input.read()) + with open(busco_genemark) as input: + output.write(input.read()) + #get evidence if exists + if Transcripts: + #get transcript alignments in this region + busco_transcripts = os.path.join(args.out, 'predict_misc', 'busco_transcripts.gff3') + with open(busco_transcripts, 'w') as output: + subprocess.call(['bedtools', 'intersect', '-a', Transcripts, '-b', busco_bed], stdout = output) + if Exonerate: + #get protein alignments in this region + busco_proteins = os.path.join(args.out, 'predict_misc', 'busco_proteins.gff3') + with open(busco_proteins, 'w') as output: + subprocess.call(['bedtools', 'intersect', '-a', Exonerate, '-b', busco_bed], stdout = output) + #set Weights file dependent on which data is present. + busco_weights = os.path.join(args.out, 'predict_misc', 'busco_weights.txt') + with open(busco_weights, 'w') as output: + output.write("OTHER_PREDICTION\tAugustus\t2\n") + output.write("ABINITIO_PREDICTION\tGeneMark\t1\n") + if Exonerate: + output.write("PROTEIN\texonerate\t1\n") + if Transcripts: + output.write("TRANSCRIPT\tgenome\t1\n") + #setup EVM run + EVM_out = os.path.join(args.out, 'predict_misc', 'busco.evm.gff3') + EVM_script = os.path.join(parentdir, 'bin', 'funannotate-runEVM.py') + #get absolute paths for everything + Busco_Weights = os.path.abspath(busco_weights) + EVM_out = os.path.abspath(EVM_out) + Busco_Predictions = os.path.abspath(busco_predictions) + #parse entire EVM command to script + if Exonerate and Transcripts: + evm_cmd = [sys.executable, EVM_script, os.path.join(args.out, 'logfiles', 'funannotate-EVM_busco.log'), str(args.cpus), '--genome', MaskGenome, '--gene_predictions', Busco_Predictions, '--protein_alignments', Exonerate, '--transcript_alignments', Transcripts, '--weights', Busco_Weights, '--min_intron_length', str(args.min_intronlen), EVM_out] + elif not Exonerate and Transcripts: + evm_cmd = [sys.executable, EVM_script, os.path.join(args.out, 'logfiles', 'funannotate-EVM_busco.log'),str(args.cpus), '--genome', MaskGenome, '--gene_predictions', Busco_Predictions, '--transcript_alignments', Transcripts, '--weights', Busco_Weights, '--min_intron_length', str(args.min_intronlen), EVM_out] + elif not Transcripts and Exonerate: + evm_cmd = [sys.executable, EVM_script, os.path.join(args.out, 'logfiles', 'funannotate-EVM_busco.log'), str(args.cpus), '--genome', MaskGenome, '--gene_predictions', Busco_Predictions, '--protein_alignments', Exonerate, '--weights', Busco_Weights, '--min_intron_length', str(args.min_intronlen), EVM_out] + elif not any([Transcripts,Exonerate]): + evm_cmd = [sys.executable, EVM_script, os.path.join(args.out, 'logfiles', 'funannotate-EVM_busco.log'), str(args.cpus), '--genome', MaskGenome, '--gene_predictions', Busco_Predictions, '--weights', Busco_Weights, '--min_intron_length', str(args.min_intronlen), EVM_out] + #run EVM + if not os.path.isfile(EVM_out): + subprocess.call(evm_cmd) + try: + total = lib.countGFFgenes(EVM_out) + except IOError: + lib.log.error("EVM did not run correctly, output file missing") + os._exit(1) + #check number of gene models, if 0 then failed, delete output file for re-running + if total < 1: + lib.log.error("Evidence modeler has failed, exiting") + os.remove(EVM_out) + os._exit(1) else: - shutil.rmtree('busco') #delete if it is there - os.makedirs('busco') #create fresh folder - busco_log = os.path.join(args.out, 'logfiles', 'busco.log') - if lib.CheckAugustusSpecies(args.busco_seed_species): - busco_seed = args.busco_seed_species + lib.log.info('{0:,}'.format(total) + ' total gene models from EVM') + #move EVM folder to predict folder + if os.path.isdir('EVM_tmp'): + if os.path.isdir(os.path.join(args.out, 'predict_misc', 'EVM_busco')): + shutil.rmtree(os.path.join(args.out, 'predict_misc', 'EVM_busco')) + os.rename('EVM_tmp', os.path.join(args.out, 'predict_misc', 'EVM_busco')) + #convert to proteins and screen with busco + lib.log.info("Checking BUSCO protein models for accuracy") + evm_proteins = os.path.join(args.out, 'predict_misc', 'busco.evm.proteins.fa') + busco_final = os.path.join(args.out, 'predict_misc', 'busco.final.gff3') + if not busco_final: + with open(evm_proteins, 'w') as output: + subprocess.call([EVM2proteins, EVM_out, MaskGenome], stdout = output) + if not os.path.isdir(os.path.join(args.out, 'predict_misc', 'busco_proteins')): + os.makedirs(os.path.join(args.out, 'predict_misc', 'busco_proteins')) + with open(busco_log, 'a') as logfile: + subprocess.call([sys.executable, BUSCO, '-in', os.path.abspath(evm_proteins), '--lineage', BUSCO_FUNGI, '-a', aug_species, '--cpu', str(args.cpus), '-m', 'OGS', '-f'], cwd = os.path.join(args.out, 'predict_misc', 'busco_proteins'), stdout = logfile, stderr = logfile) + subprocess.call([os.path.join(parentdir, 'util', 'filter_buscos.py'), EVM_out, os.path.join(args.out, 'predict_misc', 'busco_proteins', 'run_'+aug_species, 'full_table_'+aug_species), busco_final], stdout = FNULL, stderr = FNULL) + total = lib.countGFFgenes(busco_final) + lib.log.info('{0:,}'.format(total) + ' gene models validated, using for training Augustus') + + ###Run Augustus training + GFF2GB = os.path.join(AUGUSTUS_BASE, 'scripts', 'gff2gbSmallDNA.pl') + trainingset = os.path.join(args.out, 'predict_misc', 'busco.training.gb') + subprocess.call([GFF2GB, busco_final, MaskGenome, '250', trainingset], stderr = FNULL) + if args.optimize_augustus: + lib.trainAugustus(AUGUSTUS_BASE, aug_species, trainingset, MaskGenome, args.out, args.cpus, True) else: - busco_seed = 'generic' - with open(busco_log, 'w') as logfile: - subprocess.call([sys.executable, BUSCO, '--genome', MaskGenome, '--lineage', BUSCO_FUNGI, '-o', aug_species, '--cpu', str(args.cpus), '--species', busco_seed], cwd = 'busco', stdout = logfile, stderr = logfile) - #check if BUSCO found models for training, if not error out and exit. - busco_training = os.path.join('busco', 'run_'+aug_species, 'training_set_'+aug_species) - if not lib.checkannotations(busco_training): - lib.log.error("BUSCO training of Augusus failed, check busco logs, exiting") - #remove the augustus training config folder - shutil.rmtree(os.path.join(AUGUSTUS, 'species', aug_species)) - os._exit(1) - else: #proper training files exist, now run Augustus training and optimization - lib.log.info("BUSCO predictions complete, now training Augustus") - ###Run training - lib.trainAugustus(AUGUSTUS_BASE, aug_species, busco_training, MaskGenome, args.out, args.cpus) + lib.trainAugustus(AUGUSTUS_BASE, aug_species, trainingset, MaskGenome, args.out, args.cpus, False) + lib.log.info("Running Augustus gene prediction") #now run Augustus multithreaded... if not os.path.isfile(aug_out): @@ -821,10 +955,6 @@ def __init__(self,prog): if os.path.isdir(os.path.join(args.out, 'predict_misc', 'tbl2asn')): shutil.rmtree(os.path.join(args.out, 'predict_misc', 'tbl2asn')) os.rename('tbl2asn', os.path.join(args.out, 'predict_misc', 'tbl2asn')) -if os.path.isdir('busco'): - if os.path.isdir(os.path.join(args.out, 'predict_misc', 'busco')): - shutil.rmtree(os.path.join(args.out, 'predict_misc', 'busco')) - os.rename('busco', os.path.join(args.out, 'predict_misc', 'busco')) if os.path.isfile('funannotate-EVM.log'): os.rename('funannotate-EVM.log', os.path.join(args.out, 'logfiles', 'funannotate-EVM.log')) if os.path.isfile('funannotate-p2g.log'): diff --git a/docs/ubuntu_install.md b/docs/ubuntu_install.md index 0690b1e1..065c9bac 100644 --- a/docs/ubuntu_install.md +++ b/docs/ubuntu_install.md @@ -43,6 +43,7 @@ brew doctor brew tap homebrew/dupes brew tap homebrew/science brew tap nextgenusfs/tap +brew update ``` 4) Install funannotate and dependencies using LinuxBrew diff --git a/funannotate.py b/funannotate.py index 148e0cfe..4176e2c9 100755 --- a/funannotate.py +++ b/funannotate.py @@ -129,7 +129,8 @@ def fmtcols(mylist, cols): --genemark_mod GeneMark ini mod file. --protein_evidence Proteins to map to genome (prot1.fa,prot2.fa,uniprot.fa). Default: uniprot.fa --transcript_evidence mRNA/ESTs to align to genome (trans1.fa,ests.fa,trinity.fa). Default: none - --busco_seed_species Augustus pre-trained species to start BUSCO. Default: generic + --busco_seed_species Augustus pre-trained species to start BUSCO. Default: anidulans + --optimize_augustus Run 'optimze_augustus.pl' to refine training (long runtime) --busco_db BUSCO models. Default: fungi [fungi,vertebrata,metazoa,eukaryota,arthropoda] --organism Fungal-specific options. Default: fungus. [fungus,other] diff --git a/lib/library.py b/lib/library.py index bad0a1bb..f1ff9dea 100644 --- a/lib/library.py +++ b/lib/library.py @@ -1,5 +1,5 @@ from __future__ import division -import os, subprocess, logging, sys, argparse, inspect, csv, time, re, shutil, datetime, glob, platform, multiprocessing +import os, subprocess, logging, sys, argparse, inspect, csv, time, re, shutil, datetime, glob, platform, multiprocessing, itertools from natsort import natsorted import warnings from Bio import SeqIO @@ -74,6 +74,9 @@ def readBlocks(source, pattern): buffer.append( line ) yield buffer +def empty_line_sep(line): + return line=='\n' + def get_parent_dir(directory): return os.path.dirname(directory) @@ -1732,31 +1735,35 @@ def getTrainResults(input): values3 = line.split('|') #get [6] and [7] return (values1[1], values1[2], values2[6], values2[7], values3[6], values3[7]) -def trainAugustus(AUGUSTUS_BASE, train_species, trainingset, genome, outdir, cpus): +def trainAugustus(AUGUSTUS_BASE, train_species, trainingset, genome, outdir, cpus, optimize): RANDOMSPLIT = os.path.join(AUGUSTUS_BASE, 'scripts', 'randomSplit.pl') OPTIMIZE = os.path.join(AUGUSTUS_BASE, 'scripts', 'optimize_augustus.pl') + NEW_SPECIES = os.path.join(AUGUSTUS_BASE, 'scripts', 'new_species.pl') aug_cpus = '--cpus='+str(cpus) species = '--species='+train_species aug_log = os.path.join(outdir, 'logfiles', 'augustus_training.log') trainingdir = 'tmp_opt_'+train_species with open(aug_log, 'w') as logfile: - subprocess.call([RANDOMSPLIT, trainingset, '200']) #split off 100 models for testing purposes - if not CheckAugustusSpecies(train_species): #check if training set exists, if not run etraining - subprocess.call(['etraining', species, trainingset], stderr = logfile, stdout = logfile) + if not CheckAugustusSpecies(train_species): + subprocess.call([NEW_SPECIES, species], stdout = logfile, stderr = logfile) + #run etraining again to only use best models from EVM for training + subprocess.call(['etraining', species, trainingset], stderr = logfile, stdout = logfile) + subprocess.call([RANDOMSPLIT, trainingset, '200']) #split off 200 models for testing purposes with open(os.path.join(outdir, 'predict_misc', 'augustus.initial.training.txt'), 'w') as initialtraining: subprocess.call(['augustus', species, trainingset+'.test'], stdout=initialtraining) train_results = getTrainResults(os.path.join(outdir, 'predict_misc', 'augustus.initial.training.txt')) log.info('Initial training: '+'{0:.2%}'.format(float(train_results[4]))+' genes predicted exactly and '+'{0:.2%}'.format(float(train_results[2]))+' of exons predicted exactly') - #now run optimization - subprocess.call([OPTIMIZE, species, aug_cpus, trainingset], stderr = logfile, stdout = logfile) - #run etraining again - subprocess.call(['etraining', species, trainingset], stderr = logfile, stdout = logfile) - with open(os.path.join(outdir, 'predict_misc', 'augustus.final.training.txt'), 'w') as finaltraining: - subprocess.call(['augustus', species, os.path.join(trainingdir, 'bucket1.gb')], stdout=finaltraining) - train_results = getTrainResults(os.path.join(outdir, 'predict_misc', 'augustus.final.training.txt')) - log.info('Optimized training: '+'{0:.2%}'.format(float(train_results[4]))+' genes predicted exactly and '+'{0:.2%}'.format(float(train_results[2]))+' of exons predicted exactly') - #clean up tmp folder - shutil.rmtree(trainingdir) + if optimize: + #now run optimization + subprocess.call([OPTIMIZE, species, aug_cpus, trainingset], stderr = logfile, stdout = logfile) + #run etraining again + subprocess.call(['etraining', species, trainingset], stderr = logfile, stdout = logfile) + with open(os.path.join(outdir, 'predict_misc', 'augustus.final.training.txt'), 'w') as finaltraining: + subprocess.call(['augustus', species, trainingset+'.test'], stdout=finaltraining) + train_results = getTrainResults(os.path.join(outdir, 'predict_misc', 'augustus.final.training.txt')) + log.info('Optimized training: '+'{0:.2%}'.format(float(train_results[4]))+' genes predicted exactly and '+'{0:.2%}'.format(float(train_results[2]))+' of exons predicted exactly') + #clean up tmp folder + shutil.rmtree(trainingdir) HEADER = ''' diff --git a/util/filter_buscos.py b/util/filter_buscos.py new file mode 100755 index 00000000..c52eb090 --- /dev/null +++ b/util/filter_buscos.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python + +#script to reformat Augustus BUSCO results +import sys, os, itertools + +if len(sys.argv) < 4: + print("Usage: filter_buscos.py busco.evm.gff3 full_table_species busco.final.gff3") + sys.exit(1) + +def group_separator(line): + return line=='\n' + +#parse the busco table into dictionary format +busco_complete = {} +with open(sys.argv[2], 'rU') as buscoinput: + for line in buscoinput: + cols = line.split('\t') + if cols[1] == 'Complete': + ID = cols[2].replace('evm.model.', '') + if not ID in busco_complete: + busco_complete[ID] = (cols[0], cols[3]) + else: + score = busco_complete.get(ID)[1] + if float(cols[3]) > float(score): + busco_complete[ID] = (cols[0], cols[3]) + print ID, 'updated dictionary' + else: + print ID, 'is repeated and score is less' + +#now parse the evm busco file, group them +results = [] +with open(sys.argv[1]) as f: + for key, group in itertools.groupby(f, group_separator): + if not key: + results.append(list(group)) + +#loop through each gene model, lookup the BUSCO name, and then replace the name with counter based and busco model name +''' +scaffold_1 EVM gene 18407 18947 . - . ID=evm.TU.scaffold_1.1;Name=EVM%20prediction%20scaffold_1.1 +scaffold_1 EVM mRNA 18407 18947 . - . ID=evm.model.scaffold_1.1;Parent=evm.TU.scaffold_1.1;Name=EVM%20prediction%20scaffold_1.1 +scaffold_1 EVM exon 18772 18947 . - . ID=evm.model.scaffold_1.1.exon1;Parent=evm.model.scaffold_1.1 +scaffold_1 EVM CDS 18772 18947 . - 0 ID=cds.evm.model.scaffold_1.1;Parent=evm.model.scaffold_1.1 +scaffold_1 EVM exon 18407 18615 . - . ID=evm.model.scaffold_1.1.exon2;Parent=evm.model.scaffold_1.1 +scaffold_1 EVM CDS 18407 18615 . - 1 ID=cds.evm.model.scaffold_1.1;Parent=evm.model.scaffold_1.1 +''' +counter = 0 +with open(sys.argv[3], 'w') as output: + for i in results: + counter += 1 + cols = i[0].split('\t') + ID = cols[8].split(';')[0] + ID = ID.replace('ID=', '') + lookup = ID.replace('evm.TU.', '') + if lookup in busco_complete: + name = busco_complete.get(lookup)[0] + geneID = 'gene'+str(counter) + mrnaID = 'mrna'+str(counter) + newblock = ''.join(i) + newblock = newblock.replace('EVM%20prediction%20'+lookup, name) + newblock = newblock.replace(ID, geneID) + newblock = newblock.replace('evm.model.'+lookup, mrnaID) + output.write(newblock+'\n') diff --git a/util/fix_busco_naming.py b/util/fix_busco_naming.py new file mode 100755 index 00000000..b446eb09 --- /dev/null +++ b/util/fix_busco_naming.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python + +#script to reformat Augustus BUSCO results +import sys, os, itertools + +if len(sys.argv) < 4: + print("Usage: fix_busco_naming.py busco_augustus.gff3 full_table_species busco_augustus.fixed.gff3") + sys.exit(1) + +def group_separator(line): + return line=='\n' + +#parse the busco table into dictionary format +busco_complete = {} +with open(sys.argv[2], 'rU') as buscoinput: + for line in buscoinput: + cols = line.split('\t') + if cols[1] == 'Complete': + if not cols[0] in busco_complete: + busco_complete[cols[0]] = cols[2]+':'+cols[3]+'-'+cols[4] + +#now parse the augustus input file where gene numbers are likely repeated. +results = [] +with open(sys.argv[1]) as f: + for key, group in itertools.groupby(f, group_separator): + if not key: + results.append(list(group)) + +#loop through each gene model, lookup the BUSCO name, and then replace the name with counter based and busco model name +counter = 0 +inverse_busco = {v: k for k, v in busco_complete.items()} +with open(sys.argv[3], 'w') as output: + for i in results: + counter += 1 + cols = i[0].split('\t') + lookup = cols[0]+':'+cols[3]+'-'+cols[4] + if lookup in inverse_busco: + name = inverse_busco.get(lookup) + else: + name = 'unknown_model' + ID = cols[8].split(';')[0] + ID = ID.replace('ID=', '') + newID = 'gene'+str(counter) + newblock = ''.join(i) + newblock = newblock.replace('Augustus%20prediction', name) + newblock = newblock.replace(ID, newID) + output.write(newblock+'\n') diff --git a/util/funannotate-BUSCO.py b/util/funannotate-BUSCO.py index 60ca84d6..8d7593d5 100755 --- a/util/funannotate-BUSCO.py +++ b/util/funannotate-BUSCO.py @@ -1089,7 +1089,7 @@ def measuring (nested): scaff = seq_id.strip(']').split('[')[1].split(':')[0] scaff_start = seq_id.strip(']').split('[')[1].split(':')[1].split('-')[0] scaff_end = seq_id.strip(']').split('[')[1].split(':')[1].split('-')[1] - + scaff_end = scaff_end.replace(']', '') out.write('%s\tComplete\t%s\t%s\t%s\t%s\t%s\n' % (entity, scaff, scaff_start, scaff_end, bit_score, seq_len)) csc[entity] = seq_id @@ -1106,6 +1106,7 @@ def measuring (nested): scaff = seq_id.strip(']').split('[')[1].split(':')[0] scaff_start = seq_id.strip(']').split('[')[1].split(':')[1].split('-')[0] scaff_end = seq_id.strip(']').split('[')[1].split(':')[1].split('-')[1] + scaff_end = scaff_end.replace(']', '') out.write('%s\tDuplicated\t%s\t%s\t%s\t%s\t%s\n' % (entity, scaff, scaff_start, scaff_end, bit_score, seq_len)) @@ -1122,6 +1123,7 @@ def measuring (nested): scaff = seq_id.strip(']').split('[')[1].split(':')[0] scaff_start = seq_id.strip(']').split('[')[1].split(':')[1].split('-')[0] scaff_end = seq_id.strip(']').split('[')[1].split(':')[1].split('-')[1] + scaff_end = scaff_end.replace(']', '') out.write('%s\tFragmented\t%s\t%s\t%s\t%s\t%s\n' % (entity, scaff, scaff_start, scaff_end, bit_score, seq_len)) missing = [] @@ -1227,7 +1229,7 @@ def measuring (nested): gff_file.write(line) gff_file.close() - subprocess.call('$AUGUSTUS_CONFIG_PATH/../scripts/gff2gbSmallDNA.pl %s/gffs/%s.gff %s 1000 %s/gb/%s.raw.gb 1>/dev/null 2>/dev/null' % + subprocess.call('$AUGUSTUS_CONFIG_PATH/../scripts/gff2gbSmallDNA.pl %s/gffs/%s.gff %s 500 %s/gb/%s.raw.gb 1>/dev/null 2>/dev/null' % (mainout, entry, args['genome'], mainout, entry), shell = True) #bacteria clade needs to be flagged as "prokaryotic" NEW_SPECIES = os.path.join(AUGUSTUS_BASE, 'scripts', 'new_species.pl')