add missing files you forgot about

nextgenusfs · Dec 13, 2016 · 0abf4d9 · 0abf4d9
1 parent 0d7cdf5
commit 0abf4d9
Show file tree

Hide file tree

Showing 3 changed files with 2,125 additions and 0 deletions.
diff --git a/lib/codeml.config b/lib/codeml.config
@@ -0,0 +1,31 @@
+seqfile = INPUTFILEHERE
+treefile = INPUTTREEHERE
+outfile = OUTPUTFILEHERE
+noisy = 9 * 0,1,2,3,9: how much rubbish on the screen
+verbose = 1 * 1: detailed output, 0: concise output
+runmode = 0 * 0: user tree; 1: semi-automatic; 2: automatic
+* 3: StepwiseAddition; (4,5):PerturbationNNI; -2: pairwise
+
+seqtype = 1 * 1:codons; 2:AAs; 3:codons-->AAs
+CodonFreq = 2 * 0:1/61 each, 1:F1X4, 2:F3X4, 3:codon table
+clock = 0 * 0: no clock, unrooted tree, 1: clock, rooted tree
+aaDist = 0 * 0:equal, +:geometric; -:linear, {1-5:G1974,Miyata,c,p,v}
+model = 0
+
+NSsites = 0
+* 0:one w; 1:NearlyNeutral; 2:PositiveSelection; 3:discrete;
+* 4:freqs; 5:gamma;6:2gamma;7:beta;8:beta&w;9:beta&gamma;10:3normal
+icode = 0 * 0:standard genetic code; 1:mammalian mt; 2-10:see below
+Mgene = 0 * 0:rates, 1:separate; 2:pi, 3:kappa, 4:all
+
+fix_kappa = 0 * 1: kappa fixed, 0: kappa to be estimated
+kappa = 1 * initial or fixed kappa
+fix_omega = 0 * 1: omega or omega_1 fixed, 0: estimate
+omega = 1 * initial or fixed omega, for codons or codon-based AAs
+ncatG = 10 * # of categories in the dG or AdG models of rates
+
+getSE = 0 * 0: don't want them, 1: want S.E.s of estimates
+RateAncestor = 0 * (0,1,2): rates (alpha>0) or ancestral states (1 or 2)
+Small_Diff = .45e-6
+cleandata = 1 * remove sites with ambiguity data (1:yes, 0:no)?
+fix_blength = 0 * 0: ignore, -1: random, 1: initial, 2: fixed 
diff --git a/util/add2outgroups.py b/util/add2outgroups.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+
+import sys, os, subprocess, shutil, argparse, inspect
+from Bio import SeqIO
+currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+parentdir = os.path.dirname(currentdir)
+sys.path.insert(0, parentdir)
+import lib.library as lib
+
+
+#setup menu with argparse
+class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
+    def __init__(self, prog):
+        super(MyFormatter, self).__init__(prog, max_help_position=48)
+parser = argparse.ArgumentParser(prog='funannotate-predict.py', usage="%(prog)s [options] -i genome.fasta",
+    description = '''Script that adds a proteome to the outgroups.''',
+    epilog = """Written by Jon Palmer (2016) [email protected]""",
+    formatter_class = MyFormatter)
+parser.add_argument('-i', '--input', required=True, help='Proteome in FASTA format')
+parser.add_argument('--species', required=True, help='Species name "binomial in quotes"')
+parser.add_argument('--busco_db', default='dikarya', required=True, help='BUSCO database to use')
+parser.add_argument('--cpus', default=2, type=int, help='Number of CPUs to use')
+args=parser.parse_args()
+
+#get base name
+species = args.species.replace(' ', '_').lower()+'.'+args.busco_db
+OUTGROUPS = os.path.join(parentdir, 'DB', 'outgroups')
+
+#create log file
+log_name = species+'-add2outgroups.log'
+if os.path.isfile(log_name):
+    os.remove(log_name)
+
+#initialize script, log system info and cmd issue at runtime
+lib.setupLogging(log_name)
+FNULL = open(os.devnull, 'w')
+cmd_args = " ".join(sys.argv)+'\n'
+lib.log.debug(cmd_args)
+print "-------------------------------------------------------"
+lib.SystemInfo()
+
+#get version of funannotate
+version = lib.get_version()
+lib.log.info("Running %s" % version)
+
+#check buscos, download if necessary
+if not os.path.isdir(os.path.join(parentdir, 'DB', args.busco_db)):
+    lib.download_buscos(args.busco_db)
+
+ProtCount = lib.countfasta(args.input)
+lib.log.info('{0:,}'.format(ProtCount) + ' protein records loaded')  
+
+#convert to proteins and screen with busco
+lib.log.info("Looking for BUSCO models with %s DB" % args.busco_db)
+BUSCODB = os.path.join(parentdir, 'DB', args.busco_db)
+BUSCO = os.path.join(parentdir, 'util', 'funannotate-BUSCO2.py')
+cmd = [sys.executable, BUSCO, '-i', os.path.abspath(args.input), '-m', 'proteins', '--lineage', BUSCODB, '-o', species, '--cpu', str(args.cpus), '-f']
+lib.runSubprocess(cmd, '.', lib.log)
+
+#check that it ran correctly
+busco_results = os.path.join('run_'+species, 'full_table_'+species+'.tsv')
+if not lib.checkannotations(busco_results):
+    lib.log.error("BUSCO failed, check logfile")
+    sys.exit(1)
+nameChange = {}
+with open(busco_results, 'rU') as input:
+    for line in input:
+        if line.startswith('#'):
+            continue
+        cols = line.split('\t')
+        if cols[1] == 'Complete':
+            if not cols[2] in nameChange:
+                nameChange[cols[2]] = cols[0]
+            else:
+                lib.log.error("Duplicate ID found: %s %s. Removing from results" % (cols[2], cols[0]))
+                del nameChange[cols[2]]
+
+#output counts
+lib.log.info('{0:,}'.format(len(nameChange)) + ' BUSCO models found')  
+
+#index the proteome for parsing
+SeqRecords = SeqIO.index(args.input, 'fasta')
+
+#setup output proteome
+busco_out = os.path.join(OUTGROUPS, species+'_buscos.fa')
+with open(busco_out, 'w') as output:
+    for k,v in nameChange.items():
+        rec = SeqRecords[k]
+        output.write('>%s\n%s\n' % (v, rec.seq))
+lib.log.info("Results written to: %s" % busco_out)
+
+#clean up your mess
+shutil.rmtree('run_'+species)
+shutil.rmtree('tmp')
+
+