Skip to content

Commit

Permalink
A bunch of fixes brought in from 1.0 branch, update to reqs, version …
Browse files Browse the repository at this point in the history
…bump
  • Loading branch information
camillescott committed Nov 23, 2016
1 parent 7a8a7ee commit 86fa03c
Show file tree
Hide file tree
Showing 12 changed files with 92 additions and 83 deletions.
11 changes: 6 additions & 5 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,17 +50,17 @@ Install some packages manually::
export PATH=$HOME/last-658/scripts:$PATH

cd
curl -LO http://busco.ezlab.org/files/BUSCO_v1.1b1.tar.gz
tar -xvzf BUSCO_v1.1b1.tar.gz
chmod +x BUSCO_v1.1b1/*.py
export PATH=$HOME/BUSCO_v1.1b1:$PATH
curl -LO http://busco.ezlab.org/v1/files/BUSCO_v1.22.tar.gz
tar -xvzf BUSCO_v1.22.tar.gz
chmod +x BUSCO_v1.22/*.py
export PATH=$HOME/BUSCO_v1.22:$PATH
cd

To add these to your environment permanently::

echo 'export PATH=$PATH:$HOME/TransDecoder-2.0.1' >> $HOME/.bashrc
echo 'export PATH=$PATH:$HOME/last-658/src' >> $HOME/.bashrc
echo 'export PATH=$PATH:$HOME/BUSCO_v1.1b1' >> $HOME/.bashrc
echo 'export PATH=$HOME/BUSCO_v1.22:$PATH' >> $HOME/.bashrc

Now, install dammit::

Expand Down Expand Up @@ -114,6 +114,7 @@ Known Issues
EMBOSS, which is not searched for. Although the installation instructions cover these
dependencies, users who *cough* don't read the directions *cough* might be confused that a
dependency is marked as installed but still doesn't work.
* dammit 0.3 does not support BUSCO v2. dammit 1.0 is building 2.0 support in.


Acknowledgements
Expand Down
20 changes: 16 additions & 4 deletions dammit/.databases.json
Original file line number Diff line number Diff line change
Expand Up @@ -48,25 +48,37 @@
"metazoa": {
"access": "download",
"db_type": "hmm",
"url": "http://busco.ezlab.org/files/metazoa_buscos.tar.gz"
"url": "http://busco.ezlab.org/v1/files/metazoa_buscos.tar.gz"
},

"vertebrata": {
"access": "download",
"db_type": "hmm",
"url": "http://busco.ezlab.org/files/vertebrata_buscos.tar.gz"
"url": "http://busco.ezlab.org/v1/files/vertebrata_buscos.tar.gz"
},

"arthropoda": {
"access": "download",
"db_type": "hmm",
"url": "http://busco.ezlab.org/files/arthropoda_buscos.tar.gz"
"url": "http://busco.ezlab.org/v1/files/arthropoda_buscos.tar.gz"
},

"eukaryota": {
"access": "download",
"db_type": "hmm",
"url": "http://busco.ezlab.org/files/eukaryota_buscos.tar.gz"
"url": "http://busco.ezlab.org/v1/files/eukaryota_buscos.tar.gz"
},

"fungi": {
"access": "download",
"db_type": "hmm",
"url": "http://busco.ezlab.org/v1/files/fungi_buscos.tar.gz"
},

"plantae": {
"access": "download",
"db_type": "hmm",
"url": "http://busco.ezlab.org/v1/files/plant_early_release.tar.gz"
}
}

Expand Down
2 changes: 1 addition & 1 deletion dammit/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.3.1-dev
0.3.1
8 changes: 4 additions & 4 deletions dammit/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def transdecoder_tasks(self):
self.args.evalue,
self.args.n_threads,
common.CONFIG['settings']['hmmer']['hmmscan'],
n_nodes=self.args.n_nodes)
pbs=self.args.sshloginfile)

yield get_remap_hmmer_task(self.transdecoder_pfam_fn,
self.transdecoder_orf_gff3_fn,
Expand All @@ -202,7 +202,7 @@ def cmscan_task(self):
self.args.evalue,
self.args.n_threads,
cmscan_cfg,
n_nodes=self.args.n_nodes)
pbs=self.args.sshloginfile)

def orthodb_task(self):
'''Run LAST to get homologies with OrthoDB. We use LAST here because
Expand All @@ -218,7 +218,7 @@ def orthodb_task(self):
translate=True,
cutoff=self.args.evalue,
n_threads=self.args.n_threads,
n_nodes=self.args.n_nodes)
pbs=self.args.sshloginfile)

def uniref_task(self):

Expand All @@ -231,7 +231,7 @@ def uniref_task(self):
translate=True,
cutoff=self.args.evalue,
n_threads=self.args.n_threads,
n_nodes=self.args.n_nodes)
pbs=self.args.sshloginfile)

def user_crb_tasks(self):
'''Run conditional recipricol best hits LAST (CRBL) against the
Expand Down
7 changes: 2 additions & 5 deletions dammit/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,11 +175,8 @@ def add_common_args(parser):
help='Optional additional protein databases. '\
' These will be searched with CRB-blast.'
)
annotate_parser.add_argument('--n_nodes',
type=int,
help='EXPERIMENTAL support for Portable'\
' Batch System. `--n-threads` will'\
' be assumed as the ppn parameter.')
annotate_parser.add_argument('--sshloginfile', default=None,
help='Distribute execution across the specified nodes.')


add_common_args(annotate_parser)
Expand Down
13 changes: 9 additions & 4 deletions dammit/dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

from .common import which
from . import common
from .tasks import get_download_and_untar_task


class DependencyHandler(object):
Expand Down Expand Up @@ -118,12 +117,18 @@ def check_blast(logger):


def check_busco(logger):
busco = which('BUSCO_v1.1b1.py')
versions = [(122, 'BUSCO_v1.22.py'),
(111, 'BUSCO_v1.1b1.py')]
for version, exc in versions:
busco = which(exc)
if busco is not None:
break
if busco is None:
return False, 'Not found on $PATH'
else:
logger.debug('BUSCO:' + busco)
return True, os.path.dirname(busco)
if logger:
logger.debug('BUSCO:' + busco)
return True, busco


def check_transdecoder(logger):
Expand Down
17 changes: 15 additions & 2 deletions dammit/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
'''
import csv
import re
import sys
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -244,6 +245,7 @@ def attr_col_func(col):
chunksize=chunksize, header=None,
dtype=dict(gff_cols)):

group.reset_index(drop=True, inplace=True)
# Generate a new DataFrame from the attributes dicts, and merge it in
gtf_df = pd.merge(group,
pd.DataFrame(list(group.attributes)),
Expand Down Expand Up @@ -316,7 +318,8 @@ def busco_to_df(fn_list, dbs=['metazoa', 'vertebrata']):
return df


def hmmscan_to_df_iter(fn, chunksize=10000):
def hmmscan_to_df_iter(fn, chunksize=10000,
query_regex=re.compile(r'(?P<name>Transcript_[0-9]*)')):
'''Iterator over DataFrames of length chunksize from a given
hmmscan result file.
Expand All @@ -327,11 +330,19 @@ def hmmscan_to_df_iter(fn, chunksize=10000):
Args:
fn (str): Path to the hmmscan file.
chunksize (int): Hits per iteration.
query_regex (re): Compiled regex to retrieve transcript name.
Yields:
DataFrame: Pandas DataFrame with the hmmscan hits.
'''


def split_query(item):
q, _, _ = item.rpartition('|')
results = query_regex.search(item).groupdict()
try:
q = results['name']
except KeyError as e:
e.message = 'Header regex should have a "name" field.'
raise
return q

def build_df(data):
Expand Down Expand Up @@ -379,6 +390,8 @@ def cmscan_to_df_iter(fn, chunksize=10000):
def build_df(data):
df = pd.DataFrame(data, columns=[k for k, _ in cmscan_cols])
convert_dtypes(df, dict(cmscan_cols))
sidx = df.seq_from > df.seq_to
df.loc[sidx, 'seq_from'], df.loc[sidx, 'seq_to'] = df.loc[sidx, 'seq_to'], df.loc[sidx, 'seq_from']
df.mdl_from = df.mdl_from - 1
df.seq_from = df.seq_from - 1
return df
Expand Down
71 changes: 26 additions & 45 deletions dammit/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from khmer import HLLCounter, ReadParser

from common import which
from dependencies import check_busco
from . import parsers
from .hits import BestHits
from .fileio import maf
Expand Down Expand Up @@ -46,24 +47,21 @@ def clean_folder(target):
pass


def parallel_fasta(input_filename, n_jobs):
file_size = 'S=`stat -c "%%s" {0}`; B=`expr $S / {1}`;'.format(input_filename,
n_jobs)
exc = which('parallel')
cmd = [file_size, 'cat', input_filename, '|', exc, '--block', '$B',
'--pipe', '--recstart', '">"', '--gnu', '-j', str(n_jobs)]

return ' '.join(cmd)
def parallel_fasta(input_filename, output_filename, command, n_jobs, pbs=None):

def multinode_parallel_fasta(input_filename, ppn, nodes):
file_size = 'S=`stat -c "%%s" {0}`; B=`expr $S / {1}`;'.format(input_filename,
nodes * ppn)
exc = which('parallel')
cmd = [file_size, 'cat', input_filename, '|', exc, '--block', '$B',
'--pipe', '--recstart', '">"', '--gnu', '--jobs', str(ppn),
'--sshloginfile $PBS_NODEFILE', '--workdir $PWD']
cmd = ['cat', input_filename, '|', exc, '--round-robin', '--pipe', '-L', 2,
'-N', 10000, '--gnu']
if pbs is not None:
cmd.extend(['--sshloginfile', pbs, '--workdir $PWD'])
else:
cmd.extend(['-j', n_jobs])
cmd.extend(['-a', input_filename])

return ' '.join(cmd)
if isinstance(command, list):
command = ' '.join(command)
cmd.extend([command, '>', output_filename])
return ' '.join(map(str, cmd))


seq_ext = re.compile(r'(.fasta)|(.fa)|(.fastq)|(.fq)')
Expand Down Expand Up @@ -282,7 +280,7 @@ def get_lastdb_task(db_fn, db_out_prefix, lastdb_cfg, prot=True):

@create_task_object
def get_lastal_task(query, db, out_fn, cfg, translate=False,
cutoff=0.00001, n_threads=1, n_nodes=None):
cutoff=0.00001, n_threads=1, pbs=None):
'''Create a pydoit task to run lastal
Args:
Expand All @@ -308,13 +306,7 @@ def get_lastal_task(query, db, out_fn, cfg, translate=False,
lastal_cmd.append(db)
lastal_cmd = '"{0}"'.format(' '.join(lastal_cmd))

if n_nodes is None:
parallel = parallel_fasta(query, n_threads)
else:
parallel = multinode_parallel_fasta(query, n_threads, n_nodes)

cmd = [parallel, lastal_cmd, '<', query, '>', out_fn]
cmd = ' '.join(cmd)
cmd = parallel_fasta(query, out_fn, lastal_cmd, n_threads, pbs=pbs)

name = 'lastal:{0}'.format(os.path.join(out_fn))

Expand Down Expand Up @@ -380,7 +372,7 @@ def get_busco_task(input_filename, output_name, busco_db_dir, input_type,
name = 'busco:' + os.path.basename(input_filename) + '-' + os.path.basename(busco_db_dir)

assert input_type in ['genome', 'OGS', 'trans']
exc = which('BUSCO_v1.1b1.py')
_, exc = check_busco(None)
# BUSCO chokes on file paths as output names
output_name = os.path.basename(output_name)

Expand Down Expand Up @@ -411,23 +403,19 @@ def get_cmpress_task(db_filename, infernal_cfg):

@create_task_object
def get_cmscan_task(input_filename, output_filename, db_filename,
cutoff, n_threads, infernal_cfg, n_nodes=None):
cutoff, n_threads, infernal_cfg, pbs=None):

name = 'cmscan:' + os.path.basename(input_filename) + '.x.' + \
os.path.basename(db_filename)

exc = which('cmscan')
if n_nodes is None:
parallel_cmd = parallel_fasta(input_filename, n_threads)
else:
parallel_cmd = multinode_parallel_fasta(input_filename, n_threads,
n_nodes)


stat = output_filename + '.cmscan.out'
cmd = [parallel_cmd, exc, '--cpu', '1', '--rfam', '--nohmmonly',
cmd = [exc, '--cpu', '1', '--rfam', '--nohmmonly',
'-E', str(cutoff), '--tblout', '/dev/stdout', '-o', stat,
db_filename, '/dev/stdin', '>', output_filename]
cmd = ' '.join(cmd)
db_filename, '/dev/stdin']
cmd = parallel_fasta(input_filename, output_filename, cmd,
n_threads, pbs=pbs)

return {'name': name,
'title': title_with_actions,
Expand All @@ -454,24 +442,17 @@ def get_hmmpress_task(db_filename, hmmer_cfg):

@create_task_object
def get_hmmscan_task(input_filename, output_filename, db_filename,
cutoff, n_threads, hmmer_cfg, n_nodes=None):
cutoff, n_threads, hmmer_cfg, pbs=None):

name = 'hmmscan:' + os.path.basename(input_filename) + '.x.' + \
os.path.basename(db_filename)

hmmscan_exc = which('hmmscan')

if n_nodes is None:
parallel_cmd = parallel_fasta(input_filename, n_threads)
else:
parallel_cmd = multinode_parallel_fasta(input_filename, n_threads,
n_nodes)

stat = output_filename + '.out'
cmd = [parallel_cmd, hmmscan_exc, '--cpu', '1', '--domtblout', '/dev/stdout',
'-E', str(cutoff), '-o', stat, db_filename, '/dev/stdin',
'>', output_filename]
cmd = ' '.join(cmd)
cmd = [hmmscan_exc, '--cpu', '1', '--domtblout', '/dev/stdout',
'-E', str(cutoff), '-o', stat, db_filename, '/dev/stdin']
cmd = parallel_fasta(input_filename, output_filename, cmd, n_threads, pbs=pbs)

return {'name': name,
'title': title_with_actions,
Expand Down
8 changes: 4 additions & 4 deletions dammit/tests/test-data/20aa-alitest.fa
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
>test1
>Transcript_1
ACDEFGHIKLMNPQRSTVWY
>test2
>Transcript_2
XXXXACDEFGHIMNXXXPQRSTVWY
>test3
>Transcript_3
ACDEFGHILMNXXXXXPQRSTVWYXXXX
>test4
>Transcript_4
XXXACDEFGHIKLMNPQRSTVWYXXX

10 changes: 5 additions & 5 deletions doc/installing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -65,14 +65,14 @@ dammit also runs BUSCO to assess completeness. To install it, run the following
commands::

cd
curl -LO http://busco.ezlab.org/files/BUSCO_v1.1b1.tar.gz
tar -xvzf BUSCO_v1.1b1.tar.gz
chmod +x BUSCO_v1.1b1/*.py
export PATH=$HOME/BUSCO_v1.1b1:$PATH
curl -LO http://busco.ezlab.org/v1/files/BUSCO_v1.22.tar.gz
tar -xvzf BUSCO_v1.22.tar.gz
chmod +x BUSCO_v1.22/*.py
export PATH=$HOME/BUSCO_v1.22:$PATH

...and once again, to install it permanently::

echo 'export PATH=$HOME/BUSCO_v1.1b1:$PATH' >> $HOME/.bashrc
echo 'export PATH=$HOME/BUSCO_v1.22:$PATH' >> $HOME/.bashrc

Python Dependencies
--------------------
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
setuptools>=0.6.35
pandas>=0.17
pandas==0.18.1
khmer>=2.0
doit>=0.29.0
doit==0.29.0
Sphinx>1.3.1
sphinx-rtd-theme>=0.1.9
nose==1.3.4
Expand Down
Loading

0 comments on commit 86fa03c

Please sign in to comment.