Skip to content

Commit

Permalink
Merge pull request #157 from dib-lab/fix_transdecoder_sep
Browse files Browse the repository at this point in the history
Fix transdecoder sep
  • Loading branch information
camillescott authored Dec 10, 2019
2 parents f7e2329 + 388f875 commit c6597d8
Show file tree
Hide file tree
Showing 23 changed files with 3,220 additions and 65 deletions.
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@ deps: FORCE
pip install --requirement requirements.txt

install: deps
python setup.py install
python -m pip install . --ignore-installed --no-deps

dev-install: FORCE
python -m pip install . --ignore-installed --no-deps -vv

test: FORCE
py.test -m "not long and not huge"
Expand Down
2 changes: 1 addition & 1 deletion dammit/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.1
1.2
2 changes: 1 addition & 1 deletion dammit/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ def register_transdecoder_tasks(handler, config, databases,
pfam_csv_fn = '{0}.x.pfam-A.csv'.format(input_fn)
handler.register_task('hmmscan:Pfam-A:remap',
get_remap_hmmer_task(handler.files['longest_orfs_pfam'],
path.join(transdecoder_dir, 'longest_orfs.gff3'),
handler.files['longest_orfs'],
pfam_csv_fn),
files={'Pfam-A-csv': pfam_csv_fn})

Expand Down
4 changes: 3 additions & 1 deletion dammit/fileio/gff3.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def get_next():
return get_next

next_ID = id_gen_wrapper()
_ = next_ID()

class GFF3Parser(ChunkParser):

Expand Down Expand Up @@ -150,12 +151,13 @@ def hmmscan_to_gff3(hmmscan_df, tag='', database=''):

# Confirm whether this is the appropriate value to use
gff3_df['score'] = hmmscan_df['domain_i_evalue']
gff3_df['strand'] = ['.'] * len(hmmscan_df)
gff3_df['strand'] = hmmscan_df['strand']
gff3_df['phase'] = ['.'] * len(hmmscan_df)

def build_attr(row):
data = []
data.append('ID=homology:{0}'.format(next_ID()))
data.append('Parent={0}'.format(row.full_feature_name))
data.append('Name={0}'.format(row.target_name))
data.append('Target={0} {1} {2} +'.format(row.target_name,
row.hmm_coord_from+1,
Expand Down
12 changes: 10 additions & 2 deletions dammit/fileio/maf.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,14 @@ def __iter__(self):
key, _, val = token.strip().partition('=')
cur_aln[key] = float(val)

# First sequence info
# First (subject) sequence info
line = guarded_next()
tokens = line.split()
if len(tokens) < 7:
# sometimes lastal adds an extra line break after
# the sequence identifier...
tokens.extend(guarded_next().split())

cur_aln['s_name'] = tokens[1]
cur_aln['s_start'] = int(tokens[2])
cur_aln['s_aln_len'] = int(tokens[3])
Expand All @@ -84,9 +89,12 @@ def __iter__(self):
if self.aln_strings:
cur_aln['s_aln'] = tokens[6]

# First sequence info
# Second (query) sequence info
line = guarded_next()
tokens = line.split()
if len(tokens) < 7:
tokens.extend(guarded_next().split())

cur_aln['q_name'] = tokens[1]
cur_aln['q_start'] = int(tokens[2])
cur_aln['q_aln_len'] = int(tokens[3])
Expand Down
70 changes: 70 additions & 0 deletions dammit/fileio/transdecoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Copyright (C) 2015-2018 Camille Scott
# All rights reserved.
#
# This software may be modified and distributed under the terms
# of the BSD license. See the LICENSE file for details.

from khmer import ReadParser
import pandas as pd

from dammit.fileio.base import convert_dtypes, ChunkParser


class TransDecoderPepParser(ChunkParser):

columns = [('full_transcript_name', str),
('transcript_name', str),
('feature_name', str),
('src_start', int),
('src_end', int),
('length', int),
('strand', str),
('feature_type', str)]

def __init__(self, filename, **kwargs):
super(TransDecoderPepParser, self).__init__(filename, **kwargs)

def __iter__(self):

data = []
n_entries = 0
for record in ReadParser(self.filename):
full_transcript_name, feature_type, length, \
gen_code, src_coords = record.name.split(' ')

transcript_name, _, feature_name = full_transcript_name.partition('.')
_, _, coords = src_coords.partition(':')
strand = coords.strip()[-2]
_, _, length = length.partition(':')
start, _, end = coords.strip('(-+)').partition('-')

data.append([full_transcript_name,
transcript_name,
feature_name,
start,
end,
length,
strand,
feature_type])

n_entries += 1
if len(data) >= self.chunksize:
yield self._build_df(data)
data = []

if n_entries == 0:
self.raise_empty()
if data:
yield self._build_df(data)

def _build_df(self, data):
if not data:
self.raise_empty()
df = pd.DataFrame(data, columns=[k for k, _ in self.columns])
convert_dtypes(df, dict(self.columns))
# fix the evil coordinate system
sidx = df.src_start > df.src_end
df.loc[sidx, 'src_start'], df.loc[sidx, 'src_end'] = \
df.loc[sidx, 'src_end'], df.loc[sidx, 'src_start']
df.src_start = df.src_start - 1
return df
42 changes: 17 additions & 25 deletions dammit/tasks/hmmer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
from dammit.parallel import parallel_fasta
from dammit.fileio.hmmer import HMMerParser
from dammit.fileio.gff3 import GFF3Parser
from dammit.fileio.transdecoder import TransDecoderPepParser
from dammit.tasks.transdecoder import TransDecoderLongOrfsTask
from dammit.utils import doit_task, which


Expand Down Expand Up @@ -116,23 +118,8 @@ def task(self, db_filename, params=None, task_dep=None):
return task_d


def split_transdecoder_names(hmmer_df):
hmmer_df.rename(columns={'query_name': 'full_query_name'},
inplace=True)

name_df = hmmer_df.full_query_name.str.split('::', expand=True)
name_df.rename(columns={0: 'td_transcript_name',
1: 'query_name',
2: 'td_g',
3: 'td_m'},
inplace=True)
hmmer_df = pd.concat((hmmer_df, name_df), axis=1)

return hmmer_df


@doit_task
def get_remap_hmmer_task(hmmer_filename, remap_gff_filename, output_filename):
def get_remap_hmmer_task(hmmer_filename, pep_fa_filename, output_filename):
'''Given an hmmscan result from the ORFs generated by
`TransDecoder.LongOrfs` and TransDecoder's GFF3, remap the HMMER results
so that they refer to the original nucleotide coordinates rather than the
Expand All @@ -151,27 +138,32 @@ def get_remap_hmmer_task(hmmer_filename, remap_gff_filename, output_filename):
name = 'remap_hmmer:{0}'.format(os.path.basename(hmmer_filename))

def cmd():
gff_df = GFF3Parser(remap_gff_filename).read()
ref_df = TransDecoderPepParser(pep_fa_filename).read()
hmmer_df = HMMerParser(hmmer_filename).read()
hmmer_df = split_transdecoder_names(hmmer_df)

if len(gff_df) > 0 and len(hmmer_df) > 0:
merged_df = pd.merge(hmmer_df, gff_df, left_on='full_query_name', right_on='ID')
if len(ref_df) > 0 and len(hmmer_df) > 0:
merged_df = pd.merge(hmmer_df,
ref_df,
left_on='query_name',
right_on='full_transcript_name')

hmmer_df['env_coord_from'] = (merged_df.start + \
hmmer_df['env_coord_from'] = (merged_df.src_start + \
(3 * merged_df.env_coord_from)).astype(int)
hmmer_df['env_coord_to'] = (merged_df.start + \
hmmer_df['env_coord_to'] = (merged_df.src_start + \
(3 * merged_df.env_coord_to)).astype(int)
hmmer_df['ali_coord_from'] = (merged_df.start + \
hmmer_df['ali_coord_from'] = (merged_df.src_start + \
(3 * merged_df.ali_coord_from)).astype(int)
hmmer_df['ali_coord_to'] = (merged_df.start + \
hmmer_df['ali_coord_to'] = (merged_df.src_start + \
(3 * merged_df.ali_coord_to)).astype(int)
hmmer_df['strand'] = merged_df['strand']
hmmer_df['full_feature_name'] = merged_df['full_transcript_name']
hmmer_df['query_name'] = merged_df['transcript_name']

hmmer_df.to_csv(output_filename, header=True, index=False)

return {'name': name,
'actions': [cmd],
'file_dep': [hmmer_filename, remap_gff_filename],
'file_dep': [hmmer_filename, pep_fa_filename],
'targets': [output_filename],
'clean': [clean_targets]}

13 changes: 13 additions & 0 deletions dammit/tasks/transdecoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from doit.tools import LongRunning
from doit.task import clean_targets

import sh

from dammit.tasks.utils import clean_folder, DependentTask, InstallationError
from dammit.profile import profile_task
from dammit.utils import which, doit_task
Expand All @@ -21,10 +23,21 @@ def deps(self):
longorfs = which('TransDecoder.LongOrfs')
if longorfs is None:
raise InstallationError('TransDecoder.LongOrfs not found.')
if self.version() < 5:
raise InstallationError('TransDecoder >= 5 required.')
if self.logger:
self.logger.debug('TransDecoder.LongOrfs:' + longorfs)
return longorfs

@staticmethod
def version():
try:
cmd = sh.Command('TransDecoder.LongOrfs')
cmd('--version')
except sh.ErrorReturnCode:
return 3
return 5

@doit_task
@profile_task
def task(self, input_filename, params=None):
Expand Down
Loading

0 comments on commit c6597d8

Please sign in to comment.