Skip to content

Commit

Permalink
Allow iteration over search_hit in pepxml
Browse files Browse the repository at this point in the history
  • Loading branch information
levitsky committed Sep 26, 2024
1 parent 35cddc6 commit 2d0a874
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 42 deletions.
5 changes: 4 additions & 1 deletion CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
4.7.4a1
4.7.4a2
-------

- Fix call signature for :py:func:`pepxml.read` and make it a full alias to :py:class:`pepxml.PepXML`.
- Disable indexing of pepXML in :py:func:`pepxml.DataFrame`, which fixes creation of dataframes from files
with non-unique spectrum IDs.
- Fix #156.
- Allow iteration over search hits instead of spectrum queries with :py:meth:`pyteomics.pepxml.PepXML.search_hits`.
:py:func:`pyteomics.pepxml.DataFrame` has a new argument `by`, which accepts values "spectrum_query" (default)
and "search_hit" (new).


4.7.3
Expand Down
112 changes: 72 additions & 40 deletions pyteomics/pepxml.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,9 @@


class PepXML(xml.MultiProcessingXML, xml.IndexSavingXML):
"""Parser class for pepXML files."""
"""
Parser class for pepXML files.
"""
file_format = 'pepXML'
_root_element = 'msms_pipeline_analysis'
_default_schema = _schema_defaults._pepxml_schema_defaults
Expand Down Expand Up @@ -195,6 +197,16 @@ def safe_float(s):
info['search_hit'].sort(key=lambda x: x['hit_rank'])
return info

def search_hits(self):
"""
Iterate over search hits rather than spectrum queries.
"""
for sq in self:
sh = sq.pop('search_hit', [])
for item in sh:
item.update(sq)
yield item


def read(*args, **kwargs):
"""Parse `source` and iterate through peptide-spectrum matches.
Expand Down Expand Up @@ -369,6 +381,11 @@ def DataFrame(*args, **kwargs):
**kwargs
Passed to :py:func:`chain`.
by : str, keyword only, optional
Can be :py:const:`"spectrum_query"` (default) or :py:const:`"search_hit"`.
One row in the resulting dataframe corresponds to one element of the given type.
If :py:const:`"spectrum_query"` is set, only the top search hit is shown in the dataframe.
sep : str or None, keyword only, optional
Some values related to PSMs (such as protein information) are variable-length
lists. If `sep` is a :py:class:`str`, they will be packed into single string using
Expand Down Expand Up @@ -403,8 +420,50 @@ def DataFrame(*args, **kwargs):
kwargs = kwargs.copy()
sep = kwargs.pop('sep', None)
pd_kwargs = kwargs.pop('pd_kwargs', {})
def gen_items():
kwargs.setdefault('use_index', False)
kwargs.setdefault('use_index', False)
by = kwargs.pop('by', 'spectrum_query')

def search_hit_info(sh):
info = {}
proteins = sh.pop('proteins')
prot_dict = {}
for p in proteins:
for k in p:
prot_dict[k] = []
for p in proteins:
for k, v in prot_dict.items():
v.append(p.get(k))
if sep is None:
info.update(prot_dict)
else:
for k, v in prot_dict.items():
info[k] = sep.join(str(val) if val is not None else '' for val in v)
info.update(sh.pop('search_score'))
mods = sh.pop('modifications', [])
formatted_mods = ['{0[mass]:.3f}@{0[position]}'.format(x) for x in mods]
if sep is not None:
info['modifications'] = sep.join(formatted_mods)
else:
info['modifications'] = formatted_mods
for k, v in sh.items():
if isinstance(v, (str, int, float)):
info[k] = v
if 'analysis_result' in sh:
for ar in sh['analysis_result']:
if ar['analysis'] == 'peptideprophet':
try:
info.update(ar['peptideprophet_result']['parameter'])
except KeyError:
pass
info['peptideprophet_probability'] = ar['peptideprophet_result']['probability']
info['peptideprophet_ntt_prob'] = ar['peptideprophet_result']['all_ntt_prob']
elif ar['analysis'] == 'interprophet':
info.update(ar['interprophet_result']['parameter'])
info['interprophet_probability'] = ar['interprophet_result']['probability']
info['interprophet_ntt_prob'] = ar['interprophet_result']['all_ntt_prob']
return info

def iter_spectrum_query():
with chain(*args, **kwargs) as f:
for item in f:
info = {}
Expand All @@ -413,44 +472,17 @@ def gen_items():
info[k] = v
if 'search_hit' in item:
sh = item['search_hit'][0]
proteins = sh.pop('proteins')
prot_dict = {}
for p in proteins:
for k in p:
prot_dict[k] = []
for p in proteins:
for k, v in prot_dict.items():
v.append(p.get(k))
if sep is None:
info.update(prot_dict)
else:
for k, v in prot_dict.items():
info[k] = sep.join(str(val) if val is not None else '' for val in v)
info.update(sh.pop('search_score'))
mods = sh.pop('modifications', [])
formatted_mods = ['{0[mass]:.3f}@{0[position]}'.format(x) for x in mods]
if sep is not None:
info['modifications'] = sep.join(formatted_mods)
else:
info['modifications'] = formatted_mods
for k, v in sh.items():
if isinstance(v, (str, int, float)):
info[k] = v
if 'analysis_result' in sh:
for ar in sh['analysis_result']:
if ar['analysis'] == 'peptideprophet':
try:
info.update(ar['peptideprophet_result']['parameter'])
except KeyError:
pass
info['peptideprophet_probability'] = ar['peptideprophet_result']['probability']
info['peptideprophet_ntt_prob'] = ar['peptideprophet_result']['all_ntt_prob']
elif ar['analysis'] == 'interprophet':
info.update(ar['interprophet_result']['parameter'])
info['interprophet_probability'] = ar['interprophet_result']['probability']
info['interprophet_ntt_prob'] = ar['interprophet_result']['all_ntt_prob']
info.update(search_hit_info(sh))
yield info
return pd.DataFrame(gen_items(), **pd_kwargs)

def iter_search_hit():
for source in args:
with PepXML(source, **kwargs) as f:
for sh in f.search_hits():
yield search_hit_info(sh)

items = {'spectrum_query': iter_spectrum_query, 'search_hit': iter_search_hit}[by]
return pd.DataFrame(items(), **pd_kwargs)


def filter_df(*args, **kwargs):
Expand Down
2 changes: 1 addition & 1 deletion pyteomics/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
"""

__version__ = '4.7.4a1'
__version__ = '4.7.4a2'

from collections import namedtuple
import re
Expand Down

0 comments on commit 2d0a874

Please sign in to comment.