diff --git a/CHANGELOG.md b/CHANGELOG.md index f56be1e..6fb73a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.6.0] - 2023-10-19 + +### Added + +- `io`: Added new `io.pepxml` reader + +### Fixed + +- Docs: Add ionbot to README.rst, fix order in API docs + ## [0.5.0] - 2023-09-20 ### Added @@ -21,6 +31,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `io.mzid`: Allow inconsistent presence of score in PSMs in a single mzid file ### Changed + - `PSM`: Values of the `rescoring_features` dictionary are now coerced to floats - io: Raise `PSMUtilsIOException` when passed filetype is not known - `io`: Make io reader `read_file` method inheritable (code cleanup) @@ -31,6 +42,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Formatting: Increase max line length to 99 (code formatting) ### Fixed + - `PSMList`: Fix issue where `psm_list["protein_list"]` resulted in a Numpy error due to the inconsistent shape of the lists. - `io.tsv`: Throw more descriptive `PSMUtilsIOException` when handeling tsv errors - `io.msamanda`: Fix support for N/C-terminal modifications @@ -69,15 +81,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [0.3.1] - 2023-06-19 ### Changed + - `io.sage`: Change `spectrum_fdr` to `spectrum_q` (crf. lazear/sage#64). ## [0.3.0] - 2023-06-08 ### Added + - Add reader for [Sage](https://github.com/lazear/sage) PSM files. - `io.mzid`: Add reading/writing of PEP and q-values ### Changed + - `psm`: The default values of `PSM.provenance_data`, `PSM.metadata` and `PSM.rescoring_features` are now `dict()` instead of `None`. - `PSMList`: Also allow Numpy integers for indexing a single PSM - `io.mzid.MzidReader`: Attempt to parse `retention time` or `scan start time` cvParams from both SpectrumIdentificationResult as SpectrumIdentificationItem levels. Note that according to the mzIdentML specification document (v1.1.1) neither cvParams are expected to be present at either level. @@ -88,6 +103,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Filter warnings from `psims.mzmlb` on import, as `mzmlb` is not used ### Fixed + - `psm`: Fix missing qvalue and pep in docstring - `peptidoform`: ProForma mass modifications are now correctly parsed within the `rename_modifications` function. - `io.maxquant.MSMSReader`: Correctly parse empty `Proteins` column to `None` @@ -157,7 +173,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed -- `PSMList`: Truncate __repr__ to first five entries only, avoiding crashing notebook output +- `PSMList`: Truncate `__repr__` to first five entries only, avoiding crashing notebook output - `Peptidoform`: Minor typing fix - `add_fixed_modifications`: Allow input as dict as well as list of tuples - `io`: Fix issue where the `NamedTemporaryFile` for `_supports_write_psm` was seen as invalid Percolator file diff --git a/README.rst b/README.rst index 23a293d..b50299a 100644 --- a/README.rst +++ b/README.rst @@ -89,11 +89,13 @@ Supported file formats ===================================================================================================================== ======================== =============== =============== File format psm_utils tag Read support Write support ===================================================================================================================== ======================== =============== =============== + `ionbot CSV `_ ``ionbot`` ✅ ❌ `OpenMS idXML `_ ``idxml`` ✅ ❌ `MaxQuant msms.txt `_ ``msms`` ✅ ❌ `MS Amanda CSV `_ ``msamanda`` ✅ ❌ `mzIdentML `_ ``mzid`` ✅ ✅ `Peptide Record `_ ``peprec`` ✅ ✅ + `pepXML `_ ``pepxml`` ✅ ❌ `Percolator tab `_ ``percolator`` ✅ ✅ Proteome Discoverer MSF ``proteome_discoverer`` ✅ ❌ `Sage `_ ``sage`` ✅ ❌ diff --git a/docs/source/api/psm_utils.io.rst b/docs/source/api/psm_utils.io.rst index b79563f..aae0b9f 100644 --- a/docs/source/api/psm_utils.io.rst +++ b/docs/source/api/psm_utils.io.rst @@ -6,6 +6,14 @@ psm_utils.io :members: +psm_utils.io.ionbot +########################## + +.. automodule:: psm_utils.io.ionbot + :members: + :inherited-members: + + psm_utils.io.idxml ################## @@ -24,6 +32,7 @@ psm_utils.io.maxquant :inherited-members: + psm_utils.io.msamanda ##################### @@ -51,6 +60,15 @@ psm_utils.io.peptide_record +psm_utils.io.pepxml +########################### + +.. automodule:: psm_utils.io.pepxml + :members: + :inherited-members: + + + psm_utils.io.percolator ####################### @@ -92,10 +110,3 @@ psm_utils.io.xtandem .. automodule:: psm_utils.io.xtandem :members: :inherited-members: - -psm_utils.io.ionbot -########################## - -.. automodule:: psm_utils.io.ionbot - :members: - :inherited-members: \ No newline at end of file diff --git a/example_files/PXD035029-34339_x00530_AH_AH006c_DDA_1_head.pepXML.gz b/example_files/PXD035029-34339_x00530_AH_AH006c_DDA_1_head.pepXML.gz new file mode 100644 index 0000000..b5808a4 Binary files /dev/null and b/example_files/PXD035029-34339_x00530_AH_AH006c_DDA_1_head.pepXML.gz differ diff --git a/psm_utils/__init__.py b/psm_utils/__init__.py index 6f1cb91..c4822a4 100644 --- a/psm_utils/__init__.py +++ b/psm_utils/__init__.py @@ -1,6 +1,6 @@ """Common utilities for parsing and handling PSMs, and search engine results.""" -__version__ = "0.5.0" +__version__ = "0.6.0" from warnings import filterwarnings diff --git a/psm_utils/io/__init__.py b/psm_utils/io/__init__.py index 2fc69cd..6b3fa47 100644 --- a/psm_utils/io/__init__.py +++ b/psm_utils/io/__init__.py @@ -9,16 +9,17 @@ from rich.progress import track import psm_utils.io.idxml as idxml +import psm_utils.io.ionbot as ionbot import psm_utils.io.maxquant as maxquant import psm_utils.io.msamanda as msamanda import psm_utils.io.mzid as mzid import psm_utils.io.peptide_record as peptide_record +import psm_utils.io.pepxml as pepxml import psm_utils.io.percolator as percolator import psm_utils.io.proteome_discoverer as proteome_discoverer import psm_utils.io.sage as sage import psm_utils.io.tsv as tsv import psm_utils.io.xtandem as xtandem -import psm_utils.io.ionbot as ionbot from psm_utils.io._base_classes import WriterBase from psm_utils.io.exceptions import PSMUtilsIOException from psm_utils.psm import PSM @@ -49,6 +50,12 @@ "extension": ".peprec.txt", "filename_pattern": r"(^.*\.peprec(?:\.txt)?$)|(?:^peprec\.txt$)", }, + "pepxml": { + "reader": pepxml.PepXMLReader, + "writer": None, + "extension": ".pepxml", + "filename_pattern": r"^.*\.pepxml$", + }, "percolator": { "reader": percolator.PercolatorTabReader, "writer": percolator.PercolatorTabWriter, diff --git a/psm_utils/io/pepxml.py b/psm_utils/io/pepxml.py new file mode 100644 index 0000000..ce614dc --- /dev/null +++ b/psm_utils/io/pepxml.py @@ -0,0 +1,146 @@ +"""Interface with TPP pepXML PSM files.""" + +from __future__ import annotations + +import logging +from collections import defaultdict +from pathlib import Path +from typing import List, Optional, Union + +from pyteomics import pepxml, proforma + +from psm_utils.io._base_classes import ReaderBase +from psm_utils.peptidoform import Peptidoform +from psm_utils.psm import PSM +from psm_utils.utils import mass_to_mz + +logger = logging.getLogger(__name__) + +STANDARD_SEARCHENGINE_SCORES = [ + "expect", + "EValue", + "Evalue", + "SpecEValue", + "xcorr", # Fallback if no e-value is present + "delta_dot", # SpectraST + "mzFidelity", +] + + +class PepXMLReader(ReaderBase): + def __init__(self, filename: Union[str, Path], *args, score_key: str = None, **kwargs) -> None: + """ + Reader for pepXML PSM files. + + Parameters + ---------- + filename: str, pathlib.Path + Path to PSM file. + score_key: str, optional + Name of the score metric to use as PSM score. If not provided, the score metric is + inferred from a list of known search engine scores. + + """ + super().__init__(filename, *args, **kwargs) + self.score_key = score_key or self._infer_score_name() + + def __iter__(self): + """Iterate over file and return PSMs one-by-one.""" + with pepxml.read(str(self.filename)) as reader: + for spectrum_query in reader: + for search_hit in spectrum_query["search_hit"]: + yield self._parse_psm(spectrum_query, search_hit) + + def _infer_score_name(self) -> str: + """Infer the score from the list of known PSM scores.""" + # Get scores from first PSM + with pepxml.read(str(self.filename)) as reader: + for spectrum_query in reader: + score_keys = spectrum_query["search_hit"][0]["search_score"].keys() + break + + # Infer score name + if not score_keys: + logger.warning("No pepXML scores found.") + return None + else: + for score in STANDARD_SEARCHENGINE_SCORES: # Check for known scores + if score in score_keys: + logger.debug(f"Using known pepXML score `{score}`.") + return score + else: + logger.warning(f"No known pepXML scores found. Defaulting to `{score_keys[0]}`.") + return score_keys[0] # Default to the first one if nothing found + + @staticmethod + def _parse_peptidoform(peptide: str, modifications: List[dict], charge: Optional[int] = None): + """Parse pepXML peptide to :py:class:`~psm_utils.peptidoform.Peptidoform`.""" + modifications_dict = defaultdict(list) + n_term = [] + c_term = [] + for mod in modifications: + mod_tag = proforma.process_tag_tokens(f"{mod['mass']:+}") + if mod["position"] == 0: + n_term.append(mod_tag) + elif mod["position"] == len(peptide) + 1: + c_term.append(mod_tag) + else: + modifications_dict[mod["position"]].append(mod_tag) + + sequence = [(aa, modifications_dict[i] or None) for i, aa in enumerate(peptide)] + properties = { + "n_term": n_term, + "c_term": c_term, + "charge_state": proforma.ChargeState(charge) if charge else None, + "unlocalized_modifications": [], + "labile_modifications": [], + "fixed_modifications": [], + "intervals": [], + "isotopes": [], + "group_ids": [], + } + return Peptidoform(proforma.ProForma(sequence, properties)) + + def _parse_psm(self, spectrum_query: dict, search_hit: dict) -> PSM: + """Parse pepXML PSM to :py:class:`~psm_utils.psm.PSM`.""" + return PSM( + peptidoform=self._parse_peptidoform( + search_hit["peptide"], + search_hit["modifications"], + spectrum_query["assumed_charge"], + ), + spectrum_id=spectrum_query["spectrum"], + run=None, + collection=None, + spectrum=None, + is_decoy=None, + score=search_hit["search_score"][self.score_key], + qvalue=None, + pep=None, + precursor_mz=mass_to_mz( + spectrum_query["precursor_neutral_mass"], spectrum_query["assumed_charge"] + ), + retention_time=spectrum_query["retention_time_sec"], + ion_mobility=spectrum_query["ion_mobility"] + if "ion_mobility" in spectrum_query + else None, + protein_list=[p["protein"] for p in search_hit["proteins"]], + rank=search_hit["hit_rank"], + source=None, + provenance_data={ + "pepxml_index": str(spectrum_query["index"]), + "start_scan": str(spectrum_query["start_scan"]), + "end_scan": str(spectrum_query["end_scan"]), + }, + metadata={ + "num_matched_ions": str(search_hit["num_matched_ions"]), + "tot_num_ions": str(search_hit["tot_num_ions"]), + "num_missed_cleavages": str(search_hit["num_missed_cleavages"]), + }.update( + { + f"search_score_{key.lower()}": str(search_hit["search_score"][key]) + for key in search_hit["search_score"] + } + ), + rescoring_features=None, + ) diff --git a/tests/test_io/test_pepxml.py b/tests/test_io/test_pepxml.py new file mode 100644 index 0000000..1e1d388 --- /dev/null +++ b/tests/test_io/test_pepxml.py @@ -0,0 +1,37 @@ +from psm_utils.peptidoform import Peptidoform +from psm_utils.io.pepxml import PepXMLReader + +class TestPepXMLReader: + def test_parse_peptidoform(self): + test_cases = [ + { + "in": { + "peptide": "ACDEK", + "modifications": [], + "charge": 2, + }, + "out": Peptidoform("ACDEK/2"), + }, + { + "in": { + "peptide": "STEEQNGGGQK", + "modifications": [ + {"position": 0, "mass": 43.017841151532004}, + {"position": 2, "mass": 181.014009}, + ], + "charge": 2, + }, + "out": Peptidoform("[+43.017841151532004]-STE[+181.014009]EQNGGGQK/2"), + }, + { + "in": { + "peptide": "ACDEK", + "modifications": [{"position": 6, "mass": 181.014009}], + "charge": 3, + }, + "out": Peptidoform("ACDEK-[+181.014009]/3"), + }, + ] + + for test_case in test_cases: + assert test_case["out"] == PepXMLReader._parse_peptidoform(**test_case["in"])