diff --git a/CHANGELOG.md b/CHANGELOG.md
index f56be1e..6fb73a2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [0.6.0] - 2023-10-19
+
+### Added
+
+- `io`: Added new `io.pepxml` reader
+
+### Fixed
+
+- Docs: Add ionbot to README.rst, fix order in API docs
+
## [0.5.0] - 2023-09-20
### Added
@@ -21,6 +31,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `io.mzid`: Allow inconsistent presence of score in PSMs in a single mzid file
### Changed
+
- `PSM`: Values of the `rescoring_features` dictionary are now coerced to floats
- io: Raise `PSMUtilsIOException` when passed filetype is not known
- `io`: Make io reader `read_file` method inheritable (code cleanup)
@@ -31,6 +42,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Formatting: Increase max line length to 99 (code formatting)
### Fixed
+
- `PSMList`: Fix issue where `psm_list["protein_list"]` resulted in a Numpy error due to the inconsistent shape of the lists.
- `io.tsv`: Throw more descriptive `PSMUtilsIOException` when handeling tsv errors
- `io.msamanda`: Fix support for N/C-terminal modifications
@@ -69,15 +81,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [0.3.1] - 2023-06-19
### Changed
+
- `io.sage`: Change `spectrum_fdr` to `spectrum_q` (crf. lazear/sage#64).
## [0.3.0] - 2023-06-08
### Added
+
- Add reader for [Sage](https://github.com/lazear/sage) PSM files.
- `io.mzid`: Add reading/writing of PEP and q-values
### Changed
+
- `psm`: The default values of `PSM.provenance_data`, `PSM.metadata` and `PSM.rescoring_features` are now `dict()` instead of `None`.
- `PSMList`: Also allow Numpy integers for indexing a single PSM
- `io.mzid.MzidReader`: Attempt to parse `retention time` or `scan start time` cvParams from both SpectrumIdentificationResult as SpectrumIdentificationItem levels. Note that according to the mzIdentML specification document (v1.1.1) neither cvParams are expected to be present at either level.
@@ -88,6 +103,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Filter warnings from `psims.mzmlb` on import, as `mzmlb` is not used
### Fixed
+
- `psm`: Fix missing qvalue and pep in docstring
- `peptidoform`: ProForma mass modifications are now correctly parsed within the `rename_modifications` function.
- `io.maxquant.MSMSReader`: Correctly parse empty `Proteins` column to `None`
@@ -157,7 +173,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
-- `PSMList`: Truncate __repr__ to first five entries only, avoiding crashing notebook output
+- `PSMList`: Truncate `__repr__` to first five entries only, avoiding crashing notebook output
- `Peptidoform`: Minor typing fix
- `add_fixed_modifications`: Allow input as dict as well as list of tuples
- `io`: Fix issue where the `NamedTemporaryFile` for `_supports_write_psm` was seen as invalid Percolator file
diff --git a/README.rst b/README.rst
index 23a293d..b50299a 100644
--- a/README.rst
+++ b/README.rst
@@ -89,11 +89,13 @@ Supported file formats
===================================================================================================================== ======================== =============== ===============
File format psm_utils tag Read support Write support
===================================================================================================================== ======================== =============== ===============
+ `ionbot CSV `_ ``ionbot`` ✅ ❌
`OpenMS idXML `_ ``idxml`` ✅ ❌
`MaxQuant msms.txt `_ ``msms`` ✅ ❌
`MS Amanda CSV `_ ``msamanda`` ✅ ❌
`mzIdentML `_ ``mzid`` ✅ ✅
`Peptide Record `_ ``peprec`` ✅ ✅
+ `pepXML `_ ``pepxml`` ✅ ❌
`Percolator tab `_ ``percolator`` ✅ ✅
Proteome Discoverer MSF ``proteome_discoverer`` ✅ ❌
`Sage `_ ``sage`` ✅ ❌
diff --git a/docs/source/api/psm_utils.io.rst b/docs/source/api/psm_utils.io.rst
index b79563f..aae0b9f 100644
--- a/docs/source/api/psm_utils.io.rst
+++ b/docs/source/api/psm_utils.io.rst
@@ -6,6 +6,14 @@ psm_utils.io
:members:
+psm_utils.io.ionbot
+##########################
+
+.. automodule:: psm_utils.io.ionbot
+ :members:
+ :inherited-members:
+
+
psm_utils.io.idxml
##################
@@ -24,6 +32,7 @@ psm_utils.io.maxquant
:inherited-members:
+
psm_utils.io.msamanda
#####################
@@ -51,6 +60,15 @@ psm_utils.io.peptide_record
+psm_utils.io.pepxml
+###########################
+
+.. automodule:: psm_utils.io.pepxml
+ :members:
+ :inherited-members:
+
+
+
psm_utils.io.percolator
#######################
@@ -92,10 +110,3 @@ psm_utils.io.xtandem
.. automodule:: psm_utils.io.xtandem
:members:
:inherited-members:
-
-psm_utils.io.ionbot
-##########################
-
-.. automodule:: psm_utils.io.ionbot
- :members:
- :inherited-members:
\ No newline at end of file
diff --git a/example_files/PXD035029-34339_x00530_AH_AH006c_DDA_1_head.pepXML.gz b/example_files/PXD035029-34339_x00530_AH_AH006c_DDA_1_head.pepXML.gz
new file mode 100644
index 0000000..b5808a4
Binary files /dev/null and b/example_files/PXD035029-34339_x00530_AH_AH006c_DDA_1_head.pepXML.gz differ
diff --git a/psm_utils/__init__.py b/psm_utils/__init__.py
index 6f1cb91..c4822a4 100644
--- a/psm_utils/__init__.py
+++ b/psm_utils/__init__.py
@@ -1,6 +1,6 @@
"""Common utilities for parsing and handling PSMs, and search engine results."""
-__version__ = "0.5.0"
+__version__ = "0.6.0"
from warnings import filterwarnings
diff --git a/psm_utils/io/__init__.py b/psm_utils/io/__init__.py
index 2fc69cd..6b3fa47 100644
--- a/psm_utils/io/__init__.py
+++ b/psm_utils/io/__init__.py
@@ -9,16 +9,17 @@
from rich.progress import track
import psm_utils.io.idxml as idxml
+import psm_utils.io.ionbot as ionbot
import psm_utils.io.maxquant as maxquant
import psm_utils.io.msamanda as msamanda
import psm_utils.io.mzid as mzid
import psm_utils.io.peptide_record as peptide_record
+import psm_utils.io.pepxml as pepxml
import psm_utils.io.percolator as percolator
import psm_utils.io.proteome_discoverer as proteome_discoverer
import psm_utils.io.sage as sage
import psm_utils.io.tsv as tsv
import psm_utils.io.xtandem as xtandem
-import psm_utils.io.ionbot as ionbot
from psm_utils.io._base_classes import WriterBase
from psm_utils.io.exceptions import PSMUtilsIOException
from psm_utils.psm import PSM
@@ -49,6 +50,12 @@
"extension": ".peprec.txt",
"filename_pattern": r"(^.*\.peprec(?:\.txt)?$)|(?:^peprec\.txt$)",
},
+ "pepxml": {
+ "reader": pepxml.PepXMLReader,
+ "writer": None,
+ "extension": ".pepxml",
+ "filename_pattern": r"^.*\.pepxml$",
+ },
"percolator": {
"reader": percolator.PercolatorTabReader,
"writer": percolator.PercolatorTabWriter,
diff --git a/psm_utils/io/pepxml.py b/psm_utils/io/pepxml.py
new file mode 100644
index 0000000..ce614dc
--- /dev/null
+++ b/psm_utils/io/pepxml.py
@@ -0,0 +1,146 @@
+"""Interface with TPP pepXML PSM files."""
+
+from __future__ import annotations
+
+import logging
+from collections import defaultdict
+from pathlib import Path
+from typing import List, Optional, Union
+
+from pyteomics import pepxml, proforma
+
+from psm_utils.io._base_classes import ReaderBase
+from psm_utils.peptidoform import Peptidoform
+from psm_utils.psm import PSM
+from psm_utils.utils import mass_to_mz
+
+logger = logging.getLogger(__name__)
+
+STANDARD_SEARCHENGINE_SCORES = [
+ "expect",
+ "EValue",
+ "Evalue",
+ "SpecEValue",
+ "xcorr", # Fallback if no e-value is present
+ "delta_dot", # SpectraST
+ "mzFidelity",
+]
+
+
+class PepXMLReader(ReaderBase):
+ def __init__(self, filename: Union[str, Path], *args, score_key: str = None, **kwargs) -> None:
+ """
+ Reader for pepXML PSM files.
+
+ Parameters
+ ----------
+ filename: str, pathlib.Path
+ Path to PSM file.
+ score_key: str, optional
+ Name of the score metric to use as PSM score. If not provided, the score metric is
+ inferred from a list of known search engine scores.
+
+ """
+ super().__init__(filename, *args, **kwargs)
+ self.score_key = score_key or self._infer_score_name()
+
+ def __iter__(self):
+ """Iterate over file and return PSMs one-by-one."""
+ with pepxml.read(str(self.filename)) as reader:
+ for spectrum_query in reader:
+ for search_hit in spectrum_query["search_hit"]:
+ yield self._parse_psm(spectrum_query, search_hit)
+
+ def _infer_score_name(self) -> str:
+ """Infer the score from the list of known PSM scores."""
+ # Get scores from first PSM
+ with pepxml.read(str(self.filename)) as reader:
+ for spectrum_query in reader:
+ score_keys = spectrum_query["search_hit"][0]["search_score"].keys()
+ break
+
+ # Infer score name
+ if not score_keys:
+ logger.warning("No pepXML scores found.")
+ return None
+ else:
+ for score in STANDARD_SEARCHENGINE_SCORES: # Check for known scores
+ if score in score_keys:
+ logger.debug(f"Using known pepXML score `{score}`.")
+ return score
+ else:
+ logger.warning(f"No known pepXML scores found. Defaulting to `{score_keys[0]}`.")
+ return score_keys[0] # Default to the first one if nothing found
+
+ @staticmethod
+ def _parse_peptidoform(peptide: str, modifications: List[dict], charge: Optional[int] = None):
+ """Parse pepXML peptide to :py:class:`~psm_utils.peptidoform.Peptidoform`."""
+ modifications_dict = defaultdict(list)
+ n_term = []
+ c_term = []
+ for mod in modifications:
+ mod_tag = proforma.process_tag_tokens(f"{mod['mass']:+}")
+ if mod["position"] == 0:
+ n_term.append(mod_tag)
+ elif mod["position"] == len(peptide) + 1:
+ c_term.append(mod_tag)
+ else:
+ modifications_dict[mod["position"]].append(mod_tag)
+
+ sequence = [(aa, modifications_dict[i] or None) for i, aa in enumerate(peptide)]
+ properties = {
+ "n_term": n_term,
+ "c_term": c_term,
+ "charge_state": proforma.ChargeState(charge) if charge else None,
+ "unlocalized_modifications": [],
+ "labile_modifications": [],
+ "fixed_modifications": [],
+ "intervals": [],
+ "isotopes": [],
+ "group_ids": [],
+ }
+ return Peptidoform(proforma.ProForma(sequence, properties))
+
+ def _parse_psm(self, spectrum_query: dict, search_hit: dict) -> PSM:
+ """Parse pepXML PSM to :py:class:`~psm_utils.psm.PSM`."""
+ return PSM(
+ peptidoform=self._parse_peptidoform(
+ search_hit["peptide"],
+ search_hit["modifications"],
+ spectrum_query["assumed_charge"],
+ ),
+ spectrum_id=spectrum_query["spectrum"],
+ run=None,
+ collection=None,
+ spectrum=None,
+ is_decoy=None,
+ score=search_hit["search_score"][self.score_key],
+ qvalue=None,
+ pep=None,
+ precursor_mz=mass_to_mz(
+ spectrum_query["precursor_neutral_mass"], spectrum_query["assumed_charge"]
+ ),
+ retention_time=spectrum_query["retention_time_sec"],
+ ion_mobility=spectrum_query["ion_mobility"]
+ if "ion_mobility" in spectrum_query
+ else None,
+ protein_list=[p["protein"] for p in search_hit["proteins"]],
+ rank=search_hit["hit_rank"],
+ source=None,
+ provenance_data={
+ "pepxml_index": str(spectrum_query["index"]),
+ "start_scan": str(spectrum_query["start_scan"]),
+ "end_scan": str(spectrum_query["end_scan"]),
+ },
+ metadata={
+ "num_matched_ions": str(search_hit["num_matched_ions"]),
+ "tot_num_ions": str(search_hit["tot_num_ions"]),
+ "num_missed_cleavages": str(search_hit["num_missed_cleavages"]),
+ }.update(
+ {
+ f"search_score_{key.lower()}": str(search_hit["search_score"][key])
+ for key in search_hit["search_score"]
+ }
+ ),
+ rescoring_features=None,
+ )
diff --git a/tests/test_io/test_pepxml.py b/tests/test_io/test_pepxml.py
new file mode 100644
index 0000000..1e1d388
--- /dev/null
+++ b/tests/test_io/test_pepxml.py
@@ -0,0 +1,37 @@
+from psm_utils.peptidoform import Peptidoform
+from psm_utils.io.pepxml import PepXMLReader
+
+class TestPepXMLReader:
+ def test_parse_peptidoform(self):
+ test_cases = [
+ {
+ "in": {
+ "peptide": "ACDEK",
+ "modifications": [],
+ "charge": 2,
+ },
+ "out": Peptidoform("ACDEK/2"),
+ },
+ {
+ "in": {
+ "peptide": "STEEQNGGGQK",
+ "modifications": [
+ {"position": 0, "mass": 43.017841151532004},
+ {"position": 2, "mass": 181.014009},
+ ],
+ "charge": 2,
+ },
+ "out": Peptidoform("[+43.017841151532004]-STE[+181.014009]EQNGGGQK/2"),
+ },
+ {
+ "in": {
+ "peptide": "ACDEK",
+ "modifications": [{"position": 6, "mass": 181.014009}],
+ "charge": 3,
+ },
+ "out": Peptidoform("ACDEK-[+181.014009]/3"),
+ },
+ ]
+
+ for test_case in test_cases:
+ assert test_case["out"] == PepXMLReader._parse_peptidoform(**test_case["in"])