Skip to content

Commit

Permalink
Merge pull request #43 from compomics/ionbot_parser
Browse files Browse the repository at this point in the history
Add ionbot reader; read ion mobility from mzid files
  • Loading branch information
RalfG authored Sep 15, 2023
2 parents e2cb5d4 + 0b96576 commit 5f3939d
Show file tree
Hide file tree
Showing 7 changed files with 350 additions and 26 deletions.
7 changes: 7 additions & 0 deletions docs/source/api/psm_utils.io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,10 @@ psm_utils.io.xtandem
.. automodule:: psm_utils.io.xtandem
:members:
:inherited-members:

psm_utils.io.ionbot
##########################

.. automodule:: psm_utils.io.ionbot
:members:
:inherited-members:
7 changes: 7 additions & 0 deletions psm_utils/io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import psm_utils.io.sage as sage
import psm_utils.io.tsv as tsv
import psm_utils.io.xtandem as xtandem
import psm_utils.io.ionbot as ionbot
from psm_utils.io._base_classes import WriterBase
from psm_utils.io.exceptions import PSMUtilsIOException
from psm_utils.psm import PSM
Expand Down Expand Up @@ -84,6 +85,12 @@
"extension": ".tsv",
"filename_pattern": r"^.*(?:_|\.).sage.tsv$",
},
"ionbot": {
"reader": ionbot.IonbotReader,
"writer": None,
"extension": "ionbot.first.csv",
"filename_pattern": r"^ionbot.first.csv$",
},
}
READERS = {k: v["reader"] for k, v in FILETYPES.items() if v["reader"]}
WRITERS = {k: v["writer"] for k, v in FILETYPES.items() if v["writer"]}
Expand Down
142 changes: 142 additions & 0 deletions psm_utils/io/ionbot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
"""
Interface with ionbot PSM files.
Currently only supports the ionbot.first.csv files.
"""

from __future__ import annotations

import csv
import re
from pathlib import Path
from typing import Dict, Iterable, Union

from psm_utils.io._base_classes import ReaderBase
from psm_utils.io.exceptions import PSMUtilsIOException
from psm_utils.peptidoform import Peptidoform
from psm_utils.psm import PSM
from psm_utils.psm_list import PSMList

REQUIRED_COLUMNS = [
"database_peptide",
"modifications",
"charge",
"spectrum_title",
"spectrum_file",
"proteins",
"observed_retention_time",
"database",
"psm_score",
"q-value",
"PEP",
]


class IonbotReader(ReaderBase):
def __init__(
self,
filename: str | Path,
*args,
**kwargs,
) -> None:
"""
Reader for ``ionbot.first.csv`` PSM files.
Parameters
----------
filename: str, pathlib.Path
Path to PSM file.
Examples
--------
IonbotReader supports iteration:
>>> from psm_utils.io.ionbot import IonbotReader
>>> for psm in IonbotReader("ionbot.first.csv"):
... print(psm.peptidoform.proforma)
ACDEK
AC[Carbamidomethyl]DEFGR
[Acetyl]-AC[Carbamidomethyl]DEFGHIK
Or a full file can be read at once into a :py:class:`psm_utils.psm_list.PSMList`
object:
>>> ionbot_reader = IonbotReader("ionbot.first.csv")
>>> psm_list = ionbot_reader.read_file()
"""
super().__init__(filename, *args, **kwargs)
self.filename = filename

def __iter__(self) -> Iterable[PSM]:
"""Iterate over file and return PSMs one-by-one."""
with open(self.filename, "rt") as open_file:
reader = csv.DictReader(open_file, delimiter=",")
for row in reader:
yield self._get_peptide_spectrum_match(row)

def read_file(self) -> PSMList:
"""Read full PSM file into a PSMList object."""
return PSMList(psm_list=[psm for psm in self])

def _get_peptide_spectrum_match(self, psm_dict: Dict[str, str | float]) -> PSM:
return PSM(
peptidoform=self._parse_peptidoform(
psm_dict["matched_peptide"],
psm_dict["modifications"],
psm_dict["charge"],
),
spectrum_id=psm_dict["spectrum_title"],
run=psm_dict["spectrum_file"],
is_decoy=True
if psm_dict["database"] == "D"
else False
if psm_dict["database"] == "T"
else None,
score=float(psm_dict["psm_score"]),
precursor_mz=float(psm_dict["m/z"]),
retention_time=float(psm_dict["observed_retention_time"]),
protein_list=psm_dict["proteins"].split("||"),
source="ionbot",
qvalue=float(psm_dict["q-value"]),
pep=float(psm_dict["PEP"]),
provenance_data=({"ionbot_filename": str(self.filename)}),
metadata={
col: str(psm_dict[col]) for col in psm_dict.keys() if col not in REQUIRED_COLUMNS
},
)

@staticmethod
def _parse_peptidoform(
peptide: str, modifications: str, charge: Union[str, int]
) -> Peptidoform:
"""Parse peptide, modifications, and charge to Peptidoform."""
# Split peptide into list of amino acids with termini
peptide = peptide = [""] + list(peptide) + [""]

# Add modifications
pattern = re.compile(r"^(?P<U>\[\S*?\])?(?P<mod>.*?)(?P<AA>\[\S*?\])?$")
for position, label in zip(modifications.split("|")[::2], modifications.split("|")[1::2]):
mod_match = pattern.search(label)
if mod_match.group("U"):
parsed_label = "U:" + mod_match.group("U")[1:-1]
else:
parsed_label = mod_match.group("mod")
peptide[int(position)] += f"[{parsed_label}]"

# Add terminal modifications
peptide[0] = peptide[0] + "-" if peptide[0] else ""
peptide[-1] = "-" + peptide[-1] if peptide[-1] else ""
proforma_seq = "".join(peptide)

# Add charge state
proforma_seq += f"/{charge}"

return Peptidoform(proforma_seq)


class InvalidIonbotModificationError(PSMUtilsIOException):
"""Invalid Peptide Record modification."""

pass
62 changes: 37 additions & 25 deletions psm_utils/io/mzid.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,31 +132,23 @@ def __init__(self, filename: str | Path, *args, score_key: str = None, **kwargs)
"""
super().__init__(filename, *args, **kwargs)

self.score_key = score_key

self._non_metadata_keys = None
self._non_metadata_keys = ["ContactRole", "passThreshold"]
self._score_key = None
self._rt_key = None
self._spectrum_rt_key = None
self._qvalue_key = None
self._pep_key = None
self._im_key = None

self._source = self._infer_source()

def __iter__(self):
"""Iterate over file and return PSMs one-by-one."""
with mzid.read(str(self.filename)) as reader:
for spectrum in reader:
# Check if RT is encoded in spectrum metadata
if "retention time" in spectrum:
self._spectrum_rt_key = "retention time"
elif "scan start time" in spectrum:
self._spectrum_rt_key = "scan start time"
else:
self._spectrum_rt_key = None
# Parse PSM non-metadata keys, rt key, and score key
self._get_non_metadata_keys(spectrum["SpectrumIdentificationItem"][0].keys())
break
# Parse spectrum metadata
self._get_toplevel_non_metadata_keys(reader[0].keys())
# Parse PSM non-metadata keys, rt key and score key
self._get_non_metadata_keys(reader[0]["SpectrumIdentificationItem"][0].keys())

for spectrum in reader:
# Parse spectrum metadata
Expand All @@ -166,11 +158,12 @@ def __iter__(self):
)
run = Path(spectrum["location"]).stem if "location" in spectrum else None
rt = float(spectrum[self._spectrum_rt_key]) if self._spectrum_rt_key else None
ion_mobility = float(spectrum[self._im_key]) if self._im_key else None

# Parse PSMs from spectrum
for entry in spectrum["SpectrumIdentificationItem"]:
yield self._get_peptide_spectrum_match(
spectrum_id, spectrum_title, run, rt, entry
spectrum_id, spectrum_title, run, rt, ion_mobility, entry
)

@staticmethod
Expand Down Expand Up @@ -235,6 +228,7 @@ def _get_peptide_spectrum_match(
spectrum_title: Union[str, None],
run: Union[str, None],
rt: Union[float, None],
ion_mobility: Union[float, None],
spectrum_identification_item: dict[str, str | float | list],
) -> PSM:
"""Parse single mzid entry to :py:class:`~psm_utils.peptidoform.Peptidoform`."""
Expand Down Expand Up @@ -275,11 +269,12 @@ def _get_peptide_spectrum_match(
spectrum_id=psm_spectrum_id,
run=run,
is_decoy=is_decoy,
score=score,
score=sii[self._score_key] if self._score_key else None,
qvalue=sii[self._qvalue_key] if self._qvalue_key else None,
pep=sii[self._pep_key] if self._pep_key else None,
precursor_mz=precursor_mz,
retention_time=rt,
ion_mobility=ion_mobility,
protein_list=protein_list,
rank=sii["rank"] if "rank" in sii else None,
source=self._source,
Expand All @@ -289,7 +284,7 @@ def _get_peptide_spectrum_match(
return psm

def _get_non_metadata_keys(self, keys: list):
"""Gather all the keys that should not be written to metadata"""
"""Gather all the keys at PSM-level that should not be written to metadata."""
# All keys required to create PSM object
default_keys = [
"chargeState",
Expand All @@ -300,9 +295,13 @@ def _get_non_metadata_keys(self, keys: list):
"Modification",
]
# Get the score key and add to default keys
if not self.score_key:
self.score_key = self._infer_score_name(keys)
default_keys.append(self.score_key)
self._score_key = self._infer_score_name(keys)
if self._score_key:
default_keys.append(self._score_key)
else:
logger.warning(
"No known score metric found in mzIdentML file. Scores will be set to None."
)

# Get the q-value key and add to default keys
self._qvalue_key = self._infer_qvalue_name(keys)
Expand All @@ -322,18 +321,30 @@ def _get_non_metadata_keys(self, keys: list):
break

# Keys that are not necessary for metadata
self._non_metadata_keys = ["ContactRole", "passThreshold"]
self._non_metadata_keys.extend(default_keys)

def _get_toplevel_non_metadata_keys(self, keys: list):
"""Gather all keys at spectrum-level that should not be written to metadata."""
# Check if RT is encoded in spectrum metadata
for key in ["retention time", "scan start time"]:
if key in keys:
self._spectrum_rt_key = key
self._non_metadata_keys.append(key)
break

# Check if ion mobility is encoded in spectrum metadata
for im_key in ["inverse reduced ion mobility"]:
if im_key in keys:
self._im_key = im_key
self._non_metadata_keys.append(im_key)
break

@staticmethod
def _infer_score_name(keys) -> str:
"""Infer the score from the list of known PSM scores."""

for score in STANDARD_SEARCHENGINE_SCORES:
if score in keys:
return score
else:
raise UnknownMzidScore("No known score metric found in mzIdentML file.")

@staticmethod
def _infer_qvalue_name(keys) -> Union[str, None]:
Expand Down Expand Up @@ -407,6 +418,7 @@ def write_psm(self, psm: PSM):
------
NotImplementedError
MzidWriter currently does not support write_psm.
"""
raise NotImplementedError("MzidWriter currently does not support write_psm.")

Expand Down
6 changes: 6 additions & 0 deletions psm_utils/psm.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,12 @@ def __getitem__(self, item) -> any:
def __setitem__(self, item, value: any) -> None:
setattr(self, item, value)

@property
def precursor_mz_error(self) -> float:
"""Difference between observed and theoretical m/z in Da."""
theoretical_mz = self.peptidoform.theoretical_mz
return self.precursor_mz - theoretical_mz

def get_precursor_charge(self) -> int:
"""Precursor charge, as embedded in :py:attr:`PSM.peptidoform`."""
return self.peptidoform.precursor_charge
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,4 @@ target-version = ['py37']

[tool.ruff]
line-length = 99
target-version = 'py37'
target-version = "py37"
Loading

0 comments on commit 5f3939d

Please sign in to comment.