From a2e4b6acad3e32756e8755d8087d72df17cd6895 Mon Sep 17 00:00:00 2001 From: RalfG Date: Mon, 25 Mar 2024 17:10:34 +0100 Subject: [PATCH] Implement final to do's - terminal modifications - is_decoy field - use measured ion mobility - use tims_score --- psm_utils/io/timscore.py | 27 ++++++++++++++++++--------- tests/test_io/test_timscore.py | 7 +++++-- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/psm_utils/io/timscore.py b/psm_utils/io/timscore.py index 7fe8021..e92157d 100644 --- a/psm_utils/io/timscore.py +++ b/psm_utils/io/timscore.py @@ -1,6 +1,7 @@ -"""Reader for Proteome Discoverer MSF PSM files.""" +"""Reader for TIMScore Parquet files.""" import logging +import re from pathlib import Path from typing import Union @@ -9,6 +10,7 @@ from psm_utils import PSM from psm_utils.io._base_classes import ReaderBase +from psm_utils.peptidoform import format_number_as_string logger = logging.getLogger(__name__) @@ -32,6 +34,7 @@ def __init__( """ super().__init__(filename, *args, **kwargs) + self._decoy_pattern = re.compile(r"^Reverse_") self.data = pd.read_parquet(self.filename) @@ -47,11 +50,11 @@ def __iter__(self): entry.stripped_peptide, entry.ptms, entry.ptm_locations, entry.precursor_charge ), spectrum_id=entry.ms2_id, - is_decoy=None, # TODO: Parse from protein? - score=entry.tims_score, # TODO: Correct score? + is_decoy=all(self._decoy_pattern.match(p) for p in entry.locus_name), + score=entry.tims_score, precursor_mz=entry.precursor_mz, retention_time=entry.rt, - ion_mobility=entry.corrected_ook0, + ion_mobility=entry.ook0, protein_list=list(entry.locus_name), rank=entry.rank, source="TIMScore", @@ -63,7 +66,7 @@ def __iter__(self): metadata={ "leading_aa": str(entry.leading_aa), "trailing_aa": str(entry.trailing_aa), - "ook0": str(entry.ook0), + "corrected_ook0": str(entry.corrected_ook0), }, rescoring_features={ "x_corr_score": float(entry.x_corr_score), @@ -81,9 +84,15 @@ def _parse_peptidoform( stripped_peptide: str, ptms: np.ndarray, ptm_locations: np.ndarray, precursor_charge: int ) -> str: """Parse peptide sequence and modifications to ProForma.""" - # TODO: How are terminal modifications handled? peptidoform = list(stripped_peptide) + n_term = "" + c_term = "" for ptm, ptm_location in zip(ptms, ptm_locations): - peptidoform[ptm_location] = f"{peptidoform[ptm_location]}[{ptm}]" - peptidoform.append(f"/{precursor_charge}") - return "".join(peptidoform) + ptm = format_number_as_string(ptm) + if ptm_location == -1: + n_term = f"[{ptm}]-" + elif ptm_location == len(peptidoform): + c_term = f"-[{ptm}]" + else: + peptidoform[ptm_location] = f"{peptidoform[ptm_location]}[{ptm}]" + return f"{n_term}{''.join(peptidoform)}{c_term}/{precursor_charge}" diff --git a/tests/test_io/test_timscore.py b/tests/test_io/test_timscore.py index 58a8f2b..09e7bb1 100644 --- a/tests/test_io/test_timscore.py +++ b/tests/test_io/test_timscore.py @@ -6,8 +6,11 @@ def test_parse_peptidoform(): test_cases = [ ("ACDMEK", np.array([]), np.array([]), 2, "ACDMEK/2"), - ("ACDMEK", np.array([15.99]), np.array([3]), 2, "ACDM[15.99]EK/2"), - ("ACDMEK", np.array([57.02, 15.99]), np.array([1, 3]), 2, "AC[57.02]DM[15.99]EK/2"), + ("ACDMEK", np.array([15.99]), np.array([3]), 2, "ACDM[+15.99]EK/2"), + ("ACDMEK", np.array([57.02, 15.99]), np.array([1, 3]), 2, "AC[+57.02]DM[+15.99]EK/2"), + ("ACDMEK", np.array([42.01]), np.array([-1]), 2, "[+42.01]-ACDMEK/2"), + ("ACDMEK", np.array([-0.98]), np.array([6]), 2, "ACDMEK-[-0.98]/2"), + ("ACDMEK", np.array([42.01, -0.98]), np.array([-1, 6]), 2, "[+42.01]-ACDMEK-[-0.98]/2"), ] for peptide, ptms, ptm_locations, precursor_charge, expected in test_cases: