diff --git a/README.rst b/README.rst index 77b61e6..89bda42 100644 --- a/README.rst +++ b/README.rst @@ -99,6 +99,7 @@ Supported file formats `Percolator tab `_ ``percolator`` ✅ ✅ Proteome Discoverer MSF ``proteome_discoverer`` ✅ ❌ `Sage `_ ``sage`` ✅ ❌ + TIMScore Parquet ``timscore`` ✅ ❌ `TSV `_ ``tsv`` ✅ ✅ `X!Tandem XML `_ ``xtandem`` ✅ ❌ ===================================================================================================================== ======================== =============== =============== diff --git a/docs/source/api/psm_utils.io.rst b/docs/source/api/psm_utils.io.rst index aae0b9f..92afe78 100644 --- a/docs/source/api/psm_utils.io.rst +++ b/docs/source/api/psm_utils.io.rst @@ -104,6 +104,14 @@ psm_utils.io.tsv +psm_utils.io.timscore +################################ +.. automodule:: psm_utils.io.timscore + :members: + :inherited-members: + + + psm_utils.io.xtandem #################### diff --git a/example_files/1min_snippit_timscore.candidates.parquet b/example_files/1min_snippit_timscore.candidates.parquet new file mode 100644 index 0000000..abe0c15 Binary files /dev/null and b/example_files/1min_snippit_timscore.candidates.parquet differ diff --git a/psm_utils/io/__init__.py b/psm_utils/io/__init__.py index a528f61..8700250 100644 --- a/psm_utils/io/__init__.py +++ b/psm_utils/io/__init__.py @@ -18,6 +18,7 @@ import psm_utils.io.percolator as percolator import psm_utils.io.proteome_discoverer as proteome_discoverer import psm_utils.io.sage as sage +import psm_utils.io.timscore as timscore import psm_utils.io.tsv as tsv import psm_utils.io.xtandem as xtandem from psm_utils.io._base_classes import WriterBase @@ -68,6 +69,12 @@ "extension": ".msf", "filename_pattern": r"^.*\.msf$", }, + "timscore": { + "reader": timscore.TIMScoreReader, + "writer": None, + "extension": ".parquet", + "filename_pattern": r"^.*\.candidates\.parquet$", + }, "tsv": { "reader": tsv.TSVReader, "writer": tsv.TSVWriter, diff --git a/psm_utils/io/timscore.py b/psm_utils/io/timscore.py new file mode 100644 index 0000000..e92157d --- /dev/null +++ b/psm_utils/io/timscore.py @@ -0,0 +1,98 @@ +"""Reader for TIMScore Parquet files.""" + +import logging +import re +from pathlib import Path +from typing import Union + +import numpy as np +import pandas as pd + +from psm_utils import PSM +from psm_utils.io._base_classes import ReaderBase +from psm_utils.peptidoform import format_number_as_string + +logger = logging.getLogger(__name__) + + +class TIMScoreReader(ReaderBase): + """Reader for TIMScore Parquet files.""" + + def __init__( + self, + filename: Union[str, Path], + *args, + **kwargs, + ) -> None: + """ + Reader for TIMScore Parquet files. + + Parameters + ---------- + filename: str, pathlib.Path + Path to MSF file. + + """ + super().__init__(filename, *args, **kwargs) + self._decoy_pattern = re.compile(r"^Reverse_") + + self.data = pd.read_parquet(self.filename) + + def __len__(self): + """Return number of PSMs in file.""" + return len(self.data) + + def __iter__(self): + """Iterate over file and return PSMs one-by-one.""" + for entry in self.data.itertuples(): + yield PSM( + peptidoform=_parse_peptidoform( + entry.stripped_peptide, entry.ptms, entry.ptm_locations, entry.precursor_charge + ), + spectrum_id=entry.ms2_id, + is_decoy=all(self._decoy_pattern.match(p) for p in entry.locus_name), + score=entry.tims_score, + precursor_mz=entry.precursor_mz, + retention_time=entry.rt, + ion_mobility=entry.ook0, + protein_list=list(entry.locus_name), + rank=entry.rank, + source="TIMScore", + provenance_data={ + "candidate_id": str(entry.candidate_id), + "ms2_id": str(entry.ms2_id), + "parent_id": str(entry.parent_id), + }, + metadata={ + "leading_aa": str(entry.leading_aa), + "trailing_aa": str(entry.trailing_aa), + "corrected_ook0": str(entry.corrected_ook0), + }, + rescoring_features={ + "x_corr_score": float(entry.x_corr_score), + "delta_cn_score": float(entry.delta_cn_score), + "ppm_error": float(entry.ppm_error), + "number_matched_ions": float(entry.number_matched_ions), + "number_expected_ions": float(entry.number_expected_ions), + "ion_proportion": float(entry.ion_proportion), + "spectrum_total_ion_intensity": float(entry.spectrum_total_ion_intensity), + }, + ) + + +def _parse_peptidoform( + stripped_peptide: str, ptms: np.ndarray, ptm_locations: np.ndarray, precursor_charge: int +) -> str: + """Parse peptide sequence and modifications to ProForma.""" + peptidoform = list(stripped_peptide) + n_term = "" + c_term = "" + for ptm, ptm_location in zip(ptms, ptm_locations): + ptm = format_number_as_string(ptm) + if ptm_location == -1: + n_term = f"[{ptm}]-" + elif ptm_location == len(peptidoform): + c_term = f"-[{ptm}]" + else: + peptidoform[ptm_location] = f"{peptidoform[ptm_location]}[{ptm}]" + return f"{n_term}{''.join(peptidoform)}{c_term}/{precursor_charge}" diff --git a/tests/test_io/test_timscore.py b/tests/test_io/test_timscore.py new file mode 100644 index 0000000..09e7bb1 --- /dev/null +++ b/tests/test_io/test_timscore.py @@ -0,0 +1,17 @@ +import numpy as np + +from psm_utils.io.timscore import _parse_peptidoform + + +def test_parse_peptidoform(): + test_cases = [ + ("ACDMEK", np.array([]), np.array([]), 2, "ACDMEK/2"), + ("ACDMEK", np.array([15.99]), np.array([3]), 2, "ACDM[+15.99]EK/2"), + ("ACDMEK", np.array([57.02, 15.99]), np.array([1, 3]), 2, "AC[+57.02]DM[+15.99]EK/2"), + ("ACDMEK", np.array([42.01]), np.array([-1]), 2, "[+42.01]-ACDMEK/2"), + ("ACDMEK", np.array([-0.98]), np.array([6]), 2, "ACDMEK-[-0.98]/2"), + ("ACDMEK", np.array([42.01, -0.98]), np.array([-1, 6]), 2, "[+42.01]-ACDMEK-[-0.98]/2"), + ] + + for peptide, ptms, ptm_locations, precursor_charge, expected in test_cases: + assert _parse_peptidoform(peptide, ptms, ptm_locations, precursor_charge) == expected