Skip to content

Commit

Permalink
Merge pull request #70 from compomics/add-prolucid-support
Browse files Browse the repository at this point in the history
Add TIMScore Parquet support
  • Loading branch information
RalfG authored Mar 27, 2024
2 parents 15b4d18 + 99b0b66 commit 9f33fc8
Show file tree
Hide file tree
Showing 6 changed files with 131 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ Supported file formats
`Percolator tab <https://github.com/percolator/percolator/wiki/Interface>`_ ``percolator`` ✅ ✅
Proteome Discoverer MSF ``proteome_discoverer`` ✅ ❌
`Sage <https://github.com/lazear/sage/blob/v0.12.0/DOCS.md#interpreting-sage-output>`_ ``sage`` ✅ ❌
TIMScore Parquet ``timscore`` ✅ ❌
`TSV <https://psm-utils.readthedocs.io/en/stable/api/psm_utils.io/#module-psm_utils.io.tsv>`_ ``tsv`` ✅ ✅
`X!Tandem XML <https://www.thegpm.org/tandem/>`_ ``xtandem`` ✅ ❌
===================================================================================================================== ======================== =============== ===============
Expand Down
8 changes: 8 additions & 0 deletions docs/source/api/psm_utils.io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,14 @@ psm_utils.io.tsv



psm_utils.io.timscore
################################
.. automodule:: psm_utils.io.timscore
:members:
:inherited-members:



psm_utils.io.xtandem
####################

Expand Down
Binary file not shown.
7 changes: 7 additions & 0 deletions psm_utils/io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import psm_utils.io.percolator as percolator
import psm_utils.io.proteome_discoverer as proteome_discoverer
import psm_utils.io.sage as sage
import psm_utils.io.timscore as timscore
import psm_utils.io.tsv as tsv
import psm_utils.io.xtandem as xtandem
from psm_utils.io._base_classes import WriterBase
Expand Down Expand Up @@ -68,6 +69,12 @@
"extension": ".msf",
"filename_pattern": r"^.*\.msf$",
},
"timscore": {
"reader": timscore.TIMScoreReader,
"writer": None,
"extension": ".parquet",
"filename_pattern": r"^.*\.candidates\.parquet$",
},
"tsv": {
"reader": tsv.TSVReader,
"writer": tsv.TSVWriter,
Expand Down
98 changes: 98 additions & 0 deletions psm_utils/io/timscore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
"""Reader for TIMScore Parquet files."""

import logging
import re
from pathlib import Path
from typing import Union

import numpy as np
import pandas as pd

from psm_utils import PSM
from psm_utils.io._base_classes import ReaderBase
from psm_utils.peptidoform import format_number_as_string

logger = logging.getLogger(__name__)


class TIMScoreReader(ReaderBase):
"""Reader for TIMScore Parquet files."""

def __init__(
self,
filename: Union[str, Path],
*args,
**kwargs,
) -> None:
"""
Reader for TIMScore Parquet files.
Parameters
----------
filename: str, pathlib.Path
Path to MSF file.
"""
super().__init__(filename, *args, **kwargs)
self._decoy_pattern = re.compile(r"^Reverse_")

self.data = pd.read_parquet(self.filename)

def __len__(self):
"""Return number of PSMs in file."""
return len(self.data)

def __iter__(self):
"""Iterate over file and return PSMs one-by-one."""
for entry in self.data.itertuples():
yield PSM(
peptidoform=_parse_peptidoform(
entry.stripped_peptide, entry.ptms, entry.ptm_locations, entry.precursor_charge
),
spectrum_id=entry.ms2_id,
is_decoy=all(self._decoy_pattern.match(p) for p in entry.locus_name),
score=entry.tims_score,
precursor_mz=entry.precursor_mz,
retention_time=entry.rt,
ion_mobility=entry.ook0,
protein_list=list(entry.locus_name),
rank=entry.rank,
source="TIMScore",
provenance_data={
"candidate_id": str(entry.candidate_id),
"ms2_id": str(entry.ms2_id),
"parent_id": str(entry.parent_id),
},
metadata={
"leading_aa": str(entry.leading_aa),
"trailing_aa": str(entry.trailing_aa),
"corrected_ook0": str(entry.corrected_ook0),
},
rescoring_features={
"x_corr_score": float(entry.x_corr_score),
"delta_cn_score": float(entry.delta_cn_score),
"ppm_error": float(entry.ppm_error),
"number_matched_ions": float(entry.number_matched_ions),
"number_expected_ions": float(entry.number_expected_ions),
"ion_proportion": float(entry.ion_proportion),
"spectrum_total_ion_intensity": float(entry.spectrum_total_ion_intensity),
},
)


def _parse_peptidoform(
stripped_peptide: str, ptms: np.ndarray, ptm_locations: np.ndarray, precursor_charge: int
) -> str:
"""Parse peptide sequence and modifications to ProForma."""
peptidoform = list(stripped_peptide)
n_term = ""
c_term = ""
for ptm, ptm_location in zip(ptms, ptm_locations):
ptm = format_number_as_string(ptm)
if ptm_location == -1:
n_term = f"[{ptm}]-"
elif ptm_location == len(peptidoform):
c_term = f"-[{ptm}]"
else:
peptidoform[ptm_location] = f"{peptidoform[ptm_location]}[{ptm}]"
return f"{n_term}{''.join(peptidoform)}{c_term}/{precursor_charge}"
17 changes: 17 additions & 0 deletions tests/test_io/test_timscore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import numpy as np

from psm_utils.io.timscore import _parse_peptidoform


def test_parse_peptidoform():
test_cases = [
("ACDMEK", np.array([]), np.array([]), 2, "ACDMEK/2"),
("ACDMEK", np.array([15.99]), np.array([3]), 2, "ACDM[+15.99]EK/2"),
("ACDMEK", np.array([57.02, 15.99]), np.array([1, 3]), 2, "AC[+57.02]DM[+15.99]EK/2"),
("ACDMEK", np.array([42.01]), np.array([-1]), 2, "[+42.01]-ACDMEK/2"),
("ACDMEK", np.array([-0.98]), np.array([6]), 2, "ACDMEK-[-0.98]/2"),
("ACDMEK", np.array([42.01, -0.98]), np.array([-1, 6]), 2, "[+42.01]-ACDMEK-[-0.98]/2"),
]

for peptide, ptms, ptm_locations, precursor_charge, expected in test_cases:
assert _parse_peptidoform(peptide, ptms, ptm_locations, precursor_charge) == expected

0 comments on commit 9f33fc8

Please sign in to comment.