Skip to content

Commit

Permalink
Add ProLuCID parquet support
Browse files Browse the repository at this point in the history
  • Loading branch information
RalfG committed Mar 9, 2024
1 parent 56ed71d commit a0dc262
Show file tree
Hide file tree
Showing 6 changed files with 119 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ Supported file formats
`Percolator tab <https://github.com/percolator/percolator/wiki/Interface>`_ ``percolator`` ✅ ✅
Proteome Discoverer MSF ``proteome_discoverer`` ✅ ❌
`Sage <https://github.com/lazear/sage/blob/v0.12.0/DOCS.md#interpreting-sage-output>`_ ``sage`` ✅ ❌
TIMScore Parquet ``timscore`` ✅ ❌
`TSV <https://psm-utils.readthedocs.io/en/stable/api/psm_utils.io/#module-psm_utils.io.tsv>`_ ``tsv`` ✅ ✅
`X!Tandem XML <https://www.thegpm.org/tandem/>`_ ``xtandem`` ✅ ❌
===================================================================================================================== ======================== =============== ===============
Expand Down
8 changes: 8 additions & 0 deletions docs/source/api/psm_utils.io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,14 @@ psm_utils.io.tsv



psm_utils.io.timscore
################################
.. automodule:: psm_utils.io.timscore
:members:
:inherited-members:



psm_utils.io.xtandem
####################

Expand Down
Binary file not shown.
7 changes: 7 additions & 0 deletions psm_utils/io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import psm_utils.io.percolator as percolator
import psm_utils.io.proteome_discoverer as proteome_discoverer
import psm_utils.io.sage as sage
import psm_utils.io.timscore as timscore
import psm_utils.io.tsv as tsv
import psm_utils.io.xtandem as xtandem
from psm_utils.io._base_classes import WriterBase
Expand Down Expand Up @@ -68,6 +69,12 @@
"extension": ".msf",
"filename_pattern": r"^.*\.msf$",
},
"timscore": {
"reader": timscore.TIMScoreReader,
"writer": None,
"extension": ".parquet",
"filename_pattern": r"^.*\.candidates\.parquet$",
},
"tsv": {
"reader": tsv.TSVReader,
"writer": tsv.TSVWriter,
Expand Down
89 changes: 89 additions & 0 deletions psm_utils/io/timscore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""Reader for Proteome Discoverer MSF PSM files."""

import logging
from pathlib import Path
from typing import Union

import numpy as np
import pandas as pd

from psm_utils import PSM
from psm_utils.io._base_classes import ReaderBase

logger = logging.getLogger(__name__)


class TIMScoreReader(ReaderBase):
"""Reader for TIMScore Parquet files."""

def __init__(
self,
filename: Union[str, Path],
*args,
**kwargs,
) -> None:
"""
Reader for TIMScore Parquet files.
Parameters
----------
filename: str, pathlib.Path
Path to MSF file.
"""
super().__init__(filename, *args, **kwargs)

self.data = pd.read_parquet(self.filename)

def __len__(self):
"""Return number of PSMs in file."""
return len(self.data)

def __iter__(self):
"""Iterate over file and return PSMs one-by-one."""
for entry in self.data.itertuples():
yield PSM(
peptidoform=_parse_peptidoform(
entry.stripped_peptide, entry.ptms, entry.ptm_locations, entry.precursor_charge
),
spectrum_id=entry.ms2_id,
is_decoy=None, # TODO: Parse from protein?
score=entry.tims_score, # TODO: Correct score?
precursor_mz=entry.precursor_mz,
retention_time=entry.rt,
ion_mobility=entry.corrected_ook0,
protein_list=list(entry.locus_name),
rank=entry.rank,
source="TIMScore",
provenance_data={
"candidate_id": str(entry.candidate_id),
"ms2_id": str(entry.ms2_id),
"parent_id": str(entry.parent_id),
},
metadata={
"leading_aa": str(entry.leading_aa),
"trailing_aa": str(entry.trailing_aa),
"ook0": str(entry.ook0),
},
rescoring_features={
"x_corr_score": float(entry.x_corr_score),
"delta_cn_score": float(entry.delta_cn_score),
"ppm_error": float(entry.ppm_error),
"number_matched_ions": float(entry.number_matched_ions),
"number_expected_ions": float(entry.number_expected_ions),
"ion_proportion": float(entry.ion_proportion),
"spectrum_total_ion_intensity": float(entry.spectrum_total_ion_intensity),
},
)


def _parse_peptidoform(
stripped_peptide: str, ptms: np.ndarray, ptm_locations: np.ndarray, precursor_charge: int
) -> str:
"""Parse peptide sequence and modifications to ProForma."""
# TODO: How are terminal modifications handled?
peptidoform = list(stripped_peptide)
for ptm, ptm_location in zip(ptms, ptm_locations):
peptidoform[ptm_location] = f"{peptidoform[ptm_location]}[{ptm}]"
peptidoform.append(f"/{precursor_charge}")
return "".join(peptidoform)
14 changes: 14 additions & 0 deletions tests/test_io/test_timscore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import numpy as np

from psm_utils.io.timscore import _parse_peptidoform


def test_parse_peptidoform():
test_cases = [
("ACDMEK", np.array([]), np.array([]), 2, "ACDMEK/2"),
("ACDMEK", np.array([15.99]), np.array([3]), 2, "ACDM[15.99]EK/2"),
("ACDMEK", np.array([57.02, 15.99]), np.array([1, 3]), 2, "AC[57.02]DM[15.99]EK/2"),
]

for peptide, ptms, ptm_locations, precursor_charge, expected in test_cases:
assert _parse_peptidoform(peptide, ptms, ptm_locations, precursor_charge) == expected

0 comments on commit a0dc262

Please sign in to comment.