Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Read and write idXML files using pyopenms #52

Merged
merged 23 commits into from
Oct 25, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 73 additions & 27 deletions psm_utils/io/idxml.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from psm_utils.io._base_classes import ReaderBase, WriterBase
from psm_utils.psm import PSM
from psm_utils.psm_list import PSMList
from psm_utils.peptidoform import Peptidoform

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -93,6 +94,7 @@ def __init__(self, filename: Union[Path, str], *args, **kwargs) -> None:
"""
super().__init__(filename, *args, **kwargs)
self.protein_ids, self.peptide_ids = self._parse_idxml()
self.user_params_metadata = self._get_userparams_metadata(self.peptide_ids[0].getHits()[0])
self.rescoring_features = self._get_rescoring_features(self.peptide_ids[0].getHits()[0])

def __iter__(self) -> Iterable[PSM]:
Expand Down Expand Up @@ -162,6 +164,12 @@ def _parse_psm(
peptidoform = self._parse_peptidoform(
peptide_hit.getSequence().toString(), peptide_hit.getCharge()
)
# This is needed to calculate a qvalue before rescoring the PSMList
peptide_id_metadata = {
"idxml:score_type": str(peptide_id.getScoreType()),
"idxml:higher_score_better": str(peptide_id.isHigherScoreBetter()),
"idxml:significance_threshold": str(peptide_id.getSignificanceThreshold())}
peptide_hit_metadata = {key: peptide_hit.getMetaValue(key) for key in self.user_params_metadata}
return PSM(
peptidoform=peptidoform,
spectrum_id=peptide_id.getMetaValue("spectrum_reference"),
Expand All @@ -179,17 +187,11 @@ def _parse_psm(
# Storing proforma notation of peptidoform and UNIMOD peptide sequence for mapping back
# to original sequence in writer
provenance_data={str(peptidoform): peptide_hit.getSequence().toString()},
# TODO: Can we recover any all UserParams and save them as metadata?
# This is needed to calculate a qvalue before rescoring the PSMList
metadata={
"idxml:score_type": str(peptide_id.getScoreType()),
"idxml:higher_score_better": str(peptide_id.isHigherScoreBetter()),
"idxml:significance_threshold": str(peptide_id.getSignificanceThreshold()),
},
# Store metadata of PeptideIdentification and PeptideHit objects
metadata=peptide_id_metadata | peptide_hit_metadata,
jonasscheid marked this conversation as resolved.
Show resolved Hide resolved
rescoring_features={
key: float(peptide_hit.getMetaValue(key))
for key in self.rescoring_features
if self._is_float(peptide_hit.getMetaValue(key))
},
)

Expand Down Expand Up @@ -218,14 +220,20 @@ def _get_run(

return run

@staticmethod
def _get_rescoring_features(peptide_hit: oms.PeptideHit) -> List[str]:
"""Get list of rescoring features from idXML."""
def _get_userparams_metadata(self, peptide_hit: oms.PeptideHit) -> List[str]:
"""Get list of string type UserParams attached to each PeptideHit."""
# Fill the key list with all the keys from the PeptideHit
# Empty list is required for the Cython wrapper to work correctly
keys = []
peptide_hit.getKeys(keys)
keys = [key.decode() for key in keys if key.decode() in RESCORING_FEATURE_LIST]
keys = [key.decode() for key in keys if not self._is_float(peptide_hit.getMetaValue(key.decode()))]
return keys

def _get_rescoring_features(self, peptide_hit: oms.PeptideHit) -> List[str]:
"""Get list of rescoring features in UserParams attached to each PeptideHit."""
keys = []
peptide_hit.getKeys(keys)
keys = [key.decode() for key in keys if self._is_float(peptide_hit.getMetaValue(key.decode())) and key.decode() in RESCORING_FEATURE_LIST]
return keys

@staticmethod
Expand Down Expand Up @@ -265,15 +273,37 @@ def __init__(
filename
Path to PSM file.
protein_ids
# TODO Add description
Optional :py:class:`~pyopenms.ProteinIdentification` object to be written to the idXML file.
peptide_ids
# TODO Add description
Optional :py:class:`~pyopenms.PeptideIdentification` object to be written to the idXML file.

Notes
-----
- Unlike other psm_utils.io writer classes, :py:class:`IdXMLWriter` does not support writing
a single PSM to a file with the :py:meth:`write_psm` method. Only writing a full PSMList
to a file at once with the :py:meth:`write_file` method is currently supported.
- If `protein_ids` and `peptide_ids` are provided, each :py:class:`~pyopenms.PeptideIdentification`
object in the list `peptide_ids` will be updated with new `rescoring_features` from the PSMList.
Otherwise, new pyopenms objects will be created, filled with information of PSMList and written to the idXML file.

Examples
jonasscheid marked this conversation as resolved.
Show resolved Hide resolved
--------
- Example with `pyopenms` objects:

>>> from psm_utils.io.idxml import IdXMLReader, IdXMLWriter
>>> reader = IdXMLReader("psm_utils/tests/test_data/test_in.idXML"")
jonasscheid marked this conversation as resolved.
Show resolved Hide resolved
>>> psm_list = reader.read_file()
>>> for psm in psm_list:
... psm.rescoring_features = psm.rescoring_features | {"feature": 1}
>>> writer = IdXMLWriter("psm_utils/tests/test_data//test_out.idXML", reader.protein_ids, reader.peptide_ids)
>>> writer.write_file(psm_list)

- Example without `pyopenms` objects:

>>> from psm_utils.psm_list import PSMList
>>> psm_list = PSMList(psm_list=[PSM(peptidoform="ACDK", spectrum_id=1, score=140.2, retention_time=600.2)])
>>> writer = IdXMLWriter("psm_utils/tests/test_data//test_out.idXML")
>>> writer.write_file(psm_list)

"""
super().__init__(filename, *args, **kwargs)
Expand Down Expand Up @@ -370,9 +400,15 @@ def _update_peptide_hit(self, peptide_hit: oms.PeptideHit, psm: PSM) -> None:
Inplace update of :py:class:`~pyopenms.PeptideHit` with novel predicted features
information from :py:class:`~psm_utils.psm.PSM`.
"""
peptide_hit.setScore(psm.score)
peptide_hit.setRank(psm.rank - 1) # 1-based to 0-based
# TODO: Should q-value and pep also be updated?
if psm.score is not None:
peptide_hit.setScore(psm.score)
if psm.rank is not None:
peptide_hit.setRank(psm.rank - 1) # 1-based to 0-based
if psm.qvalue is not None:
peptide_hit.setMetaValue("q-value", psm.qvalue)
if psm.pep is not None:
peptide_hit.setMetaValue("PEP", psm.pep)

for feature, value in psm.rescoring_features.items():
if feature not in RESCORING_FEATURE_LIST:
peptide_hit.setMetaValue(feature, value)
Expand Down Expand Up @@ -401,6 +437,9 @@ def _create_new_ids(self, psm_dict: dict) -> None:
peptide_id = oms.PeptideIdentification()
peptide_id.setMetaValue("spectrum_reference", spectrum_id)
peptide_id.setMetaValue("id_merge_index", msrun_reference.index(str(run)))
jonasscheid marked this conversation as resolved.
Show resolved Hide resolved
# TODO: Question: Is there a way to infer the score type from the PSM?
jonasscheid marked this conversation as resolved.
Show resolved Hide resolved
if psms[0].score is not None:
peptide_id.setScoreType("search_engine_score")
if psms[0].precursor_mz is not None:
peptide_id.setMZ(psms[0].precursor_mz)
if psms[0].retention_time is not None:
Expand All @@ -412,7 +451,7 @@ def _create_new_ids(self, psm_dict: dict) -> None:
peptide_hit = oms.PeptideHit()
peptide_hit.setSequence(
oms.AASequence.fromString(
self._convert_proforma_to_unimod(str(psm.peptidoform))
self._convert_proforma_to_unimod(psm.peptidoform)
)
)
peptide_hit.setCharge(psm.peptidoform.precursor_charge)
Expand All @@ -422,8 +461,10 @@ def _create_new_ids(self, psm_dict: dict) -> None:
if psm.is_decoy is None
else ("decoy" if psm.is_decoy else "target"),
)
if psm.score is not None:
peptide_hit.setScore(psm.score)
if psm.qvalue is not None:
peptide_hit.setMetaValue("q-value", psm.qvalue)
if psm.pep is not None:
peptide_hit.setMetaValue("PEP", psm.pep)
if psm.rank is not None:
peptide_hit.setRank(psm.rank - 1) # 1-based to 0-based
self._add_meta_values_from_dict(peptide_hit, psm.metadata)
Expand Down Expand Up @@ -459,16 +500,21 @@ def _create_new_ids(self, psm_dict: dict) -> None:
self.peptide_ids,
)

def _convert_proforma_to_unimod(self, sequence: str) -> str:
def _convert_proforma_to_unimod(self, peptidoform: Peptidoform) -> str:
"""Convert a peptidoform sequence in proforma notation to UNIMOD notation."""
# Replace square brackets with parentheses
sequence = sequence.replace("[", "(").replace("]", ")")
sequence = str(peptidoform).split('/')[0]

# Replace square brackets around modifications with parentheses
sequence = re.sub(r"\[([^\]]+)\]", r"(\1)", sequence)

# Check for N-terminal and C-terminal modifications
if sequence[:2] == "((":
sequence = sequence.replace("((", ".[", 1).replace(")-", ".", 1)
if sequence[-2:] == "))":
sequence = sequence[::-1].replace("))", ".]", 1).replace("-(", ".", 1)[::-1]
if sequence.startswith("["):
sequence = re.sub(r"^\[([^\]]+)\]-", r"(\1)", sequence)
if sequence.endswith("]"):
sequence = re.sub(r"-\[([^\]]+)\]$", r"-(\1)", sequence)

# Remove dashes for N-terminal and C-terminal modifications
sequence = sequence.replace(")-", ")").replace("-(", "(")

return sequence

Expand Down
5 changes: 3 additions & 2 deletions tests/test_data/test_out_sage.idXML
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@
</ProteinHit>
<UserParam type="stringList" name="spectra_data" value="[LQSRPAAPPAPGPGQLTLR]"/>
</ProteinIdentification>
<PeptideIdentification score_type="" higher_score_better="true" significance_threshold="0.0" MZ="643.03499169873669" RT="108.285399999999996" spectrum_reference="controllerType=0 controllerNumber=1 scan=30069" >
<PeptideHit score="1.2944585" sequence="LQSRPAAPPAPGPGQLTLR" charge="3" >
<PeptideIdentification score_type="search_engine_score" higher_score_better="true" significance_threshold="0.0" MZ="643.03499169873669" RT="108.285399999999996" spectrum_reference="controllerType=0 controllerNumber=1 scan=30069" >
<PeptideHit score="0.0" sequence="LQSRPAAPPAPGPGQLTLR" charge="3" protein_refs="PH_0" >
<UserParam type="string" name="target_decoy" value="target"/>
<UserParam type="float" name="isotope_error" value="0.0"/>
<UserParam type="float" name="q-value" value="1.0"/>
<UserParam type="string" name="sage_filename" value="./tests/test_data/results.sage.tsv"/>
<UserParam type="float" name="expmass" value="1926.081500000000005"/>
<UserParam type="float" name="calcmass" value="1926.079999999999927"/>
Expand Down
41 changes: 4 additions & 37 deletions tests/test_io/test_idxml.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,40 +27,7 @@ def test__parse_peptidoform(self):
assert IdXMLReader._parse_peptidoform(*test_in) == expected_out

def test__parse_psm(self):
test_psm = PSM(
peptidoform=Peptidoform("LVPIWKK/2"),
spectrum_id="controllerType=0 controllerNumber=1 scan=294",
run="HepG2_rep3_small",
collection=None,
spectrum=None,
is_decoy=True,
score=1.0099999904632568,
qvalue=None,
pep=None,
precursor_mz=442.29595853189994,
retention_time=849.0,
ion_mobility=None,
protein_list=["DECOY_sp|Q5TH74|STPG1_HUMAN"],
rank=1,
source="idXML",
provenance_data={"LVPIWKK/2": "LVPIWKK"},
metadata={
"idxml:score_type": "expect",
"idxml:higher_score_better": "False",
"idxml:significance_threshold": "0.0",
},
rescoring_features={
"MS:1002252": 0.693,
"COMET:deltaCn": 1.0,
"MS:1002255": 35.9,
"COMET:deltaLCn": 0.0,
"COMET:lnExpect": 0.009950330853168092,
"COMET:lnNumSP": 3.555348061489414,
"COMET:lnRankSP": 0.0,
"COMET:IonFrac": 0.25,
},
)

test_psm = PSM(peptidoform=Peptidoform('LVPIWKK/2'), spectrum_id='controllerType=0 controllerNumber=1 scan=294', run='HepG2_rep3_small', collection=None, spectrum=None, is_decoy=True, score=1.0099999904632568, qvalue=None, pep=None, precursor_mz=442.29595853189994, retention_time=849.0, ion_mobility=None, protein_list=['DECOY_sp|Q5TH74|STPG1_HUMAN'], rank=1, source='idXML', provenance_data={'LVPIWKK/2': 'LVPIWKK'}, metadata={'idxml:score_type': 'expect', 'idxml:higher_score_better': 'False', 'idxml:significance_threshold': '0.0', 'target_decoy': 'decoy', 'protein_references': 'unique'}, rescoring_features={'MS:1002252': 0.693, 'COMET:deltaCn': 1.0, 'MS:1002255': 35.9, 'COMET:deltaLCn': 0.0, 'COMET:lnExpect': 0.009950330853168092, 'COMET:lnNumSP': 3.555348061489414, 'COMET:lnRankSP': 0.0, 'COMET:IonFrac': 0.25})
reader = IdXMLReader("./tests/test_data/test_in.idXML")
psm = reader._parse_psm(
reader.protein_ids, reader.peptide_ids[0], reader.peptide_ids[0].getHits()[0]
Expand All @@ -70,7 +37,7 @@ def test__parse_psm(self):
def test__get_run(self):
expected_output = "HepG2_rep3_small"
reader = IdXMLReader("./tests/test_data/test_in.idXML")
assert IdXMLReader._get_run(reader.protein_ids, reader.peptide_ids[0]) == expected_output
assert reader._get_run(reader.protein_ids, reader.peptide_ids[0]) == expected_output

def test__get_rescoring_features(self):
expected_output = [
Expand All @@ -85,7 +52,7 @@ def test__get_rescoring_features(self):
]
reader = IdXMLReader("./tests/test_data/test_in.idXML")
assert (
IdXMLReader._get_rescoring_features(reader.peptide_ids[0].getHits()[0])
reader._get_rescoring_features(reader.peptide_ids[0].getHits()[0])
== expected_output
)

Expand All @@ -103,7 +70,7 @@ def test_write_file_with_pyopenms_objects(self):
assert sha == expected_sha

def test_write_file_without_pyopenms_objects(self):
expected_sha = "a24e157579a4e9da71900931c6b5578a264ea7830135ede480b41e6550944e5e"
expected_sha = "b81addaf8ef1f5cb5007f14a914bee508c54d59f34f8857a5770d3db9aa2c15b"
reader = SageReader("./tests/test_data/results.sage.tsv")
psm_list = reader.read_file()
writer = IdXMLWriter("./tests/test_data/test_out_sage.idXML")
Expand Down