From 833b016f26a70525ee57184c74e1854997e19995 Mon Sep 17 00:00:00 2001 From: julianu Date: Fri, 3 May 2024 18:36:10 +0200 Subject: [PATCH 01/10] small fix for complete protein names in XTandem --- psm_utils/io/xtandem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/psm_utils/io/xtandem.py b/psm_utils/io/xtandem.py index 90e2a60..ef030d6 100644 --- a/psm_utils/io/xtandem.py +++ b/psm_utils/io/xtandem.py @@ -164,7 +164,7 @@ def _parse_entry(self, entry, run: str) -> PSM: precursor_mz=entry["mh"] - mass.nist_mass["H"][0][0], retention_time=entry["rt"], run=run, - protein_list=[protein["label"] for protein in entry["protein"]], + protein_list=[protein["note"] for protein in entry["protein"]], source="X!Tandem", provenance_data={ "xtandem_filename": str(self.filename), From 1fbf0ae42c7c1925e7b865f7aabad08704598b99 Mon Sep 17 00:00:00 2001 From: julianu Date: Fri, 3 May 2024 18:38:26 +0200 Subject: [PATCH 02/10] all proteins seem to be parsed by now --- psm_utils/io/xtandem.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/psm_utils/io/xtandem.py b/psm_utils/io/xtandem.py index ef030d6..8ac1856 100644 --- a/psm_utils/io/xtandem.py +++ b/psm_utils/io/xtandem.py @@ -35,10 +35,6 @@ .. code-block:: [+39,99545] - -* Although X!Tandem XML allows multiple peptide/protein identifications per entry, only - the first peptide/protein per entry is parsed. - """ From 1af95ae3689dbb539e4f35fce95967a677d3fbc5 Mon Sep 17 00:00:00 2001 From: julianu Date: Tue, 7 May 2024 16:27:45 +0200 Subject: [PATCH 03/10] allow parsing of multiple PSMs per spectrum correctly --- psm_utils/io/xtandem.py | 66 ++++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 28 deletions(-) diff --git a/psm_utils/io/xtandem.py b/psm_utils/io/xtandem.py index 8ac1856..5305f6c 100644 --- a/psm_utils/io/xtandem.py +++ b/psm_utils/io/xtandem.py @@ -112,8 +112,8 @@ def __iter__(self): with tandem.read(str(self.filename)) as reader: run = self._parse_run(self.filename) for entry in reader: - psm = self._parse_entry(entry, run) - yield psm + for psm in self._parse_entry(entry, run): + yield psm @staticmethod def _parse_peptidoform(peptide_entry, charge): @@ -147,32 +147,42 @@ def _parse_peptidoform(peptide_entry, charge): return Peptidoform(proforma_seq) - def _parse_entry(self, entry, run: str) -> PSM: - """Parse X!Tandem XML entry to :py:class:`~psm_utils.psm.PSM`.""" - peptide_entry = entry["protein"][0]["peptide"] - psm = PSM( - peptidoform=self._parse_peptidoform(peptide_entry, entry["z"]), - spectrum_id=entry["support"]["fragment ion mass spectrum"]["note"], - is_decoy=entry["protein"][0]["label"].startswith(self.decoy_prefix), - score=-np.log(peptide_entry[self.score_key]) - if self.score_key == "expect" - else peptide_entry[self.score_key], - precursor_mz=entry["mh"] - mass.nist_mass["H"][0][0], - retention_time=entry["rt"], - run=run, - protein_list=[protein["note"] for protein in entry["protein"]], - source="X!Tandem", - provenance_data={ - "xtandem_filename": str(self.filename), - "xtandem_id": str(entry["id"]), - }, - metadata={ - "xtandem_hyperscore": str(peptide_entry["hyperscore"]), - "xtandem_delta": str(peptide_entry["delta"]), - "xtandem_nextscore": str(peptide_entry["nextscore"]), - }, - ) - return psm + def _parse_entry(self, entry, run: str) -> list: + """Parse X!Tandem XML entry to a list of :py:class:`~psm_utils.psm.PSM`.""" + pepform_to_psms = dict() + + for protein_entry in entry["protein"]: + peptide_entry = protein_entry["peptide"] + peptidoform = self._parse_peptidoform(peptide_entry, entry["z"]) + + if peptidoform not in pepform_to_psms.keys(): + psm = PSM( + peptidoform = self._parse_peptidoform(peptide_entry, entry["z"]), + spectrum_id = entry["support"]["fragment ion mass spectrum"]["note"], + is_decoy = protein_entry["label"].startswith(self.decoy_prefix), + score = -np.log(peptide_entry[self.score_key]) + if self.score_key == "expect" + else peptide_entry[self.score_key], + precursor_mz = entry["mh"] - mass.nist_mass["H"][0][0], + retention_time = entry["rt"], + run = run, + protein_list = [ protein_entry["note"] ], + source = "X!Tandem", + provenance_data = { + "xtandem_filename": str(self.filename), + "xtandem_id": str(entry["id"]), + }, + metadata = { + "xtandem_hyperscore": str(peptide_entry["hyperscore"]), + "xtandem_delta": str(peptide_entry["delta"]), + "xtandem_nextscore": str(peptide_entry["nextscore"]), + }, + ) + pepform_to_psms[peptidoform] = psm + else: + pepform_to_psms[peptidoform].protein_list.append(protein_entry["note"]) + + return list(pepform_to_psms.values()) def _parse_run(self, filepath): """Parse X!Tandem XML run to :py:class:`~psm_utils.psm.PSM`.""" From bf508727fae79ba452c080c5344de47c9922df41 Mon Sep 17 00:00:00 2001 From: julianu Date: Tue, 7 May 2024 18:04:41 +0200 Subject: [PATCH 04/10] fixing m/z calculation --- psm_utils/io/xtandem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/psm_utils/io/xtandem.py b/psm_utils/io/xtandem.py index 5305f6c..6b4358b 100644 --- a/psm_utils/io/xtandem.py +++ b/psm_utils/io/xtandem.py @@ -163,7 +163,7 @@ def _parse_entry(self, entry, run: str) -> list: score = -np.log(peptide_entry[self.score_key]) if self.score_key == "expect" else peptide_entry[self.score_key], - precursor_mz = entry["mh"] - mass.nist_mass["H"][0][0], + precursor_mz = entry["mh"] / entry["z"], retention_time = entry["rt"], run = run, protein_list = [ protein_entry["note"] ], From 2195aa2d17532cd569cc212f40c011aa16191cc3 Mon Sep 17 00:00:00 2001 From: julianu Date: Tue, 21 May 2024 17:37:26 +0200 Subject: [PATCH 05/10] removed unused import of pyteomics.mass --- psm_utils/io/xtandem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/psm_utils/io/xtandem.py b/psm_utils/io/xtandem.py index 6b4358b..2b1a99e 100644 --- a/psm_utils/io/xtandem.py +++ b/psm_utils/io/xtandem.py @@ -47,7 +47,7 @@ from typing import Union import numpy as np -from pyteomics import mass, tandem +from pyteomics import tandem from psm_utils.exceptions import PSMUtilsException from psm_utils.io._base_classes import ReaderBase From a8f5b15bf310c38d95e8bd11be3ab0d93e73f18c Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Tue, 9 Jul 2024 19:08:45 +0200 Subject: [PATCH 06/10] don't use keys to test if in dict + reformat --- psm_utils/io/xtandem.py | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/psm_utils/io/xtandem.py b/psm_utils/io/xtandem.py index 2b1a99e..43c9966 100644 --- a/psm_utils/io/xtandem.py +++ b/psm_utils/io/xtandem.py @@ -37,7 +37,6 @@ [+39,99545] """ - from __future__ import annotations import logging @@ -155,24 +154,26 @@ def _parse_entry(self, entry, run: str) -> list: peptide_entry = protein_entry["peptide"] peptidoform = self._parse_peptidoform(peptide_entry, entry["z"]) - if peptidoform not in pepform_to_psms.keys(): + if peptidoform not in pepform_to_psms: psm = PSM( - peptidoform = self._parse_peptidoform(peptide_entry, entry["z"]), - spectrum_id = entry["support"]["fragment ion mass spectrum"]["note"], - is_decoy = protein_entry["label"].startswith(self.decoy_prefix), - score = -np.log(peptide_entry[self.score_key]) - if self.score_key == "expect" - else peptide_entry[self.score_key], - precursor_mz = entry["mh"] / entry["z"], - retention_time = entry["rt"], - run = run, - protein_list = [ protein_entry["note"] ], - source = "X!Tandem", - provenance_data = { + peptidoform=self._parse_peptidoform(peptide_entry, entry["z"]), + spectrum_id=entry["support"]["fragment ion mass spectrum"]["note"], + is_decoy=protein_entry["label"].startswith(self.decoy_prefix), + score=( + -np.log(peptide_entry[self.score_key]) + if self.score_key == "expect" + else peptide_entry[self.score_key] + ), + precursor_mz=entry["mh"] / entry["z"], + retention_time=entry["rt"], + run=run, + protein_list=[protein_entry["note"]], + source="X!Tandem", + provenance_data={ "xtandem_filename": str(self.filename), "xtandem_id": str(entry["id"]), }, - metadata = { + metadata={ "xtandem_hyperscore": str(peptide_entry["hyperscore"]), "xtandem_delta": str(peptide_entry["delta"]), "xtandem_nextscore": str(peptide_entry["nextscore"]), @@ -181,7 +182,7 @@ def _parse_entry(self, entry, run: str) -> list: pepform_to_psms[peptidoform] = psm else: pepform_to_psms[peptidoform].protein_list.append(protein_entry["note"]) - + return list(pepform_to_psms.values()) def _parse_run(self, filepath): From 3912910709f6b8ec5611568ee83110a78e7fde68 Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Tue, 9 Jul 2024 19:20:14 +0200 Subject: [PATCH 07/10] require numpy < 2 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index edaf8b6..5994f20 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ requires-python = ">=3.7" dependencies = [ "click", "lxml", - "numpy", + "numpy < 2", # NOTE: openms currenly doesn't support numpy 2.0, but doesn't have a version requirement "pandas", "psims", "pyarrow", From 408efd0e102fb982786df0a29f456d52d92270e3 Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Wed, 10 Jul 2024 12:47:02 +0200 Subject: [PATCH 08/10] update test file --- tests/test_data/test_out_sage.idXML | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/tests/test_data/test_out_sage.idXML b/tests/test_data/test_out_sage.idXML index 450ef56..060d91d 100644 --- a/tests/test_data/test_out_sage.idXML +++ b/tests/test_data/test_out_sage.idXML @@ -20,22 +20,21 @@ - - - + + + - - + + - + - - + From 5d01b6ff757a69bb18700211dca2a0f9583040b3 Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Wed, 10 Jul 2024 13:26:24 +0200 Subject: [PATCH 09/10] update checksum for pyarrow 16.1.0 --- tests/test_io/test_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_io/test_parquet.py b/tests/test_io/test_parquet.py index 20bb0b0..7d08cfe 100644 --- a/tests/test_io/test_parquet.py +++ b/tests/test_io/test_parquet.py @@ -41,7 +41,7 @@ def compute_checksum(filename): class TestParquetWriter: - expected_checksum = "cf3f2e9f073be58612ce81f240da9f4109e1c76eea25f1b7881e09c0a8fdee16" + expected_checksum = "be6da8d891bd63b85fab5bb11d6a113f8df6ce9f7fd3d1cc429804dc41d972af" def test_write_psm(self): with ParquetWriter("test.pq") as writer: From 4dc1c9595fc4db9355b95769477c079c39150e32 Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Wed, 10 Jul 2024 14:07:50 +0200 Subject: [PATCH 10/10] add xtandem test case --- tests/test_data/test.t.xml | 208 ++++++++++++++++++++++++++++++++++ tests/test_io/test_xtandem.py | 61 +++++++++- 2 files changed, 266 insertions(+), 3 deletions(-) create mode 100644 tests/test_data/test.t.xml diff --git a/tests/test_data/test.t.xml b/tests/test_data/test.t.xml new file mode 100644 index 0000000..8041699 --- /dev/null +++ b/tests/test_data/test.t.xml @@ -0,0 +1,208 @@ + + + + + +tr|Q8U2N0|Q8U2N0_PYRFU Uncharacterized protein OS=Pyrococcus furiosus (strain ATCC 43587 / DSM 3638 / JCM 8422 / Vc1) OX=186497 GN=PF0803 PE=4 SV=1 + + + MKREDLLWTL IGLSLLYSYL SNNLSGVLFG VVLFSYIVQA RRGFNPDFDV + KVDIPERFEE GITGEVVVGV VNRGSEGFLE VEVSGEDVEG DKRRVFLRKG + ESVVKVKVKP LAKGEMELKF KIRFEDRAGL YYEEEERSFR IQVLPSVDSI + REAMEEERRV RLKEAYKKGR IGVESLEIYG LREYLPGDDV RRIDWKASAR + IGKIIVKEFL RESEGDVYIV LDASREMRKR VRKSKIDYAS TLALYLATLI + VREGRRVGLI IFWDEDFKVV KPGRELEKIR EAIRFRPVRG LMSFKGEISL + RVRGFLKLFP RKRRSIADAL LSLRESSHLI LISDLMSNTP LLYRAIAMAK + KKHRIVILSP NPVLFYSGEL DEETLRFLYR KYKEREKVIR RFNSLVPTLD + LGPSDYREVL EVLG + + + + + +DECOY_tr|Q8TZM9_REVERSED|Q8TZM9_PYRFU Aldose reductase OS=Pyrococcus furiosus (strain ATCC 43587 / DSM 3638 / JCM 8422 / Vc1) OX=186497 GN=PF1960 PE=4 SV=1 + + + VCRRAMERDE ESLRWGMAGF NEKLHEKNSA KPIAVVNEEW ILYNLAVQAA + TKGYKEGIKA LCENRALTGK ELPTYAMLAI GERKMYDLLG TTEPWRDKVS + YKVQNAVIEY KRMVEQSRQL LELNFNSVGI YRIVGEDVLD ELAHLTEEIK + KFDDVPWHLL YLDIYTGLRK ASARAAKKAE EYGFHTPWVK SVIFIDEREF + EKIAEGVIEE AHGAGYFEAT DILNMGLELG YRIAEISEKD RSYDPTERGG + IGWTGMGIAT VKDDGIRKLD NFANVRKM + + + + + + +4.38931 +-0.283181 + + +0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + + + + +549 549 549 485 450 375 301 209 149 88 43 15 6 2 1 0 + + + + + + +0 1 2 3 4 5 6 7 8 9 10 11 12 + + + + +541 541 541 477 440 365 285 191 116 61 24 1 0 + + + + + + +0 1 2 3 4 5 6 + + + + +258 247 39 4 0 1 0 + + + + + + +0 1 2 3 4 5 + + + + +176 277 81 14 1 0 + + + + + + +635.328491210938_3990.64720000002 RTINSECONDS=3990.64720000002 + +1903.97 +3 + + +169.133 174.055 175.087 183.149 191.082 197.129 198.132 211.144 235.108 246.182 260.107 263.103 285.088 295.189 298.176 310.213 330.708 342.185 349.151 360.155 369.214 386.24 399.207 407.265 425.186 442.248 459.224 465.223 470.244 482.74 +520.35 559.288 572.309 589.375 607.382 626.401 636.326 645.358 657.839 666.844 678.421 688.331 695.446 723.386 756.392 774.401 821.447 1122.54 1332.68 1334.68 + + + + +13 10 3 13 13 51 5 18 100 9 5 26 4 6 4 40 11 11 3 8 4 6 6 6 18 5 14 8 21 5 +11 7 14 10 12 8 33 46 5 30 9 4 5 11 3 37 16 10 26 7 + + + + + + /home/compomics/Programs/tandem-linux-17-02-01-4/bin/default_input_protein.xml + /home/compomics/extra_disk/rescore-pyro-tandem/data/taxonomy_pyro.xml + 30 + yes + + 0.1 + testing 1 2 3 + no + yes + /home/compomics/extra_disk/rescore-pyro-tandem/tandem/pyro-pyro.xml + yes + yes + yes + all + + yes + protein + yes + tandem-style.xsl + 0.0 + 0.0 + +17.002735 + +1.007825 + [RK]|{P} + no + + pyro + yes + 0.1 + + no + + + + + + yes + 20 + yes + no + 57.022@C + 15.994@M + + no + yes + no + no + no + 2 + 4 + no + yes + no + 100.0 + monoisotopic + 0.02 + Daltons + 4 + 150.0 + 500.0 + 15 + 5 + 5 + ppm + yes + /home/compomics/Documents/spectrum_files/pyrococcus/Velos005137.mgf + 1000 + 16 + 50 + + + yes + 3 + no + yes + + + /home/compomics/extra_disk/rescore-pyro-tandem/db/pyro_crap_td.fasta + no description + 0 + 0 + 10876794 + 4322 + 15365 + 2019:07:15:21:28:29 + X! Tandem Alanine (2017.2.1.4) + 212 424 483 534 601 636 644 675 571 611 571 466 473 431 363 355 339 282 289 233 + 844 + 5666 + 52 + 0 + 0 + 0 + 1266 + 29.24 + 0.0019 + 0.22 + 0.0018 + + diff --git a/tests/test_io/test_xtandem.py b/tests/test_io/test_xtandem.py index 75cbdce..024e056 100644 --- a/tests/test_io/test_xtandem.py +++ b/tests/test_io/test_xtandem.py @@ -1,12 +1,14 @@ """Tests for psm_utils.io.xtandem.""" from psm_utils.io.xtandem import XTandemReader +from psm_utils.peptidoform import Peptidoform +from psm_utils.psm import PSM +from psm_utils.psm_list import PSMList class TestXTandemReader: - reader = XTandemReader("path") - def test__parse_peptidoform(self): + reader = XTandemReader("path") test_cases = [ { "test_in": ({"start": 556, "seq": "KMDYPPKR"}, 2), @@ -40,5 +42,58 @@ def test__parse_peptidoform(self): ] for case in test_cases: - test_out = self.reader._parse_peptidoform(*case["test_in"]).proforma + test_out = reader._parse_peptidoform(*case["test_in"]).proforma assert test_out == case["expected_out"] + + def test_reader(self): + test_cases = PSMList( + psm_list=[ + PSM( + peptidoform=Peptidoform('RGFNPDFDVKVDIPER/3'), + spectrum_id='635.328491210938_3990.64720000002 RTINSECONDS=3990.64720000002', + run='Velos005137', + is_decoy=False, + score=-4.3694478524670215, + precursor_mz=634.656974, + retention_time=3990.64720000002, + protein_list=[ + 'tr|Q8U2N0|Q8U2N0_PYRFU Uncharacterized protein OS=Pyrococcus furiosus (strain ATCC 43587 / DSM 3638 / JCM 8422 / Vc1) OX=186497 GN=PF0803 PE=4 SV=1' + ], + source='X!Tandem', + provenance_data={ + 'xtandem_filename': 'tests/test_data/test.t.xml', + 'xtandem_id': '10487' + }, + metadata={ + 'xtandem_hyperscore': '8.8', + 'xtandem_delta': '0.0048', + 'xtandem_nextscore': '8.0' + }, + ), + PSM( + peptidoform=Peptidoform('GGIGWTGMGIATVKDDGIR/3'), + spectrum_id='635.328491210938_3990.64720000002 RTINSECONDS=3990.64720000002', + run='Velos005137', + is_decoy=True, + score=-4.3694478524670215, + precursor_mz=634.656974, + retention_time=3990.64720000002, + protein_list=[ + 'DECOY_tr|Q8TZM9_REVERSED|Q8TZM9_PYRFU Aldose reductase OS=Pyrococcus furiosus (strain ATCC 43587 / DSM 3638 / JCM 8422 / Vc1) OX=186497 GN=PF1960 PE=4 SV=1' + ], + source='X!Tandem', + provenance_data={ + 'xtandem_filename': 'tests/test_data/test.t.xml', + 'xtandem_id': '10487' + }, + metadata={ + 'xtandem_hyperscore': '8.8', + 'xtandem_delta': '0.0015', + 'xtandem_nextscore': '8.0' + }, + ) + ] + ) + + psms = XTandemReader("./tests/test_data/test.t.xml").read_file() + assert psms == test_cases