Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix parsing PSMs and complete protein names in XTandem #83

Merged
merged 10 commits into from
Jul 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 41 additions & 34 deletions psm_utils/io/xtandem.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,8 @@
.. code-block::

[+39,99545]

* Although X!Tandem XML allows multiple peptide/protein identifications per entry, only
the first peptide/protein per entry is parsed.

"""


from __future__ import annotations

import logging
Expand All @@ -51,7 +46,7 @@
from typing import Union

import numpy as np
from pyteomics import mass, tandem
from pyteomics import tandem

from psm_utils.exceptions import PSMUtilsException
from psm_utils.io._base_classes import ReaderBase
Expand Down Expand Up @@ -116,8 +111,8 @@ def __iter__(self):
with tandem.read(str(self.filename)) as reader:
run = self._parse_run(self.filename)
for entry in reader:
psm = self._parse_entry(entry, run)
yield psm
for psm in self._parse_entry(entry, run):
yield psm

@staticmethod
def _parse_peptidoform(peptide_entry, charge):
Expand Down Expand Up @@ -151,32 +146,44 @@ def _parse_peptidoform(peptide_entry, charge):

return Peptidoform(proforma_seq)

def _parse_entry(self, entry, run: str) -> PSM:
"""Parse X!Tandem XML entry to :py:class:`~psm_utils.psm.PSM`."""
peptide_entry = entry["protein"][0]["peptide"]
psm = PSM(
peptidoform=self._parse_peptidoform(peptide_entry, entry["z"]),
spectrum_id=entry["support"]["fragment ion mass spectrum"]["note"],
is_decoy=entry["protein"][0]["label"].startswith(self.decoy_prefix),
score=-np.log(peptide_entry[self.score_key])
if self.score_key == "expect"
else peptide_entry[self.score_key],
precursor_mz=entry["mh"] - mass.nist_mass["H"][0][0],
retention_time=entry["rt"],
run=run,
protein_list=[protein["label"] for protein in entry["protein"]],
source="X!Tandem",
provenance_data={
"xtandem_filename": str(self.filename),
"xtandem_id": str(entry["id"]),
},
metadata={
"xtandem_hyperscore": str(peptide_entry["hyperscore"]),
"xtandem_delta": str(peptide_entry["delta"]),
"xtandem_nextscore": str(peptide_entry["nextscore"]),
},
)
return psm
def _parse_entry(self, entry, run: str) -> list:
"""Parse X!Tandem XML entry to a list of :py:class:`~psm_utils.psm.PSM`."""
pepform_to_psms = dict()

for protein_entry in entry["protein"]:
peptide_entry = protein_entry["peptide"]
peptidoform = self._parse_peptidoform(peptide_entry, entry["z"])

if peptidoform not in pepform_to_psms:
psm = PSM(
peptidoform=self._parse_peptidoform(peptide_entry, entry["z"]),
spectrum_id=entry["support"]["fragment ion mass spectrum"]["note"],
is_decoy=protein_entry["label"].startswith(self.decoy_prefix),
score=(
-np.log(peptide_entry[self.score_key])
if self.score_key == "expect"
else peptide_entry[self.score_key]
),
precursor_mz=entry["mh"] / entry["z"],
retention_time=entry["rt"],
run=run,
protein_list=[protein_entry["note"]],
source="X!Tandem",
provenance_data={
"xtandem_filename": str(self.filename),
"xtandem_id": str(entry["id"]),
},
metadata={
"xtandem_hyperscore": str(peptide_entry["hyperscore"]),
"xtandem_delta": str(peptide_entry["delta"]),
"xtandem_nextscore": str(peptide_entry["nextscore"]),
},
)
pepform_to_psms[peptidoform] = psm
else:
pepform_to_psms[peptidoform].protein_list.append(protein_entry["note"])

return list(pepform_to_psms.values())

def _parse_run(self, filepath):
"""Parse X!Tandem XML run to :py:class:`~psm_utils.psm.PSM`."""
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ requires-python = ">=3.7"
dependencies = [
"click",
"lxml",
"numpy",
"numpy < 2", # NOTE: openms currenly doesn't support numpy 2.0, but doesn't have a version requirement
"pandas",
"psims",
"pyarrow",
Expand Down
208 changes: 208 additions & 0 deletions tests/test_data/test.t.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="tandem-style.xsl"?>
<bioml xmlns:GAML="http://www.bioml.com/gaml/" label="models from '/home/compomics/Documents/spectrum_files/pyrococcus/Velos005137.mgf'">
<group id="10487" mh="1903.970922" z="3" rt="3990.64720000002" expect="7.9e+01" label="tr|Q8U2N0|Q8U2N0_PYRFU Uncharacterized protein OS=Pyrococcus furiosus (strain ATCC..." type="model" sumI="5.86" maxI="97703.7" fI="977.037" act="0" >
<protein expect="0.0" id="10487.1" uid="781" label="tr|Q8U2N0|Q8U2N0_PYRFU Uncharacterized protein OS=Pyrococcus furiosus (strain ATCC..." sumI="5.86" >
<note label="description">tr|Q8U2N0|Q8U2N0_PYRFU Uncharacterized protein OS=Pyrococcus furiosus (strain ATCC 43587 / DSM 3638 / JCM 8422 / Vc1) OX=186497 GN=PF0803 PE=4 SV=1</note>
<file type="peptide" URL="/home/compomics/extra_disk/rescore-pyro-tandem/db/pyro_crap_td.fasta"/>
<peptide start="1" end="414">
MKREDLLWTL IGLSLLYSYL SNNLSGVLFG VVLFSYIVQA RRGFNPDFDV
KVDIPERFEE GITGEVVVGV VNRGSEGFLE VEVSGEDVEG DKRRVFLRKG
ESVVKVKVKP LAKGEMELKF KIRFEDRAGL YYEEEERSFR IQVLPSVDSI
REAMEEERRV RLKEAYKKGR IGVESLEIYG LREYLPGDDV RRIDWKASAR
IGKIIVKEFL RESEGDVYIV LDASREMRKR VRKSKIDYAS TLALYLATLI
VREGRRVGLI IFWDEDFKVV KPGRELEKIR EAIRFRPVRG LMSFKGEISL
RVRGFLKLFP RKRRSIADAL LSLRESSHLI LISDLMSNTP LLYRAIAMAK
KKHRIVILSP NPVLFYSGEL DEETLRFLYR KYKEREKVIR RFNSLVPTLD
LGPSDYREVL EVLG
<domain id="10487.1.1" start="42" end="57" expect="7.9e+01" mh="1903.9661" delta="0.0048" hyperscore="8.8" nextscore="8.0" y_score="7.3" y_ions="3" b_score="4.6" b_ions="1" pre="VQAR" post="FEEG" seq="RGFNPDFDVKVDIPER" missed_cleavages="2">
</domain>
</peptide>
</protein>
<protein expect="0.0" id="10487.2" uid="3776" label="DECOY_tr|Q8TZM9_REVERSED|Q8TZM9_PYRFU Aldose reductase OS=Pyrococcus furiosus (strain..." sumI="5.86" >
<note label="description">DECOY_tr|Q8TZM9_REVERSED|Q8TZM9_PYRFU Aldose reductase OS=Pyrococcus furiosus (strain ATCC 43587 / DSM 3638 / JCM 8422 / Vc1) OX=186497 GN=PF1960 PE=4 SV=1</note>
<file type="peptide" URL="/home/compomics/extra_disk/rescore-pyro-tandem/db/pyro_crap_td.fasta"/>
<peptide start="1" end="278">
VCRRAMERDE ESLRWGMAGF NEKLHEKNSA KPIAVVNEEW ILYNLAVQAA
TKGYKEGIKA LCENRALTGK ELPTYAMLAI GERKMYDLLG TTEPWRDKVS
YKVQNAVIEY KRMVEQSRQL LELNFNSVGI YRIVGEDVLD ELAHLTEEIK
KFDDVPWHLL YLDIYTGLRK ASARAAKKAE EYGFHTPWVK SVIFIDEREF
EKIAEGVIEE AHGAGYFEAT DILNMGLELG YRIAEISEKD RSYDPTERGG
IGWTGMGIAT VKDDGIRKLD NFANVRKM
<domain id="10487.2.1" start="249" end="267" expect="7.9e+01" mh="1903.9695" delta="0.0015" hyperscore="8.8" nextscore="8.0" y_score="7.3" y_ions="3" b_score="4.6" b_ions="1" pre="PTER" post="KLDN" seq="GGIGWTGMGIATVKDDGIR" missed_cleavages="1">
</domain>
</peptide>
</protein>
<group label="supporting data" type="support">
<GAML:trace label="10487.hyper" type="hyperscore expectation function">
<GAML:attribute type="a0">4.38931</GAML:attribute>
<GAML:attribute type="a1">-0.283181</GAML:attribute>
<GAML:Xdata label="10487.hyper" units="score">
<GAML:values byteorder="INTEL" format="ASCII" numvalues="16">
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
</GAML:values>
</GAML:Xdata>
<GAML:Ydata label="10487.hyper" units="counts">
<GAML:values byteorder="INTEL" format="ASCII" numvalues="16">
549 549 549 485 450 375 301 209 149 88 43 15 6 2 1 0
</GAML:values>
</GAML:Ydata>
</GAML:trace>
<GAML:trace label="10487.convolute" type="convolution survival function">
<GAML:Xdata label="10487.convolute" units="score">
<GAML:values byteorder="INTEL" format="ASCII" numvalues="13">
0 1 2 3 4 5 6 7 8 9 10 11 12
</GAML:values>
</GAML:Xdata>
<GAML:Ydata label="10487.convolute" units="counts">
<GAML:values byteorder="INTEL" format="ASCII" numvalues="13">
541 541 541 477 440 365 285 191 116 61 24 1 0
</GAML:values>
</GAML:Ydata>
</GAML:trace>
<GAML:trace label="10487.b" type="b ion histogram">
<GAML:Xdata label="10487.b" units="number of ions">
<GAML:values byteorder="INTEL" format="ASCII" numvalues="7">
0 1 2 3 4 5 6
</GAML:values>
</GAML:Xdata>
<GAML:Ydata label="10487.b" units="counts">
<GAML:values byteorder="INTEL" format="ASCII" numvalues="7">
258 247 39 4 0 1 0
</GAML:values>
</GAML:Ydata>
</GAML:trace>
<GAML:trace label="10487.y" type="y ion histogram">
<GAML:Xdata label="10487.y" units="number of ions">
<GAML:values byteorder="INTEL" format="ASCII" numvalues="6">
0 1 2 3 4 5
</GAML:values>
</GAML:Xdata>
<GAML:Ydata label="10487.y" units="counts">
<GAML:values byteorder="INTEL" format="ASCII" numvalues="6">
176 277 81 14 1 0
</GAML:values>
</GAML:Ydata>
</GAML:trace>

</group>
<group type="support" label="fragment ion mass spectrum">
<note label="Description">635.328491210938_3990.64720000002 RTINSECONDS=3990.64720000002 </note>
<GAML:trace id="10487" label="10487.spectrum" type="tandem mass spectrum">
<GAML:attribute type="M+H">1903.97</GAML:attribute>
<GAML:attribute type="charge">3</GAML:attribute>
<GAML:Xdata label="10487.spectrum" units="MASSTOCHARGERATIO">
<GAML:values byteorder="INTEL" format="ASCII" numvalues="50">
169.133 174.055 175.087 183.149 191.082 197.129 198.132 211.144 235.108 246.182 260.107 263.103 285.088 295.189 298.176 310.213 330.708 342.185 349.151 360.155 369.214 386.24 399.207 407.265 425.186 442.248 459.224 465.223 470.244 482.74
520.35 559.288 572.309 589.375 607.382 626.401 636.326 645.358 657.839 666.844 678.421 688.331 695.446 723.386 756.392 774.401 821.447 1122.54 1332.68 1334.68
</GAML:values>
</GAML:Xdata>
<GAML:Ydata label="10487.spectrum" units="UNKNOWN">
<GAML:values byteorder="INTEL" format="ASCII" numvalues="50">
13 10 3 13 13 51 5 18 100 9 5 26 4 6 4 40 11 11 3 8 4 6 6 6 18 5 14 8 21 5
11 7 14 10 12 8 33 46 5 30 9 4 5 11 3 37 16 10 26 7
</GAML:values>
</GAML:Ydata>
</GAML:trace>
</group></group>
<group label="input parameters" type="parameters">
<note type="input" label="list path, default parameters">/home/compomics/Programs/tandem-linux-17-02-01-4/bin/default_input_protein.xml</note>
<note type="input" label="list path, taxonomy information">/home/compomics/extra_disk/rescore-pyro-tandem/data/taxonomy_pyro.xml</note>
<note type="input" label="output, histogram column width">30</note>
<note type="input" label="output, histograms">yes</note>
<note type="input" label="output, log path"></note>
<note type="input" label="output, maximum valid expectation value">0.1</note>
<note type="input" label="output, message">testing 1 2 3</note>
<note type="input" label="output, one sequence copy">no</note>
<note type="input" label="output, parameters">yes</note>
<note type="input" label="output, path">/home/compomics/extra_disk/rescore-pyro-tandem/tandem/pyro-pyro.xml</note>
<note type="input" label="output, path hashing">yes</note>
<note type="input" label="output, performance">yes</note>
<note type="input" label="output, proteins">yes</note>
<note type="input" label="output, results">all</note>
<note type="input" label="output, sequence path"></note>
<note type="input" label="output, sequences">yes</note>
<note type="input" label="output, sort results by">protein</note>
<note type="input" label="output, spectra">yes</note>
<note type="input" label="output, xsl path">tandem-style.xsl</note>
<note type="input" label="protein, C-terminal residue modification mass">0.0</note>
<note type="input" label="protein, N-terminal residue modification mass">0.0</note>
<note type="input" label="protein, cleavage C-terminal mass change">+17.002735</note>
<note type="input" label="protein, cleavage N-terminal mass change">+1.007825</note>
<note type="input" label="protein, cleavage site">[RK]|{P}</note>
<note type="input" label="protein, homolog management">no</note>
<note type="input" label="protein, modified residue mass file"></note>
<note type="input" label="protein, taxon">pyro</note>
<note type="input" label="refine">yes</note>
<note type="input" label="refine, maximum valid expectation value">0.1</note>
<note type="input" label="refine, modification mass"></note>
<note type="input" label="refine, point mutations">no</note>
<note type="input" label="refine, potential C-terminus modifications"></note>
<note type="input" label="refine, potential N-terminus modifications"></note>
<note type="input" label="refine, potential modification mass"></note>
<note type="input" label="refine, potential modification motif"></note>
<note type="input" label="refine, sequence path"></note>
<note type="input" label="refine, spectrum synthesis">yes</note>
<note type="input" label="refine, tic percent">20</note>
<note type="input" label="refine, unanticipated cleavage">yes</note>
<note type="input" label="refine, use potential modifications for full refinement">no</note>
<note type="input" label="residue, modification mass">57.022@C</note>
<note type="input" label="residue, potential modification mass">15.994@M</note>
<note type="input" label="residue, potential modification motif"></note>
<note type="input" label="scoring, a ions">no</note>
<note type="input" label="scoring, b ions">yes</note>
<note type="input" label="scoring, c ions">no</note>
<note type="input" label="scoring, cyclic permutation">no</note>
<note type="input" label="scoring, include reverse">no</note>
<note type="input" label="scoring, maximum missed cleavage sites">2</note>
<note type="input" label="scoring, minimum ion count">4</note>
<note type="input" label="scoring, x ions">no</note>
<note type="input" label="scoring, y ions">yes</note>
<note type="input" label="scoring, z ions">no</note>
<note type="input" label="spectrum, dynamic range">100.0</note>
<note type="input" label="spectrum, fragment mass type">monoisotopic</note>
<note type="input" label="spectrum, fragment monoisotopic mass error">0.02</note>
<note type="input" label="spectrum, fragment monoisotopic mass error units">Daltons</note>
<note type="input" label="spectrum, maximum parent charge">4</note>
<note type="input" label="spectrum, minimum fragment mz">150.0</note>
<note type="input" label="spectrum, minimum parent m+h">500.0</note>
<note type="input" label="spectrum, minimum peaks">15</note>
<note type="input" label="spectrum, parent monoisotopic mass error minus">5</note>
<note type="input" label="spectrum, parent monoisotopic mass error plus">5</note>
<note type="input" label="spectrum, parent monoisotopic mass error units">ppm</note>
<note type="input" label="spectrum, parent monoisotopic mass isotope error">yes</note>
<note type="input" label="spectrum, path">/home/compomics/Documents/spectrum_files/pyrococcus/Velos005137.mgf</note>
<note type="input" label="spectrum, sequence batch size">1000</note>
<note type="input" label="spectrum, threads">16</note>
<note type="input" label="spectrum, total peaks">50</note>
</group>
<group label="unused input parameters" type="parameters">
<note type="input" label="protein, use minimal annotations">yes</note>
<note type="input" label="refine, maximum missed cleavage sites">3</note>
<note type="input" label="scoring, pluggable scoring">no</note>
<note type="input" label="spectrum, use noise suppression">yes</note>
</group>
<group label="performance parameters" type="parameters">
<note label="list path, sequence source #1">/home/compomics/extra_disk/rescore-pyro-tandem/db/pyro_crap_td.fasta</note>
<note label="list path, sequence source description #1">no description</note>
<note label="modelling, duplicate peptide ids">0</note>
<note label="modelling, duplicate proteins">0</note>
<note label="modelling, total peptides used">10876794</note>
<note label="modelling, total proteins used">4322</note>
<note label="modelling, total spectra used">15365</note>
<note label="process, start time">2019:07:15:21:28:29</note>
<note label="process, version">X! Tandem Alanine (2017.2.1.4)</note>
<note label="quality values">212 424 483 534 601 636 644 675 571 611 571 466 473 431 363 355 339 282 289 233</note>
<note label="refining, # input models">844</note>
<note label="refining, # input spectra">5666</note>
<note label="refining, # partial cleavage">52</note>
<note label="refining, # point mutations">0</note>
<note label="refining, # potential C-terminii">0</note>
<note label="refining, # potential N-terminii">0</note>
<note label="refining, # unanticipated cleavage">1266</note>
<note label="timing, initial modelling total (sec)">29.24</note>
<note label="timing, initial modelling/spectrum (sec)">0.0019</note>
<note label="timing, load sequence models (sec)">0.22</note>
<note label="timing, refinement/spectrum (sec)">0.0018</note>
</group>
</bioml>
15 changes: 7 additions & 8 deletions tests/test_data/test_out_sage.idXML
Original file line number Diff line number Diff line change
Expand Up @@ -20,22 +20,21 @@
<UserParam type="float" name="peptide_len" value="19.0"/>
<UserParam type="float" name="missed_cleavages" value="0.0"/>
<UserParam type="float" name="precursor_ppm" value="0.8239083"/>
<UserParam type="float" name="fragment_ppm" value="0.5347518"/>
<UserParam type="float" name="hyperscore" value="71.788444602553838"/>
<UserParam type="float" name="delta_next" value="71.788444602553838"/>
<UserParam type="float" name="fragment_ppm" value="0.503857"/>
<UserParam type="float" name="hyperscore" value="72.265915738060158"/>
<UserParam type="float" name="delta_next" value="72.265915738060158"/>
<UserParam type="float" name="delta_best" value="0.0"/>
<UserParam type="float" name="delta_rt_model" value="0.0"/>
<UserParam type="float" name="aligned_rt" value="0.0"/>
<UserParam type="float" name="delta_rt_model" value="0.993444"/>
<UserParam type="float" name="aligned_rt" value="0.993444"/>
<UserParam type="float" name="predicted_rt" value="0.0"/>
<UserParam type="float" name="matched_peaks" value="22.0"/>
<UserParam type="float" name="longest_b" value="9.0"/>
<UserParam type="float" name="longest_y" value="12.0"/>
<UserParam type="float" name="longest_y_pct" value="0.6315789"/>
<UserParam type="float" name="matched_intensity_pct" value="50.784999999999997"/>
<UserParam type="float" name="matched_intensity_pct" value="64.770966000000001"/>
<UserParam type="float" name="scored_candidates" value="1.0"/>
<UserParam type="float" name="poisson" value="-1.956281191108343"/>
<UserParam type="float" name="ms1_intensity" value="3.0614618e08"/>
<UserParam type="float" name="ms2_intensity" value="5.6930696e07"/>
<UserParam type="float" name="ms2_intensity" value="7.260917e07"/>
</PeptideHit>
<UserParam type="int" name="id_merge_index" value="0"/>
</PeptideIdentification>
Expand Down
2 changes: 1 addition & 1 deletion tests/test_io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def compute_checksum(filename):


class TestParquetWriter:
expected_checksum = "cf3f2e9f073be58612ce81f240da9f4109e1c76eea25f1b7881e09c0a8fdee16"
expected_checksum = "be6da8d891bd63b85fab5bb11d6a113f8df6ce9f7fd3d1cc429804dc41d972af"

def test_write_psm(self):
with ParquetWriter("test.pq") as writer:
Expand Down
Loading
Loading