From d352100ab63c1060b864e3c15b8f2d0aa0b2363e Mon Sep 17 00:00:00 2001 From: rodvrees Date: Tue, 25 Jun 2024 14:24:52 +0200 Subject: [PATCH 01/13] DIANN io --- .gitignore | 4 ++ psm_utils/io/__init__.py | 7 +++ psm_utils/io/diann.py | 118 ++++++++++++++++++++++++++++++++++++++ psm_utils/io/msfragger.py | 0 4 files changed, 129 insertions(+) create mode 100644 psm_utils/io/diann.py create mode 100644 psm_utils/io/msfragger.py diff --git a/.gitignore b/.gitignore index 0432116..91b6a52 100644 --- a/.gitignore +++ b/.gitignore @@ -132,3 +132,7 @@ dmypy.json # Pyre type checker .pyre/ .vscode/settings.json + +# Specific to Branch +example_files/DIANN_example.tsv +test.ipynb diff --git a/psm_utils/io/__init__.py b/psm_utils/io/__init__.py index 79f09ac..ae0b321 100644 --- a/psm_utils/io/__init__.py +++ b/psm_utils/io/__init__.py @@ -22,6 +22,7 @@ import psm_utils.io.sage as sage import psm_utils.io.tsv as tsv import psm_utils.io.xtandem as xtandem +import psm_utils.io.diann as diann from psm_utils.io._base_classes import WriterBase from psm_utils.io.exceptions import PSMUtilsIOException from psm_utils.psm import PSM @@ -106,6 +107,12 @@ "extension": ".parquet", "filename_pattern": r"^.*(?:_|\.).sage.parquet$", }, + "diann": { + "reader": diann.DIANNReader, + "writer": None, + "extension": ".tsv", + "filename_pattern": r"^.*\.tsv$", + }, "parquet": { # List after proteoscape and sage to avoid extension matching conflicts "reader": parquet.ParquetReader, "writer": parquet.ParquetWriter, diff --git a/psm_utils/io/diann.py b/psm_utils/io/diann.py new file mode 100644 index 0000000..b1b9c1d --- /dev/null +++ b/psm_utils/io/diann.py @@ -0,0 +1,118 @@ +""" +Reader for PSM files from DIA-NN + +Reads the '.tsv' file as defined on the `DIA-NN documentation page `_. +""" + +from __future__ import annotations + +import csv +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Iterable, Optional +import re + +import pyarrow.parquet as pq +from pyteomics import mass + +from psm_utils.io._base_classes import ReaderBase +from psm_utils.io._utils import set_csv_field_size_limit +from psm_utils.psm import PSM +from psm_utils.psm_list import PSMList + +set_csv_field_size_limit() + +class DIANNReader(ReaderBase, ABC): + def __init__( + self, filename, score_column: str = "CScore", *args, **kwargs + ) -> None: + """ + Reader for DIA-NN '.tsv' file. + + Parameters + ---------- + filename : str or Path + Path to PSM file. + score_column: str, optional + Name of the column that holds the primary PSM score. Default is + ``CScore``. + + """ + super().__init__(filename, *args, **kwargs) + self.filename = filename + self.score_column = score_column + + def __iter__(self) -> Iterable[PSM]: + """Iterate over file and return PSMs one-by-one.""" + with open(self.filename) as msms_in: + reader = csv.DictReader(msms_in, delimiter="\t") + for row in reader: + yield self._get_peptide_spectrum_match(row) + + def _get_peptide_spectrum_match(self, psm_dict) -> PSM: + """Parse a single PSM from a DIA-NN PSM file.""" + rescoring_features = {} + for ft in RESCORING_FEATURES: + try: + rescoring_features[ft] = psm_dict[ft] + except KeyError: + continue + + return PSM( + peptidoform=self._parse_peptidoform( + psm_dict["Modified.Sequence"], + psm_dict["Precursor.Charge"]), + spectrum_id='NA', # DIA-NN does not output spectrum ID + run=psm_dict["Run"], + is_decoy=False, + qvalue=psm_dict["Q.Value"], + pep=float(psm_dict["PEP"]), + score=float(psm_dict[self.score_column]), + retention_time=float(psm_dict["RT"]), + ion_mobility=float(psm_dict["IM"]), + protein_list=psm_dict["Protein.Names"].split(";"), + source="diann", + rank=None, # Leave out? + provenance_data=({"diann_filename": str(self.filename)}), + rescoring_features=rescoring_features, + metadata={}, + ) + + @staticmethod + def _parse_peptidoform(peptide: str, charge: Optional[str]) -> str: + if charge: + peptide += f"/{int(float(charge))}" + pattern = r"\(UniMod:(\d+)\)" + replacement = r"[UNIMOD:\1]" + peptide = re.sub(pattern, replacement, peptide) + # If [UNIMOD:n] occurs before the first amino acid, a hyphen is added before the first amino acid + if peptide[0] == "[": + # Hyphen after the closing bracket + peptide = peptide.replace("]", "]-", 1) + return peptide + + def _parse_precursor_mz(): + return NotImplementedError("Method not implemented yet. DIA-NN does not yet output precursor m/z.") + + def from_dataframe(cls, dataframe) -> PSMList: + """Create a PSMList from a DIA-NN Pandas DataFrame.""" + return PSMList( + ptm_list=[ + cls._get_peptide_spectrum_match(cls(""), entry) + for entry in dataframe.to_dict(orient="records") + ] + ) + + +# TODO: Check +RESCORING_FEATURES = [ + "CScore", + "RT", + "Predicted.RT", + "iRT", + "Predicted.iRT", + "Ms1.Profile.Corr", + "Ms1.Area", + "IM", + "iIM" +] diff --git a/psm_utils/io/msfragger.py b/psm_utils/io/msfragger.py new file mode 100644 index 0000000..e69de29 From 4b96137c0d923d7ba74a54670602ffc03593c954 Mon Sep 17 00:00:00 2001 From: rodvrees Date: Tue, 25 Jun 2024 16:51:08 +0200 Subject: [PATCH 02/13] fragpipe reader --- .gitignore | 1 + psm_utils/io/__init__.py | 10 ++- psm_utils/io/diann.py | 14 ++-- psm_utils/io/fragpipe.py | 133 ++++++++++++++++++++++++++++++++++++++ psm_utils/io/msfragger.py | 0 5 files changed, 149 insertions(+), 9 deletions(-) create mode 100644 psm_utils/io/fragpipe.py delete mode 100644 psm_utils/io/msfragger.py diff --git a/.gitignore b/.gitignore index 91b6a52..e1307d8 100644 --- a/.gitignore +++ b/.gitignore @@ -135,4 +135,5 @@ dmypy.json # Specific to Branch example_files/DIANN_example.tsv +example_files/MSFragger_example_psm.tsv test.ipynb diff --git a/psm_utils/io/__init__.py b/psm_utils/io/__init__.py index ae0b321..60407c9 100644 --- a/psm_utils/io/__init__.py +++ b/psm_utils/io/__init__.py @@ -23,6 +23,7 @@ import psm_utils.io.tsv as tsv import psm_utils.io.xtandem as xtandem import psm_utils.io.diann as diann +import psm_utils.io.fragpipe as fragpipe from psm_utils.io._base_classes import WriterBase from psm_utils.io.exceptions import PSMUtilsIOException from psm_utils.psm import PSM @@ -107,12 +108,19 @@ "extension": ".parquet", "filename_pattern": r"^.*(?:_|\.).sage.parquet$", }, - "diann": { + "fragpipe": { + "reader": fragpipe.FragpipeReader, + "writer": None, + "extension": ".tsv", + "filename_pattern": r"^.*psm\.tsv$", + }, + "diann": { # List after fragpipe to avoid extension matching conflicts #TODO: fix tsv conflict "reader": diann.DIANNReader, "writer": None, "extension": ".tsv", "filename_pattern": r"^.*\.tsv$", }, + "parquet": { # List after proteoscape and sage to avoid extension matching conflicts "reader": parquet.ParquetReader, "writer": parquet.ParquetWriter, diff --git a/psm_utils/io/diann.py b/psm_utils/io/diann.py index b1b9c1d..468a944 100644 --- a/psm_utils/io/diann.py +++ b/psm_utils/io/diann.py @@ -7,14 +7,10 @@ from __future__ import annotations import csv -from abc import ABC, abstractmethod -from pathlib import Path +from abc import ABC from typing import Iterable, Optional import re -import pyarrow.parquet as pq -from pyteomics import mass - from psm_utils.io._base_classes import ReaderBase from psm_utils.io._utils import set_csv_field_size_limit from psm_utils.psm import PSM @@ -24,7 +20,7 @@ class DIANNReader(ReaderBase, ABC): def __init__( - self, filename, score_column: str = "CScore", *args, **kwargs + self, filename, score_column: str = "CScore", *args, **kwargs ) -> None: """ Reader for DIA-NN '.tsv' file. @@ -72,7 +68,7 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM: ion_mobility=float(psm_dict["IM"]), protein_list=psm_dict["Protein.Names"].split(";"), source="diann", - rank=None, # Leave out? + rank=1, # Leave out? provenance_data=({"diann_filename": str(self.filename)}), rescoring_features=rescoring_features, metadata={}, @@ -91,9 +87,11 @@ def _parse_peptidoform(peptide: str, charge: Optional[str]) -> str: peptide = peptide.replace("]", "]-", 1) return peptide + @staticmethod def _parse_precursor_mz(): - return NotImplementedError("Method not implemented yet. DIA-NN does not yet output precursor m/z.") + return NotImplementedError("Method not implemented yet. DIA-NN does not yet output precursor m/z, but might in the future.") + @staticmethod def from_dataframe(cls, dataframe) -> PSMList: """Create a PSMList from a DIA-NN Pandas DataFrame.""" return PSMList( diff --git a/psm_utils/io/fragpipe.py b/psm_utils/io/fragpipe.py new file mode 100644 index 0000000..cb5fcf6 --- /dev/null +++ b/psm_utils/io/fragpipe.py @@ -0,0 +1,133 @@ +""" +Reader for PSM files from the Fragpipe platform. + +Reads the Philosopher ``psm.tsv`` file as defined on the +`Fragpipe documentation page `_. + +""" + +from __future__ import annotations + +import csv +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Iterable, Optional + +import pyarrow.parquet as pq +from pyteomics import mass + +from psm_utils.io._base_classes import ReaderBase +from psm_utils.io._utils import set_csv_field_size_limit +from psm_utils.psm import PSM +from psm_utils.psm_list import PSMList + +set_csv_field_size_limit() + +class FragpipeReader(ReaderBase, ABC): + def __init__( + self, filename, score_column: str = "Hyperscore", mz_column: str = "Observed M/Z", *args, **kwargs + ) -> None: + """ + Reader for MSFragger ``psm.tsv`` file. + + Parameters + ---------- + filename : str or Path + Path to PSM file. + score_column: str, optional + Name of the column that holds the primary PSM score. Default is + ``Hyperscore``. + + """ + super().__init__(filename, *args, **kwargs) + self.filename = filename + self.score_column = score_column + self.mz_column = mz_column + + def __iter__(self) -> Iterable[PSM]: + """Iterate over file and return PSMs one-by-one.""" + with open(self.filename) as msms_in: + reader = csv.DictReader(msms_in, delimiter="\t") + for row in reader: + yield self._get_peptide_spectrum_match(row) + + def _get_peptide_spectrum_match(self, psm_dict) -> PSM: + """Parse a single PSM from a MSFragger PSM file.""" + rescoring_features = {} + for ft in RESCORING_FEATURES: + try: + rescoring_features[ft] = psm_dict[ft] + except KeyError: + continue + + return PSM( + peptidoform=self._parse_peptidoform( + psm_dict["Modified Peptide"], + psm_dict['Peptide'], + psm_dict["Charge"]), + spectrum_id=self._parse_spectrum_id(psm_dict['Spectrum']), + run=Path(psm_dict["Spectrum File"]).stem, + is_decoy=False, + qvalue=None, # Q-value is not outputted by Philosopher + pep= 1 - float(psm_dict["Probability"]), # PeptideProphet Probability, not explicitely stated if this is the inverse of PEP + score=psm_dict[self.score_column], + precursor_mz=psm_dict[self.mz_column], # Allows use of both calibrated and uncalibrated Observed M/Z?+ + retention_time=float(psm_dict["Retention"]), + ion_mobility=float(psm_dict["Ion Mobility"]) if "Ion Mobility" in psm_dict else None, + protein_list=self._parse_protein_list(psm_dict["Protein"], + psm_dict["Mapped Proteins"]), + source="fragpipe", + rank=1, + rescoring_features=rescoring_features, + metadata={} + ) + + @staticmethod + def _parse_peptidoform(mod_peptide: str, peptide: str, charge: Optional[str]) -> str: + if mod_peptide: + peptide = mod_peptide + if charge: + peptide += f"/{int(float(charge))}" + if peptide.startswith('n'): + peptide = peptide[1:] + # A hyphen needs to be added after the N-terminal modification, thus after the ] + peptide = peptide.replace(']', ']-', 1) + return peptide + + @staticmethod + def _parse_spectrum_id(spectrum: str) -> str: + return spectrum.split(".")[1] + + @staticmethod + def _parse_protein_list(razor_protein: str, mapped_proteins) -> list[str]: + if mapped_proteins: + mapped_proteins_list = mapped_proteins.split(", ") + return [razor_protein] + mapped_proteins_list + else: + return [razor_protein] + + @staticmethod + def from_dataframe(cls, dataframe) -> PSMList: + """Create a PSMList from a pandas DataFrame.""" + return PSMList( + ptm_list=[ + cls._get_peptide_spectrum_match(cls(""), entry) + for entry in dataframe.to_dict(orient="records") + ] + ) + + +# TODO: check +RESCORING_FEATURES = [ + "Peptide Length", + "Retention", + "Observed Mass", + "Observed M/Z", + "Calculated Peptide Mass", + "Calculated M/Z", + "Delta Mass", + "Hyperscore", + "Number of Missed Cleavages", + "Intensity" +] + diff --git a/psm_utils/io/msfragger.py b/psm_utils/io/msfragger.py deleted file mode 100644 index e69de29..0000000 From 6bd341182328bc7d9188b6dd143d6cde933f789a Mon Sep 17 00:00:00 2001 From: rodvrees Date: Wed, 26 Jun 2024 10:34:28 +0200 Subject: [PATCH 03/13] alphadia reader --- .gitignore | 1 + psm_utils/io/__init__.py | 10 +++- psm_utils/io/alphadia.py | 118 +++++++++++++++++++++++++++++++++++++++ psm_utils/io/diann.py | 2 +- psm_utils/io/fragpipe.py | 10 ++-- 5 files changed, 133 insertions(+), 8 deletions(-) create mode 100644 psm_utils/io/alphadia.py diff --git a/.gitignore b/.gitignore index e1307d8..8669c02 100644 --- a/.gitignore +++ b/.gitignore @@ -136,4 +136,5 @@ dmypy.json # Specific to Branch example_files/DIANN_example.tsv example_files/MSFragger_example_psm.tsv +example_files/AlphaDIA_example.tsv test.ipynb diff --git a/psm_utils/io/__init__.py b/psm_utils/io/__init__.py index 60407c9..e345708 100644 --- a/psm_utils/io/__init__.py +++ b/psm_utils/io/__init__.py @@ -24,6 +24,7 @@ import psm_utils.io.xtandem as xtandem import psm_utils.io.diann as diann import psm_utils.io.fragpipe as fragpipe +import psm_utils.io.alphadia as alphadia from psm_utils.io._base_classes import WriterBase from psm_utils.io.exceptions import PSMUtilsIOException from psm_utils.psm import PSM @@ -114,13 +115,20 @@ "extension": ".tsv", "filename_pattern": r"^.*psm\.tsv$", }, - "diann": { # List after fragpipe to avoid extension matching conflicts #TODO: fix tsv conflict + "alphadia": { + "reader": alphadia.AlphaDIAReader, + "writer": None, + "extension": ".tsv", + "filename_pattern": r"^.*precursor\.tsv$", + }, + "diann": { # List after fragpipe and alphadia to avoid extension matching conflicts #TODO: fix tsv conflict "reader": diann.DIANNReader, "writer": None, "extension": ".tsv", "filename_pattern": r"^.*\.tsv$", }, + "parquet": { # List after proteoscape and sage to avoid extension matching conflicts "reader": parquet.ParquetReader, "writer": parquet.ParquetWriter, diff --git a/psm_utils/io/alphadia.py b/psm_utils/io/alphadia.py new file mode 100644 index 0000000..88af545 --- /dev/null +++ b/psm_utils/io/alphadia.py @@ -0,0 +1,118 @@ +""" +Reader for PSM files from the AlphaDIA search engine. + +Reads the AlphaDIA ``precursor.tsv`` file as defined on the +`TODO: NOT YET A LINK`_. + +""" + +from __future__ import annotations + +import csv +from abc import ABC +from typing import Iterable, Optional + +from psm_utils.io._base_classes import ReaderBase +from psm_utils.io._utils import set_csv_field_size_limit +from psm_utils.psm import PSM +from psm_utils.psm_list import PSMList + +set_csv_field_size_limit() + + +class AlphaDIAReader(ReaderBase, ABC): + def __init__(self, filename, score_column: str = "score", *args, **kwargs): + """ + Reader for AlphaDIA ``precursor.tsv`` file. + + Parameters + ---------- + filename : str or Path + Path to PSM file. + score_column: str, optional + Name of the column that holds the primary PSM score. Default is + ``score``. + + """ + super().__init__(filename, *args, **kwargs) + self.filename = filename + self.score_column = score_column + + def __iter__(self) -> Iterable[PSM]: + """Iterate over file and return PSMs one-by-one.""" + with open(self.filename) as msms_in: + reader = csv.DictReader(msms_in, delimiter="\t") + for row in reader: + yield self._get_peptide_spectrum_match(row) + + def _get_peptide_spectrum_match(self, psm_dict) -> PSM: + """Parse a single PSM from a AlphaDIA PSM file.""" + rescoring_features = {} + for ft in RESCORING_FEATURES: + try: + rescoring_features[ft] = psm_dict[ft] + except KeyError: + continue + + return PSM( + peptidoform=self._parse_peptidoform( + psm_dict["sequence"], psm_dict["mods"], psm_dict["mod_sites"], psm_dict["charge"] + ), + spectrum_id=psm_dict["frame_start"], # TODO: needs to be checked + run=psm_dict["run"], + spectrum=psm_dict["frame_start"], # TODO: needs to be checked + is_decoy=bool(int(psm_dict["decoy"])), + score=psm_dict[self.score_column], + qvalue=psm_dict["qval"], + pep=psm_dict[ + "proba" + ], # TODO: needs to be checked, assumption because if it is 1-proba than it's really bad + precursor_mz=psm_dict["mz_observed"], + retention_time=psm_dict["rt_observed"], + ion_mobility=psm_dict["mobility_observed"], + protein_list=psm_dict["proteins"].split(";"), + rank=psm_dict["rank"], + source="alphadia", + provenance_data=({"alphadia_filename": str(self.filename)}), + metadata={}, + rescoring_features=rescoring_features, + ) + + @staticmethod + def _parse_peptidoform(sequence: str, mods: str, mod_sites, charge: Optional[str]) -> str: + if mods: + mods = mods.split(";") + mod_sites = mod_sites.split(";") + for mod, site in reversed(sorted(zip(mods, mod_sites), key=lambda x: int(x[1]))): + if int(site) == 0: + sequence = ( + sequence[: int(site)] + f"[{mod.split('@')[0]}]-" + sequence[int(site) :] + ) + else: + sequence = ( + sequence[: int(site)] + f"[{mod.split('@')[0]}]" + sequence[int(site) :] + ) + if charge: + sequence += f"/{int(float(charge))}" + return sequence + + @classmethod + def from_dataframe(cls, dataframe) -> PSMList: + """Create a PSMList from a AlphaDIA Pandas DataFrame.""" + return PSMList( + psm_list=[ + cls._get_peptide_spectrum_match(cls(""), entry) + for entry in dataframe.to_dict(orient="records") + ] + ) + + +# TODO: check +RESCORING_FEATURES = [ + "rt_observed", + "mobility_observed", + "mz_observed", + "score", + "charge", + "delta_rt", +] diff --git a/psm_utils/io/diann.py b/psm_utils/io/diann.py index 468a944..bc79670 100644 --- a/psm_utils/io/diann.py +++ b/psm_utils/io/diann.py @@ -91,7 +91,7 @@ def _parse_peptidoform(peptide: str, charge: Optional[str]) -> str: def _parse_precursor_mz(): return NotImplementedError("Method not implemented yet. DIA-NN does not yet output precursor m/z, but might in the future.") - @staticmethod + @classmethod def from_dataframe(cls, dataframe) -> PSMList: """Create a PSMList from a DIA-NN Pandas DataFrame.""" return PSMList( diff --git a/psm_utils/io/fragpipe.py b/psm_utils/io/fragpipe.py index cb5fcf6..71efc37 100644 --- a/psm_utils/io/fragpipe.py +++ b/psm_utils/io/fragpipe.py @@ -9,13 +9,10 @@ from __future__ import annotations import csv -from abc import ABC, abstractmethod +from abc import ABC from pathlib import Path from typing import Iterable, Optional -import pyarrow.parquet as pq -from pyteomics import mass - from psm_utils.io._base_classes import ReaderBase from psm_utils.io._utils import set_csv_field_size_limit from psm_utils.psm import PSM @@ -65,7 +62,7 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM: psm_dict["Modified Peptide"], psm_dict['Peptide'], psm_dict["Charge"]), - spectrum_id=self._parse_spectrum_id(psm_dict['Spectrum']), + spectrum_id=self._parse_spectrum_id(psm_dict['Spectrum']), #TODO: needs to be checked run=Path(psm_dict["Spectrum File"]).stem, is_decoy=False, qvalue=None, # Q-value is not outputted by Philosopher @@ -78,6 +75,7 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM: psm_dict["Mapped Proteins"]), source="fragpipe", rank=1, + provenance_data=({"fragpipe_filename": str(self.filename)}), rescoring_features=rescoring_features, metadata={} ) @@ -106,7 +104,7 @@ def _parse_protein_list(razor_protein: str, mapped_proteins) -> list[str]: else: return [razor_protein] - @staticmethod + @classmethod def from_dataframe(cls, dataframe) -> PSMList: """Create a PSMList from a pandas DataFrame.""" return PSMList( From 36ec0472dcd883858f107c340da1a841b11201b9 Mon Sep 17 00:00:00 2001 From: rodvrees Date: Wed, 26 Jun 2024 22:10:57 +0200 Subject: [PATCH 04/13] diann change protein column --- psm_utils/io/diann.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/psm_utils/io/diann.py b/psm_utils/io/diann.py index bc79670..60c48e0 100644 --- a/psm_utils/io/diann.py +++ b/psm_utils/io/diann.py @@ -18,10 +18,9 @@ set_csv_field_size_limit() + class DIANNReader(ReaderBase, ABC): - def __init__( - self, filename, score_column: str = "CScore", *args, **kwargs - ) -> None: + def __init__(self, filename, score_column: str = "CScore", *args, **kwargs) -> None: """ Reader for DIA-NN '.tsv' file. @@ -56,9 +55,9 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM: return PSM( peptidoform=self._parse_peptidoform( - psm_dict["Modified.Sequence"], - psm_dict["Precursor.Charge"]), - spectrum_id='NA', # DIA-NN does not output spectrum ID + psm_dict["Modified.Sequence"], psm_dict["Precursor.Charge"] + ), + spectrum_id="NA", # DIA-NN does not output spectrum ID run=psm_dict["Run"], is_decoy=False, qvalue=psm_dict["Q.Value"], @@ -66,9 +65,9 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM: score=float(psm_dict[self.score_column]), retention_time=float(psm_dict["RT"]), ion_mobility=float(psm_dict["IM"]), - protein_list=psm_dict["Protein.Names"].split(";"), + protein_list=psm_dict["Protein.Ids"].split(";"), source="diann", - rank=1, # Leave out? + rank=1, # Leave out? provenance_data=({"diann_filename": str(self.filename)}), rescoring_features=rescoring_features, metadata={}, @@ -89,7 +88,9 @@ def _parse_peptidoform(peptide: str, charge: Optional[str]) -> str: @staticmethod def _parse_precursor_mz(): - return NotImplementedError("Method not implemented yet. DIA-NN does not yet output precursor m/z, but might in the future.") + return NotImplementedError( + "Method not implemented yet. DIA-NN does not yet output precursor m/z, but might in the future." + ) @classmethod def from_dataframe(cls, dataframe) -> PSMList: @@ -112,5 +113,5 @@ def from_dataframe(cls, dataframe) -> PSMList: "Ms1.Profile.Corr", "Ms1.Area", "IM", - "iIM" + "iIM", ] From b228fb601c461918a26cae3665582f8a876e00a5 Mon Sep 17 00:00:00 2001 From: rodvrees Date: Thu, 27 Jun 2024 14:22:03 +0200 Subject: [PATCH 05/13] unit tests for diann, alphadia and fragpipe readers --- psm_utils/io/alphadia.py | 2 +- psm_utils/io/diann.py | 2 ++ psm_utils/io/fragpipe.py | 55 ++++++++++++++++++++++--------- tests/test_data/test_alphadia.tsv | 2 ++ tests/test_data/test_diann.tsv | 2 ++ tests/test_data/test_fragpipe.tsv | 2 ++ tests/test_io/test_alphadia.py | 39 ++++++++++++++++++++++ tests/test_io/test_diann.py | 44 +++++++++++++++++++++++++ tests/test_io/test_fragpipe.py | 42 +++++++++++++++++++++++ 9 files changed, 173 insertions(+), 17 deletions(-) create mode 100644 tests/test_data/test_alphadia.tsv create mode 100644 tests/test_data/test_diann.tsv create mode 100644 tests/test_data/test_fragpipe.tsv create mode 100644 tests/test_io/test_alphadia.py create mode 100644 tests/test_io/test_diann.py create mode 100644 tests/test_io/test_fragpipe.py diff --git a/psm_utils/io/alphadia.py b/psm_utils/io/alphadia.py index 88af545..c2c3fff 100644 --- a/psm_utils/io/alphadia.py +++ b/psm_utils/io/alphadia.py @@ -71,7 +71,7 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM: retention_time=psm_dict["rt_observed"], ion_mobility=psm_dict["mobility_observed"], protein_list=psm_dict["proteins"].split(";"), - rank=psm_dict["rank"], + rank=int(psm_dict["rank"]) + 1, # AlphaDIA ranks are 0-based source="alphadia", provenance_data=({"alphadia_filename": str(self.filename)}), metadata={}, diff --git a/psm_utils/io/diann.py b/psm_utils/io/diann.py index 60c48e0..a60cb7b 100644 --- a/psm_utils/io/diann.py +++ b/psm_utils/io/diann.py @@ -114,4 +114,6 @@ def from_dataframe(cls, dataframe) -> PSMList: "Ms1.Area", "IM", "iIM", + "Predicted.IM", + "Predicted.iIM", ] diff --git a/psm_utils/io/fragpipe.py b/psm_utils/io/fragpipe.py index 71efc37..50d6bd4 100644 --- a/psm_utils/io/fragpipe.py +++ b/psm_utils/io/fragpipe.py @@ -20,9 +20,15 @@ set_csv_field_size_limit() + class FragpipeReader(ReaderBase, ABC): def __init__( - self, filename, score_column: str = "Hyperscore", mz_column: str = "Observed M/Z", *args, **kwargs + self, + filename, + score_column: str = "Hyperscore", + mz_column: str = "Observed M/Z", + *args, + **kwargs, ) -> None: """ Reader for MSFragger ``psm.tsv`` file. @@ -34,6 +40,9 @@ def __init__( score_column: str, optional Name of the column that holds the primary PSM score. Default is ``Hyperscore``. + mz_column: str, optional + Name of the column that holds the precursor m/z. Default is + ``Observed M/Z``. """ super().__init__(filename, *args, **kwargs) @@ -59,25 +68,31 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM: return PSM( peptidoform=self._parse_peptidoform( - psm_dict["Modified Peptide"], - psm_dict['Peptide'], - psm_dict["Charge"]), - spectrum_id=self._parse_spectrum_id(psm_dict['Spectrum']), #TODO: needs to be checked - run=Path(psm_dict["Spectrum File"]).stem, + psm_dict["Modified Peptide"], psm_dict["Peptide"], psm_dict["Charge"] + ), + spectrum_id=self._parse_spectrum_id(psm_dict["Spectrum"]), # TODO: needs to be checked + run=self._parse_run(psm_dict["Spectrum File"]), is_decoy=False, - qvalue=None, # Q-value is not outputted by Philosopher - pep= 1 - float(psm_dict["Probability"]), # PeptideProphet Probability, not explicitely stated if this is the inverse of PEP + qvalue=None, # Q-value is not outputted by Philosopher + pep=1 + - float( + psm_dict["Probability"] + ), # PeptideProphet Probability, not explicitely stated if this is the inverse of PEP + # But I'm assuming it is score=psm_dict[self.score_column], - precursor_mz=psm_dict[self.mz_column], # Allows use of both calibrated and uncalibrated Observed M/Z?+ + precursor_mz=psm_dict[ + self.mz_column + ], # Allows use of both calibrated and uncalibrated Observed M/Z? retention_time=float(psm_dict["Retention"]), ion_mobility=float(psm_dict["Ion Mobility"]) if "Ion Mobility" in psm_dict else None, - protein_list=self._parse_protein_list(psm_dict["Protein"], - psm_dict["Mapped Proteins"]), + protein_list=self._parse_protein_list( + psm_dict["Protein"], psm_dict["Mapped Proteins"] + ), source="fragpipe", rank=1, provenance_data=({"fragpipe_filename": str(self.filename)}), rescoring_features=rescoring_features, - metadata={} + metadata={}, ) @staticmethod @@ -86,10 +101,10 @@ def _parse_peptidoform(mod_peptide: str, peptide: str, charge: Optional[str]) -> peptide = mod_peptide if charge: peptide += f"/{int(float(charge))}" - if peptide.startswith('n'): + if peptide.startswith("n"): peptide = peptide[1:] # A hyphen needs to be added after the N-terminal modification, thus after the ] - peptide = peptide.replace(']', ']-', 1) + peptide = peptide.replace("]", "]-", 1) return peptide @staticmethod @@ -104,6 +119,16 @@ def _parse_protein_list(razor_protein: str, mapped_proteins) -> list[str]: else: return [razor_protein] + # Dependent on the fragpipe workflow used the run name can be different, but in most cases + # something like 'interact-.pep.xml' is used + @staticmethod + def _parse_run(spectrum_file: str) -> str: + if (spectrum_file.endswith(".pep.xml")) and (spectrum_file.startswith("interact-")): + spectrum_file = spectrum_file.replace("interact-", "") + return Path(Path(spectrum_file).stem).stem + else: + return Path(spectrum_file).stem + @classmethod def from_dataframe(cls, dataframe) -> PSMList: """Create a PSMList from a pandas DataFrame.""" @@ -126,6 +151,4 @@ def from_dataframe(cls, dataframe) -> PSMList: "Delta Mass", "Hyperscore", "Number of Missed Cleavages", - "Intensity" ] - diff --git a/tests/test_data/test_alphadia.tsv b/tests/test_data/test_alphadia.tsv new file mode 100644 index 0000000..4928e16 --- /dev/null +++ b/tests/test_data/test_alphadia.tsv @@ -0,0 +1,2 @@ +base_width_mobility base_width_rt rt_observed mobility_observed mono_ms1_intensity top_ms1_intensity sum_ms1_intensity weighted_ms1_intensity weighted_mass_deviation weighted_mass_error mz_observed mono_ms1_height top_ms1_height sum_ms1_height weighted_ms1_height isotope_intensity_correlation isotope_height_correlation n_observations intensity_correlation height_correlation intensity_fraction height_fraction intensity_fraction_weighted height_fraction_weighted mean_observation_score sum_b_ion_intensity sum_y_ion_intensity diff_b_y_ion_intensity f_masked fragment_scan_correlation template_scan_correlation fragment_frame_correlation top3_frame_correlation template_frame_correlation top3_b_ion_correlation n_b_ions top3_y_ion_correlation n_y_ions cycle_fwhm mobility_fwhm delta_frame_peak top_3_ms2_mass_error mean_ms2_mass_error n_overlapping mean_overlapping_intensity mean_overlapping_mass_error precursor_idx rank scan_center score frame_start scan_stop elution_group_idx frame_center scan_start frame_stop flat_frag_start_idx decoy i_1 mz_library mod_sites charge flat_frag_stop_idx proteins genes channel i_0 sequence i_2 i_3 mobility_library mods rt_calibrated rt_library delta_rt n_K n_R n_P _decoy proba qval _candidate_idx valid candidate_idx run mod_seq_hash mod_seq_charge_hash pg_master pg pg_qval intensity +0.000000 75.606934 3111.141602 0.000001 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 648.794128 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.000000 0.808025 0.969734 1.000000 1.000000 1.000000 1.000000 0.000000 11.815787 13.400094 -1.584307 1.000000 0.000000 0.000000 0.792649 0.992209 0.000000 0.923456 2.000000 0.919347 10.000000 18.168669 0.000000 0.333333 -1.579069 -1.260026 4.000000 1441328.000000 -1.948657 12789812 0 0 170.287918 79426 1 6406557 80332 0 81389 65332105 0 0.305868 648.794128 5 2 65332117 P06733 ENOA_HUMAN 0 0.475448 LMIEMDGTENK 0.158954 0.059730 0.944761 Oxidation@M 3150.670410 2546.791260 -39.528809 1 0 0 0.000000 0.000005 0.000000 12789812 True 12789812 LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_03 5071272541180990939 5071272541180990941 P06733 P06733 0.000000 20915252511.701401 diff --git a/tests/test_data/test_diann.tsv b/tests/test_data/test_diann.tsv new file mode 100644 index 0000000..fba1e43 --- /dev/null +++ b/tests/test_data/test_diann.tsv @@ -0,0 +1,2 @@ +File.Name Run Protein.Group Protein.Ids Protein.Names Genes PG.Quantity PG.Normalised PG.MaxLFQ Genes.Quantity Genes.Normalised Genes.MaxLFQ Genes.MaxLFQ.Unique Modified.Sequence Stripped.Sequence Precursor.Id Precursor.Charge Q.Value PEP Global.Q.Value Protein.Q.Value PG.Q.Value Global.PG.Q.Value GG.Q.Value Translated.Q.Value Proteotypic Precursor.Quantity Precursor.Normalised Precursor.Translated Translated.Quality Ms1.Translated Quantity.Quality RT RT.Start RT.Stop iRT Predicted.RT Predicted.iRT First.Protein.Description Lib.Q.Value Lib.PG.Q.Value Ms1.Profile.Corr Ms1.Area Evidence Spectrum.Similarity Averagine Mass.Evidence CScore Decoy.Evidence Decoy.CScore Fragment.Quant.Raw Fragment.Quant.Corrected Fragment.Correlations MS2.Scan PTM.Informative PTM.Specific PTM.Localising PTM.Q.Value PTM.Site.Confidence Lib.PTM.Site.Confidence IM iIM Predicted.IM Predicted.iIM +/data/Orbi_Yeast/LFQ_Orbitrap_AIF_Yeast_03.mzML LFQ_Orbitrap_AIF_Yeast_03 P38156 P38156 MAL31_YEAST MAL31 672704 689275 689274 672704 689275 689274 689274 AAAAEINVKDPKEDLETSVVDEGR AAAAEINVKDPKEDLETSVVDEGR AAAAEINVKDPKEDLETSVVDEGR4 4 0.000548193 0.0104343 1 1 1 1 1 0 1 413970 424167 413970 849940 0.904051 75.2574 75.0156 75.5001 33.9222 75.2713 33.8999 Maltose permease MAL31 1 1 0.347567 849940 1.52391 0.455898 0.0388433 0 0.995107 0.705793 0.213383 274506;139464;0;0;0;0;486380;0;70361;370465;36455.3;0; 274506;139464;0;0;0;0;486380;0;70361;370465;36455.3;0; 0.995393;0.724264;0;0;0;0;0.949297;0;0.169817;0.59338;0.481298;0; 116903 0 0 0 0 0 0 0 0 0 0 diff --git a/tests/test_data/test_fragpipe.tsv b/tests/test_data/test_fragpipe.tsv new file mode 100644 index 0000000..847583d --- /dev/null +++ b/tests/test_data/test_fragpipe.tsv @@ -0,0 +1,2 @@ +Spectrum Spectrum File Peptide Modified Peptide Extended Peptide Prev AA Next AA Peptide Length Charge Retention Observed Mass Calibrated Observed Mass Observed M/Z Calibrated Observed M/Z Calculated Peptide Mass Calculated M/Z Delta Mass SpectralSim RTScore Expectation Hyperscore Nextscore Probability Number of Enzymatic Termini Number of Missed Cleavages Protein Start Protein End Intensity Assigned Modifications Observed Modifications Purity Is Unique Protein Protein ID Entry Name Gene Protein Description Mapped Genes Mapped Proteins +LFQ_Orbitrap_AIF_Yeast_01_Q1.00001.00001.2 interact-LFQ_Orbitrap_AIF_Yeast_01_Q1.pep.xml TGAPNNGQYGADNGNPNGER NQQNNQER.TGAPNNGQYGADNGNPNGER.GIFSTIVG R G 20 2 2432.1640 2001.8539 2001.8527 1001.9342 1001.9336 2001.8524 1001.9335 0.0002 0.9925 5.8094 0.00000000000011 57.2940 0.0000 1.0000 2 0 24 43 0.0000 0.00 true sp|P40159|YNU8_YEAST P40159 YNU8_YEAST YNL208W Uncharacterized protein YNL208W diff --git a/tests/test_io/test_alphadia.py b/tests/test_io/test_alphadia.py new file mode 100644 index 0000000..64e38ae --- /dev/null +++ b/tests/test_io/test_alphadia.py @@ -0,0 +1,39 @@ +"""Tests for psm_utils.io.alphadia.""" + +from psm_utils.io.alphadia import AlphaDIAReader +from psm_utils.psm import PSM + +test_psm = PSM( + peptidoform="LMIEM[Oxidation]DGTENK/2", + spectrum_id="79426", + run="LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_03", + collection=None, + spectrum="79426", + is_decoy=False, + score=170.287918, + qvalue=0.000000, + pep=0.000005, + precursor_mz=648.794128, + retention_time=3111.141602, + ion_mobility=0.000001, + protein_list=["P06733"], + rank=1, + source="alphadia", + metadata={}, + rescoring_features={ + "rt_observed": 3111.141602, + "mobility_observed": 0.000001, + "mz_observed": 648.794128, + "score": 170.287918, + "charge": 2, + "delta_rt": -39.528809, + }, +) + + +class TestAlphaDIAReader: + def test_iter(self): + with AlphaDIAReader("./tests/test_data/test_alphadia.tsv") as reader: + for psm in reader: + psm.provenance_data = {} + assert psm == test_psm diff --git a/tests/test_io/test_diann.py b/tests/test_io/test_diann.py new file mode 100644 index 0000000..543db4c --- /dev/null +++ b/tests/test_io/test_diann.py @@ -0,0 +1,44 @@ +"""Tests for psm_utils.io.diann.""" + +from psm_utils.io.diann import DIANNReader +from psm_utils.psm import PSM + +test_psm = PSM( + peptidoform="AAAAEINVKDPKEDLETSVVDEGR/4", + spectrum_id="NA", + run="LFQ_Orbitrap_AIF_Yeast_03", + collection=None, + spectrum=None, + is_decoy=False, + score=0.995107, + qvalue=0.000548193, + pep=0.0104343, + precursor_mz=None, + retention_time=75.2574, + ion_mobility=0, + protein_list=["P38156"], + rank=1, + source="diann", + metadata={}, + rescoring_features={ + "CScore": 0.995107, + "RT": 75.2574, + "Predicted.RT": 75.2713, + "iRT": 33.9222, + "Predicted.iRT": 33.8999, + "Ms1.Profile.Corr": 0.347567, + "Ms1.Area": 849940, + "IM": 0, + "iIM": 0, + "Predicted.IM": 0, + "Predicted.iIM": 0, + }, +) + + +class TestDIANNReader: + def test_iter(self): + with DIANNReader("./tests/test_data/test_diann.tsv") as reader: + for psm in reader: + psm.provenance_data = {} + assert psm == test_psm diff --git a/tests/test_io/test_fragpipe.py b/tests/test_io/test_fragpipe.py new file mode 100644 index 0000000..4c1b067 --- /dev/null +++ b/tests/test_io/test_fragpipe.py @@ -0,0 +1,42 @@ +"""Tests for psm_utils.io.fragpipe.""" + +from psm_utils.io.fragpipe import FragpipeReader +from psm_utils.psm import PSM + +test_psm = PSM( + peptidoform="TGAPNNGQYGADNGNPNGER/2", + spectrum_id="00001", + run="LFQ_Orbitrap_AIF_Yeast_01_Q1", + collection=None, + spectrum=None, + is_decoy=False, + score=57.2940, + qvalue=None, + pep=1 - 1.0000, + precursor_mz=1001.9342, + retention_time=2432.1640, + ion_mobility=None, + protein_list=["sp|P40159|YNU8_YEAST"], + rank=1, + source="fragpipe", + metadata={}, + rescoring_features={ + "Peptide Length": 20, + "Retention": 2432.1640, + "Observed Mass": 2001.8539, + "Observed M/Z": 1001.9342, + "Calculated Peptide Mass": 2001.8524, + "Calculated M/Z": 1001.9335, + "Delta Mass": 0.0002, + "Hyperscore": 57.2940, + "Number of Missed Cleavages": 0, + }, +) + + +class TestFragpipeReader: + def test_iter(self): + with FragpipeReader("./tests/test_data/test_fragpipe.tsv") as reader: + for psm in reader: + psm.provenance_data = {} + assert psm == test_psm From 479b0d557487acd24f7795881631e482c8ca806e Mon Sep 17 00:00:00 2001 From: rodvrees Date: Fri, 28 Jun 2024 10:03:20 +0200 Subject: [PATCH 06/13] diann qval column variable --- psm_utils/io/diann.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/psm_utils/io/diann.py b/psm_utils/io/diann.py index a60cb7b..52e4b1a 100644 --- a/psm_utils/io/diann.py +++ b/psm_utils/io/diann.py @@ -20,7 +20,9 @@ class DIANNReader(ReaderBase, ABC): - def __init__(self, filename, score_column: str = "CScore", *args, **kwargs) -> None: + def __init__( + self, filename, score_column: str = "CScore", qval_column="Q.Value", *args, **kwargs + ) -> None: """ Reader for DIA-NN '.tsv' file. @@ -36,6 +38,7 @@ def __init__(self, filename, score_column: str = "CScore", *args, **kwargs) -> N super().__init__(filename, *args, **kwargs) self.filename = filename self.score_column = score_column + self.qval_column = qval_column def __iter__(self) -> Iterable[PSM]: """Iterate over file and return PSMs one-by-one.""" @@ -60,7 +63,9 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM: spectrum_id="NA", # DIA-NN does not output spectrum ID run=psm_dict["Run"], is_decoy=False, - qvalue=psm_dict["Q.Value"], + qvalue=psm_dict[ + self.qval_column + ], # DIA-NN puts out q-value on both run and library level pep=float(psm_dict["PEP"]), score=float(psm_dict[self.score_column]), retention_time=float(psm_dict["RT"]), From 8d7f914f4876a7b692f67292cd23e5d833db7cb7 Mon Sep 17 00:00:00 2001 From: rodvrees Date: Thu, 31 Oct 2024 15:39:59 +0100 Subject: [PATCH 07/13] DIANN scan --- .gitignore | 3 ++- psm_utils/io/diann.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 8669c02..854c67a 100644 --- a/.gitignore +++ b/.gitignore @@ -133,8 +133,9 @@ dmypy.json .pyre/ .vscode/settings.json -# Specific to Branch +# Specific to Branch, temp example_files/DIANN_example.tsv example_files/MSFragger_example_psm.tsv example_files/AlphaDIA_example.tsv test.ipynb +example_files/evidence.txt diff --git a/psm_utils/io/diann.py b/psm_utils/io/diann.py index 52e4b1a..265d816 100644 --- a/psm_utils/io/diann.py +++ b/psm_utils/io/diann.py @@ -60,7 +60,7 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM: peptidoform=self._parse_peptidoform( psm_dict["Modified.Sequence"], psm_dict["Precursor.Charge"] ), - spectrum_id="NA", # DIA-NN does not output spectrum ID + spectrum_id=psm_dict["MS2.Scan"], run=psm_dict["Run"], is_decoy=False, qvalue=psm_dict[ From d12120a013c02ebc4b171020b450ec1d4e673f3b Mon Sep 17 00:00:00 2001 From: rodvrees Date: Thu, 31 Oct 2024 15:43:29 +0100 Subject: [PATCH 08/13] cleanup .gitignore --- .gitignore | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.gitignore b/.gitignore index 854c67a..0432116 100644 --- a/.gitignore +++ b/.gitignore @@ -132,10 +132,3 @@ dmypy.json # Pyre type checker .pyre/ .vscode/settings.json - -# Specific to Branch, temp -example_files/DIANN_example.tsv -example_files/MSFragger_example_psm.tsv -example_files/AlphaDIA_example.tsv -test.ipynb -example_files/evidence.txt From 2a7b56e1efb2f69e2212ba49eba6f11e22d4d03d Mon Sep 17 00:00:00 2001 From: rodvrees Date: Thu, 31 Oct 2024 15:46:35 +0100 Subject: [PATCH 09/13] formatting --- psm_utils/io/diann.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/psm_utils/io/diann.py b/psm_utils/io/diann.py index 265d816..60da432 100644 --- a/psm_utils/io/diann.py +++ b/psm_utils/io/diann.py @@ -60,7 +60,7 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM: peptidoform=self._parse_peptidoform( psm_dict["Modified.Sequence"], psm_dict["Precursor.Charge"] ), - spectrum_id=psm_dict["MS2.Scan"], + spectrum_id=psm_dict["MS2.Scan"], run=psm_dict["Run"], is_decoy=False, qvalue=psm_dict[ From 19c5bbc1808510e21a41f705d959e9b6dbfc6cfa Mon Sep 17 00:00:00 2001 From: rodvrees Date: Thu, 31 Oct 2024 15:50:09 +0100 Subject: [PATCH 10/13] fix diann test case --- tests/test_io/test_diann.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_io/test_diann.py b/tests/test_io/test_diann.py index 543db4c..2ce86ca 100644 --- a/tests/test_io/test_diann.py +++ b/tests/test_io/test_diann.py @@ -5,7 +5,7 @@ test_psm = PSM( peptidoform="AAAAEINVKDPKEDLETSVVDEGR/4", - spectrum_id="NA", + spectrum_id="116903", run="LFQ_Orbitrap_AIF_Yeast_03", collection=None, spectrum=None, From a9cac756b63087a1acf08682a09be2b664fcbe7c Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 6 Nov 2024 21:41:01 +0100 Subject: [PATCH 11/13] Review changes - Add support for C-term modifications - Fixed to-dos - Formatting and structure - Add docs --- docs/source/api/psm_utils.io.rst | 24 ++++++ example_files/alphadia.precursors.tsv | 4 + example_files/fragpipe.psm.tsv | 6 ++ psm_utils/io/__init__.py | 30 ++++---- psm_utils/io/alphadia.py | 70 ++++++++--------- psm_utils/io/diann.py | 81 ++++++++++---------- psm_utils/io/fragpipe.py | 106 +++++++++++++------------- tests/test_io/test_alphadia.py | 82 +++++++++++++++++++- tests/test_io/test_diann.py | 19 ++++- tests/test_io/test_fragpipe.py | 34 +++++++-- 10 files changed, 298 insertions(+), 158 deletions(-) create mode 100644 example_files/alphadia.precursors.tsv create mode 100644 example_files/fragpipe.psm.tsv diff --git a/docs/source/api/psm_utils.io.rst b/docs/source/api/psm_utils.io.rst index d858997..1de803f 100644 --- a/docs/source/api/psm_utils.io.rst +++ b/docs/source/api/psm_utils.io.rst @@ -7,6 +7,30 @@ psm_utils.io +psm_utils.io.alphapept +################## + +.. automodule:: psm_utils.io.alphapept + :members: + :inherited-members: + + +psm_utils.io.diann +################## + +.. automodule:: psm_utils.io.diann + :members: + :inherited-members: + + +psm_utils.io.fragpipe +################## + +.. automodule:: psm_utils.io.fragpipe + :members: + :inherited-members: + + psm_utils.io.idxml ################## diff --git a/example_files/alphadia.precursors.tsv b/example_files/alphadia.precursors.tsv new file mode 100644 index 0000000..2954564 --- /dev/null +++ b/example_files/alphadia.precursors.tsv @@ -0,0 +1,4 @@ +base_width_mobility base_width_rt rt_observed mobility_observed mono_ms1_intensity top_ms1_intensity sum_ms1_intensity weighted_ms1_intensity weighted_mass_deviation weighted_mass_error mz_observed mono_ms1_height top_ms1_height sum_ms1_height weighted_ms1_height isotope_intensity_correlation isotope_height_correlation n_observations intensity_correlation height_correlation intensity_fraction height_fraction intensity_fraction_weighted height_fraction_weighted mean_observation_score sum_b_ion_intensity sum_y_ion_intensity diff_b_y_ion_intensity f_masked fragment_scan_correlation template_scan_correlation fragment_frame_correlation top3_frame_correlation template_frame_correlation top3_b_ion_correlation n_b_ions top3_y_ion_correlation n_y_ions cycle_fwhm mobility_fwhm delta_frame_peak top_3_ms2_mass_error mean_ms2_mass_error n_overlapping mean_overlapping_intensity mean_overlapping_mass_error precursor_idx rank frame_center scan_center score elution_group_idx frame_start scan_stop frame_stop scan_start proteins rt_calibrated flat_frag_start_idx charge mods decoy sequence mz_library channel genes i_0 flat_frag_stop_idx i_2 i_1 i_3 mobility_library rt_library mod_sites delta_rt n_K n_R n_P _decoy proba qval _candidate_idx valid candidate_idx run mod_seq_hash mod_seq_charge_hash pg_master pg pg_qval intensity +0.000000 40.673340 2800.518555 0.000001 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 894.337830 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 0.968887 0.845673 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 15.266385 -15.266385 1.000000 0.000000 0.000000 0.929785 0.975279 0.000000 0.000000 0.000000 0.948546 12.000000 14.244627 0.000000 -0.500000 0.132713 -0.218829 0.000000 0.000000 0.000000 10447876 0 72329 0 136.160126 5238821 71876 1 72933 0 P18899 2347.609131 59818105 3 0 SSYGSSSNDDSYGSSNNDDSYGSSNK 894.337830 0 DDR48_YEAST 0.273118 59818117 0.249391 0.348172 0.129319 0.948457 1399.216187 452.909424 1 0 0 0.000000 0.000000 0.000000 10447876 True 10447876 LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01 8562405370847133435 8562405370847133438 P18899 P18899 0.000000 190103852.035206 +0.000000 40.745483 1647.208252 0.000001 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 986.440491 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 0.991654 0.992141 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 14.408463 -14.408463 1.000000 0.000000 0.000000 0.738752 0.974915 0.000000 0.000000 0.000000 0.880488 12.000000 9.885651 0.000000 0.000000 -0.391579 -0.698411 0.000000 0.000000 0.000000 8793636 0 42431 0 122.278320 4411698 41978 1 43035 0 Q9ULU4 1670.462402 49907897 2 0 SSQGSSSSTQSAPSETASASK 986.440491 0 PKCB1_HUMAN 0.380560 49907909 0.190793 0.352861 0.075786 1.158085 387.834503 -23.254150 1 0 1 0.000000 0.000000 0.000000 8793636 True 8793636 LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01 5824087303549386971 5824087303549386973 Q9ULU4 Q9ULU4 0.000000 195496849.073322 +0.000000 52.349121 2678.317139 0.000001 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 905.432312 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 0.986449 0.931379 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 16.636572 -16.636572 1.000000 0.000000 0.000000 0.978579 0.996334 0.000000 0.000000 0.000000 0.988605 12.000000 13.867673 0.000000 0.000000 -0.432777 0.780247 0.000000 0.000000 0.000000 7132549 0 69158 0 152.012512 3581144 68554 1 69913 0 O60763 2646.791260 39980635 2 0 SSQTSGTNEQSSAIVSAR 905.432312 0 USO1_HUMAN 0.404900 39980647 0.177361 0.352328 0.065410 1.110423 1774.035034 31.525879 0 1 0 0.000000 0.000000 0.000000 7132549 True 7132549 LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01 14912031975374993231 14912031975374993233 O60763 O60763 0.000000 406414129.849395 diff --git a/example_files/fragpipe.psm.tsv b/example_files/fragpipe.psm.tsv new file mode 100644 index 0000000..4ffd36e --- /dev/null +++ b/example_files/fragpipe.psm.tsv @@ -0,0 +1,6 @@ +Spectrum Spectrum File Peptide Modified Peptide Extended Peptide Prev AA Next AA Peptide Length Charge Retention Observed Mass Calibrated Observed Mass Observed M/Z Calibrated Observed M/Z Calculated Peptide Mass Calculated M/Z Delta Mass Expectation Hyperscore Nextscore Probability Number of Enzymatic Termini Number of Missed Cleavages Protein Start Protein End Intensity Assigned Modifications Observed Modifications Purity Is Unique Protein Protein ID Entry Name Gene Protein Description Mapped Genes Mapped Proteins +LFQ_Orbitrap_AIF_Human_01.100000.100000.0 D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml SEDCFILDHGK SEDCFILDHGK PFAQGAIK.SEDCFILDHGK.DGKIFVWK K D 11 3 3813.8638 1319.5804 1319.5807 440.8674 440.8675 1319.5815 440.8678 -0.0008 0.01264961000000 19.3701 15.5657 0.9968 2 0 328 338 0.0000 4C(57.0214) 0.00 false sp|GELS_HUMAN| GELS_HUMAN GSN sp|P06396|GELS_HUMAN +LFQ_Orbitrap_AIF_Human_01.100002.100002.0 D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml FLLEAGADQEHK KGHIEMVR.FLLEAGADQEHK.TDEMHTAI R T 12 3 3813.9346 1356.6663 1356.6665 453.2294 453.2294 1356.6672 453.2297 -0.0007 0.01950739000000 18.9370 14.3831 0.9985 2 0 419 430 0.0000 0.00 false sp|O75179|ANR17_HUMAN O75179 ANR17_HUMAN ANKRD17 Ankyrin repeat domain-containing protein 17 ANKHD1 sp|Q8IWZ3|ANKH1_HUMAN +LFQ_Orbitrap_AIF_Human_01.100004.100004.0 D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml REELSNVLAAMR REELSNVLAAM[147]R THIRAKRK.REELSNVLAAMR.KAAAKKD K K 12 3 3814.0050 1403.7197 1403.7198 468.9138 468.9139 1403.7190 468.9136 0.0008 0.00008879724000 24.3292 15.9192 0.9998 2 1 87 98 0.0000 11M(15.9949) 0.00 true sp|Q9Y3U8|RL36_HUMAN Q9Y3U8 RL36_HUMAN RPL36 Large ribosomal subunit protein eL36 +LFQ_Orbitrap_AIF_Human_01.100040.100040.0 D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml LHISPSNMTNQNTPEYMEK LHISPSNM[147]TNQNTPEYMEKc[17] EYFGPDFK.LHISPSNMTNQNTPEYMEK.IKQRIFEN K I 19 3 3815.4023 2248.0256 2248.0251 750.3491 750.3490 2248.0254 750.3491 -0.0002 0.00194418200000 21.2429 21.2429 0.7143 2 0 344 362 0.0000 8M(15.9949), C-term(-0.9840) 0.00 true sp|Q92769|HDAC2_HUMAN Q92769 HDAC2_HUMAN HDAC2 Histone deacetylase 2 +LFQ_Orbitrap_AIF_Human_01.101373.101373.0 D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml ANIAVQR n[43]ANIAVQR .ANIAVQR.IKREFKEV M I 7 2 3866.1475 812.4501 812.4503 407.2323 407.2324 812.4505 407.2325 -0.0002 0.11090580000000 17.1991 14.1196 0.9898 2 0 2 8 0.0000 N-term(42.0106) 0.00 true sp|P61086|UBE2K_HUMAN P61086 UBE2K_HUMAN UBE2K Ubiquitin-conjugating enzyme E2 K diff --git a/psm_utils/io/__init__.py b/psm_utils/io/__init__.py index e345708..0a4307e 100644 --- a/psm_utils/io/__init__.py +++ b/psm_utils/io/__init__.py @@ -8,6 +8,9 @@ from rich.progress import track +import psm_utils.io.alphadia as alphadia +import psm_utils.io.diann as diann +import psm_utils.io.fragpipe as fragpipe import psm_utils.io.idxml as idxml import psm_utils.io.ionbot as ionbot import psm_utils.io.maxquant as maxquant @@ -22,9 +25,6 @@ import psm_utils.io.sage as sage import psm_utils.io.tsv as tsv import psm_utils.io.xtandem as xtandem -import psm_utils.io.diann as diann -import psm_utils.io.fragpipe as fragpipe -import psm_utils.io.alphadia as alphadia from psm_utils.io._base_classes import WriterBase from psm_utils.io.exceptions import PSMUtilsIOException from psm_utils.psm import PSM @@ -110,32 +110,30 @@ "filename_pattern": r"^.*(?:_|\.).sage.parquet$", }, "fragpipe": { - "reader": fragpipe.FragpipeReader, + "reader": fragpipe.FragPipeReader, "writer": None, "extension": ".tsv", - "filename_pattern": r"^.*psm\.tsv$", + "filename_pattern": r"^.*(?:_|\.)?psm\.tsv$", }, "alphadia": { - "reader": alphadia.AlphaDIAReader, - "writer": None, - "extension": ".tsv", - "filename_pattern": r"^.*precursor\.tsv$", + "reader": alphadia.AlphaDIAReader, + "writer": None, + "extension": ".tsv", + "filename_pattern": r"^.*(?:_|\.)?precursors\.tsv$", }, - "diann": { # List after fragpipe and alphadia to avoid extension matching conflicts #TODO: fix tsv conflict - "reader": diann.DIANNReader, + "diann": { + "reader": diann.DIANNTSVReader, "writer": None, "extension": ".tsv", - "filename_pattern": r"^.*\.tsv$", + "filename_pattern": r"^.*(?:_|\.)?diann\.tsv$", }, - - - "parquet": { # List after proteoscape and sage to avoid extension matching conflicts + "parquet": { # List after more specific Parquet patterns to avoid matching conflicts "reader": parquet.ParquetReader, "writer": parquet.ParquetWriter, "extension": ".parquet", "filename_pattern": r"^.*\.parquet$", }, - "tsv": { # List after sage to avoid extension matching conflicts + "tsv": { # List after more specific TSV patterns to avoid matching conflicts "reader": tsv.TSVReader, "writer": tsv.TSVWriter, "extension": ".tsv", diff --git a/psm_utils/io/alphadia.py b/psm_utils/io/alphadia.py index c2c3fff..8f6e1b8 100644 --- a/psm_utils/io/alphadia.py +++ b/psm_utils/io/alphadia.py @@ -1,10 +1,4 @@ -""" -Reader for PSM files from the AlphaDIA search engine. - -Reads the AlphaDIA ``precursor.tsv`` file as defined on the -`TODO: NOT YET A LINK`_. - -""" +"""Reader for PSM files from the AlphaDIA search engine.""" from __future__ import annotations @@ -19,9 +13,18 @@ set_csv_field_size_limit() +# TODO: check +RESCORING_FEATURES = [ + "rt_observed", + "mobility_observed", + "mz_observed", + "charge", + "delta_rt", +] + class AlphaDIAReader(ReaderBase, ABC): - def __init__(self, filename, score_column: str = "score", *args, **kwargs): + def __init__(self, filename, *args, **kwargs): """ Reader for AlphaDIA ``precursor.tsv`` file. @@ -29,14 +32,10 @@ def __init__(self, filename, score_column: str = "score", *args, **kwargs): ---------- filename : str or Path Path to PSM file. - score_column: str, optional - Name of the column that holds the primary PSM score. Default is - ``score``. """ super().__init__(filename, *args, **kwargs) self.filename = filename - self.score_column = score_column def __iter__(self) -> Iterable[PSM]: """Iterate over file and return PSMs one-by-one.""" @@ -62,17 +61,15 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM: run=psm_dict["run"], spectrum=psm_dict["frame_start"], # TODO: needs to be checked is_decoy=bool(int(psm_dict["decoy"])), - score=psm_dict[self.score_column], + score=psm_dict["score"], qvalue=psm_dict["qval"], - pep=psm_dict[ - "proba" - ], # TODO: needs to be checked, assumption because if it is 1-proba than it's really bad + pep=psm_dict["proba"], precursor_mz=psm_dict["mz_observed"], retention_time=psm_dict["rt_observed"], ion_mobility=psm_dict["mobility_observed"], protein_list=psm_dict["proteins"].split(";"), rank=int(psm_dict["rank"]) + 1, # AlphaDIA ranks are 0-based - source="alphadia", + source="AlphaDIA", provenance_data=({"alphadia_filename": str(self.filename)}), metadata={}, rescoring_features=rescoring_features, @@ -80,20 +77,28 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM: @staticmethod def _parse_peptidoform(sequence: str, mods: str, mod_sites, charge: Optional[str]) -> str: + """Parse a peptidoform from a AlphaDIA PSM file.""" + # Parse modifications if mods: - mods = mods.split(";") - mod_sites = mod_sites.split(";") - for mod, site in reversed(sorted(zip(mods, mod_sites), key=lambda x: int(x[1]))): - if int(site) == 0: - sequence = ( - sequence[: int(site)] + f"[{mod.split('@')[0]}]-" + sequence[int(site) :] - ) + sequence_list = [""] + list(sequence) + [""] # N-term, sequence, C-term + for mod, site in zip(mods.split(";"), mod_sites.split(";")): + site = int(site) + name = mod.split("@")[0] + # N-terminal modification + if site == 0: + sequence_list[0] = f"[{name}]-" + # C-terminal modification + elif site == -1: + sequence_list[-1] = f"-[{name}]" + # Sequence modification else: - sequence = ( - sequence[: int(site)] + f"[{mod.split('@')[0]}]" + sequence[int(site) :] - ) + sequence_list[site] = f"{sequence_list[site]}[{name}]" + sequence = "".join(sequence_list) + + # Add charge if charge: sequence += f"/{int(float(charge))}" + return sequence @classmethod @@ -105,14 +110,3 @@ def from_dataframe(cls, dataframe) -> PSMList: for entry in dataframe.to_dict(orient="records") ] ) - - -# TODO: check -RESCORING_FEATURES = [ - "rt_observed", - "mobility_observed", - "mz_observed", - "score", - "charge", - "delta_rt", -] diff --git a/psm_utils/io/diann.py b/psm_utils/io/diann.py index 60da432..1d93183 100644 --- a/psm_utils/io/diann.py +++ b/psm_utils/io/diann.py @@ -1,15 +1,24 @@ """ Reader for PSM files from DIA-NN -Reads the '.tsv' file as defined on the `DIA-NN documentation page `_. +Reads the '.tsv' file as defined on the +`DIA-NN documentation page `_. + +Notes +----- + +- DIA-NN calculates q-values at both the run and library level. The run-level q-value is used as + the PSM q-value. +- DIA-NN currently does not return precursor m/z values. +- DIA-NN currently does not support C-terminal modifications in its searches. + """ from __future__ import annotations import csv -from abc import ABC -from typing import Iterable, Optional import re +from typing import Iterable, Optional from psm_utils.io._base_classes import ReaderBase from psm_utils.io._utils import set_csv_field_size_limit @@ -18,11 +27,22 @@ set_csv_field_size_limit() +RESCORING_FEATURES = [ + "RT", + "Predicted.RT", + "iRT", + "Predicted.iRT", + "Ms1.Profile.Corr", + "Ms1.Area", + "IM", + "iIM", + "Predicted.IM", + "Predicted.iIM", +] + -class DIANNReader(ReaderBase, ABC): - def __init__( - self, filename, score_column: str = "CScore", qval_column="Q.Value", *args, **kwargs - ) -> None: +class DIANNTSVReader(ReaderBase): + def __init__(self, filename, *args, **kwargs) -> None: """ Reader for DIA-NN '.tsv' file. @@ -30,15 +50,10 @@ def __init__( ---------- filename : str or Path Path to PSM file. - score_column: str, optional - Name of the column that holds the primary PSM score. Default is - ``CScore``. """ super().__init__(filename, *args, **kwargs) self.filename = filename - self.score_column = score_column - self.qval_column = qval_column def __iter__(self) -> Iterable[PSM]: """Iterate over file and return PSMs one-by-one.""" @@ -63,16 +78,15 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM: spectrum_id=psm_dict["MS2.Scan"], run=psm_dict["Run"], is_decoy=False, - qvalue=psm_dict[ - self.qval_column - ], # DIA-NN puts out q-value on both run and library level + qvalue=psm_dict["Q.Value"], pep=float(psm_dict["PEP"]), - score=float(psm_dict[self.score_column]), + score=float(psm_dict["CScore"]), + precursor_mz=None, # Not returned by DIA-NN :( retention_time=float(psm_dict["RT"]), ion_mobility=float(psm_dict["IM"]), protein_list=psm_dict["Protein.Ids"].split(";"), source="diann", - rank=1, # Leave out? + rank=None, provenance_data=({"diann_filename": str(self.filename)}), rescoring_features=rescoring_features, metadata={}, @@ -80,22 +94,25 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM: @staticmethod def _parse_peptidoform(peptide: str, charge: Optional[str]) -> str: + # Add charge if charge: peptide += f"/{int(float(charge))}" + + # Replace parentheses with square brackets and capitalize UniMod prefix pattern = r"\(UniMod:(\d+)\)" replacement = r"[UNIMOD:\1]" peptide = re.sub(pattern, replacement, peptide) - # If [UNIMOD:n] occurs before the first amino acid, a hyphen is added before the first amino acid + + # Add hyphen for N-terminal modifications + # If [UNIMOD:n] occurs before the first amino acid, a hyphen is added before the first + # amino acid if peptide[0] == "[": # Hyphen after the closing bracket peptide = peptide.replace("]", "]-", 1) - return peptide - @staticmethod - def _parse_precursor_mz(): - return NotImplementedError( - "Method not implemented yet. DIA-NN does not yet output precursor m/z, but might in the future." - ) + # C-terminal modifications are currently not supported in DIA-NN + + return peptide @classmethod def from_dataframe(cls, dataframe) -> PSMList: @@ -106,19 +123,3 @@ def from_dataframe(cls, dataframe) -> PSMList: for entry in dataframe.to_dict(orient="records") ] ) - - -# TODO: Check -RESCORING_FEATURES = [ - "CScore", - "RT", - "Predicted.RT", - "iRT", - "Predicted.iRT", - "Ms1.Profile.Corr", - "Ms1.Area", - "IM", - "iIM", - "Predicted.IM", - "Predicted.iIM", -] diff --git a/psm_utils/io/fragpipe.py b/psm_utils/io/fragpipe.py index 50d6bd4..fc07395 100644 --- a/psm_utils/io/fragpipe.py +++ b/psm_utils/io/fragpipe.py @@ -4,6 +4,11 @@ Reads the Philosopher ``psm.tsv`` file as defined on the `Fragpipe documentation page `_. +Notes +----- + +- Decoy PSMs and q-values are not returned by FragPipe. + """ from __future__ import annotations @@ -20,13 +25,23 @@ set_csv_field_size_limit() +RESCORING_FEATURES = [ + "Peptide Length", + "Retention", + "Observed Mass", + "Observed M/Z", + "Calculated Peptide Mass", + "Calculated M/Z", + "Delta Mass", + "Number of Missed Cleavages", +] + -class FragpipeReader(ReaderBase, ABC): +class FragPipeReader(ReaderBase, ABC): def __init__( self, filename, - score_column: str = "Hyperscore", - mz_column: str = "Observed M/Z", + use_calibrated_mz: bool = True, *args, **kwargs, ) -> None: @@ -35,20 +50,18 @@ def __init__( Parameters ---------- - filename : str or Path + filename Path to PSM file. - score_column: str, optional - Name of the column that holds the primary PSM score. Default is - ``Hyperscore``. - mz_column: str, optional - Name of the column that holds the precursor m/z. Default is - ``Observed M/Z``. + use_calibrated_mz + Whether to use ``Calibrated Observed M/Z`` (true) or non-calibrated ``Observed m/z`` + (false), by default True. """ super().__init__(filename, *args, **kwargs) self.filename = filename - self.score_column = score_column - self.mz_column = mz_column + self.use_calibrated_mz = use_calibrated_mz + + self._mz_key = "Calibrated Observed M/Z" if use_calibrated_mz else "Observed M/Z" def __iter__(self) -> Iterable[PSM]: """Iterate over file and return PSMs one-by-one.""" @@ -58,38 +71,27 @@ def __iter__(self) -> Iterable[PSM]: yield self._get_peptide_spectrum_match(row) def _get_peptide_spectrum_match(self, psm_dict) -> PSM: - """Parse a single PSM from a MSFragger PSM file.""" - rescoring_features = {} - for ft in RESCORING_FEATURES: - try: - rescoring_features[ft] = psm_dict[ft] - except KeyError: - continue + """Parse a single PSM from a FragPipe PSM file.""" + rescoring_features = {ft: psm_dict[ft] for ft in RESCORING_FEATURES if ft in psm_dict} return PSM( peptidoform=self._parse_peptidoform( psm_dict["Modified Peptide"], psm_dict["Peptide"], psm_dict["Charge"] ), - spectrum_id=self._parse_spectrum_id(psm_dict["Spectrum"]), # TODO: needs to be checked + spectrum_id=self._parse_spectrum_id(psm_dict["Spectrum"]), run=self._parse_run(psm_dict["Spectrum File"]), is_decoy=False, - qvalue=None, # Q-value is not outputted by Philosopher - pep=1 - - float( - psm_dict["Probability"] - ), # PeptideProphet Probability, not explicitely stated if this is the inverse of PEP - # But I'm assuming it is - score=psm_dict[self.score_column], - precursor_mz=psm_dict[ - self.mz_column - ], # Allows use of both calibrated and uncalibrated Observed M/Z? + # Assuming this is 1 - PEP, as described in the PeptideProphet paper + # (https://doi.org/10.1186/1471-2105-13-S16-S1) + pep=1 - float(psm_dict["Probability"]), + score=psm_dict["Hyperscore"], + precursor_mz=psm_dict[self._mz_key], retention_time=float(psm_dict["Retention"]), ion_mobility=float(psm_dict["Ion Mobility"]) if "Ion Mobility" in psm_dict else None, protein_list=self._parse_protein_list( psm_dict["Protein"], psm_dict["Mapped Proteins"] ), - source="fragpipe", - rank=1, + source="FragPipe", provenance_data=({"fragpipe_filename": str(self.filename)}), rescoring_features=rescoring_features, metadata={}, @@ -97,32 +99,44 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM: @staticmethod def _parse_peptidoform(mod_peptide: str, peptide: str, charge: Optional[str]) -> str: + """Parse the peptidoform from the modified peptide, peptide, and charge columns.""" if mod_peptide: peptide = mod_peptide + # N-terminal modification + if peptide.startswith("n"): + peptide = peptide[1:] + # A hyphen needs to be added after the N-terminal modification, thus after the ] + peptide = peptide.replace("]", "]-", 1) + # C-terminal modification + if peptide.endswith("]"): + if "c[" in peptide: + peptide = peptide.replace("c[", "-[", 1) if charge: peptide += f"/{int(float(charge))}" - if peptide.startswith("n"): - peptide = peptide[1:] - # A hyphen needs to be added after the N-terminal modification, thus after the ] - peptide = peptide.replace("]", "]-", 1) return peptide @staticmethod def _parse_spectrum_id(spectrum: str) -> str: - return spectrum.split(".")[1] + """Extract scan number from spectrum ID: ``(file name).(scan #).(scan #).(charge).``""" + try: + return spectrum.split(".")[-2] + except IndexError: + return spectrum @staticmethod def _parse_protein_list(razor_protein: str, mapped_proteins) -> list[str]: + """Combine razor protein and mapped proteins into a single list.""" if mapped_proteins: mapped_proteins_list = mapped_proteins.split(", ") return [razor_protein] + mapped_proteins_list else: return [razor_protein] - # Dependent on the fragpipe workflow used the run name can be different, but in most cases - # something like 'interact-.pep.xml' is used @staticmethod def _parse_run(spectrum_file: str) -> str: + """Extract run name from spectrum file.""" + # Depending on the FragPipe workflow used, the run name can be different. In most cases + # something like 'interact-.pep.xml' is used if (spectrum_file.endswith(".pep.xml")) and (spectrum_file.startswith("interact-")): spectrum_file = spectrum_file.replace("interact-", "") return Path(Path(spectrum_file).stem).stem @@ -138,17 +152,3 @@ def from_dataframe(cls, dataframe) -> PSMList: for entry in dataframe.to_dict(orient="records") ] ) - - -# TODO: check -RESCORING_FEATURES = [ - "Peptide Length", - "Retention", - "Observed Mass", - "Observed M/Z", - "Calculated Peptide Mass", - "Calculated M/Z", - "Delta Mass", - "Hyperscore", - "Number of Missed Cleavages", -] diff --git a/tests/test_io/test_alphadia.py b/tests/test_io/test_alphadia.py index 64e38ae..cb3e77a 100644 --- a/tests/test_io/test_alphadia.py +++ b/tests/test_io/test_alphadia.py @@ -18,13 +18,12 @@ ion_mobility=0.000001, protein_list=["P06733"], rank=1, - source="alphadia", + source="AlphaDIA", metadata={}, rescoring_features={ "rt_observed": 3111.141602, "mobility_observed": 0.000001, "mz_observed": 648.794128, - "score": 170.287918, "charge": 2, "delta_rt": -39.528809, }, @@ -37,3 +36,82 @@ def test_iter(self): for psm in reader: psm.provenance_data = {} assert psm == test_psm + + def test__parse_peptidoform(self): + test_cases = [ + { + "sequence": "DNTTSGCGSDLQSATGTAR", + "mods": "Carbamidomethyl@C", + "mod_sites": "7", + "charge": 2, + "expected": "DNTTSGC[Carbamidomethyl]GSDLQSATGTAR/2", + }, + { + "sequence": "STCTEGEIACSADGK", + "mods": "Carbamidomethyl@C;Carbamidomethyl@C", + "mod_sites": "3;10", + "charge": 2, + "expected": "STC[Carbamidomethyl]TEGEIAC[Carbamidomethyl]SADGK/2", + }, + { + "sequence": "MLGETCADCGTILLQDK", + "mods": "Oxidation@M;Carbamidomethyl@C;Carbamidomethyl@C", + "mod_sites": "1;6;9", + "charge": 2, + "expected": "M[Oxidation]LGETC[Carbamidomethyl]ADC[Carbamidomethyl]GTILLQDK/2", + }, + { + "sequence": "VGLIGSCTNSSYEDMSR", + "mods": "Oxidation@M;Carbamidomethyl@C", + "mod_sites": "15;7", + "charge": 2, + "expected": "VGLIGSC[Carbamidomethyl]TNSSYEDM[Oxidation]SR/2", + }, + { + "sequence": "STATTTVTTSDQASHPTK", + "mods": "Acetyl@Protein_N-term", + "mod_sites": "0", + "charge": 2, + "expected": "[Acetyl]-STATTTVTTSDQASHPTK/2", + }, + { + "sequence": "MEPGPDGPAASGPAAIR", + "mods": "Acetyl@Protein_N-term;Oxidation@M", + "mod_sites": "0;1", + "charge": 2, + "expected": "[Acetyl]-M[Oxidation]EPGPDGPAASGPAAIR/2", + }, + { + "sequence": "AEPQPPSGGLTDEAALSCCSDADPSTK", + "mods": "Acetyl@Protein_N-term;Carbamidomethyl@C;Carbamidomethyl@C", + "mod_sites": "0;18;19", + "charge": 3, + "expected": "[Acetyl]-AEPQPPSGGLTDEAALSC[Carbamidomethyl]C[Carbamidomethyl]SDADPSTK/3", + }, + { + "sequence": "EPLISAPYLTTTKMSAPATLDAACIFCK", + "mods": "Acetyl@Protein_N-term;Oxidation@M;Carbamidomethyl@C;Carbamidomethyl@C", + "mod_sites": "0;14;24;27", + "charge": 4, + "expected": "[Acetyl]-EPLISAPYLTTTKM[Oxidation]SAPATLDAAC[Carbamidomethyl]IFC[Carbamidomethyl]K/4", + }, + { + "sequence": "GDIDANAFQHK", + "mods": "Amidated@Any_C-term", + "mod_sites": "-1", + "charge": 2, + "expected": "GDIDANAFQHK-[Amidated]/2", + }, + { + "sequence": "MNNPAMTIKGEQAK", + "mods": "Acetyl@Protein_N-term;Oxidation@M;Oxidation@M;Amidated@Any_C-term", + "mod_sites": "0;1;6;-1", + "charge": 4, + "expected": "[Acetyl]-M[Oxidation]NNPAM[Oxidation]TIKGEQAK-[Amidated]/4", + }, + ] + + for test_case in test_cases: + assert AlphaDIAReader._parse_peptidoform( + test_case["sequence"], test_case["mods"], test_case["mod_sites"], test_case["charge"] + ) == test_case["expected"] diff --git a/tests/test_io/test_diann.py b/tests/test_io/test_diann.py index 2ce86ca..12d6036 100644 --- a/tests/test_io/test_diann.py +++ b/tests/test_io/test_diann.py @@ -1,6 +1,6 @@ """Tests for psm_utils.io.diann.""" -from psm_utils.io.diann import DIANNReader +from psm_utils.io.diann import DIANNTSVReader from psm_utils.psm import PSM test_psm = PSM( @@ -17,7 +17,7 @@ retention_time=75.2574, ion_mobility=0, protein_list=["P38156"], - rank=1, + rank=None, source="diann", metadata={}, rescoring_features={ @@ -36,9 +36,20 @@ ) -class TestDIANNReader: +class TestDIANNTSVReader: def test_iter(self): - with DIANNReader("./tests/test_data/test_diann.tsv") as reader: + with DIANNTSVReader("./tests/test_data/test_diann.tsv") as reader: for psm in reader: psm.provenance_data = {} assert psm == test_psm + + def test__parse_peptidoform(self): + test_cases = [ + (("ACDE", "4"), "ACDE/4"), + (("AC(UniMod:1)DE", "4"), "AC[UNIMOD:1]DE/4"), + (("(UniMod:4)ACDE", "4"), "[UNIMOD:4]-ACDE/4"), + ] + + reader = DIANNTSVReader("./tests/test_data/test_diann.tsv") + for (peptide, charge), expected in test_cases: + assert reader._parse_peptidoform(peptide, charge) == expected diff --git a/tests/test_io/test_fragpipe.py b/tests/test_io/test_fragpipe.py index 4c1b067..e92f45e 100644 --- a/tests/test_io/test_fragpipe.py +++ b/tests/test_io/test_fragpipe.py @@ -1,6 +1,6 @@ """Tests for psm_utils.io.fragpipe.""" -from psm_utils.io.fragpipe import FragpipeReader +from psm_utils.io.fragpipe import FragPipeReader from psm_utils.psm import PSM test_psm = PSM( @@ -13,12 +13,12 @@ score=57.2940, qvalue=None, pep=1 - 1.0000, - precursor_mz=1001.9342, + precursor_mz=1001.9336, retention_time=2432.1640, ion_mobility=None, protein_list=["sp|P40159|YNU8_YEAST"], - rank=1, - source="fragpipe", + rank=None, + source="FragPipe", metadata={}, rescoring_features={ "Peptide Length": 20, @@ -36,7 +36,31 @@ class TestFragpipeReader: def test_iter(self): - with FragpipeReader("./tests/test_data/test_fragpipe.tsv") as reader: + with FragPipeReader("./tests/test_data/test_fragpipe.tsv") as reader: for psm in reader: psm.provenance_data = {} assert psm == test_psm + + def test__parse_peptidoform(self): + test_cases = [ + (("LHM[147]TNQNMEKc[17]", "LHMTNQNMEK", "3"), "LHM[147]TNQNMEK-[17]/3"), + (("n[43]ANIAVQR", "ANIAVQR", "2"), "[43]-ANIAVQR/2"), + ((None, "IPAVTYPK", "2"), "IPAVTYPK/2"), + (("", "IPAVTYPK", "2"), "IPAVTYPK/2"), + (("", "IPAVTYPK", 2), "IPAVTYPK/2"), + ] + + reader = FragPipeReader("./tests/test_data/test_fragpipe.tsv") + for (peptide, modified_peptide, charge), expected in test_cases: + assert reader._parse_peptidoform(peptide, modified_peptide, charge) == expected + + def test__parse_spectrum_id(self): + test_cases = [ + ("LFQ_Orbitrap_AIF_Human_01.101124.101124.0", "101124"), + ("LFQ.Orbitrap.AIF.Human.01.101124.101124.0", "101124"), + ("101124", "101124"), + ] + + reader = FragPipeReader("./tests/test_data/test_fragpipe.tsv") + for spectrum, expected in test_cases: + assert reader._parse_spectrum_id(spectrum) == expected From 00c2714122f9ddd8b86b98cf0bf1f3754a0c4a3b Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 6 Nov 2024 21:50:17 +0100 Subject: [PATCH 12/13] Add new readers to readme --- README.rst | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 257ee33..b21e247 100644 --- a/README.rst +++ b/README.rst @@ -89,6 +89,9 @@ Supported file formats ===================================================================================================================== ======================== =============== =============== File format psm_utils tag Read support Write support ===================================================================================================================== ======================== =============== =============== + `AlphaDIA precursors TSV `_ ``alphadia`` ✅ ❌ + `DIA-NN TSV `_ ``diann`` ✅ ❌ + `FragPipe PSM TSV `_ ``fragpipe`` ✅ ❌ `ionbot CSV `_ ``ionbot`` ✅ ❌ `OpenMS idXML `_ ``idxml`` ✅ ✅ `MaxQuant msms.txt `_ ``msms`` ✅ ❌ @@ -98,10 +101,10 @@ Supported file formats `Peptide Record `_ ``peprec`` ✅ ✅ `pepXML `_ ``pepxml`` ✅ ❌ `Percolator tab `_ ``percolator`` ✅ ✅ - Proteome Discoverer MSF ``proteome_discoverer`` ✅ ❌ + `Proteome Discoverer MSF <#>`_ ``proteome_discoverer`` ✅ ❌ `Sage Parquet `_ ``sage_parquet`` ✅ ❌ `Sage TSV `_ ``sage_tsv`` ✅ ❌ - ProteoScape Parquet ``proteoscape`` ✅ ❌ + `ProteoScape Parquet <#>`_ ``proteoscape`` ✅ ❌ `TSV `_ ``tsv`` ✅ ✅ `X!Tandem XML `_ ``xtandem`` ✅ ❌ ===================================================================================================================== ======================== =============== =============== From 029ba11db5fc180ed5e10679a0d6903fb2b5e1a1 Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 6 Nov 2024 22:00:57 +0100 Subject: [PATCH 13/13] Fix tests --- tests/test_io/test_diann.py | 1 - tests/test_io/test_fragpipe.py | 1 - 2 files changed, 2 deletions(-) diff --git a/tests/test_io/test_diann.py b/tests/test_io/test_diann.py index 12d6036..29884b8 100644 --- a/tests/test_io/test_diann.py +++ b/tests/test_io/test_diann.py @@ -21,7 +21,6 @@ source="diann", metadata={}, rescoring_features={ - "CScore": 0.995107, "RT": 75.2574, "Predicted.RT": 75.2713, "iRT": 33.9222, diff --git a/tests/test_io/test_fragpipe.py b/tests/test_io/test_fragpipe.py index e92f45e..9020c41 100644 --- a/tests/test_io/test_fragpipe.py +++ b/tests/test_io/test_fragpipe.py @@ -28,7 +28,6 @@ "Calculated Peptide Mass": 2001.8524, "Calculated M/Z": 1001.9335, "Delta Mass": 0.0002, - "Hyperscore": 57.2940, "Number of Missed Cleavages": 0, }, )