From d352100ab63c1060b864e3c15b8f2d0aa0b2363e Mon Sep 17 00:00:00 2001
From: rodvrees <robbe.devreese@hotmail.com>
Date: Tue, 25 Jun 2024 14:24:52 +0200
Subject: [PATCH 01/13] DIANN io

---
 .gitignore                |   4 ++
 psm_utils/io/__init__.py  |   7 +++
 psm_utils/io/diann.py     | 118 ++++++++++++++++++++++++++++++++++++++
 psm_utils/io/msfragger.py |   0
 4 files changed, 129 insertions(+)
 create mode 100644 psm_utils/io/diann.py
 create mode 100644 psm_utils/io/msfragger.py

diff --git a/.gitignore b/.gitignore
index 0432116..91b6a52 100644
--- a/.gitignore
+++ b/.gitignore
@@ -132,3 +132,7 @@ dmypy.json
 # Pyre type checker
 .pyre/
 .vscode/settings.json
+
+# Specific to Branch
+example_files/DIANN_example.tsv
+test.ipynb
diff --git a/psm_utils/io/__init__.py b/psm_utils/io/__init__.py
index 79f09ac..ae0b321 100644
--- a/psm_utils/io/__init__.py
+++ b/psm_utils/io/__init__.py
@@ -22,6 +22,7 @@
 import psm_utils.io.sage as sage
 import psm_utils.io.tsv as tsv
 import psm_utils.io.xtandem as xtandem
+import psm_utils.io.diann as diann
 from psm_utils.io._base_classes import WriterBase
 from psm_utils.io.exceptions import PSMUtilsIOException
 from psm_utils.psm import PSM
@@ -106,6 +107,12 @@
         "extension": ".parquet",
         "filename_pattern": r"^.*(?:_|\.).sage.parquet$",
     },
+    "diann": {
+        "reader": diann.DIANNReader,
+        "writer": None,
+        "extension": ".tsv",
+        "filename_pattern": r"^.*\.tsv$",
+    },
     "parquet": {  # List after proteoscape and sage to avoid extension matching conflicts
         "reader": parquet.ParquetReader,
         "writer": parquet.ParquetWriter,
diff --git a/psm_utils/io/diann.py b/psm_utils/io/diann.py
new file mode 100644
index 0000000..b1b9c1d
--- /dev/null
+++ b/psm_utils/io/diann.py
@@ -0,0 +1,118 @@
+"""
+Reader for PSM files from DIA-NN
+
+Reads the '.tsv' file as defined on the `DIA-NN documentation page <https://github.com/vdemichev/DiaNN>`_.
+"""
+
+from __future__ import annotations
+
+import csv
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Iterable, Optional
+import re
+
+import pyarrow.parquet as pq
+from pyteomics import mass
+
+from psm_utils.io._base_classes import ReaderBase
+from psm_utils.io._utils import set_csv_field_size_limit
+from psm_utils.psm import PSM
+from psm_utils.psm_list import PSMList
+
+set_csv_field_size_limit()
+
+class DIANNReader(ReaderBase, ABC):
+    def __init__(
+    self, filename, score_column: str = "CScore", *args, **kwargs
+    ) -> None:
+        """
+        Reader for DIA-NN '.tsv' file.
+
+        Parameters
+        ----------
+        filename : str or Path
+            Path to PSM file.
+        score_column: str, optional
+            Name of the column that holds the primary PSM score. Default is
+            ``CScore``.
+
+        """
+        super().__init__(filename, *args, **kwargs)
+        self.filename = filename
+        self.score_column = score_column
+
+    def __iter__(self) -> Iterable[PSM]:
+        """Iterate over file and return PSMs one-by-one."""
+        with open(self.filename) as msms_in:
+            reader = csv.DictReader(msms_in, delimiter="\t")
+            for row in reader:
+                yield self._get_peptide_spectrum_match(row)
+
+    def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
+        """Parse a single PSM from a DIA-NN PSM file."""
+        rescoring_features = {}
+        for ft in RESCORING_FEATURES:
+            try:
+                rescoring_features[ft] = psm_dict[ft]
+            except KeyError:
+                continue
+
+        return PSM(
+            peptidoform=self._parse_peptidoform(
+                psm_dict["Modified.Sequence"],
+                psm_dict["Precursor.Charge"]),
+            spectrum_id='NA', # DIA-NN does not output spectrum ID
+            run=psm_dict["Run"],
+            is_decoy=False,
+            qvalue=psm_dict["Q.Value"],
+            pep=float(psm_dict["PEP"]),
+            score=float(psm_dict[self.score_column]),
+            retention_time=float(psm_dict["RT"]),
+            ion_mobility=float(psm_dict["IM"]),
+            protein_list=psm_dict["Protein.Names"].split(";"),
+            source="diann",
+            rank=None, # Leave out?
+            provenance_data=({"diann_filename": str(self.filename)}),
+            rescoring_features=rescoring_features,
+            metadata={},
+        )
+
+    @staticmethod
+    def _parse_peptidoform(peptide: str, charge: Optional[str]) -> str:
+        if charge:
+            peptide += f"/{int(float(charge))}"
+        pattern = r"\(UniMod:(\d+)\)"
+        replacement = r"[UNIMOD:\1]"
+        peptide = re.sub(pattern, replacement, peptide)
+        # If [UNIMOD:n] occurs before the first amino acid, a hyphen is added before the first amino acid
+        if peptide[0] == "[":
+            # Hyphen after the closing bracket
+            peptide = peptide.replace("]", "]-", 1)
+        return peptide
+
+    def _parse_precursor_mz():
+        return NotImplementedError("Method not implemented yet. DIA-NN does not yet output precursor m/z.")
+
+    def from_dataframe(cls, dataframe) -> PSMList:
+        """Create a PSMList from a DIA-NN Pandas DataFrame."""
+        return PSMList(
+            ptm_list=[
+                cls._get_peptide_spectrum_match(cls(""), entry)
+                for entry in dataframe.to_dict(orient="records")
+            ]
+        )
+
+
+# TODO: Check
+RESCORING_FEATURES = [
+    "CScore",
+    "RT",
+    "Predicted.RT",
+    "iRT",
+    "Predicted.iRT",
+    "Ms1.Profile.Corr",
+    "Ms1.Area",
+    "IM",
+    "iIM"
+]
diff --git a/psm_utils/io/msfragger.py b/psm_utils/io/msfragger.py
new file mode 100644
index 0000000..e69de29

From 4b96137c0d923d7ba74a54670602ffc03593c954 Mon Sep 17 00:00:00 2001
From: rodvrees <robbe.devreese@hotmail.com>
Date: Tue, 25 Jun 2024 16:51:08 +0200
Subject: [PATCH 02/13] fragpipe reader

---
 .gitignore                |   1 +
 psm_utils/io/__init__.py  |  10 ++-
 psm_utils/io/diann.py     |  14 ++--
 psm_utils/io/fragpipe.py  | 133 ++++++++++++++++++++++++++++++++++++++
 psm_utils/io/msfragger.py |   0
 5 files changed, 149 insertions(+), 9 deletions(-)
 create mode 100644 psm_utils/io/fragpipe.py
 delete mode 100644 psm_utils/io/msfragger.py

diff --git a/.gitignore b/.gitignore
index 91b6a52..e1307d8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -135,4 +135,5 @@ dmypy.json
 
 # Specific to Branch
 example_files/DIANN_example.tsv
+example_files/MSFragger_example_psm.tsv
 test.ipynb
diff --git a/psm_utils/io/__init__.py b/psm_utils/io/__init__.py
index ae0b321..60407c9 100644
--- a/psm_utils/io/__init__.py
+++ b/psm_utils/io/__init__.py
@@ -23,6 +23,7 @@
 import psm_utils.io.tsv as tsv
 import psm_utils.io.xtandem as xtandem
 import psm_utils.io.diann as diann
+import psm_utils.io.fragpipe as fragpipe
 from psm_utils.io._base_classes import WriterBase
 from psm_utils.io.exceptions import PSMUtilsIOException
 from psm_utils.psm import PSM
@@ -107,12 +108,19 @@
         "extension": ".parquet",
         "filename_pattern": r"^.*(?:_|\.).sage.parquet$",
     },
-    "diann": {
+    "fragpipe": {
+        "reader": fragpipe.FragpipeReader,
+        "writer": None,
+        "extension": ".tsv",
+        "filename_pattern": r"^.*psm\.tsv$",
+    },
+    "diann": { # List after fragpipe to avoid extension matching conflicts #TODO: fix tsv conflict
         "reader": diann.DIANNReader,
         "writer": None,
         "extension": ".tsv",
         "filename_pattern": r"^.*\.tsv$",
     },
+
     "parquet": {  # List after proteoscape and sage to avoid extension matching conflicts
         "reader": parquet.ParquetReader,
         "writer": parquet.ParquetWriter,
diff --git a/psm_utils/io/diann.py b/psm_utils/io/diann.py
index b1b9c1d..468a944 100644
--- a/psm_utils/io/diann.py
+++ b/psm_utils/io/diann.py
@@ -7,14 +7,10 @@
 from __future__ import annotations
 
 import csv
-from abc import ABC, abstractmethod
-from pathlib import Path
+from abc import ABC
 from typing import Iterable, Optional
 import re
 
-import pyarrow.parquet as pq
-from pyteomics import mass
-
 from psm_utils.io._base_classes import ReaderBase
 from psm_utils.io._utils import set_csv_field_size_limit
 from psm_utils.psm import PSM
@@ -24,7 +20,7 @@
 
 class DIANNReader(ReaderBase, ABC):
     def __init__(
-    self, filename, score_column: str = "CScore", *args, **kwargs
+        self, filename, score_column: str = "CScore", *args, **kwargs
     ) -> None:
         """
         Reader for DIA-NN '.tsv' file.
@@ -72,7 +68,7 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
             ion_mobility=float(psm_dict["IM"]),
             protein_list=psm_dict["Protein.Names"].split(";"),
             source="diann",
-            rank=None, # Leave out?
+            rank=1, # Leave out?
             provenance_data=({"diann_filename": str(self.filename)}),
             rescoring_features=rescoring_features,
             metadata={},
@@ -91,9 +87,11 @@ def _parse_peptidoform(peptide: str, charge: Optional[str]) -> str:
             peptide = peptide.replace("]", "]-", 1)
         return peptide
 
+    @staticmethod
     def _parse_precursor_mz():
-        return NotImplementedError("Method not implemented yet. DIA-NN does not yet output precursor m/z.")
+        return NotImplementedError("Method not implemented yet. DIA-NN does not yet output precursor m/z, but might in the future.")
 
+    @staticmethod
     def from_dataframe(cls, dataframe) -> PSMList:
         """Create a PSMList from a DIA-NN Pandas DataFrame."""
         return PSMList(
diff --git a/psm_utils/io/fragpipe.py b/psm_utils/io/fragpipe.py
new file mode 100644
index 0000000..cb5fcf6
--- /dev/null
+++ b/psm_utils/io/fragpipe.py
@@ -0,0 +1,133 @@
+"""
+Reader for PSM files from the Fragpipe platform.
+
+Reads the Philosopher ``psm.tsv`` file as defined on the
+`Fragpipe documentation page <https://fragpipe.nesvilab.org/docs/tutorial_fragpipe_outputs.html>`_.
+
+"""
+
+from __future__ import annotations
+
+import csv
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Iterable, Optional
+
+import pyarrow.parquet as pq
+from pyteomics import mass
+
+from psm_utils.io._base_classes import ReaderBase
+from psm_utils.io._utils import set_csv_field_size_limit
+from psm_utils.psm import PSM
+from psm_utils.psm_list import PSMList
+
+set_csv_field_size_limit()
+
+class FragpipeReader(ReaderBase, ABC):
+    def __init__(
+        self, filename, score_column: str = "Hyperscore", mz_column: str = "Observed M/Z", *args, **kwargs
+    ) -> None:
+        """
+        Reader for MSFragger ``psm.tsv`` file.
+
+        Parameters
+        ----------
+        filename : str or Path
+            Path to PSM file.
+        score_column: str, optional
+            Name of the column that holds the primary PSM score. Default is
+            ``Hyperscore``.
+
+        """
+        super().__init__(filename, *args, **kwargs)
+        self.filename = filename
+        self.score_column = score_column
+        self.mz_column = mz_column
+
+    def __iter__(self) -> Iterable[PSM]:
+        """Iterate over file and return PSMs one-by-one."""
+        with open(self.filename) as msms_in:
+            reader = csv.DictReader(msms_in, delimiter="\t")
+            for row in reader:
+                yield self._get_peptide_spectrum_match(row)
+
+    def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
+        """Parse a single PSM from a MSFragger PSM file."""
+        rescoring_features = {}
+        for ft in RESCORING_FEATURES:
+            try:
+                rescoring_features[ft] = psm_dict[ft]
+            except KeyError:
+                continue
+
+        return PSM(
+            peptidoform=self._parse_peptidoform(
+                psm_dict["Modified Peptide"],
+                psm_dict['Peptide'],
+                psm_dict["Charge"]),
+            spectrum_id=self._parse_spectrum_id(psm_dict['Spectrum']),
+            run=Path(psm_dict["Spectrum File"]).stem,
+            is_decoy=False,
+            qvalue=None, # Q-value is not outputted by Philosopher
+            pep= 1 - float(psm_dict["Probability"]), # PeptideProphet Probability, not explicitely stated if this is the inverse of PEP
+            score=psm_dict[self.score_column],
+            precursor_mz=psm_dict[self.mz_column], # Allows use of both calibrated and uncalibrated Observed M/Z?+
+            retention_time=float(psm_dict["Retention"]),
+            ion_mobility=float(psm_dict["Ion Mobility"]) if "Ion Mobility" in psm_dict else None,
+            protein_list=self._parse_protein_list(psm_dict["Protein"],
+                                                  psm_dict["Mapped Proteins"]),
+            source="fragpipe",
+            rank=1,
+            rescoring_features=rescoring_features,
+            metadata={}
+        )
+
+    @staticmethod
+    def _parse_peptidoform(mod_peptide: str, peptide: str, charge: Optional[str]) -> str:
+        if mod_peptide:
+            peptide = mod_peptide
+        if charge:
+            peptide += f"/{int(float(charge))}"
+        if peptide.startswith('n'):
+            peptide = peptide[1:]
+            # A hyphen needs to be added after the N-terminal modification, thus after the ]
+            peptide = peptide.replace(']', ']-', 1)
+        return peptide
+
+    @staticmethod
+    def _parse_spectrum_id(spectrum: str) -> str:
+        return spectrum.split(".")[1]
+
+    @staticmethod
+    def _parse_protein_list(razor_protein: str, mapped_proteins) -> list[str]:
+        if mapped_proteins:
+            mapped_proteins_list = mapped_proteins.split(", ")
+            return [razor_protein] + mapped_proteins_list
+        else:
+            return [razor_protein]
+
+    @staticmethod
+    def from_dataframe(cls, dataframe) -> PSMList:
+        """Create a PSMList from a pandas DataFrame."""
+        return PSMList(
+            ptm_list=[
+                cls._get_peptide_spectrum_match(cls(""), entry)
+                for entry in dataframe.to_dict(orient="records")
+            ]
+        )
+
+
+# TODO: check
+RESCORING_FEATURES = [
+    "Peptide Length",
+    "Retention",
+    "Observed Mass",
+    "Observed M/Z",
+    "Calculated Peptide Mass",
+    "Calculated M/Z",
+    "Delta Mass",
+    "Hyperscore",
+    "Number of Missed Cleavages",
+    "Intensity"
+]
+
diff --git a/psm_utils/io/msfragger.py b/psm_utils/io/msfragger.py
deleted file mode 100644
index e69de29..0000000

From 6bd341182328bc7d9188b6dd143d6cde933f789a Mon Sep 17 00:00:00 2001
From: rodvrees <robbe.devreese@hotmail.com>
Date: Wed, 26 Jun 2024 10:34:28 +0200
Subject: [PATCH 03/13] alphadia reader

---
 .gitignore               |   1 +
 psm_utils/io/__init__.py |  10 +++-
 psm_utils/io/alphadia.py | 118 +++++++++++++++++++++++++++++++++++++++
 psm_utils/io/diann.py    |   2 +-
 psm_utils/io/fragpipe.py |  10 ++--
 5 files changed, 133 insertions(+), 8 deletions(-)
 create mode 100644 psm_utils/io/alphadia.py

diff --git a/.gitignore b/.gitignore
index e1307d8..8669c02 100644
--- a/.gitignore
+++ b/.gitignore
@@ -136,4 +136,5 @@ dmypy.json
 # Specific to Branch
 example_files/DIANN_example.tsv
 example_files/MSFragger_example_psm.tsv
+example_files/AlphaDIA_example.tsv
 test.ipynb
diff --git a/psm_utils/io/__init__.py b/psm_utils/io/__init__.py
index 60407c9..e345708 100644
--- a/psm_utils/io/__init__.py
+++ b/psm_utils/io/__init__.py
@@ -24,6 +24,7 @@
 import psm_utils.io.xtandem as xtandem
 import psm_utils.io.diann as diann
 import psm_utils.io.fragpipe as fragpipe
+import psm_utils.io.alphadia as alphadia
 from psm_utils.io._base_classes import WriterBase
 from psm_utils.io.exceptions import PSMUtilsIOException
 from psm_utils.psm import PSM
@@ -114,13 +115,20 @@
         "extension": ".tsv",
         "filename_pattern": r"^.*psm\.tsv$",
     },
-    "diann": { # List after fragpipe to avoid extension matching conflicts #TODO: fix tsv conflict
+    "alphadia": {
+    "reader": alphadia.AlphaDIAReader,
+    "writer": None,
+    "extension": ".tsv",
+    "filename_pattern": r"^.*precursor\.tsv$",
+    },
+    "diann": { # List after fragpipe and alphadia to avoid extension matching conflicts #TODO: fix tsv conflict
         "reader": diann.DIANNReader,
         "writer": None,
         "extension": ".tsv",
         "filename_pattern": r"^.*\.tsv$",
     },
 
+
     "parquet": {  # List after proteoscape and sage to avoid extension matching conflicts
         "reader": parquet.ParquetReader,
         "writer": parquet.ParquetWriter,
diff --git a/psm_utils/io/alphadia.py b/psm_utils/io/alphadia.py
new file mode 100644
index 0000000..88af545
--- /dev/null
+++ b/psm_utils/io/alphadia.py
@@ -0,0 +1,118 @@
+"""
+Reader for PSM files from the AlphaDIA search engine.
+
+Reads the AlphaDIA ``precursor.tsv`` file as defined on the
+`TODO: NOT YET A LINK`_.
+
+"""
+
+from __future__ import annotations
+
+import csv
+from abc import ABC
+from typing import Iterable, Optional
+
+from psm_utils.io._base_classes import ReaderBase
+from psm_utils.io._utils import set_csv_field_size_limit
+from psm_utils.psm import PSM
+from psm_utils.psm_list import PSMList
+
+set_csv_field_size_limit()
+
+
+class AlphaDIAReader(ReaderBase, ABC):
+    def __init__(self, filename, score_column: str = "score", *args, **kwargs):
+        """
+        Reader for AlphaDIA ``precursor.tsv`` file.
+
+        Parameters
+        ----------
+        filename : str or Path
+            Path to PSM file.
+        score_column: str, optional
+            Name of the column that holds the primary PSM score. Default is
+            ``score``.
+
+        """
+        super().__init__(filename, *args, **kwargs)
+        self.filename = filename
+        self.score_column = score_column
+
+    def __iter__(self) -> Iterable[PSM]:
+        """Iterate over file and return PSMs one-by-one."""
+        with open(self.filename) as msms_in:
+            reader = csv.DictReader(msms_in, delimiter="\t")
+            for row in reader:
+                yield self._get_peptide_spectrum_match(row)
+
+    def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
+        """Parse a single PSM from a AlphaDIA PSM file."""
+        rescoring_features = {}
+        for ft in RESCORING_FEATURES:
+            try:
+                rescoring_features[ft] = psm_dict[ft]
+            except KeyError:
+                continue
+
+        return PSM(
+            peptidoform=self._parse_peptidoform(
+                psm_dict["sequence"], psm_dict["mods"], psm_dict["mod_sites"], psm_dict["charge"]
+            ),
+            spectrum_id=psm_dict["frame_start"],  # TODO: needs to be checked
+            run=psm_dict["run"],
+            spectrum=psm_dict["frame_start"],  # TODO: needs to be checked
+            is_decoy=bool(int(psm_dict["decoy"])),
+            score=psm_dict[self.score_column],
+            qvalue=psm_dict["qval"],
+            pep=psm_dict[
+                "proba"
+            ],  # TODO: needs to be checked, assumption because if it is 1-proba than it's really bad
+            precursor_mz=psm_dict["mz_observed"],
+            retention_time=psm_dict["rt_observed"],
+            ion_mobility=psm_dict["mobility_observed"],
+            protein_list=psm_dict["proteins"].split(";"),
+            rank=psm_dict["rank"],
+            source="alphadia",
+            provenance_data=({"alphadia_filename": str(self.filename)}),
+            metadata={},
+            rescoring_features=rescoring_features,
+        )
+
+    @staticmethod
+    def _parse_peptidoform(sequence: str, mods: str, mod_sites, charge: Optional[str]) -> str:
+        if mods:
+            mods = mods.split(";")
+            mod_sites = mod_sites.split(";")
+            for mod, site in reversed(sorted(zip(mods, mod_sites), key=lambda x: int(x[1]))):
+                if int(site) == 0:
+                    sequence = (
+                        sequence[: int(site)] + f"[{mod.split('@')[0]}]-" + sequence[int(site) :]
+                    )
+                else:
+                    sequence = (
+                        sequence[: int(site)] + f"[{mod.split('@')[0]}]" + sequence[int(site) :]
+                    )
+        if charge:
+            sequence += f"/{int(float(charge))}"
+        return sequence
+
+    @classmethod
+    def from_dataframe(cls, dataframe) -> PSMList:
+        """Create a PSMList from a AlphaDIA Pandas DataFrame."""
+        return PSMList(
+            psm_list=[
+                cls._get_peptide_spectrum_match(cls(""), entry)
+                for entry in dataframe.to_dict(orient="records")
+            ]
+        )
+
+
+# TODO: check
+RESCORING_FEATURES = [
+    "rt_observed",
+    "mobility_observed",
+    "mz_observed",
+    "score",
+    "charge",
+    "delta_rt",
+]
diff --git a/psm_utils/io/diann.py b/psm_utils/io/diann.py
index 468a944..bc79670 100644
--- a/psm_utils/io/diann.py
+++ b/psm_utils/io/diann.py
@@ -91,7 +91,7 @@ def _parse_peptidoform(peptide: str, charge: Optional[str]) -> str:
     def _parse_precursor_mz():
         return NotImplementedError("Method not implemented yet. DIA-NN does not yet output precursor m/z, but might in the future.")
 
-    @staticmethod
+    @classmethod
     def from_dataframe(cls, dataframe) -> PSMList:
         """Create a PSMList from a DIA-NN Pandas DataFrame."""
         return PSMList(
diff --git a/psm_utils/io/fragpipe.py b/psm_utils/io/fragpipe.py
index cb5fcf6..71efc37 100644
--- a/psm_utils/io/fragpipe.py
+++ b/psm_utils/io/fragpipe.py
@@ -9,13 +9,10 @@
 from __future__ import annotations
 
 import csv
-from abc import ABC, abstractmethod
+from abc import ABC
 from pathlib import Path
 from typing import Iterable, Optional
 
-import pyarrow.parquet as pq
-from pyteomics import mass
-
 from psm_utils.io._base_classes import ReaderBase
 from psm_utils.io._utils import set_csv_field_size_limit
 from psm_utils.psm import PSM
@@ -65,7 +62,7 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
                 psm_dict["Modified Peptide"],
                 psm_dict['Peptide'],
                 psm_dict["Charge"]),
-            spectrum_id=self._parse_spectrum_id(psm_dict['Spectrum']),
+            spectrum_id=self._parse_spectrum_id(psm_dict['Spectrum']), #TODO: needs to be checked
             run=Path(psm_dict["Spectrum File"]).stem,
             is_decoy=False,
             qvalue=None, # Q-value is not outputted by Philosopher
@@ -78,6 +75,7 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
                                                   psm_dict["Mapped Proteins"]),
             source="fragpipe",
             rank=1,
+            provenance_data=({"fragpipe_filename": str(self.filename)}),
             rescoring_features=rescoring_features,
             metadata={}
         )
@@ -106,7 +104,7 @@ def _parse_protein_list(razor_protein: str, mapped_proteins) -> list[str]:
         else:
             return [razor_protein]
 
-    @staticmethod
+    @classmethod
     def from_dataframe(cls, dataframe) -> PSMList:
         """Create a PSMList from a pandas DataFrame."""
         return PSMList(

From 36ec0472dcd883858f107c340da1a841b11201b9 Mon Sep 17 00:00:00 2001
From: rodvrees <robbe.devreese@hotmail.com>
Date: Wed, 26 Jun 2024 22:10:57 +0200
Subject: [PATCH 04/13] diann change protein column

---
 psm_utils/io/diann.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/psm_utils/io/diann.py b/psm_utils/io/diann.py
index bc79670..60c48e0 100644
--- a/psm_utils/io/diann.py
+++ b/psm_utils/io/diann.py
@@ -18,10 +18,9 @@
 
 set_csv_field_size_limit()
 
+
 class DIANNReader(ReaderBase, ABC):
-    def __init__(
-        self, filename, score_column: str = "CScore", *args, **kwargs
-    ) -> None:
+    def __init__(self, filename, score_column: str = "CScore", *args, **kwargs) -> None:
         """
         Reader for DIA-NN '.tsv' file.
 
@@ -56,9 +55,9 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
 
         return PSM(
             peptidoform=self._parse_peptidoform(
-                psm_dict["Modified.Sequence"],
-                psm_dict["Precursor.Charge"]),
-            spectrum_id='NA', # DIA-NN does not output spectrum ID
+                psm_dict["Modified.Sequence"], psm_dict["Precursor.Charge"]
+            ),
+            spectrum_id="NA",  # DIA-NN does not output spectrum ID
             run=psm_dict["Run"],
             is_decoy=False,
             qvalue=psm_dict["Q.Value"],
@@ -66,9 +65,9 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
             score=float(psm_dict[self.score_column]),
             retention_time=float(psm_dict["RT"]),
             ion_mobility=float(psm_dict["IM"]),
-            protein_list=psm_dict["Protein.Names"].split(";"),
+            protein_list=psm_dict["Protein.Ids"].split(";"),
             source="diann",
-            rank=1, # Leave out?
+            rank=1,  # Leave out?
             provenance_data=({"diann_filename": str(self.filename)}),
             rescoring_features=rescoring_features,
             metadata={},
@@ -89,7 +88,9 @@ def _parse_peptidoform(peptide: str, charge: Optional[str]) -> str:
 
     @staticmethod
     def _parse_precursor_mz():
-        return NotImplementedError("Method not implemented yet. DIA-NN does not yet output precursor m/z, but might in the future.")
+        return NotImplementedError(
+            "Method not implemented yet. DIA-NN does not yet output precursor m/z, but might in the future."
+        )
 
     @classmethod
     def from_dataframe(cls, dataframe) -> PSMList:
@@ -112,5 +113,5 @@ def from_dataframe(cls, dataframe) -> PSMList:
     "Ms1.Profile.Corr",
     "Ms1.Area",
     "IM",
-    "iIM"
+    "iIM",
 ]

From b228fb601c461918a26cae3665582f8a876e00a5 Mon Sep 17 00:00:00 2001
From: rodvrees <robbe.devreese@hotmail.com>
Date: Thu, 27 Jun 2024 14:22:03 +0200
Subject: [PATCH 05/13] unit tests for diann, alphadia and fragpipe readers

---
 psm_utils/io/alphadia.py          |  2 +-
 psm_utils/io/diann.py             |  2 ++
 psm_utils/io/fragpipe.py          | 55 ++++++++++++++++++++++---------
 tests/test_data/test_alphadia.tsv |  2 ++
 tests/test_data/test_diann.tsv    |  2 ++
 tests/test_data/test_fragpipe.tsv |  2 ++
 tests/test_io/test_alphadia.py    | 39 ++++++++++++++++++++++
 tests/test_io/test_diann.py       | 44 +++++++++++++++++++++++++
 tests/test_io/test_fragpipe.py    | 42 +++++++++++++++++++++++
 9 files changed, 173 insertions(+), 17 deletions(-)
 create mode 100644 tests/test_data/test_alphadia.tsv
 create mode 100644 tests/test_data/test_diann.tsv
 create mode 100644 tests/test_data/test_fragpipe.tsv
 create mode 100644 tests/test_io/test_alphadia.py
 create mode 100644 tests/test_io/test_diann.py
 create mode 100644 tests/test_io/test_fragpipe.py

diff --git a/psm_utils/io/alphadia.py b/psm_utils/io/alphadia.py
index 88af545..c2c3fff 100644
--- a/psm_utils/io/alphadia.py
+++ b/psm_utils/io/alphadia.py
@@ -71,7 +71,7 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
             retention_time=psm_dict["rt_observed"],
             ion_mobility=psm_dict["mobility_observed"],
             protein_list=psm_dict["proteins"].split(";"),
-            rank=psm_dict["rank"],
+            rank=int(psm_dict["rank"]) + 1,  # AlphaDIA ranks are 0-based
             source="alphadia",
             provenance_data=({"alphadia_filename": str(self.filename)}),
             metadata={},
diff --git a/psm_utils/io/diann.py b/psm_utils/io/diann.py
index 60c48e0..a60cb7b 100644
--- a/psm_utils/io/diann.py
+++ b/psm_utils/io/diann.py
@@ -114,4 +114,6 @@ def from_dataframe(cls, dataframe) -> PSMList:
     "Ms1.Area",
     "IM",
     "iIM",
+    "Predicted.IM",
+    "Predicted.iIM",
 ]
diff --git a/psm_utils/io/fragpipe.py b/psm_utils/io/fragpipe.py
index 71efc37..50d6bd4 100644
--- a/psm_utils/io/fragpipe.py
+++ b/psm_utils/io/fragpipe.py
@@ -20,9 +20,15 @@
 
 set_csv_field_size_limit()
 
+
 class FragpipeReader(ReaderBase, ABC):
     def __init__(
-        self, filename, score_column: str = "Hyperscore", mz_column: str = "Observed M/Z", *args, **kwargs
+        self,
+        filename,
+        score_column: str = "Hyperscore",
+        mz_column: str = "Observed M/Z",
+        *args,
+        **kwargs,
     ) -> None:
         """
         Reader for MSFragger ``psm.tsv`` file.
@@ -34,6 +40,9 @@ def __init__(
         score_column: str, optional
             Name of the column that holds the primary PSM score. Default is
             ``Hyperscore``.
+        mz_column: str, optional
+            Name of the column that holds the precursor m/z. Default is
+            ``Observed M/Z``.
 
         """
         super().__init__(filename, *args, **kwargs)
@@ -59,25 +68,31 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
 
         return PSM(
             peptidoform=self._parse_peptidoform(
-                psm_dict["Modified Peptide"],
-                psm_dict['Peptide'],
-                psm_dict["Charge"]),
-            spectrum_id=self._parse_spectrum_id(psm_dict['Spectrum']), #TODO: needs to be checked
-            run=Path(psm_dict["Spectrum File"]).stem,
+                psm_dict["Modified Peptide"], psm_dict["Peptide"], psm_dict["Charge"]
+            ),
+            spectrum_id=self._parse_spectrum_id(psm_dict["Spectrum"]),  # TODO: needs to be checked
+            run=self._parse_run(psm_dict["Spectrum File"]),
             is_decoy=False,
-            qvalue=None, # Q-value is not outputted by Philosopher
-            pep= 1 - float(psm_dict["Probability"]), # PeptideProphet Probability, not explicitely stated if this is the inverse of PEP
+            qvalue=None,  # Q-value is not outputted by Philosopher
+            pep=1
+            - float(
+                psm_dict["Probability"]
+            ),  # PeptideProphet Probability, not explicitely stated if this is the inverse of PEP
+            # But I'm assuming it is
             score=psm_dict[self.score_column],
-            precursor_mz=psm_dict[self.mz_column], # Allows use of both calibrated and uncalibrated Observed M/Z?+
+            precursor_mz=psm_dict[
+                self.mz_column
+            ],  # Allows use of both calibrated and uncalibrated Observed M/Z?
             retention_time=float(psm_dict["Retention"]),
             ion_mobility=float(psm_dict["Ion Mobility"]) if "Ion Mobility" in psm_dict else None,
-            protein_list=self._parse_protein_list(psm_dict["Protein"],
-                                                  psm_dict["Mapped Proteins"]),
+            protein_list=self._parse_protein_list(
+                psm_dict["Protein"], psm_dict["Mapped Proteins"]
+            ),
             source="fragpipe",
             rank=1,
             provenance_data=({"fragpipe_filename": str(self.filename)}),
             rescoring_features=rescoring_features,
-            metadata={}
+            metadata={},
         )
 
     @staticmethod
@@ -86,10 +101,10 @@ def _parse_peptidoform(mod_peptide: str, peptide: str, charge: Optional[str]) ->
             peptide = mod_peptide
         if charge:
             peptide += f"/{int(float(charge))}"
-        if peptide.startswith('n'):
+        if peptide.startswith("n"):
             peptide = peptide[1:]
             # A hyphen needs to be added after the N-terminal modification, thus after the ]
-            peptide = peptide.replace(']', ']-', 1)
+            peptide = peptide.replace("]", "]-", 1)
         return peptide
 
     @staticmethod
@@ -104,6 +119,16 @@ def _parse_protein_list(razor_protein: str, mapped_proteins) -> list[str]:
         else:
             return [razor_protein]
 
+    # Dependent on the fragpipe workflow used the run name can be different, but in most cases
+    # something like 'interact-<run_name>.pep.xml' is used
+    @staticmethod
+    def _parse_run(spectrum_file: str) -> str:
+        if (spectrum_file.endswith(".pep.xml")) and (spectrum_file.startswith("interact-")):
+            spectrum_file = spectrum_file.replace("interact-", "")
+            return Path(Path(spectrum_file).stem).stem
+        else:
+            return Path(spectrum_file).stem
+
     @classmethod
     def from_dataframe(cls, dataframe) -> PSMList:
         """Create a PSMList from a pandas DataFrame."""
@@ -126,6 +151,4 @@ def from_dataframe(cls, dataframe) -> PSMList:
     "Delta Mass",
     "Hyperscore",
     "Number of Missed Cleavages",
-    "Intensity"
 ]
-
diff --git a/tests/test_data/test_alphadia.tsv b/tests/test_data/test_alphadia.tsv
new file mode 100644
index 0000000..4928e16
--- /dev/null
+++ b/tests/test_data/test_alphadia.tsv
@@ -0,0 +1,2 @@
+base_width_mobility	base_width_rt	rt_observed	mobility_observed	mono_ms1_intensity	top_ms1_intensity	sum_ms1_intensity	weighted_ms1_intensity	weighted_mass_deviation	weighted_mass_error	mz_observed	mono_ms1_height	top_ms1_height	sum_ms1_height	weighted_ms1_height	isotope_intensity_correlation	isotope_height_correlation	n_observations	intensity_correlation	height_correlation	intensity_fraction	height_fraction	intensity_fraction_weighted	height_fraction_weighted	mean_observation_score	sum_b_ion_intensity	sum_y_ion_intensity	diff_b_y_ion_intensity	f_masked	fragment_scan_correlation	template_scan_correlation	fragment_frame_correlation	top3_frame_correlation	template_frame_correlation	top3_b_ion_correlation	n_b_ions	top3_y_ion_correlation	n_y_ions	cycle_fwhm	mobility_fwhm	delta_frame_peak	top_3_ms2_mass_error	mean_ms2_mass_error	n_overlapping	mean_overlapping_intensity	mean_overlapping_mass_error	precursor_idx	rank	scan_center	score	frame_start	scan_stop	elution_group_idx	frame_center	scan_start	frame_stop	flat_frag_start_idx	decoy	i_1	mz_library	mod_sites	charge	flat_frag_stop_idx	proteins	genes	channel	i_0	sequence	i_2	i_3	mobility_library	mods	rt_calibrated	rt_library	delta_rt	n_K	n_R	n_P	_decoy	proba	qval	_candidate_idx	valid	candidate_idx	run	mod_seq_hash	mod_seq_charge_hash	pg_master	pg	pg_qval	intensity
+0.000000	75.606934	3111.141602	0.000001	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	648.794128	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	3.000000	0.808025	0.969734	1.000000	1.000000	1.000000	1.000000	0.000000	11.815787	13.400094	-1.584307	1.000000	0.000000	0.000000	0.792649	0.992209	0.000000	0.923456	2.000000	0.919347	10.000000	18.168669	0.000000	0.333333	-1.579069	-1.260026	4.000000	1441328.000000	-1.948657	12789812	0	0	170.287918	79426	1	6406557	80332	0	81389	65332105	0	0.305868	648.794128	5	2	65332117	P06733	ENOA_HUMAN	0	0.475448	LMIEMDGTENK	0.158954	0.059730	0.944761	Oxidation@M	3150.670410	2546.791260	-39.528809	1	0	0	0.000000	0.000005	0.000000	12789812	True	12789812	LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_03	5071272541180990939	5071272541180990941	P06733	P06733	0.000000	20915252511.701401
diff --git a/tests/test_data/test_diann.tsv b/tests/test_data/test_diann.tsv
new file mode 100644
index 0000000..fba1e43
--- /dev/null
+++ b/tests/test_data/test_diann.tsv
@@ -0,0 +1,2 @@
+File.Name	Run	Protein.Group	Protein.Ids	Protein.Names	Genes	PG.Quantity	PG.Normalised	PG.MaxLFQ	Genes.Quantity	Genes.Normalised	Genes.MaxLFQ	Genes.MaxLFQ.Unique	Modified.Sequence	Stripped.Sequence	Precursor.Id	Precursor.Charge	Q.Value	PEP	Global.Q.Value	Protein.Q.Value	PG.Q.Value	Global.PG.Q.Value	GG.Q.Value	Translated.Q.Value	Proteotypic	Precursor.Quantity	Precursor.Normalised	Precursor.Translated	Translated.Quality	Ms1.Translated	Quantity.Quality	RT	RT.Start	RT.Stop	iRT	Predicted.RT	Predicted.iRT	First.Protein.Description	Lib.Q.Value	Lib.PG.Q.Value	Ms1.Profile.Corr	Ms1.Area	Evidence	Spectrum.Similarity	Averagine	Mass.Evidence	CScore	Decoy.Evidence	Decoy.CScore	Fragment.Quant.Raw	Fragment.Quant.Corrected	Fragment.Correlations	MS2.Scan	PTM.Informative	PTM.Specific	PTM.Localising	PTM.Q.Value	PTM.Site.Confidence	Lib.PTM.Site.Confidence	IM	iIM	Predicted.IM	Predicted.iIM
+/data/Orbi_Yeast/LFQ_Orbitrap_AIF_Yeast_03.mzML	LFQ_Orbitrap_AIF_Yeast_03	P38156	P38156	MAL31_YEAST	MAL31	672704	689275	689274	672704	689275	689274	689274	AAAAEINVKDPKEDLETSVVDEGR	AAAAEINVKDPKEDLETSVVDEGR	AAAAEINVKDPKEDLETSVVDEGR4	4	0.000548193	0.0104343	1	1	1	1	1	0	1	413970	424167	413970		849940	0.904051	75.2574	75.0156	75.5001	33.9222	75.2713	33.8999	Maltose permease MAL31	1	1	0.347567	849940	1.52391	0.455898	0.0388433	0	0.995107	0.705793	0.213383	274506;139464;0;0;0;0;486380;0;70361;370465;36455.3;0;	274506;139464;0;0;0;0;486380;0;70361;370465;36455.3;0;	0.995393;0.724264;0;0;0;0;0.949297;0;0.169817;0.59338;0.481298;0;	116903	0	0	0	0	0	0	0	0	0	0
diff --git a/tests/test_data/test_fragpipe.tsv b/tests/test_data/test_fragpipe.tsv
new file mode 100644
index 0000000..847583d
--- /dev/null
+++ b/tests/test_data/test_fragpipe.tsv
@@ -0,0 +1,2 @@
+Spectrum	Spectrum File	Peptide	Modified Peptide	Extended Peptide	Prev AA	Next AA	Peptide Length	Charge	Retention	Observed Mass	Calibrated Observed Mass	Observed M/Z	Calibrated Observed M/Z	Calculated Peptide Mass	Calculated M/Z	Delta Mass	SpectralSim	RTScore	Expectation	Hyperscore	Nextscore	Probability	Number of Enzymatic Termini	Number of Missed Cleavages	Protein Start	Protein End	Intensity	Assigned Modifications	Observed Modifications	Purity	Is Unique	Protein	Protein ID	Entry Name	Gene	Protein Description	Mapped Genes	Mapped Proteins
+LFQ_Orbitrap_AIF_Yeast_01_Q1.00001.00001.2	interact-LFQ_Orbitrap_AIF_Yeast_01_Q1.pep.xml	TGAPNNGQYGADNGNPNGER		NQQNNQER.TGAPNNGQYGADNGNPNGER.GIFSTIVG	R	G	20	2	2432.1640	2001.8539	2001.8527	1001.9342	1001.9336	2001.8524	1001.9335	0.0002	0.9925	5.8094	0.00000000000011	57.2940	0.0000	1.0000	2	0	24	43	0.0000			0.00	true	sp|P40159|YNU8_YEAST	P40159	YNU8_YEAST	YNL208W	Uncharacterized protein YNL208W
diff --git a/tests/test_io/test_alphadia.py b/tests/test_io/test_alphadia.py
new file mode 100644
index 0000000..64e38ae
--- /dev/null
+++ b/tests/test_io/test_alphadia.py
@@ -0,0 +1,39 @@
+"""Tests for psm_utils.io.alphadia."""
+
+from psm_utils.io.alphadia import AlphaDIAReader
+from psm_utils.psm import PSM
+
+test_psm = PSM(
+    peptidoform="LMIEM[Oxidation]DGTENK/2",
+    spectrum_id="79426",
+    run="LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_03",
+    collection=None,
+    spectrum="79426",
+    is_decoy=False,
+    score=170.287918,
+    qvalue=0.000000,
+    pep=0.000005,
+    precursor_mz=648.794128,
+    retention_time=3111.141602,
+    ion_mobility=0.000001,
+    protein_list=["P06733"],
+    rank=1,
+    source="alphadia",
+    metadata={},
+    rescoring_features={
+        "rt_observed": 3111.141602,
+        "mobility_observed": 0.000001,
+        "mz_observed": 648.794128,
+        "score": 170.287918,
+        "charge": 2,
+        "delta_rt": -39.528809,
+    },
+)
+
+
+class TestAlphaDIAReader:
+    def test_iter(self):
+        with AlphaDIAReader("./tests/test_data/test_alphadia.tsv") as reader:
+            for psm in reader:
+                psm.provenance_data = {}
+                assert psm == test_psm
diff --git a/tests/test_io/test_diann.py b/tests/test_io/test_diann.py
new file mode 100644
index 0000000..543db4c
--- /dev/null
+++ b/tests/test_io/test_diann.py
@@ -0,0 +1,44 @@
+"""Tests for psm_utils.io.diann."""
+
+from psm_utils.io.diann import DIANNReader
+from psm_utils.psm import PSM
+
+test_psm = PSM(
+    peptidoform="AAAAEINVKDPKEDLETSVVDEGR/4",
+    spectrum_id="NA",
+    run="LFQ_Orbitrap_AIF_Yeast_03",
+    collection=None,
+    spectrum=None,
+    is_decoy=False,
+    score=0.995107,
+    qvalue=0.000548193,
+    pep=0.0104343,
+    precursor_mz=None,
+    retention_time=75.2574,
+    ion_mobility=0,
+    protein_list=["P38156"],
+    rank=1,
+    source="diann",
+    metadata={},
+    rescoring_features={
+        "CScore": 0.995107,
+        "RT": 75.2574,
+        "Predicted.RT": 75.2713,
+        "iRT": 33.9222,
+        "Predicted.iRT": 33.8999,
+        "Ms1.Profile.Corr": 0.347567,
+        "Ms1.Area": 849940,
+        "IM": 0,
+        "iIM": 0,
+        "Predicted.IM": 0,
+        "Predicted.iIM": 0,
+    },
+)
+
+
+class TestDIANNReader:
+    def test_iter(self):
+        with DIANNReader("./tests/test_data/test_diann.tsv") as reader:
+            for psm in reader:
+                psm.provenance_data = {}
+                assert psm == test_psm
diff --git a/tests/test_io/test_fragpipe.py b/tests/test_io/test_fragpipe.py
new file mode 100644
index 0000000..4c1b067
--- /dev/null
+++ b/tests/test_io/test_fragpipe.py
@@ -0,0 +1,42 @@
+"""Tests for psm_utils.io.fragpipe."""
+
+from psm_utils.io.fragpipe import FragpipeReader
+from psm_utils.psm import PSM
+
+test_psm = PSM(
+    peptidoform="TGAPNNGQYGADNGNPNGER/2",
+    spectrum_id="00001",
+    run="LFQ_Orbitrap_AIF_Yeast_01_Q1",
+    collection=None,
+    spectrum=None,
+    is_decoy=False,
+    score=57.2940,
+    qvalue=None,
+    pep=1 - 1.0000,
+    precursor_mz=1001.9342,
+    retention_time=2432.1640,
+    ion_mobility=None,
+    protein_list=["sp|P40159|YNU8_YEAST"],
+    rank=1,
+    source="fragpipe",
+    metadata={},
+    rescoring_features={
+        "Peptide Length": 20,
+        "Retention": 2432.1640,
+        "Observed Mass": 2001.8539,
+        "Observed M/Z": 1001.9342,
+        "Calculated Peptide Mass": 2001.8524,
+        "Calculated M/Z": 1001.9335,
+        "Delta Mass": 0.0002,
+        "Hyperscore": 57.2940,
+        "Number of Missed Cleavages": 0,
+    },
+)
+
+
+class TestFragpipeReader:
+    def test_iter(self):
+        with FragpipeReader("./tests/test_data/test_fragpipe.tsv") as reader:
+            for psm in reader:
+                psm.provenance_data = {}
+                assert psm == test_psm

From 479b0d557487acd24f7795881631e482c8ca806e Mon Sep 17 00:00:00 2001
From: rodvrees <robbe.devreese@hotmail.com>
Date: Fri, 28 Jun 2024 10:03:20 +0200
Subject: [PATCH 06/13] diann qval column variable

---
 psm_utils/io/diann.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/psm_utils/io/diann.py b/psm_utils/io/diann.py
index a60cb7b..52e4b1a 100644
--- a/psm_utils/io/diann.py
+++ b/psm_utils/io/diann.py
@@ -20,7 +20,9 @@
 
 
 class DIANNReader(ReaderBase, ABC):
-    def __init__(self, filename, score_column: str = "CScore", *args, **kwargs) -> None:
+    def __init__(
+        self, filename, score_column: str = "CScore", qval_column="Q.Value", *args, **kwargs
+    ) -> None:
         """
         Reader for DIA-NN '.tsv' file.
 
@@ -36,6 +38,7 @@ def __init__(self, filename, score_column: str = "CScore", *args, **kwargs) -> N
         super().__init__(filename, *args, **kwargs)
         self.filename = filename
         self.score_column = score_column
+        self.qval_column = qval_column
 
     def __iter__(self) -> Iterable[PSM]:
         """Iterate over file and return PSMs one-by-one."""
@@ -60,7 +63,9 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
             spectrum_id="NA",  # DIA-NN does not output spectrum ID
             run=psm_dict["Run"],
             is_decoy=False,
-            qvalue=psm_dict["Q.Value"],
+            qvalue=psm_dict[
+                self.qval_column
+            ],  # DIA-NN puts out q-value on both run and library level
             pep=float(psm_dict["PEP"]),
             score=float(psm_dict[self.score_column]),
             retention_time=float(psm_dict["RT"]),

From 8d7f914f4876a7b692f67292cd23e5d833db7cb7 Mon Sep 17 00:00:00 2001
From: rodvrees <robbe.devreese@hotmail.com>
Date: Thu, 31 Oct 2024 15:39:59 +0100
Subject: [PATCH 07/13] DIANN scan

---
 .gitignore            | 3 ++-
 psm_utils/io/diann.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8669c02..854c67a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -133,8 +133,9 @@ dmypy.json
 .pyre/
 .vscode/settings.json
 
-# Specific to Branch
+# Specific to Branch, temp
 example_files/DIANN_example.tsv
 example_files/MSFragger_example_psm.tsv
 example_files/AlphaDIA_example.tsv
 test.ipynb
+example_files/evidence.txt
diff --git a/psm_utils/io/diann.py b/psm_utils/io/diann.py
index 52e4b1a..265d816 100644
--- a/psm_utils/io/diann.py
+++ b/psm_utils/io/diann.py
@@ -60,7 +60,7 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
             peptidoform=self._parse_peptidoform(
                 psm_dict["Modified.Sequence"], psm_dict["Precursor.Charge"]
             ),
-            spectrum_id="NA",  # DIA-NN does not output spectrum ID
+            spectrum_id=psm_dict["MS2.Scan"], 
             run=psm_dict["Run"],
             is_decoy=False,
             qvalue=psm_dict[

From d12120a013c02ebc4b171020b450ec1d4e673f3b Mon Sep 17 00:00:00 2001
From: rodvrees <robbe.devreese@hotmail.com>
Date: Thu, 31 Oct 2024 15:43:29 +0100
Subject: [PATCH 08/13] cleanup .gitignore

---
 .gitignore | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index 854c67a..0432116 100644
--- a/.gitignore
+++ b/.gitignore
@@ -132,10 +132,3 @@ dmypy.json
 # Pyre type checker
 .pyre/
 .vscode/settings.json
-
-# Specific to Branch, temp
-example_files/DIANN_example.tsv
-example_files/MSFragger_example_psm.tsv
-example_files/AlphaDIA_example.tsv
-test.ipynb
-example_files/evidence.txt

From 2a7b56e1efb2f69e2212ba49eba6f11e22d4d03d Mon Sep 17 00:00:00 2001
From: rodvrees <robbe.devreese@hotmail.com>
Date: Thu, 31 Oct 2024 15:46:35 +0100
Subject: [PATCH 09/13] formatting

---
 psm_utils/io/diann.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/psm_utils/io/diann.py b/psm_utils/io/diann.py
index 265d816..60da432 100644
--- a/psm_utils/io/diann.py
+++ b/psm_utils/io/diann.py
@@ -60,7 +60,7 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
             peptidoform=self._parse_peptidoform(
                 psm_dict["Modified.Sequence"], psm_dict["Precursor.Charge"]
             ),
-            spectrum_id=psm_dict["MS2.Scan"], 
+            spectrum_id=psm_dict["MS2.Scan"],
             run=psm_dict["Run"],
             is_decoy=False,
             qvalue=psm_dict[

From 19c5bbc1808510e21a41f705d959e9b6dbfc6cfa Mon Sep 17 00:00:00 2001
From: rodvrees <robbe.devreese@hotmail.com>
Date: Thu, 31 Oct 2024 15:50:09 +0100
Subject: [PATCH 10/13] fix diann test case

---
 tests/test_io/test_diann.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_io/test_diann.py b/tests/test_io/test_diann.py
index 543db4c..2ce86ca 100644
--- a/tests/test_io/test_diann.py
+++ b/tests/test_io/test_diann.py
@@ -5,7 +5,7 @@
 
 test_psm = PSM(
     peptidoform="AAAAEINVKDPKEDLETSVVDEGR/4",
-    spectrum_id="NA",
+    spectrum_id="116903",
     run="LFQ_Orbitrap_AIF_Yeast_03",
     collection=None,
     spectrum=None,

From a9cac756b63087a1acf08682a09be2b664fcbe7c Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Wed, 6 Nov 2024 21:41:01 +0100
Subject: [PATCH 11/13] Review changes

- Add support for C-term modifications
- Fixed to-dos
- Formatting and structure
- Add docs
---
 docs/source/api/psm_utils.io.rst      |  24 ++++++
 example_files/alphadia.precursors.tsv |   4 +
 example_files/fragpipe.psm.tsv        |   6 ++
 psm_utils/io/__init__.py              |  30 ++++----
 psm_utils/io/alphadia.py              |  70 ++++++++---------
 psm_utils/io/diann.py                 |  81 ++++++++++----------
 psm_utils/io/fragpipe.py              | 106 +++++++++++++-------------
 tests/test_io/test_alphadia.py        |  82 +++++++++++++++++++-
 tests/test_io/test_diann.py           |  19 ++++-
 tests/test_io/test_fragpipe.py        |  34 +++++++--
 10 files changed, 298 insertions(+), 158 deletions(-)
 create mode 100644 example_files/alphadia.precursors.tsv
 create mode 100644 example_files/fragpipe.psm.tsv

diff --git a/docs/source/api/psm_utils.io.rst b/docs/source/api/psm_utils.io.rst
index d858997..1de803f 100644
--- a/docs/source/api/psm_utils.io.rst
+++ b/docs/source/api/psm_utils.io.rst
@@ -7,6 +7,30 @@ psm_utils.io
 
 
 
+psm_utils.io.alphapept
+##################
+
+.. automodule:: psm_utils.io.alphapept
+   :members:
+   :inherited-members:
+
+
+psm_utils.io.diann
+##################
+
+.. automodule:: psm_utils.io.diann
+   :members:
+   :inherited-members:
+
+
+psm_utils.io.fragpipe
+##################
+
+.. automodule:: psm_utils.io.fragpipe
+   :members:
+   :inherited-members:
+
+
 psm_utils.io.idxml
 ##################
 
diff --git a/example_files/alphadia.precursors.tsv b/example_files/alphadia.precursors.tsv
new file mode 100644
index 0000000..2954564
--- /dev/null
+++ b/example_files/alphadia.precursors.tsv
@@ -0,0 +1,4 @@
+base_width_mobility	base_width_rt	rt_observed	mobility_observed	mono_ms1_intensity	top_ms1_intensity	sum_ms1_intensity	weighted_ms1_intensity	weighted_mass_deviation	weighted_mass_error	mz_observed	mono_ms1_height	top_ms1_height	sum_ms1_height	weighted_ms1_height	isotope_intensity_correlation	isotope_height_correlation	n_observations	intensity_correlation	height_correlation	intensity_fraction	height_fraction	intensity_fraction_weighted	height_fraction_weighted	mean_observation_score	sum_b_ion_intensity	sum_y_ion_intensity	diff_b_y_ion_intensity	f_masked	fragment_scan_correlation	template_scan_correlation	fragment_frame_correlation	top3_frame_correlation	template_frame_correlation	top3_b_ion_correlation	n_b_ions	top3_y_ion_correlation	n_y_ions	cycle_fwhm	mobility_fwhm	delta_frame_peak	top_3_ms2_mass_error	mean_ms2_mass_error	n_overlapping	mean_overlapping_intensity	mean_overlapping_mass_error	precursor_idx	rank	frame_center	scan_center	score	elution_group_idx	frame_start	scan_stop	frame_stop	scan_start	proteins	rt_calibrated	flat_frag_start_idx	charge	mods	decoy	sequence	mz_library	channel	genes	i_0	flat_frag_stop_idx	i_2	i_1	i_3	mobility_library	rt_library	mod_sites	delta_rt	n_K	n_R	n_P	_decoy	proba	qval	_candidate_idx	valid	candidate_idx	run	mod_seq_hash	mod_seq_charge_hash	pg_master	pg	pg_qval	intensity
+0.000000	40.673340	2800.518555	0.000001	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	894.337830	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	2.000000	0.968887	0.845673	1.000000	1.000000	1.000000	1.000000	0.000000	0.000000	15.266385	-15.266385	1.000000	0.000000	0.000000	0.929785	0.975279	0.000000	0.000000	0.000000	0.948546	12.000000	14.244627	0.000000	-0.500000	0.132713	-0.218829	0.000000	0.000000	0.000000	10447876	0	72329	0	136.160126	5238821	71876	1	72933	0	P18899	2347.609131	59818105	3		0	SSYGSSSNDDSYGSSNNDDSYGSSNK	894.337830	0	DDR48_YEAST	0.273118	59818117	0.249391	0.348172	0.129319	0.948457	1399.216187		452.909424	1	0	0	0.000000	0.000000	0.000000	10447876	True	10447876	LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01	8562405370847133435	8562405370847133438	P18899	P18899	0.000000	190103852.035206
+0.000000	40.745483	1647.208252	0.000001	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	986.440491	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	2.000000	0.991654	0.992141	1.000000	1.000000	1.000000	1.000000	0.000000	0.000000	14.408463	-14.408463	1.000000	0.000000	0.000000	0.738752	0.974915	0.000000	0.000000	0.000000	0.880488	12.000000	9.885651	0.000000	0.000000	-0.391579	-0.698411	0.000000	0.000000	0.000000	8793636	0	42431	0	122.278320	4411698	41978	1	43035	0	Q9ULU4	1670.462402	49907897	2		0	SSQGSSSSTQSAPSETASASK	986.440491	0	PKCB1_HUMAN	0.380560	49907909	0.190793	0.352861	0.075786	1.158085	387.834503		-23.254150	1	0	1	0.000000	0.000000	0.000000	8793636	True	8793636	LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01	5824087303549386971	5824087303549386973	Q9ULU4	Q9ULU4	0.000000	195496849.073322
+0.000000	52.349121	2678.317139	0.000001	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	905.432312	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	2.000000	0.986449	0.931379	1.000000	1.000000	1.000000	1.000000	0.000000	0.000000	16.636572	-16.636572	1.000000	0.000000	0.000000	0.978579	0.996334	0.000000	0.000000	0.000000	0.988605	12.000000	13.867673	0.000000	0.000000	-0.432777	0.780247	0.000000	0.000000	0.000000	7132549	0	69158	0	152.012512	3581144	68554	1	69913	0	O60763	2646.791260	39980635	2		0	SSQTSGTNEQSSAIVSAR	905.432312	0	USO1_HUMAN	0.404900	39980647	0.177361	0.352328	0.065410	1.110423	1774.035034		31.525879	0	1	0	0.000000	0.000000	0.000000	7132549	True	7132549	LFQ_Orbitrap_AIF_Condition_A_Sample_Alpha_01	14912031975374993231	14912031975374993233	O60763	O60763	0.000000	406414129.849395
diff --git a/example_files/fragpipe.psm.tsv b/example_files/fragpipe.psm.tsv
new file mode 100644
index 0000000..4ffd36e
--- /dev/null
+++ b/example_files/fragpipe.psm.tsv
@@ -0,0 +1,6 @@
+Spectrum	Spectrum File	Peptide	Modified Peptide	Extended Peptide	Prev AA	Next AA	Peptide Length	Charge	Retention	Observed Mass	Calibrated Observed Mass	Observed M/Z	Calibrated Observed M/Z	Calculated Peptide Mass	Calculated M/Z	Delta Mass	Expectation	Hyperscore	Nextscore	Probability	Number of Enzymatic Termini	Number of Missed Cleavages	Protein Start	Protein End	Intensity	Assigned Modifications	Observed Modifications	Purity	Is Unique	Protein	Protein ID	Entry Name	Gene	Protein Description	Mapped Genes	Mapped Proteins
+LFQ_Orbitrap_AIF_Human_01.100000.100000.0	D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml	SEDCFILDHGK	SEDCFILDHGK	PFAQGAIK.SEDCFILDHGK.DGKIFVWK	K	D	11	3	3813.8638	1319.5804	1319.5807	440.8674	440.8675	1319.5815	440.8678	-0.0008	0.01264961000000	19.3701	15.5657	0.9968	2	0	328	338	0.0000	4C(57.0214)		0.00	false	sp|GELS_HUMAN|	GELS_HUMAN				GSN	sp|P06396|GELS_HUMAN
+LFQ_Orbitrap_AIF_Human_01.100002.100002.0	D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml	FLLEAGADQEHK		KGHIEMVR.FLLEAGADQEHK.TDEMHTAI	R	T	12	3	3813.9346	1356.6663	1356.6665	453.2294	453.2294	1356.6672	453.2297	-0.0007	0.01950739000000	18.9370	14.3831	0.9985	2	0	419	430	0.0000			0.00	false	sp|O75179|ANR17_HUMAN	O75179	ANR17_HUMAN	ANKRD17	Ankyrin repeat domain-containing protein 17	ANKHD1	sp|Q8IWZ3|ANKH1_HUMAN
+LFQ_Orbitrap_AIF_Human_01.100004.100004.0	D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml	REELSNVLAAMR	REELSNVLAAM[147]R	THIRAKRK.REELSNVLAAMR.KAAAKKD	K	K	12	3	3814.0050	1403.7197	1403.7198	468.9138	468.9139	1403.7190	468.9136	0.0008	0.00008879724000	24.3292	15.9192	0.9998	2	1	87	98	0.0000	11M(15.9949)		0.00	true	sp|Q9Y3U8|RL36_HUMAN	Q9Y3U8	RL36_HUMAN	RPL36	Large ribosomal subunit protein eL36
+LFQ_Orbitrap_AIF_Human_01.100040.100040.0	D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml	LHISPSNMTNQNTPEYMEK	LHISPSNM[147]TNQNTPEYMEKc[17]	EYFGPDFK.LHISPSNMTNQNTPEYMEK.IKQRIFEN	K	I	19	3	3815.4023	2248.0256	2248.0251	750.3491	750.3490	2248.0254	750.3491	-0.0002	0.00194418200000	21.2429	21.2429	0.7143	2	0	344	362	0.0000	8M(15.9949), C-term(-0.9840)		0.00	true	sp|Q92769|HDAC2_HUMAN	Q92769	HDAC2_HUMAN	HDAC2	Histone deacetylase 2
+LFQ_Orbitrap_AIF_Human_01.101373.101373.0	D:\test\interact-LFQ_Orbitrap_AIF_Human_01_rank1.pep.xml	ANIAVQR	n[43]ANIAVQR	.ANIAVQR.IKREFKEV	M	I	7	2	3866.1475	812.4501	812.4503	407.2323	407.2324	812.4505	407.2325	-0.0002	0.11090580000000	17.1991	14.1196	0.9898	2	0	2	8	0.0000	N-term(42.0106)		0.00	true	sp|P61086|UBE2K_HUMAN	P61086	UBE2K_HUMAN	UBE2K	Ubiquitin-conjugating enzyme E2 K
diff --git a/psm_utils/io/__init__.py b/psm_utils/io/__init__.py
index e345708..0a4307e 100644
--- a/psm_utils/io/__init__.py
+++ b/psm_utils/io/__init__.py
@@ -8,6 +8,9 @@
 
 from rich.progress import track
 
+import psm_utils.io.alphadia as alphadia
+import psm_utils.io.diann as diann
+import psm_utils.io.fragpipe as fragpipe
 import psm_utils.io.idxml as idxml
 import psm_utils.io.ionbot as ionbot
 import psm_utils.io.maxquant as maxquant
@@ -22,9 +25,6 @@
 import psm_utils.io.sage as sage
 import psm_utils.io.tsv as tsv
 import psm_utils.io.xtandem as xtandem
-import psm_utils.io.diann as diann
-import psm_utils.io.fragpipe as fragpipe
-import psm_utils.io.alphadia as alphadia
 from psm_utils.io._base_classes import WriterBase
 from psm_utils.io.exceptions import PSMUtilsIOException
 from psm_utils.psm import PSM
@@ -110,32 +110,30 @@
         "filename_pattern": r"^.*(?:_|\.).sage.parquet$",
     },
     "fragpipe": {
-        "reader": fragpipe.FragpipeReader,
+        "reader": fragpipe.FragPipeReader,
         "writer": None,
         "extension": ".tsv",
-        "filename_pattern": r"^.*psm\.tsv$",
+        "filename_pattern": r"^.*(?:_|\.)?psm\.tsv$",
     },
     "alphadia": {
-    "reader": alphadia.AlphaDIAReader,
-    "writer": None,
-    "extension": ".tsv",
-    "filename_pattern": r"^.*precursor\.tsv$",
+        "reader": alphadia.AlphaDIAReader,
+        "writer": None,
+        "extension": ".tsv",
+        "filename_pattern": r"^.*(?:_|\.)?precursors\.tsv$",
     },
-    "diann": { # List after fragpipe and alphadia to avoid extension matching conflicts #TODO: fix tsv conflict
-        "reader": diann.DIANNReader,
+    "diann": {
+        "reader": diann.DIANNTSVReader,
         "writer": None,
         "extension": ".tsv",
-        "filename_pattern": r"^.*\.tsv$",
+        "filename_pattern": r"^.*(?:_|\.)?diann\.tsv$",
     },
-
-
-    "parquet": {  # List after proteoscape and sage to avoid extension matching conflicts
+    "parquet": {  # List after more specific Parquet patterns to avoid matching conflicts
         "reader": parquet.ParquetReader,
         "writer": parquet.ParquetWriter,
         "extension": ".parquet",
         "filename_pattern": r"^.*\.parquet$",
     },
-    "tsv": {  # List after sage to avoid extension matching conflicts
+    "tsv": {  # List after more specific TSV patterns to avoid matching conflicts
         "reader": tsv.TSVReader,
         "writer": tsv.TSVWriter,
         "extension": ".tsv",
diff --git a/psm_utils/io/alphadia.py b/psm_utils/io/alphadia.py
index c2c3fff..8f6e1b8 100644
--- a/psm_utils/io/alphadia.py
+++ b/psm_utils/io/alphadia.py
@@ -1,10 +1,4 @@
-"""
-Reader for PSM files from the AlphaDIA search engine.
-
-Reads the AlphaDIA ``precursor.tsv`` file as defined on the
-`TODO: NOT YET A LINK`_.
-
-"""
+"""Reader for PSM files from the AlphaDIA search engine."""
 
 from __future__ import annotations
 
@@ -19,9 +13,18 @@
 
 set_csv_field_size_limit()
 
+# TODO: check
+RESCORING_FEATURES = [
+    "rt_observed",
+    "mobility_observed",
+    "mz_observed",
+    "charge",
+    "delta_rt",
+]
+
 
 class AlphaDIAReader(ReaderBase, ABC):
-    def __init__(self, filename, score_column: str = "score", *args, **kwargs):
+    def __init__(self, filename, *args, **kwargs):
         """
         Reader for AlphaDIA ``precursor.tsv`` file.
 
@@ -29,14 +32,10 @@ def __init__(self, filename, score_column: str = "score", *args, **kwargs):
         ----------
         filename : str or Path
             Path to PSM file.
-        score_column: str, optional
-            Name of the column that holds the primary PSM score. Default is
-            ``score``.
 
         """
         super().__init__(filename, *args, **kwargs)
         self.filename = filename
-        self.score_column = score_column
 
     def __iter__(self) -> Iterable[PSM]:
         """Iterate over file and return PSMs one-by-one."""
@@ -62,17 +61,15 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
             run=psm_dict["run"],
             spectrum=psm_dict["frame_start"],  # TODO: needs to be checked
             is_decoy=bool(int(psm_dict["decoy"])),
-            score=psm_dict[self.score_column],
+            score=psm_dict["score"],
             qvalue=psm_dict["qval"],
-            pep=psm_dict[
-                "proba"
-            ],  # TODO: needs to be checked, assumption because if it is 1-proba than it's really bad
+            pep=psm_dict["proba"],
             precursor_mz=psm_dict["mz_observed"],
             retention_time=psm_dict["rt_observed"],
             ion_mobility=psm_dict["mobility_observed"],
             protein_list=psm_dict["proteins"].split(";"),
             rank=int(psm_dict["rank"]) + 1,  # AlphaDIA ranks are 0-based
-            source="alphadia",
+            source="AlphaDIA",
             provenance_data=({"alphadia_filename": str(self.filename)}),
             metadata={},
             rescoring_features=rescoring_features,
@@ -80,20 +77,28 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
 
     @staticmethod
     def _parse_peptidoform(sequence: str, mods: str, mod_sites, charge: Optional[str]) -> str:
+        """Parse a peptidoform from a AlphaDIA PSM file."""
+        # Parse modifications
         if mods:
-            mods = mods.split(";")
-            mod_sites = mod_sites.split(";")
-            for mod, site in reversed(sorted(zip(mods, mod_sites), key=lambda x: int(x[1]))):
-                if int(site) == 0:
-                    sequence = (
-                        sequence[: int(site)] + f"[{mod.split('@')[0]}]-" + sequence[int(site) :]
-                    )
+            sequence_list = [""] + list(sequence) + [""]  # N-term, sequence, C-term
+            for mod, site in zip(mods.split(";"), mod_sites.split(";")):
+                site = int(site)
+                name = mod.split("@")[0]
+                # N-terminal modification
+                if site == 0:
+                    sequence_list[0] = f"[{name}]-"
+                # C-terminal modification
+                elif site == -1:
+                    sequence_list[-1] = f"-[{name}]"
+                # Sequence modification
                 else:
-                    sequence = (
-                        sequence[: int(site)] + f"[{mod.split('@')[0]}]" + sequence[int(site) :]
-                    )
+                    sequence_list[site] = f"{sequence_list[site]}[{name}]"
+            sequence = "".join(sequence_list)
+
+        # Add charge
         if charge:
             sequence += f"/{int(float(charge))}"
+
         return sequence
 
     @classmethod
@@ -105,14 +110,3 @@ def from_dataframe(cls, dataframe) -> PSMList:
                 for entry in dataframe.to_dict(orient="records")
             ]
         )
-
-
-# TODO: check
-RESCORING_FEATURES = [
-    "rt_observed",
-    "mobility_observed",
-    "mz_observed",
-    "score",
-    "charge",
-    "delta_rt",
-]
diff --git a/psm_utils/io/diann.py b/psm_utils/io/diann.py
index 60da432..1d93183 100644
--- a/psm_utils/io/diann.py
+++ b/psm_utils/io/diann.py
@@ -1,15 +1,24 @@
 """
 Reader for PSM files from DIA-NN
 
-Reads the '.tsv' file as defined on the `DIA-NN documentation page <https://github.com/vdemichev/DiaNN>`_.
+Reads the '.tsv' file as defined on the
+`DIA-NN documentation page <https://github.com/vdemichev/DiaNN/tree/1.8.1?tab=readme-ov-file#main-output-reference>`_.
+
+Notes
+-----
+
+- DIA-NN calculates q-values at both the run and library level. The run-level q-value is used as
+  the PSM q-value.
+- DIA-NN currently does not return precursor m/z values.
+- DIA-NN currently does not support C-terminal modifications in its searches.
+
 """
 
 from __future__ import annotations
 
 import csv
-from abc import ABC
-from typing import Iterable, Optional
 import re
+from typing import Iterable, Optional
 
 from psm_utils.io._base_classes import ReaderBase
 from psm_utils.io._utils import set_csv_field_size_limit
@@ -18,11 +27,22 @@
 
 set_csv_field_size_limit()
 
+RESCORING_FEATURES = [
+    "RT",
+    "Predicted.RT",
+    "iRT",
+    "Predicted.iRT",
+    "Ms1.Profile.Corr",
+    "Ms1.Area",
+    "IM",
+    "iIM",
+    "Predicted.IM",
+    "Predicted.iIM",
+]
+
 
-class DIANNReader(ReaderBase, ABC):
-    def __init__(
-        self, filename, score_column: str = "CScore", qval_column="Q.Value", *args, **kwargs
-    ) -> None:
+class DIANNTSVReader(ReaderBase):
+    def __init__(self, filename, *args, **kwargs) -> None:
         """
         Reader for DIA-NN '.tsv' file.
 
@@ -30,15 +50,10 @@ def __init__(
         ----------
         filename : str or Path
             Path to PSM file.
-        score_column: str, optional
-            Name of the column that holds the primary PSM score. Default is
-            ``CScore``.
 
         """
         super().__init__(filename, *args, **kwargs)
         self.filename = filename
-        self.score_column = score_column
-        self.qval_column = qval_column
 
     def __iter__(self) -> Iterable[PSM]:
         """Iterate over file and return PSMs one-by-one."""
@@ -63,16 +78,15 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
             spectrum_id=psm_dict["MS2.Scan"],
             run=psm_dict["Run"],
             is_decoy=False,
-            qvalue=psm_dict[
-                self.qval_column
-            ],  # DIA-NN puts out q-value on both run and library level
+            qvalue=psm_dict["Q.Value"],
             pep=float(psm_dict["PEP"]),
-            score=float(psm_dict[self.score_column]),
+            score=float(psm_dict["CScore"]),
+            precursor_mz=None,  # Not returned by DIA-NN :(
             retention_time=float(psm_dict["RT"]),
             ion_mobility=float(psm_dict["IM"]),
             protein_list=psm_dict["Protein.Ids"].split(";"),
             source="diann",
-            rank=1,  # Leave out?
+            rank=None,
             provenance_data=({"diann_filename": str(self.filename)}),
             rescoring_features=rescoring_features,
             metadata={},
@@ -80,22 +94,25 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
 
     @staticmethod
     def _parse_peptidoform(peptide: str, charge: Optional[str]) -> str:
+        # Add charge
         if charge:
             peptide += f"/{int(float(charge))}"
+
+        # Replace parentheses with square brackets and capitalize UniMod prefix
         pattern = r"\(UniMod:(\d+)\)"
         replacement = r"[UNIMOD:\1]"
         peptide = re.sub(pattern, replacement, peptide)
-        # If [UNIMOD:n] occurs before the first amino acid, a hyphen is added before the first amino acid
+
+        # Add hyphen for N-terminal modifications
+        # If [UNIMOD:n] occurs before the first amino acid, a hyphen is added before the first
+        # amino acid
         if peptide[0] == "[":
             # Hyphen after the closing bracket
             peptide = peptide.replace("]", "]-", 1)
-        return peptide
 
-    @staticmethod
-    def _parse_precursor_mz():
-        return NotImplementedError(
-            "Method not implemented yet. DIA-NN does not yet output precursor m/z, but might in the future."
-        )
+        # C-terminal modifications are currently not supported in DIA-NN
+
+        return peptide
 
     @classmethod
     def from_dataframe(cls, dataframe) -> PSMList:
@@ -106,19 +123,3 @@ def from_dataframe(cls, dataframe) -> PSMList:
                 for entry in dataframe.to_dict(orient="records")
             ]
         )
-
-
-# TODO: Check
-RESCORING_FEATURES = [
-    "CScore",
-    "RT",
-    "Predicted.RT",
-    "iRT",
-    "Predicted.iRT",
-    "Ms1.Profile.Corr",
-    "Ms1.Area",
-    "IM",
-    "iIM",
-    "Predicted.IM",
-    "Predicted.iIM",
-]
diff --git a/psm_utils/io/fragpipe.py b/psm_utils/io/fragpipe.py
index 50d6bd4..fc07395 100644
--- a/psm_utils/io/fragpipe.py
+++ b/psm_utils/io/fragpipe.py
@@ -4,6 +4,11 @@
 Reads the Philosopher ``psm.tsv`` file as defined on the
 `Fragpipe documentation page <https://fragpipe.nesvilab.org/docs/tutorial_fragpipe_outputs.html>`_.
 
+Notes
+-----
+
+- Decoy PSMs and q-values are not returned by FragPipe.
+
 """
 
 from __future__ import annotations
@@ -20,13 +25,23 @@
 
 set_csv_field_size_limit()
 
+RESCORING_FEATURES = [
+    "Peptide Length",
+    "Retention",
+    "Observed Mass",
+    "Observed M/Z",
+    "Calculated Peptide Mass",
+    "Calculated M/Z",
+    "Delta Mass",
+    "Number of Missed Cleavages",
+]
+
 
-class FragpipeReader(ReaderBase, ABC):
+class FragPipeReader(ReaderBase, ABC):
     def __init__(
         self,
         filename,
-        score_column: str = "Hyperscore",
-        mz_column: str = "Observed M/Z",
+        use_calibrated_mz: bool = True,
         *args,
         **kwargs,
     ) -> None:
@@ -35,20 +50,18 @@ def __init__(
 
         Parameters
         ----------
-        filename : str or Path
+        filename
             Path to PSM file.
-        score_column: str, optional
-            Name of the column that holds the primary PSM score. Default is
-            ``Hyperscore``.
-        mz_column: str, optional
-            Name of the column that holds the precursor m/z. Default is
-            ``Observed M/Z``.
+        use_calibrated_mz
+            Whether to use ``Calibrated Observed M/Z`` (true) or non-calibrated ``Observed m/z``
+            (false), by default True.
 
         """
         super().__init__(filename, *args, **kwargs)
         self.filename = filename
-        self.score_column = score_column
-        self.mz_column = mz_column
+        self.use_calibrated_mz = use_calibrated_mz
+
+        self._mz_key = "Calibrated Observed M/Z" if use_calibrated_mz else "Observed M/Z"
 
     def __iter__(self) -> Iterable[PSM]:
         """Iterate over file and return PSMs one-by-one."""
@@ -58,38 +71,27 @@ def __iter__(self) -> Iterable[PSM]:
                 yield self._get_peptide_spectrum_match(row)
 
     def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
-        """Parse a single PSM from a MSFragger PSM file."""
-        rescoring_features = {}
-        for ft in RESCORING_FEATURES:
-            try:
-                rescoring_features[ft] = psm_dict[ft]
-            except KeyError:
-                continue
+        """Parse a single PSM from a FragPipe PSM file."""
+        rescoring_features = {ft: psm_dict[ft] for ft in RESCORING_FEATURES if ft in psm_dict}
 
         return PSM(
             peptidoform=self._parse_peptidoform(
                 psm_dict["Modified Peptide"], psm_dict["Peptide"], psm_dict["Charge"]
             ),
-            spectrum_id=self._parse_spectrum_id(psm_dict["Spectrum"]),  # TODO: needs to be checked
+            spectrum_id=self._parse_spectrum_id(psm_dict["Spectrum"]),
             run=self._parse_run(psm_dict["Spectrum File"]),
             is_decoy=False,
-            qvalue=None,  # Q-value is not outputted by Philosopher
-            pep=1
-            - float(
-                psm_dict["Probability"]
-            ),  # PeptideProphet Probability, not explicitely stated if this is the inverse of PEP
-            # But I'm assuming it is
-            score=psm_dict[self.score_column],
-            precursor_mz=psm_dict[
-                self.mz_column
-            ],  # Allows use of both calibrated and uncalibrated Observed M/Z?
+            # Assuming this is 1 - PEP, as described in the PeptideProphet paper
+            # (https://doi.org/10.1186/1471-2105-13-S16-S1)
+            pep=1 - float(psm_dict["Probability"]),
+            score=psm_dict["Hyperscore"],
+            precursor_mz=psm_dict[self._mz_key],
             retention_time=float(psm_dict["Retention"]),
             ion_mobility=float(psm_dict["Ion Mobility"]) if "Ion Mobility" in psm_dict else None,
             protein_list=self._parse_protein_list(
                 psm_dict["Protein"], psm_dict["Mapped Proteins"]
             ),
-            source="fragpipe",
-            rank=1,
+            source="FragPipe",
             provenance_data=({"fragpipe_filename": str(self.filename)}),
             rescoring_features=rescoring_features,
             metadata={},
@@ -97,32 +99,44 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
 
     @staticmethod
     def _parse_peptidoform(mod_peptide: str, peptide: str, charge: Optional[str]) -> str:
+        """Parse the peptidoform from the modified peptide, peptide, and charge columns."""
         if mod_peptide:
             peptide = mod_peptide
+            # N-terminal modification
+            if peptide.startswith("n"):
+                peptide = peptide[1:]
+                # A hyphen needs to be added after the N-terminal modification, thus after the ]
+                peptide = peptide.replace("]", "]-", 1)
+            # C-terminal modification
+            if peptide.endswith("]"):
+                if "c[" in peptide:
+                    peptide = peptide.replace("c[", "-[", 1)
         if charge:
             peptide += f"/{int(float(charge))}"
-        if peptide.startswith("n"):
-            peptide = peptide[1:]
-            # A hyphen needs to be added after the N-terminal modification, thus after the ]
-            peptide = peptide.replace("]", "]-", 1)
         return peptide
 
     @staticmethod
     def _parse_spectrum_id(spectrum: str) -> str:
-        return spectrum.split(".")[1]
+        """Extract scan number from spectrum ID: ``(file name).(scan #).(scan #).(charge).``"""
+        try:
+            return spectrum.split(".")[-2]
+        except IndexError:
+            return spectrum
 
     @staticmethod
     def _parse_protein_list(razor_protein: str, mapped_proteins) -> list[str]:
+        """Combine razor protein and mapped proteins into a single list."""
         if mapped_proteins:
             mapped_proteins_list = mapped_proteins.split(", ")
             return [razor_protein] + mapped_proteins_list
         else:
             return [razor_protein]
 
-    # Dependent on the fragpipe workflow used the run name can be different, but in most cases
-    # something like 'interact-<run_name>.pep.xml' is used
     @staticmethod
     def _parse_run(spectrum_file: str) -> str:
+        """Extract run name from spectrum file."""
+        # Depending on the FragPipe workflow used, the run name can be different. In most cases
+        # something like 'interact-<run_name>.pep.xml' is used
         if (spectrum_file.endswith(".pep.xml")) and (spectrum_file.startswith("interact-")):
             spectrum_file = spectrum_file.replace("interact-", "")
             return Path(Path(spectrum_file).stem).stem
@@ -138,17 +152,3 @@ def from_dataframe(cls, dataframe) -> PSMList:
                 for entry in dataframe.to_dict(orient="records")
             ]
         )
-
-
-# TODO: check
-RESCORING_FEATURES = [
-    "Peptide Length",
-    "Retention",
-    "Observed Mass",
-    "Observed M/Z",
-    "Calculated Peptide Mass",
-    "Calculated M/Z",
-    "Delta Mass",
-    "Hyperscore",
-    "Number of Missed Cleavages",
-]
diff --git a/tests/test_io/test_alphadia.py b/tests/test_io/test_alphadia.py
index 64e38ae..cb3e77a 100644
--- a/tests/test_io/test_alphadia.py
+++ b/tests/test_io/test_alphadia.py
@@ -18,13 +18,12 @@
     ion_mobility=0.000001,
     protein_list=["P06733"],
     rank=1,
-    source="alphadia",
+    source="AlphaDIA",
     metadata={},
     rescoring_features={
         "rt_observed": 3111.141602,
         "mobility_observed": 0.000001,
         "mz_observed": 648.794128,
-        "score": 170.287918,
         "charge": 2,
         "delta_rt": -39.528809,
     },
@@ -37,3 +36,82 @@ def test_iter(self):
             for psm in reader:
                 psm.provenance_data = {}
                 assert psm == test_psm
+
+    def test__parse_peptidoform(self):
+        test_cases = [
+            {
+                "sequence": "DNTTSGCGSDLQSATGTAR",
+                "mods": "Carbamidomethyl@C",
+                "mod_sites": "7",
+                "charge": 2,
+                "expected": "DNTTSGC[Carbamidomethyl]GSDLQSATGTAR/2",
+            },
+            {
+                "sequence": "STCTEGEIACSADGK",
+                "mods": "Carbamidomethyl@C;Carbamidomethyl@C",
+                "mod_sites": "3;10",
+                "charge": 2,
+                "expected": "STC[Carbamidomethyl]TEGEIAC[Carbamidomethyl]SADGK/2",
+            },
+            {
+                "sequence": "MLGETCADCGTILLQDK",
+                "mods": "Oxidation@M;Carbamidomethyl@C;Carbamidomethyl@C",
+                "mod_sites": "1;6;9",
+                "charge": 2,
+                "expected": "M[Oxidation]LGETC[Carbamidomethyl]ADC[Carbamidomethyl]GTILLQDK/2",
+            },
+            {
+                "sequence": "VGLIGSCTNSSYEDMSR",
+                "mods": "Oxidation@M;Carbamidomethyl@C",
+                "mod_sites": "15;7",
+                "charge": 2,
+                "expected": "VGLIGSC[Carbamidomethyl]TNSSYEDM[Oxidation]SR/2",
+            },
+            {
+                "sequence": "STATTTVTTSDQASHPTK",
+                "mods": "Acetyl@Protein_N-term",
+                "mod_sites": "0",
+                "charge": 2,
+                "expected": "[Acetyl]-STATTTVTTSDQASHPTK/2",
+            },
+            {
+                "sequence": "MEPGPDGPAASGPAAIR",
+                "mods": "Acetyl@Protein_N-term;Oxidation@M",
+                "mod_sites": "0;1",
+                "charge": 2,
+                "expected": "[Acetyl]-M[Oxidation]EPGPDGPAASGPAAIR/2",
+            },
+            {
+                "sequence": "AEPQPPSGGLTDEAALSCCSDADPSTK",
+                "mods": "Acetyl@Protein_N-term;Carbamidomethyl@C;Carbamidomethyl@C",
+                "mod_sites": "0;18;19",
+                "charge": 3,
+                "expected": "[Acetyl]-AEPQPPSGGLTDEAALSC[Carbamidomethyl]C[Carbamidomethyl]SDADPSTK/3",
+            },
+            {
+                "sequence": "EPLISAPYLTTTKMSAPATLDAACIFCK",
+                "mods": "Acetyl@Protein_N-term;Oxidation@M;Carbamidomethyl@C;Carbamidomethyl@C",
+                "mod_sites": "0;14;24;27",
+                "charge": 4,
+                "expected": "[Acetyl]-EPLISAPYLTTTKM[Oxidation]SAPATLDAAC[Carbamidomethyl]IFC[Carbamidomethyl]K/4",
+            },
+            {
+                "sequence": "GDIDANAFQHK",
+                "mods": "Amidated@Any_C-term",
+                "mod_sites": "-1",
+                "charge": 2,
+                "expected": "GDIDANAFQHK-[Amidated]/2",
+            },
+            {
+                "sequence": "MNNPAMTIKGEQAK",
+                "mods": "Acetyl@Protein_N-term;Oxidation@M;Oxidation@M;Amidated@Any_C-term",
+                "mod_sites": "0;1;6;-1",
+                "charge": 4,
+                "expected": "[Acetyl]-M[Oxidation]NNPAM[Oxidation]TIKGEQAK-[Amidated]/4",
+            },
+        ]
+
+        for test_case in test_cases:
+            assert AlphaDIAReader._parse_peptidoform(
+                test_case["sequence"], test_case["mods"], test_case["mod_sites"], test_case["charge"]
+            ) == test_case["expected"]
diff --git a/tests/test_io/test_diann.py b/tests/test_io/test_diann.py
index 2ce86ca..12d6036 100644
--- a/tests/test_io/test_diann.py
+++ b/tests/test_io/test_diann.py
@@ -1,6 +1,6 @@
 """Tests for psm_utils.io.diann."""
 
-from psm_utils.io.diann import DIANNReader
+from psm_utils.io.diann import DIANNTSVReader
 from psm_utils.psm import PSM
 
 test_psm = PSM(
@@ -17,7 +17,7 @@
     retention_time=75.2574,
     ion_mobility=0,
     protein_list=["P38156"],
-    rank=1,
+    rank=None,
     source="diann",
     metadata={},
     rescoring_features={
@@ -36,9 +36,20 @@
 )
 
 
-class TestDIANNReader:
+class TestDIANNTSVReader:
     def test_iter(self):
-        with DIANNReader("./tests/test_data/test_diann.tsv") as reader:
+        with DIANNTSVReader("./tests/test_data/test_diann.tsv") as reader:
             for psm in reader:
                 psm.provenance_data = {}
                 assert psm == test_psm
+
+    def test__parse_peptidoform(self):
+        test_cases = [
+            (("ACDE", "4"), "ACDE/4"),
+            (("AC(UniMod:1)DE", "4"), "AC[UNIMOD:1]DE/4"),
+            (("(UniMod:4)ACDE", "4"), "[UNIMOD:4]-ACDE/4"),
+        ]
+
+        reader = DIANNTSVReader("./tests/test_data/test_diann.tsv")
+        for (peptide, charge), expected in test_cases:
+            assert reader._parse_peptidoform(peptide, charge) == expected
diff --git a/tests/test_io/test_fragpipe.py b/tests/test_io/test_fragpipe.py
index 4c1b067..e92f45e 100644
--- a/tests/test_io/test_fragpipe.py
+++ b/tests/test_io/test_fragpipe.py
@@ -1,6 +1,6 @@
 """Tests for psm_utils.io.fragpipe."""
 
-from psm_utils.io.fragpipe import FragpipeReader
+from psm_utils.io.fragpipe import FragPipeReader
 from psm_utils.psm import PSM
 
 test_psm = PSM(
@@ -13,12 +13,12 @@
     score=57.2940,
     qvalue=None,
     pep=1 - 1.0000,
-    precursor_mz=1001.9342,
+    precursor_mz=1001.9336,
     retention_time=2432.1640,
     ion_mobility=None,
     protein_list=["sp|P40159|YNU8_YEAST"],
-    rank=1,
-    source="fragpipe",
+    rank=None,
+    source="FragPipe",
     metadata={},
     rescoring_features={
         "Peptide Length": 20,
@@ -36,7 +36,31 @@
 
 class TestFragpipeReader:
     def test_iter(self):
-        with FragpipeReader("./tests/test_data/test_fragpipe.tsv") as reader:
+        with FragPipeReader("./tests/test_data/test_fragpipe.tsv") as reader:
             for psm in reader:
                 psm.provenance_data = {}
                 assert psm == test_psm
+
+    def test__parse_peptidoform(self):
+        test_cases = [
+            (("LHM[147]TNQNMEKc[17]", "LHMTNQNMEK", "3"), "LHM[147]TNQNMEK-[17]/3"),
+            (("n[43]ANIAVQR", "ANIAVQR", "2"), "[43]-ANIAVQR/2"),
+            ((None, "IPAVTYPK", "2"), "IPAVTYPK/2"),
+            (("", "IPAVTYPK", "2"), "IPAVTYPK/2"),
+            (("", "IPAVTYPK", 2), "IPAVTYPK/2"),
+        ]
+
+        reader = FragPipeReader("./tests/test_data/test_fragpipe.tsv")
+        for (peptide, modified_peptide, charge), expected in test_cases:
+            assert reader._parse_peptidoform(peptide, modified_peptide, charge) == expected
+
+    def test__parse_spectrum_id(self):
+        test_cases = [
+            ("LFQ_Orbitrap_AIF_Human_01.101124.101124.0", "101124"),
+            ("LFQ.Orbitrap.AIF.Human.01.101124.101124.0", "101124"),
+            ("101124", "101124"),
+        ]
+
+        reader = FragPipeReader("./tests/test_data/test_fragpipe.tsv")
+        for spectrum, expected in test_cases:
+            assert reader._parse_spectrum_id(spectrum) == expected

From 00c2714122f9ddd8b86b98cf0bf1f3754a0c4a3b Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Wed, 6 Nov 2024 21:50:17 +0100
Subject: [PATCH 12/13] Add new readers to readme

---
 README.rst | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/README.rst b/README.rst
index 257ee33..b21e247 100644
--- a/README.rst
+++ b/README.rst
@@ -89,6 +89,9 @@ Supported file formats
 ===================================================================================================================== ======================== =============== ===============
  File format                                                                                                           psm_utils tag            Read support    Write support
 ===================================================================================================================== ======================== =============== ===============
+ `AlphaDIA precursors TSV <https://alphadia.readthedocs.io/en/latest/quickstart.html#output-files>`_                   ``alphadia``             ✅              ❌
+ `DIA-NN TSV <https://github.com/vdemichev/DiaNN#output>`_                                                             ``diann``                ✅              ❌
+ `FragPipe PSM TSV <https://fragpipe.nesvilab.org/docs/tutorial_fragpipe_outputs.html#psmtsv/>`_                       ``fragpipe``             ✅              ❌
  `ionbot CSV <https://ionbot.cloud/>`_                                                                                 ``ionbot``               ✅              ❌
  `OpenMS idXML <https://www.openms.de/>`_                                                                              ``idxml``                ✅              ✅
  `MaxQuant msms.txt <https://www.maxquant.org/>`_                                                                      ``msms``                 ✅              ❌
@@ -98,10 +101,10 @@ Supported file formats
  `Peptide Record <https://psm-utils.readthedocs.io/en/stable/api/psm_utils.io/#module-psm_utils.io.peptide_record>`_   ``peprec``               ✅              ✅
  `pepXML <http://tools.proteomecenter.org/wiki/index.php?title=Formats:pepXML>`_                                       ``pepxml``               ✅              ❌
  `Percolator tab <https://github.com/percolator/percolator/wiki/Interface>`_                                           ``percolator``           ✅              ✅
- Proteome Discoverer MSF                                                                                               ``proteome_discoverer``  ✅              ❌
+ `Proteome Discoverer MSF <#>`_                                                                                        ``proteome_discoverer``  ✅              ❌
  `Sage Parquet <https://github.com/lazear/sage/blob/v0.14.7/DOCS.md#interpreting-sage-output>`_                        ``sage_parquet``         ✅              ❌
  `Sage TSV <https://github.com/lazear/sage/blob/v0.14.7/DOCS.md#interpreting-sage-output>`_                            ``sage_tsv``             ✅              ❌
- ProteoScape Parquet                                                                                                   ``proteoscape``          ✅              ❌
+ `ProteoScape Parquet <#>`_                                                                                            ``proteoscape``          ✅              ❌
  `TSV <https://psm-utils.readthedocs.io/en/stable/api/psm_utils.io/#module-psm_utils.io.tsv>`_                         ``tsv``                  ✅              ✅
  `X!Tandem XML <https://www.thegpm.org/tandem/>`_                                                                      ``xtandem``              ✅              ❌
 ===================================================================================================================== ======================== =============== ===============

From 029ba11db5fc180ed5e10679a0d6903fb2b5e1a1 Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Wed, 6 Nov 2024 22:00:57 +0100
Subject: [PATCH 13/13] Fix tests

---
 tests/test_io/test_diann.py    | 1 -
 tests/test_io/test_fragpipe.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/tests/test_io/test_diann.py b/tests/test_io/test_diann.py
index 12d6036..29884b8 100644
--- a/tests/test_io/test_diann.py
+++ b/tests/test_io/test_diann.py
@@ -21,7 +21,6 @@
     source="diann",
     metadata={},
     rescoring_features={
-        "CScore": 0.995107,
         "RT": 75.2574,
         "Predicted.RT": 75.2713,
         "iRT": 33.9222,
diff --git a/tests/test_io/test_fragpipe.py b/tests/test_io/test_fragpipe.py
index e92f45e..9020c41 100644
--- a/tests/test_io/test_fragpipe.py
+++ b/tests/test_io/test_fragpipe.py
@@ -28,7 +28,6 @@
         "Calculated Peptide Mass": 2001.8524,
         "Calculated M/Z": 1001.9335,
         "Delta Mass": 0.0002,
-        "Hyperscore": 57.2940,
         "Number of Missed Cleavages": 0,
     },
 )