From abd1eea41625cfc3e12deb28848f894aa63e6ded Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Fri, 25 Aug 2023 14:40:17 +0200
Subject: [PATCH 01/13] - Fixed: io.Percolator: Allow rescoring features that
 are not in feature names (dictwriter extrasaction) - Changed: Make io reader
 read_file method inheritable.

---
 psm_utils/io/_base_classes.py  |  5 ++---
 psm_utils/io/idxml.py          |  8 +-------
 psm_utils/io/maxquant.py       | 28 ++++++-------------------
 psm_utils/io/msamanda.py       | 13 ++----------
 psm_utils/io/mzid.py           |  4 ----
 psm_utils/io/peptide_record.py | 28 ++++++-------------------
 psm_utils/io/percolator.py     | 37 +++++++++-------------------------
 psm_utils/io/sage.py           | 13 ++++--------
 psm_utils/io/tsv.py            | 16 +++++----------
 psm_utils/io/xtandem.py        | 12 +----------
 10 files changed, 37 insertions(+), 127 deletions(-)

diff --git a/psm_utils/io/_base_classes.py b/psm_utils/io/_base_classes.py
index 60a5a00..f4c3cfc 100644
--- a/psm_utils/io/_base_classes.py
+++ b/psm_utils/io/_base_classes.py
@@ -41,11 +41,9 @@ def __exit__(self, *args, **kwargs):
     def __iter__(self):
         raise NotImplementedError()
 
-    @abstractmethod
     def read_file(self) -> PSMList:
         """Read full PSM file into a PSMList object."""
-        raise NotImplementedError()
-
+        return PSMList(psm_list=[psm for psm in self.__iter__()])
 
 class WriterBase(ABC):
     """Abstract base class for PSM file writers."""
@@ -63,6 +61,7 @@ def __exit__(self, *args, **kwargs):
     @abstractmethod
     def write_psm(self, psm: PSM):
         """Write a single PSM to the PSM file."""
+        raise NotImplementedError()
 
     @abstractmethod
     def write_file(self, psm_list: PSMList):
diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py
index b02ba99..fced5ca 100644
--- a/psm_utils/io/idxml.py
+++ b/psm_utils/io/idxml.py
@@ -35,10 +35,6 @@ def __iter__(self):
                 for peptide_hit in entry["PeptideHit"]:
                     yield self._parse_psm(entry, peptide_hit)
 
-    def read_file(self) -> PSMList:
-        """Read full PSM file into a PSMList object."""
-        return PSMList(psm_list=[psm for psm in self.__iter__()])
-
     @staticmethod
     def _parse_peptidoform(sequence: str, charge: int):
         """
@@ -74,9 +70,7 @@ def _parse_is_decoy(target_decoy: str):
     def _parse_psm(self, entry: dict, peptide_hit: dict) -> PSM:
         """Parse idXML PSM to :py:class:`~psm_utils.psm.PSM`."""
         return PSM(
-            peptidoform=self._parse_peptidoform(
-                peptide_hit["sequence"], peptide_hit["charge"]
-            ),
+            peptidoform=self._parse_peptidoform(peptide_hit["sequence"], peptide_hit["charge"]),
             spectrum_id=entry["spectrum_reference"],
             is_decoy=self._parse_is_decoy(peptide_hit["target_decoy"]),
             score=peptide_hit["score"],
diff --git a/psm_utils/io/maxquant.py b/psm_utils/io/maxquant.py
index 89c2c9c..55814be 100644
--- a/psm_utils/io/maxquant.py
+++ b/psm_utils/io/maxquant.py
@@ -84,10 +84,6 @@ def __iter__(self):
                 psm = self._get_peptide_spectrum_match(psm_dict)
                 yield psm
 
-    def read_file(self) -> PSMList:
-        """Read full MaxQuant msms.txt PSM file into a PSMList object."""
-        return PSMList(psm_list=[psm for psm in self.__iter__()])
-
     def _validate_msms(self) -> None:
         with open(self.filename, "r") as msms_file:
             msms_reader = csv.DictReader(msms_file, delimiter="\t")
@@ -97,23 +93,17 @@ def _validate_msms(self) -> None:
     def _evaluate_columns(columns) -> bool:
         """Case insensitive column evaluation msms file."""
         columns = list(map(lambda col: col.lower(), columns))
-        column_check = [
-            True if col.lower() in columns else False for col in MSMS_REQUIRED_COLUMNS
-        ]
+        column_check = [True if col.lower() in columns else False for col in MSMS_REQUIRED_COLUMNS]
         if not all(column_check):
             raise MSMSParsingError(
                 f"Missing columns: {list(compress(MSMS_REQUIRED_COLUMNS, list(~np.array(column_check))))}"
             )
 
-    def _get_peptide_spectrum_match(
-        self, psm_dict: dict[str, str | float]
-    ) -> PSM:
+    def _get_peptide_spectrum_match(self, psm_dict: dict[str, str | float]) -> PSM:
         """Return a PSM object from MaxQuant msms.txt PSM file."""
 
         psm = PSM(
-            peptidoform=self._parse_peptidoform(
-                psm_dict["Modified sequence"], psm_dict["Charge"]
-            ),
+            peptidoform=self._parse_peptidoform(psm_dict["Modified sequence"], psm_dict["Charge"]),
             spectrum_id=psm_dict["Scan number"],
             run=psm_dict["Raw file"],
             is_decoy=psm_dict["Reverse"] == "+",
@@ -148,21 +138,15 @@ def _parse_peptidoform(modified_seq: str, charge: int) -> Peptidoform:
 
             # if N-term mod
             if match.start() == 0:
-                modified_seq = re.sub(
-                    f"\({se_mod_string}\)", f"[{match[1]}]-", modified_seq
-                )
+                modified_seq = re.sub(f"\({se_mod_string}\)", f"[{match[1]}]-", modified_seq)
 
             # if C-term mod
             elif match.end() == modified_seq_len:
-                modified_seq = re.sub(
-                    f"\({se_mod_string}\)", f"-[{match[1]}]", modified_seq
-                )
+                modified_seq = re.sub(f"\({se_mod_string}\)", f"-[{match[1]}]", modified_seq)
 
             # if modification on amino acid
             else:
-                modified_seq = re.sub(
-                    f"\({se_mod_string}\)", f"[{match[1]}]", modified_seq
-                )
+                modified_seq = re.sub(f"\({se_mod_string}\)", f"[{match[1]}]", modified_seq)
 
         modified_seq += f"/{charge}"
 
diff --git a/psm_utils/io/msamanda.py b/psm_utils/io/msamanda.py
index 77d24bc..b79c7fd 100644
--- a/psm_utils/io/msamanda.py
+++ b/psm_utils/io/msamanda.py
@@ -13,7 +13,6 @@
 from psm_utils.exceptions import PSMUtilsException
 from psm_utils.io._base_classes import ReaderBase
 from psm_utils.psm import PSM, Peptidoform
-from psm_utils.psm_list import PSMList
 
 logger = logging.getLogger(__name__)
 
@@ -66,10 +65,6 @@ def __iter__(self):
             for psm_dict in reader:
                 yield self._get_peptide_spectrum_match(psm_dict)
 
-    def read_file(self) -> PSMList:
-        """Read full PSM file into a PSMList object."""
-        return PSMList(psm_list=[psm for psm in self.__iter__()])
-
     def _evaluate_columns(self, columns) -> bool:
         """Column evaluation for MS Amanda file."""
         # Check if required columns are present
@@ -84,9 +79,7 @@ def _evaluate_columns(self, columns) -> bool:
             self._present_columns.append("Rank")
 
         # Get list of present rescoring features
-        self._rescoring_feature_columns = [
-            col for col in RESCORING_FEATURES if col in columns
-        ]
+        self._rescoring_feature_columns = [col for col in RESCORING_FEATURES if col in columns]
 
         # Add remaining columns to metadata
         self._metadata_columns = [
@@ -116,9 +109,7 @@ def _get_peptide_spectrum_match(self, psm_dict: dict[str, str | float]) -> PSM:
                 if col in self._rescoring_feature_columns
             },
             metadata={
-                col: str(value)
-                for col, value in psm_dict.items()
-                if col in self._metadata_columns
+                col: str(value) for col, value in psm_dict.items() if col in self._metadata_columns
             },
         )
         if self._has_rank_column:
diff --git a/psm_utils/io/mzid.py b/psm_utils/io/mzid.py
index 8c37657..0aae4ec 100644
--- a/psm_utils/io/mzid.py
+++ b/psm_utils/io/mzid.py
@@ -166,10 +166,6 @@ def __iter__(self):
                         spectrum_id, spectrum_title, run, rt, entry
                     )
 
-    def read_file(self) -> PSMList:
-        """Read full mzid file to PSM list object."""
-        return PSMList(psm_list=[psm for psm in self])
-
     @staticmethod
     def _get_xml_namespace(root_tag):
         """Get the namespace of the xml root."""
diff --git a/psm_utils/io/peptide_record.py b/psm_utils/io/peptide_record.py
index c4b2a2a..2c1e6da 100644
--- a/psm_utils/io/peptide_record.py
+++ b/psm_utils/io/peptide_record.py
@@ -199,9 +199,7 @@ def __init__(
         # Define named tuple for single Peptide Record entries, based on
         # configured columns
         columns = self._peprec.required_columns + self._peprec.optional_columns
-        self.PeprecEntry = namedtuple(
-            "PeprecEntry", columns, defaults=[None for _ in columns]
-        )
+        self.PeprecEntry = namedtuple("PeprecEntry", columns, defaults=[None for _ in columns])
 
     def __iter__(self) -> Iterable[PSM]:
         """Iterate over file and return PSMs one-by-one."""
@@ -212,16 +210,6 @@ def __iter__(self) -> Iterable[PSM]:
                 psm = self._entry_to_psm(entry, filename=self.filename)
                 yield psm
 
-    def read_file(self) -> PSMList:
-        """Read full Peptide Record PSM file into a PSMList object."""
-        psm_list = []
-        with open(self.filename) as peprec_in:
-            reader = csv.DictReader(peprec_in, delimiter=self._peprec.separator)
-            for row in reader:
-                entry = self.PeprecEntry(**row)
-                psm_list.append(self._entry_to_psm(entry, filename=self.filename))
-        return PSMList(psm_list=psm_list)
-
     @staticmethod
     def _entry_to_psm(entry: NamedTuple, filename: Optional[str] = None) -> PSM:
         """Parse single Peptide Record entry to `PSM`."""
@@ -280,8 +268,7 @@ def __enter__(self) -> PeptideRecordWriter:
             self._open_file = open(self.filename, "wt", newline="")
             self._writer = csv.DictWriter(
                 self._open_file,
-                fieldnames=_PeptideRecord.required_columns
-                + _PeptideRecord.optional_columns,
+                fieldnames=_PeptideRecord.required_columns + _PeptideRecord.optional_columns,
                 extrasaction="ignore",
                 delimiter=" ",
             )
@@ -352,9 +339,7 @@ def write_file(self, psm_list: PSMList):
 
         """
         with open(self.filename, "wt", newline="") as f:
-            fieldnames = (
-                _PeptideRecord.required_columns + _PeptideRecord.optional_columns
-            )
+            fieldnames = _PeptideRecord.required_columns + _PeptideRecord.optional_columns
             writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter=" ")
             writer.writeheader()
             for psm in psm_list:
@@ -391,9 +376,7 @@ def peprec_to_proforma(
     peptide = [""] + list(peptide) + [""]
 
     # Add modification labels
-    for position, label in zip(
-        modifications.split("|")[::2], modifications.split("|")[1::2]
-    ):
+    for position, label in zip(modifications.split("|")[::2], modifications.split("|")[1::2]):
         try:
             peptide[int(position)] += f"[{label}]"
         except ValueError as e:
@@ -402,7 +385,8 @@ def peprec_to_proforma(
             ) from e
         except IndexError as e:
             raise InvalidPeprecModificationError(
-                f"PEPREC modification has invalid position {position} in peptide `{''.join(peptide)}`."
+                f"PEPREC modification has invalid position {position} in "
+                f"peptide `{''.join(peptide)}`."
             ) from e
 
     # Add dashes between residues and termini, and join sequence
diff --git a/psm_utils/io/percolator.py b/psm_utils/io/percolator.py
index bd23af2..200f40b 100644
--- a/psm_utils/io/percolator.py
+++ b/psm_utils/io/percolator.py
@@ -111,13 +111,6 @@ def __iter__(self) -> Iterable[PSM]:
                     psm = self._parse_entry(entry)
                     yield psm
 
-    def read_file(self) -> PSMList:
-        """Read full PSM file into a PSMList object."""
-        psm_list = []
-        for psm in self.__iter__():
-            psm_list.append(psm)
-        return PSMList(psm_list=psm_list)
-
     @staticmethod
     def _read_header(filename):
         with open(filename, "rt") as f:
@@ -184,19 +177,11 @@ def _parse_entry(self, entry):
             peptidoform=peptidoform,
             spectrum_id=entry[self.id_column],
             is_decoy=is_decoy,
-            score=float(entry[self.score_column.lower()])
-            if self.score_column
-            else None,
+            score=float(entry[self.score_column.lower()]) if self.score_column else None,
             qvalue=entry["q-value"] if "q-value" in entry else None,
-            pep=entry["posterior_error_prob"]
-            if "posterior_error_prob" in entry
-            else None,
-            precursor_mz=float(entry[self.mz_column.lower()])
-            if self.mz_column
-            else None,
-            retention_time=float(entry[self.rt_column.lower()])
-            if self.rt_column
-            else None,
+            pep=entry["posterior_error_prob"] if "posterior_error_prob" in entry else None,
+            precursor_mz=float(entry[self.mz_column.lower()]) if self.mz_column else None,
+            retention_time=float(entry[self.rt_column.lower()]) if self.rt_column else None,
             protein_list=protein_list,
             source="percolator",
             provenance_data={"filename": str(self.filename)},
@@ -224,7 +209,7 @@ def __init__(
         style: str
             Percolator Tab style. One of {``pin``, ``pout``}. If ``pin``, the columns
             ``SpecId``, ``Label``, ``ScanNr``, ``ChargeN``, ``PSMScore``, ``Peptide``, and
-            ``Proteins`` are written             alongside the requested feature names
+            ``Proteins`` are written alongside the requested feature names
             (see ``feature_names``). If ``pout``, the columns ``PSMId``, ``Label``, ``score``,
             ``q-value``, ``posterior_error_prob``, ``peptide``, and ``proteinIds`` are written.
         feature_names: list[str], optional
@@ -257,9 +242,7 @@ def __init__(
                 "proteinIds",
             ]
         else:
-            raise ValueError(
-                "Invalid Percolator Tab style. Should be one of {`pin`, `pout`}."
-            )
+            raise ValueError("Invalid Percolator Tab style. Should be one of {`pin`, `pout`}.")
         self.style = style
         self._open_file = None
         self._writer = None
@@ -280,9 +263,7 @@ def __enter__(self) -> PercolatorTabWriter:
                     fieldnames = line.strip().split("\t")
                     break
                 else:
-                    raise ValueError(
-                        f"File {self.filename} is not a valid Percolator Tab file."
-                    )
+                    raise ValueError(f"File {self.filename} is not a valid Percolator Tab file.")
                 # Determine last scan number
                 open_file.seek(0)
                 last_line = None
@@ -336,7 +317,9 @@ def write_file(self, psm_list: PSMList):
         with _PercolatorTabIO(
             self.filename, "wt", newline="", protein_separator=self._protein_separator
         ) as f:
-            writer = csv.DictWriter(f, fieldnames=self._columns, delimiter="\t")
+            writer = csv.DictWriter(
+                f, fieldnames=self._columns, delimiter="\t", extrasaction="ignore"
+            )
             writer.writeheader()
             for i, psm in enumerate(psm_list):
                 entry = self._psm_to_entry(psm)
diff --git a/psm_utils/io/sage.py b/psm_utils/io/sage.py
index a16d6f0..25abe82 100644
--- a/psm_utils/io/sage.py
+++ b/psm_utils/io/sage.py
@@ -48,13 +48,6 @@ def __iter__(self) -> Iterable[PSM]:
                 psm = self._get_peptide_spectrum_match(row)
                 yield psm
 
-    def read_file(self) -> PSMList:
-        """Read full PSM file into a PSMList object."""
-        psm_list = []
-        for psm in self.__iter__():
-            psm_list.append(psm)
-        return PSMList(psm_list=psm_list)
-
     def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
         """Parse a single PSM from a sage PSM file."""
         rescoring_features = {}
@@ -96,8 +89,10 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM:
             spectrum_id=psm_dict["scannr"],
             run=Path(psm_dict["filename"]).stem,
             is_decoy=True
-            if psm_dict["label"] == "-1" else False
-            if psm_dict["label"] == "1" else None,
+            if psm_dict["label"] == "-1"
+            else False
+            if psm_dict["label"] == "1"
+            else None,
             qvalue=psm_dict["spectrum_q"],
             score=float(psm_dict[self.score_column]),
             precursor_mz=self._parse_precursor_mz(psm_dict["expmass"], psm_dict["charge"]),
diff --git a/psm_utils/io/tsv.py b/psm_utils/io/tsv.py
index ee7b92b..60f7cc1 100644
--- a/psm_utils/io/tsv.py
+++ b/psm_utils/io/tsv.py
@@ -68,10 +68,6 @@ def __iter__(self):
             for row in reader:
                 yield PSM(**self._parse_entry(row))
 
-    def read_file(self) -> PSMList:
-        """Read full PSM file into a PSMList object."""
-        return PSMList(psm_list=[psm for psm in self.__iter__()])
-
     @staticmethod
     def _parse_entry(entry: dict):
         """Parse single TSV entry to :py:class:`~psm_utils.psm.PSM`."""
@@ -211,7 +207,9 @@ def write_file(self, psm_list: PSMList):
         if not self.fieldnames:
             raise ValueError("`example_psm` required when writing to new file.")
         with open(self.filename, "wt", newline="") as f:
-            writer = csv.DictWriter(f, fieldnames=self.fieldnames, delimiter="\t")
+            writer = csv.DictWriter(
+                f, fieldnames=self.fieldnames, delimiter="\t", extrasaction="ignore"
+            )
             writer.writeheader()
             for psm in psm_list:
                 writer.writerow(self._psm_to_entry(psm))
@@ -228,15 +226,11 @@ def _psm_to_entry(psm: PSM) -> dict:
 
         # Flatten dictionary items
         if entry["provenance_data"]:
-            entry.update(
-                {"provenance:" + k: v for k, v in entry["provenance_data"].items()}
-            )
+            entry.update({"provenance:" + k: v for k, v in entry["provenance_data"].items()})
         if entry["metadata"]:
             entry.update({"meta:" + k: v for k, v in entry["metadata"].items()})
         if entry["rescoring_features"]:
-            entry.update(
-                {"rescoring:" + k: v for k, v in entry["rescoring_features"].items()}
-            )
+            entry.update({"rescoring:" + k: v for k, v in entry["rescoring_features"].items()})
         del entry["provenance_data"]
         del entry["metadata"]
         del entry["rescoring_features"]
diff --git a/psm_utils/io/xtandem.py b/psm_utils/io/xtandem.py
index 8bf60a9..52e8ccd 100644
--- a/psm_utils/io/xtandem.py
+++ b/psm_utils/io/xtandem.py
@@ -57,9 +57,7 @@
 
 
 class XTandemReader(ReaderBase):
-    def __init__(
-        self, filename: str | Path, *args, decoy_prefix="DECOY_", **kwargs
-    ) -> None:
+    def __init__(self, filename: str | Path, *args, decoy_prefix="DECOY_", **kwargs) -> None:
         """
         Reader for X!Tandem XML PSM files.
 
@@ -101,14 +99,6 @@ def __iter__(self):
                 psm = self._parse_entry(entry)
                 yield psm
 
-    def read_file(self) -> PSMList:
-        """Read full PSM file into a PSMList object."""
-        psm_list = []
-        with tandem.read(str(self.filename)) as reader:
-            for entry in reader:
-                psm_list.append(self._parse_entry(entry))
-        return PSMList(psm_list=psm_list)
-
     def _parse_peptidoform(self, peptide_entry, charge: int) -> Peptidoform:
         """Parse X!Tandem XML peptide entry to :py:class:`~psm_utils.peptidoform.Peptidoform`."""
         if "aa" in peptide_entry:

From 82da9a2a7cc6906cabdc036bb13bf5645e0f8b59 Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Fri, 25 Aug 2023 16:45:24 +0200
Subject: [PATCH 02/13] Added: - Support for Proteome Discoverer MSF files -
 Peptidoform: Added support for initialization from a
 pyteomics.proforma.ProForma object - PSMList: Added append and extend
 methods.

Changed:
- PSM: Values of the rescoring_features dictionary are now coerced to floats

Fixed:
- Fix issue where `psm_list["protein_list"]` resulted in a Numpy error due to inconsistent shape of the lists.
---
 .gitignore                          |   7 +-
 README.rst                          |  27 +-
 docs/source/api/psm_utils.io.rst    |  22 +-
 psm_utils/io/__init__.py            |   7 +
 psm_utils/io/_pd_msf_tables.py      | 799 ++++++++++++++++++++++++++++
 psm_utils/io/peptideshaker.py       | 201 -------
 psm_utils/io/proteome_discoverer.py | 304 +++++++++++
 psm_utils/peptidoform.py            |  54 +-
 psm_utils/psm.py                    |   2 +-
 psm_utils/psm_list.py               |  28 +-
 pyproject.toml                      |  25 +-
 tests/test_io/test_msamanda.py      |   1 -
 12 files changed, 1203 insertions(+), 274 deletions(-)
 create mode 100644 psm_utils/io/_pd_msf_tables.py
 delete mode 100644 psm_utils/io/peptideshaker.py
 create mode 100644 psm_utils/io/proteome_discoverer.py

diff --git a/.gitignore b/.gitignore
index b6e4761..0432116 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+# Ruff
+.ruff_cache/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -103,7 +106,8 @@ celerybeat.pid
 
 # Environments
 .env
-.venv
+.venv/
+.venv*/
 env/
 venv/
 ENV/
@@ -127,3 +131,4 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+.vscode/settings.json
diff --git a/README.rst b/README.rst
index 86008a6..23a293d 100644
--- a/README.rst
+++ b/README.rst
@@ -86,19 +86,20 @@ Goals and non-goals
 Supported file formats
 **********************
 
-===================================================================================================================== =============== =============== ===============
- File format                                                                                                           psm_utils tag   Read support    Write support
-===================================================================================================================== =============== =============== ===============
- `OpenMS idXML <https://www.openms.de/>`_                                                                              ``idxml``       ✅              ❌
- `MaxQuant msms.txt <https://www.maxquant.org/>`_                                                                      ``msms``        ✅              ❌
- `MS Amanda CSV <https://ms.imp.ac.at/?goto=msamanda>`_                                                                ``msamanda``    ✅              ❌
- `mzIdentML <https://psidev.info/mzidentml>`_                                                                          ``mzid``        ✅              ✅
- `Peptide Record <https://psm-utils.readthedocs.io/en/stable/api/psm_utils.io/#module-psm_utils.io.peptide_record>`_   ``peprec``      ✅              ✅
- `Percolator tab <https://github.com/percolator/percolator/wiki/Interface>`_                                           ``percolator``  ✅              ✅
- `Sage <https://github.com/lazear/sage/blob/v0.12.0/DOCS.md#interpreting-sage-output>`_                                ``sage``        ✅              ❌
- `TSV <https://psm-utils.readthedocs.io/en/stable/api/psm_utils.io/#module-psm_utils.io.tsv>`_                         ``tsv``         ✅              ✅
- `X!Tandem XML <https://www.thegpm.org/tandem/>`_                                                                      ``xtandem``     ✅              ❌
-===================================================================================================================== =============== =============== ===============
+===================================================================================================================== ======================== =============== ===============
+ File format                                                                                                           psm_utils tag            Read support    Write support
+===================================================================================================================== ======================== =============== ===============
+ `OpenMS idXML <https://www.openms.de/>`_                                                                              ``idxml``                ✅              ❌
+ `MaxQuant msms.txt <https://www.maxquant.org/>`_                                                                      ``msms``                 ✅              ❌
+ `MS Amanda CSV <https://ms.imp.ac.at/?goto=msamanda>`_                                                                ``msamanda``             ✅              ❌
+ `mzIdentML <https://psidev.info/mzidentml>`_                                                                          ``mzid``                 ✅              ✅
+ `Peptide Record <https://psm-utils.readthedocs.io/en/stable/api/psm_utils.io/#module-psm_utils.io.peptide_record>`_   ``peprec``               ✅              ✅
+ `Percolator tab <https://github.com/percolator/percolator/wiki/Interface>`_                                           ``percolator``           ✅              ✅
+ Proteome Discoverer MSF                                                                                               ``proteome_discoverer``  ✅              ❌
+ `Sage <https://github.com/lazear/sage/blob/v0.12.0/DOCS.md#interpreting-sage-output>`_                                ``sage``                 ✅              ❌
+ `TSV <https://psm-utils.readthedocs.io/en/stable/api/psm_utils.io/#module-psm_utils.io.tsv>`_                         ``tsv``                  ✅              ✅
+ `X!Tandem XML <https://www.thegpm.org/tandem/>`_                                                                      ``xtandem``              ✅              ❌
+===================================================================================================================== ======================== =============== ===============
 
 Legend: ✅ Supported, ❌ Unsupported
 
diff --git a/docs/source/api/psm_utils.io.rst b/docs/source/api/psm_utils.io.rst
index 3f4956d..dcd169b 100644
--- a/docs/source/api/psm_utils.io.rst
+++ b/docs/source/api/psm_utils.io.rst
@@ -8,7 +8,7 @@ psm_utils.io
 
 
 psm_utils.io.idxml
-#####################
+##################
 
 .. automodule:: psm_utils.io.idxml
    :members:
@@ -25,7 +25,7 @@ psm_utils.io.maxquant
 
 
 psm_utils.io.msamanda
-##########################
+#####################
 
 .. automodule:: psm_utils.io.msamanda
    :members:
@@ -34,7 +34,7 @@ psm_utils.io.msamanda
 
 
 psm_utils.io.mzid
-#####################
+#################
 
 .. automodule:: psm_utils.io.mzid
    :members:
@@ -52,7 +52,7 @@ psm_utils.io.peptide_record
 
 
 psm_utils.io.percolator
-###########################
+#######################
 
 .. automodule:: psm_utils.io.percolator
    :members:
@@ -60,8 +60,16 @@ psm_utils.io.percolator
 
 
 
+psm_utils.io.proteome_discoverer
+################################
+.. automodule:: psm_utils.io.proteome_discoverer
+   :members:
+   :inherited-members:
+
+
+
 psm_utils.io.sage
-###########################
+#################
 
 .. automodule:: psm_utils.io.sage
    :members:
@@ -70,7 +78,7 @@ psm_utils.io.sage
 
 
 psm_utils.io.tsv
-##########################
+################
 
 .. automodule:: psm_utils.io.tsv
    :members:
@@ -79,7 +87,7 @@ psm_utils.io.tsv
 
 
 psm_utils.io.xtandem
-##########################
+####################
 
 .. automodule:: psm_utils.io.xtandem
    :members:
diff --git a/psm_utils/io/__init__.py b/psm_utils/io/__init__.py
index 0d27a5c..be0b424 100644
--- a/psm_utils/io/__init__.py
+++ b/psm_utils/io/__init__.py
@@ -14,6 +14,7 @@
 import psm_utils.io.mzid as mzid
 import psm_utils.io.peptide_record as peptide_record
 import psm_utils.io.percolator as percolator
+import psm_utils.io.proteome_discoverer as proteome_discoverer
 import psm_utils.io.tsv as tsv
 import psm_utils.io.xtandem as xtandem
 import psm_utils.io.sage as sage
@@ -53,6 +54,12 @@
         "extension": ".percolator.txt",
         "filename_pattern": r"^.*\.(?:(?:pin)|(?:pout))$",
     },
+    "proteome_discoverer": {
+        "reader": proteome_discoverer.MSFReader,
+        "writer": None,
+        "extension": ".msf",
+        "filename_pattern": r"^.*\.msf$",
+    },
     "tsv": {
         "reader": tsv.TSVReader,
         "writer": tsv.TSVWriter,
diff --git a/psm_utils/io/_pd_msf_tables.py b/psm_utils/io/_pd_msf_tables.py
new file mode 100644
index 0000000..a125da8
--- /dev/null
+++ b/psm_utils/io/_pd_msf_tables.py
@@ -0,0 +1,799 @@
+"""SQLAlchemy models for Mascot MSF files."""
+
+from sqlalchemy import (
+    CHAR,
+    BigInteger,
+    Boolean,
+    Column,
+    DateTime,
+    Float,
+    Index,
+    Integer,
+    LargeBinary,
+    SmallInteger,
+    String,
+    Table,
+    Text,
+    UniqueConstraint,
+    text,
+)
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.sql.sqltypes import NullType
+
+Base = declarative_base()
+metadata = Base.metadata
+
+
+class AminoAcidModification(Base):
+    __tablename__ = "AminoAcidModifications"
+
+    AminoAcidModificationID = Column(Integer, primary_key=True)
+    ModificationName = Column(String, nullable=False)
+    DeltaMass = Column(Float)
+    Substitution = Column(String)
+    LeavingGroup = Column(String)
+    Abbreviation = Column(String, nullable=False)
+    PositionType = Column(Integer, nullable=False)
+    IsActive = Column(Boolean)
+    DeltaAverageMass = Column(Float)
+    UnimodAccession = Column(String)
+    IsSubstitution = Column(Boolean, nullable=False, server_default=text("0"))
+
+
+class AminoAcidModificationsAminoAcid(Base):
+    __tablename__ = "AminoAcidModificationsAminoAcids"
+
+    AminoAcidModificationID = Column(Integer, primary_key=True, nullable=False)
+    AminoAcidID = Column(Integer, primary_key=True, nullable=False)
+    Classification = Column(Integer, nullable=False)
+
+
+class AminoAcidModificationsAminoAcidsNL(Base):
+    __tablename__ = "AminoAcidModificationsAminoAcidsNL"
+
+    AminoAcidModificationID = Column(Integer, primary_key=True, nullable=False)
+    AminoAcidID = Column(Integer, primary_key=True, nullable=False)
+    NeutralLossID = Column(Integer, primary_key=True, nullable=False)
+
+
+class AminoAcidModificationsNeutralLoss(Base):
+    __tablename__ = "AminoAcidModificationsNeutralLosses"
+
+    NeutralLossID = Column(Integer, primary_key=True)
+    Name = Column(String, nullable=False)
+    MonoisotopicMass = Column(Float, nullable=False)
+    AverageMass = Column(Float, nullable=False)
+
+
+class AminoAcid(Base):
+    __tablename__ = "AminoAcids"
+
+    AminoAcidID = Column(Integer, primary_key=True)
+    AminoAcidName = Column(String, nullable=False)
+    OneLetterCode = Column(CHAR)
+    ThreeLetterCode = Column(CHAR)
+    MonoisotopicMass = Column(Float, nullable=False)
+    AverageMass = Column(Float, nullable=False)
+    SumFormula = Column(String)
+
+
+class AnnotationDataVersion(Base):
+    __tablename__ = "AnnotationDataVersion"
+
+    PcDataVersion = Column(Integer, primary_key=True)
+    PcDataRelease = Column(BigInteger, nullable=False)
+
+
+class AnnotationDataset(Base):
+    __tablename__ = "AnnotationDataset"
+
+    DatasetId = Column(Integer, primary_key=True)
+    Name = Column(String, nullable=False)
+    DisplayName = Column(String, nullable=False)
+    Guid = Column(String, nullable=False)
+    Description = Column(Text)
+
+
+class AnnotationGroup(Base):
+    __tablename__ = "AnnotationGroups"
+
+    AnnotationGroupId = Column(Integer, primary_key=True, nullable=False)
+    Description = Column(Text)
+    DatasetId = Column(Integer, primary_key=True, nullable=False)
+    Position = Column(Integer, nullable=False)
+    ColorR = Column(Integer, nullable=False)
+    ColorG = Column(Integer, nullable=False)
+    ColorB = Column(Integer, nullable=False)
+    GroupDefinition = Column(LargeBinary)
+
+
+class AnnotationType(Base):
+    __tablename__ = "AnnotationTypes"
+
+    AnnotationTypeId = Column(Integer, primary_key=True)
+    Name = Column(String, nullable=False)
+    Description = Column(Text)
+
+
+class Annotation(Base):
+    __tablename__ = "Annotations"
+
+    AnnotationId = Column(Integer, primary_key=True)
+    Accession = Column(String, nullable=False)
+    Description = Column(Text)
+    type = Column(Integer)
+
+
+class AnnotationsAnnotationGroup(Base):
+    __tablename__ = "AnnotationsAnnotationGroups"
+
+    AnnotationId = Column(Integer, primary_key=True, nullable=False)
+    AnnotationGroupId = Column(Integer, primary_key=True, nullable=False)
+
+
+class AnnotationsProtein(Base):
+    __tablename__ = "AnnotationsProtein"
+
+    proteinID = Column(Integer, primary_key=True, nullable=False)
+    AnnotationId = Column(Integer, primary_key=True, nullable=False)
+    Evidence = Column(Integer, primary_key=True)
+    PositionBegin = Column(Integer, primary_key=True)
+    PositionEnd = Column(Integer)
+    ProteinAccession = Column(String, primary_key=True, nullable=False)
+
+
+class Chromatogram(Base):
+    __tablename__ = "Chromatograms"
+
+    FileID = Column(Integer, primary_key=True, nullable=False)
+    TraceType = Column(Integer, primary_key=True, nullable=False)
+    Chromatogram = Column(String, nullable=False)
+
+
+class CustomDataField(Base):
+    __tablename__ = "CustomDataFields"
+
+    FieldID = Column(Integer, primary_key=True)
+    Guid = Column(String, nullable=False)
+    DisplayName = Column(String, nullable=False)
+    SourceNodeNumber = Column(Integer, nullable=False)
+    TargetNodeNumber = Column(Integer, nullable=False)
+    DataType = Column(Integer, nullable=False)
+    DataTarget = Column(Integer, nullable=False)
+    Version = Column(Float, nullable=False)
+    AccessMode = Column(Integer, server_default=text("0"))
+    Visibility = Column(Integer, server_default=text("0"))
+    GroupVisibility = Column(Integer, server_default=text("0"))
+    Format = Column(String)
+    PlotType = Column(Integer, nullable=False)
+    DataPurpose = Column(String)
+
+
+class CustomDataPeptide(Base):
+    __tablename__ = "CustomDataPeptides"
+
+    FieldID = Column(Integer, primary_key=True, nullable=False)
+    PeptideID = Column(Integer, primary_key=True, nullable=False, index=True)
+    FieldValue = Column(String)
+
+
+class CustomDataPeptidesDecoy(Base):
+    __tablename__ = "CustomDataPeptides_decoy"
+
+    FieldID = Column(Integer, primary_key=True, nullable=False)
+    PeptideID = Column(Integer, primary_key=True, nullable=False, index=True)
+    FieldValue = Column(String)
+
+
+class CustomDataProcessingNode(Base):
+    __tablename__ = "CustomDataProcessingNodes"
+
+    FieldID = Column(Integer, primary_key=True, nullable=False)
+    ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False, index=True)
+    FieldValue = Column(String)
+
+
+class CustomDataProtein(Base):
+    __tablename__ = "CustomDataProteins"
+
+    FieldID = Column(Integer, primary_key=True, nullable=False)
+    ProteinID = Column(Integer, primary_key=True, nullable=False, index=True)
+    FieldValue = Column(String)
+
+
+class CustomDataProteinsDecoy(Base):
+    __tablename__ = "CustomDataProteins_decoy"
+
+    FieldID = Column(Integer, primary_key=True, nullable=False)
+    ProteinID = Column(Integer, primary_key=True, nullable=False, index=True)
+    FieldValue = Column(String)
+
+
+class CustomDataSpectra(Base):
+    __tablename__ = "CustomDataSpectra"
+
+    FieldID = Column(Integer, primary_key=True, nullable=False)
+    SpectrumID = Column(Integer, primary_key=True, nullable=False, index=True)
+    FieldValue = Column(String)
+
+
+class Enzyme(Base):
+    __tablename__ = "Enzymes"
+
+    EnzymeID = Column(Integer, primary_key=True)
+    Name = Column(String, nullable=False)
+    Abbreviation = Column(String, nullable=False)
+    Seperator = Column(String, nullable=False)
+    NonSeperator = Column(String, nullable=False)
+    Offset = Column(Integer, nullable=False)
+
+
+class EnzymesCleavageSpecificity(Base):
+    __tablename__ = "EnzymesCleavageSpecificities"
+
+    EnzymeID = Column(Integer, primary_key=True, nullable=False)
+    Specificity = Column(Integer, primary_key=True, nullable=False)
+
+
+class EventAnnotation(Base):
+    __tablename__ = "EventAnnotations"
+    __table_args__ = (
+        Index(
+            "IX_EventAnnotations_IsotopePatternID_QuanResultID", "IsotopePatternID", "QuanResultID"
+        ),
+        Index("IX_EventAnnotations_QuanResultID_QuanChannelID", "QuanResultID", "QuanChannelID"),
+    )
+
+    EventID = Column(Integer, primary_key=True)
+    Charge = Column(SmallInteger, nullable=False)
+    IsotopePatternID = Column(Integer, nullable=False)
+    QuanResultID = Column(Integer, nullable=False)
+    QuanChannelID = Column(Integer, nullable=False)
+
+
+class EventAreaAnnotation(Base):
+    __tablename__ = "EventAreaAnnotations"
+
+    EventID = Column(Integer, primary_key=True)
+    Charge = Column(SmallInteger, nullable=False)
+    IsotopePatternID = Column(Integer, nullable=False, index=True)
+    QuanResultID = Column(Integer, nullable=False)
+
+
+class Event(Base):
+    __tablename__ = "Events"
+    __table_args__ = (
+        Index("IX_Events_FileID_LeftRT_RightRT", "FileID", "LeftRT", "RightRT"),
+        Index("IX_Events_FileID_RT", "FileID", "RT"),
+    )
+
+    EventID = Column(Integer, primary_key=True)
+    Mass = Column(Float, nullable=False)
+    MassAvg = Column(Float, nullable=False)
+    Area = Column(Float, nullable=False)
+    Intensity = Column(Float, nullable=False)
+    PeakWidth = Column(Float, nullable=False)
+    RT = Column(Float, nullable=False)
+    LeftRT = Column(Float, nullable=False)
+    RightRT = Column(Float, nullable=False)
+    SN = Column(Float, nullable=False, server_default=text("0.0"))
+    FileID = Column(Integer, nullable=False)
+
+
+class FastaFile(Base):
+    __tablename__ = "FastaFiles"
+
+    FastaFileID = Column(Integer, primary_key=True)
+    FileName = Column(String, nullable=False)
+    State = Column(Integer, nullable=False)
+    VirtualFileName = Column(String, nullable=False)
+    FileSize = Column(BigInteger, nullable=False)
+    FileTime = Column(BigInteger, nullable=False)
+    NumberOfProteins = Column(BigInteger)
+    NumberOfAminoAcids = Column(BigInteger)
+    FileHashCode = Column(BigInteger)
+    Hidden = Column(Boolean, nullable=False)
+    IsSrfImport = Column(Boolean, nullable=False)
+    IsScheduledForDeletion = Column(Boolean, nullable=False, server_default=text("0"))
+
+
+class FastaFilesProteinAnnotation(Base):
+    __tablename__ = "FastaFilesProteinAnnotations"
+
+    FastaFileID = Column(Integer, primary_key=True, nullable=False)
+    ProteinAnnotationID = Column(Integer, primary_key=True, nullable=False, index=True)
+
+
+class FileInfo(Base):
+    __tablename__ = "FileInfos"
+
+    FileID = Column(Integer, primary_key=True)
+    FileName = Column(String, nullable=False)
+    FileTime = Column(String, nullable=False)
+    FileSize = Column(BigInteger, nullable=False)
+    PhysicalFileName = Column(String, nullable=False)
+    FileType = Column(SmallInteger, nullable=False)
+
+
+class MassPeakRelation(Base):
+    __tablename__ = "MassPeakRelations"
+
+    MassPeakID = Column(Integer, primary_key=True, nullable=False)
+    RelatedMassPeakID = Column(Integer, primary_key=True, nullable=False)
+
+
+class MassPeak(Base):
+    __tablename__ = "MassPeaks"
+
+    MassPeakID = Column(Integer, primary_key=True)
+    Charge = Column(SmallInteger)
+    Intensity = Column(Float)
+    Mass = Column(Float)
+    ScanNumbers = Column(String)
+    FileID = Column(Integer)
+    PercentIsolationInterference = Column(Float)
+    IonInjectTime = Column(Integer)
+
+
+class PeptideScore(Base):
+    __tablename__ = "PeptideScores"
+
+    ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False)
+    PeptideID = Column(Integer, primary_key=True, nullable=False)
+    ScoreID = Column(Integer, primary_key=True, nullable=False)
+    ProcessingNodeID = Column(Integer)
+    ScoreValue = Column(Float, nullable=False)
+
+
+class PeptideScoreDecoy(Base):
+    __tablename__ = "PeptideScores_decoy"
+
+    ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False)
+    PeptideID = Column(Integer, primary_key=True, nullable=False)
+    ScoreID = Column(Integer, primary_key=True, nullable=False)
+    ProcessingNodeID = Column(Integer)
+    ScoreValue = Column(Float, nullable=False)
+
+
+class Peptide(Base):
+    __tablename__ = "Peptides"
+
+    ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False)
+    PeptideID = Column(Integer, primary_key=True, nullable=False, index=True)
+    SpectrumID = Column(Integer, nullable=False, index=True)
+    TotalIonsCount = Column(SmallInteger, nullable=False)
+    MatchedIonsCount = Column(SmallInteger, nullable=False)
+    ConfidenceLevel = Column(SmallInteger, nullable=False)
+    SearchEngineRank = Column(Integer, nullable=False)
+    Hidden = Column(Boolean, nullable=False, server_default=text("0"))
+    Sequence = Column(String)
+    Annotation = Column(String)
+    UniquePeptideSequenceID = Column(Integer, nullable=False, server_default=text("1"))
+    MissedCleavages = Column(SmallInteger, nullable=False)
+
+
+class PeptidesAminoAcidModification(Base):
+    __tablename__ = "PeptidesAminoAcidModifications"
+
+    ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False)
+    PeptideID = Column(Integer, primary_key=True, nullable=False)
+    AminoAcidModificationID = Column(Integer, primary_key=True, nullable=False)
+    Position = Column(Integer, primary_key=True, nullable=False)
+
+
+class PeptidesAminoAcidModificationsDecoy(Base):
+    __tablename__ = "PeptidesAminoAcidModifications_decoy"
+
+    ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False)
+    PeptideID = Column(Integer, primary_key=True, nullable=False)
+    AminoAcidModificationID = Column(Integer, primary_key=True, nullable=False)
+    Position = Column(Integer, primary_key=True, nullable=False)
+
+
+class PeptidesProtein(Base):
+    __tablename__ = "PeptidesProteins"
+
+    ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False)
+    PeptideID = Column(Integer, primary_key=True, nullable=False, index=True)
+    ProteinID = Column(Integer, primary_key=True, nullable=False)
+
+
+class PeptidesProteinDecoy(Base):
+    __tablename__ = "PeptidesProteins_decoy"
+
+    ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False)
+    PeptideID = Column(Integer, primary_key=True, nullable=False, index=True)
+    ProteinID = Column(Integer, primary_key=True, nullable=False)
+
+
+class PeptidesReferenceSpectra(Base):
+    __tablename__ = "PeptidesReferenceSpectra"
+
+    PeptideID = Column(Integer, primary_key=True)
+    ReferenceSpectrumID = Column(Integer, nullable=False)
+
+
+class PeptidesTerminalModification(Base):
+    __tablename__ = "PeptidesTerminalModifications"
+
+    ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False)
+    PeptideID = Column(Integer, primary_key=True, nullable=False)
+    TerminalModificationID = Column(Integer, primary_key=True, nullable=False)
+
+
+class PeptidesTerminalModificationDecoy(Base):
+    __tablename__ = "PeptidesTerminalModifications_decoy"
+
+    ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False)
+    PeptideID = Column(Integer, primary_key=True, nullable=False)
+    TerminalModificationID = Column(Integer, primary_key=True, nullable=False)
+
+
+class PeptideDecoy(Base):
+    __tablename__ = "Peptides_decoy"
+
+    ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False)
+    PeptideID = Column(Integer, primary_key=True, nullable=False, index=True)
+    SpectrumID = Column(Integer, nullable=False, index=True)
+    TotalIonsCount = Column(SmallInteger, nullable=False)
+    MatchedIonsCount = Column(SmallInteger, nullable=False)
+    ConfidenceLevel = Column(SmallInteger, nullable=False)
+    SearchEngineRank = Column(Integer, nullable=False)
+    Sequence = Column(String)
+    Annotation = Column(String)
+    UniquePeptideSequenceID = Column(Integer, nullable=False, server_default=text("1"))
+    MissedCleavages = Column(SmallInteger, nullable=False)
+
+
+t_PrecursorIonAreaSearchSpectra = Table(
+    "PrecursorIonAreaSearchSpectra",
+    metadata,
+    Column("QuanResultID", Integer, nullable=False, index=True),
+    Column("SearchSpectrumID", Integer),
+)
+
+
+t_PrecursorIonQuanResults = Table(
+    "PrecursorIonQuanResults",
+    metadata,
+    Column("QuanChannelID", Integer, nullable=False),
+    Column("QuanResultID", Integer, nullable=False),
+    Column("Mass", Float, nullable=False),
+    Column("Charge", Integer, nullable=False),
+    Column("Area", Float),
+    Column("RetentionTime", Float),
+    Index(
+        "IX_PrecursorIonQuanResults_QuanResultID_QuanChannelID", "QuanResultID", "QuanChannelID"
+    ),
+)
+
+
+t_PrecursorIonQuanResultsSearchSpectra = Table(
+    "PrecursorIonQuanResultsSearchSpectra",
+    metadata,
+    Column("ProcessingNodeNumber", Integer, nullable=False),
+    Column("QuanResultID", Integer, nullable=False, index=True),
+    Column("SearchSpectrumID", Integer, index=True),
+)
+
+
+t_ProcessingNodeConnectionPoints = Table(
+    "ProcessingNodeConnectionPoints",
+    metadata,
+    Column("ProcessingNodeID", Integer, nullable=False),
+    Column("Interface", String, nullable=False),
+    Column("ConnectionDirection", Integer, nullable=False),
+    Column("ConnectionMode", Integer, nullable=False),
+    Column("ConnectionMultiplicity", Integer, nullable=False),
+    Column("ConnectionRequirement", Integer, nullable=False),
+    Column("DataTypeSpecialization", String, nullable=False),
+    Column("ConnectionDisplayName", String, nullable=False),
+)
+
+
+class ProcessingNodeExtension(Base):
+    __tablename__ = "ProcessingNodeExtensions"
+
+    ExtensionID = Column(Integer, primary_key=True)
+    ProcessingNodeNumber = Column(Integer, nullable=False)
+    Guid = Column(String, nullable=False)
+    Purpose = Column(String, nullable=False)
+    PurposeDetail = Column(String)
+    MajorVersion = Column(Integer, nullable=False)
+    MinorVersion = Column(Integer, nullable=False)
+    Settings = Column(Text)
+
+
+class ProcessingNodeFilterParameter(Base):
+    __tablename__ = "ProcessingNodeFilterParameters"
+
+    ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False)
+    FilterParameterName = Column(String, primary_key=True, nullable=False)
+    FilterModuleTypeID = Column(Integer, nullable=False)
+    FilterModuleNumber = Column(Integer, nullable=False)
+    ProcessingNodeID = Column(Integer, nullable=False)
+    FilterParameterValue = Column(Float, nullable=False)
+
+
+t_ProcessingNodeInterfaces = Table(
+    "ProcessingNodeInterfaces",
+    metadata,
+    Column("ProcessingNodeID", Integer, nullable=False),
+    Column("InterfaceKind", Integer, nullable=False),
+    Column("InterfaceName", String, nullable=False),
+)
+
+
+class ProcessingNodeParameter(Base):
+    __tablename__ = "ProcessingNodeParameters"
+
+    ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False)
+    ParameterName = Column(String, primary_key=True, nullable=False)
+    FriendlyName = Column(String, nullable=False)
+    ProcessingNodeID = Column(Integer, nullable=False)
+    IntendedPurpose = Column(Integer, nullable=False)
+    PurposeDetails = Column(String, nullable=False)
+    Hidden = Column(Boolean, nullable=False)
+    Advanced = Column(Boolean, nullable=False)
+    Category = Column(String, nullable=False)
+    Position = Column(Integer, nullable=False)
+    ParameterValue = Column(String, nullable=False)
+    ValueDisplayString = Column(String, nullable=False)
+
+
+class ProcessingNodeScore(Base):
+    __tablename__ = "ProcessingNodeScores"
+    __table_args__ = (UniqueConstraint("ProcessingNodeID", "ScoreName"),)
+
+    ProcessingNodeID = Column(Integer, nullable=False)
+    ScoreID = Column(Integer, primary_key=True)
+    ScoreName = Column(String, nullable=False)
+    FriendlyName = Column(String, nullable=False)
+    Description = Column(String, nullable=False)
+    FormatString = Column(String, nullable=False)
+    ScoreCategory = Column(Integer, nullable=False)
+    Hidden = Column(Boolean, nullable=False)
+    IsMainScore = Column(Boolean, nullable=False)
+    ScoreGUID = Column(String, nullable=False)
+
+
+class ProcessingNode(Base):
+    __tablename__ = "ProcessingNodes"
+
+    ProcessingNodeNumber = Column(Integer, primary_key=True)
+    ProcessingNodeID = Column(Integer, nullable=False)
+    ProcessingNodeParentNumber = Column(String, nullable=False)
+    NodeName = Column(String)
+    FriendlyName = Column(String, nullable=False)
+    MajorVersion = Column(Integer, nullable=False)
+    MinorVersion = Column(Integer, nullable=False)
+    NodeComment = Column(String)
+    NodeGUID = Column(String, nullable=False)
+    ProcessingNodeState = Column(Integer, nullable=False, server_default=text("0"))
+
+
+class ProcessingNodesSpectra(Base):
+    __tablename__ = "ProcessingNodesSpectra"
+
+    SendingProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False)
+    SpectrumID = Column(Integer, primary_key=True, nullable=False, index=True)
+
+
+class ProteinAnnotation(Base):
+    __tablename__ = "ProteinAnnotations"
+    __table_args__ = (
+        Index(
+            "IX_ProteinAnnotations_ProteinID_DescriptionHashCode",
+            "ProteinID",
+            "DescriptionHashCode",
+        ),
+    )
+
+    ProteinAnnotationID = Column(Integer, primary_key=True)
+    ProteinID = Column(Integer, nullable=False)
+    DescriptionHashCode = Column(BigInteger, nullable=False)
+    Description = Column(Text, nullable=False)
+    TaxonomyID = Column(Integer, nullable=False, index=True)
+
+
+class ProteinIdentificationGroup(Base):
+    __tablename__ = "ProteinIdentificationGroups"
+
+    ProteinIdentificationGroupId = Column(Integer, primary_key=True, nullable=False)
+    ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False)
+
+
+class ProteinScore(Base):
+    __tablename__ = "ProteinScores"
+
+    ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False)
+    ProteinID = Column(Integer, primary_key=True, nullable=False)
+    ProteinIdentificationGroupID = Column(Integer, nullable=False)
+    ProteinScore = Column(Float, nullable=False)
+    Coverage = Column(Float, nullable=False, server_default=text("0"))
+
+
+class ProteinScoresDecoy(Base):
+    __tablename__ = "ProteinScores_decoy"
+
+    ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False)
+    ProteinID = Column(Integer, primary_key=True, nullable=False)
+    ProteinIdentificationGroupID = Column(Integer, nullable=False)
+    ProteinScore = Column(Float, nullable=False)
+    Coverage = Column(Float, nullable=False, server_default=text("0"))
+
+
+class Protein(Base):
+    __tablename__ = "Proteins"
+
+    ProteinID = Column(Integer, primary_key=True)
+    Sequence = Column(Text, nullable=False)
+    SequenceHashCode = Column(BigInteger, nullable=False, index=True)
+    IsMasterProtein = Column(Boolean, nullable=False, server_default=text("0"))
+
+
+t_ProteinsProteinGroups = Table(
+    "ProteinsProteinGroups",
+    metadata,
+    Column("ProteinID", Integer, nullable=False),
+    Column("ProteinGroupID", Integer, nullable=False),
+)
+
+
+class PtmAnnotationDatum(Base):
+    __tablename__ = "PtmAnnotationData"
+
+    AnnotationType = Column(Integer, primary_key=True, nullable=False)
+    ProteinId = Column(Integer, primary_key=True, nullable=False)
+    AnnotationId = Column(Integer, primary_key=True, nullable=False)
+    Position = Column(Integer, primary_key=True, nullable=False)
+    Annotation = Column(String)
+
+
+class ReferenceSpectra(Base):
+    __tablename__ = "ReferenceSpectra"
+
+    ReferenceSpectrumId = Column(Integer, primary_key=True)
+    Sequence = Column(String, nullable=False)
+    SequenceHashCode = Column(BigInteger, nullable=False)
+    Spectrum = Column(String, nullable=False)
+    SpectrumHashCode = Column(BigInteger, nullable=False)
+    Comment = Column(Text)
+    CommentHashCode = Column(BigInteger, nullable=False)
+
+
+class ReporterIonQuanResult(Base):
+    __tablename__ = "ReporterIonQuanResults"
+    __table_args__ = (
+        Index(
+            "IX_ReporterIonQuanResults_ProcessingNodeNumber_SpectrumID",
+            "ProcessingNodeNumber",
+            "SpectrumID",
+        ),
+    )
+
+    ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False)
+    QuanChannelID = Column(Integer, primary_key=True, nullable=False)
+    SpectrumID = Column(Integer, primary_key=True, nullable=False)
+    Mass = Column(Float, nullable=False)
+    Height = Column(Float)
+
+
+t_ReporterIonQuanResultsSearchSpectra = Table(
+    "ReporterIonQuanResultsSearchSpectra",
+    metadata,
+    Column("ProcessingNodeNumber", Integer, nullable=False),
+    Column("SpectrumID", Integer, nullable=False),
+    Column("SearchSpectrumID", Integer, index=True),
+)
+
+
+class ScanEvent(Base):
+    __tablename__ = "ScanEvents"
+
+    ScanEventID = Column(Integer, primary_key=True)
+    MSLevel = Column(Integer, nullable=False)
+    Polarity = Column(Integer, nullable=False)
+    ScanType = Column(Integer, nullable=False)
+    Ionization = Column(Integer, nullable=False)
+    MassAnalyzer = Column(Integer, nullable=False)
+    ActivationType = Column(Integer, nullable=False)
+
+
+class SchemaInfo(Base):
+    __tablename__ = "SchemaInfo"
+
+    Version = Column(Integer, primary_key=True)
+    Kind = Column(String, nullable=False)
+    Date = Column(DateTime, nullable=False)
+    SoftwareVersion = Column(String, nullable=False)
+    Comment = Column(Text, nullable=False)
+
+
+class Spectrum(Base):
+    __tablename__ = "Spectra"
+
+    UniqueSpectrumID = Column(Integer, primary_key=True)
+    Spectrum = Column(String, nullable=False)
+    SpectrumHashCode = Column(BigInteger)
+
+
+class SpectrumHeader(Base):
+    __tablename__ = "SpectrumHeaders"
+
+    SpectrumID = Column(Integer, primary_key=True)
+    MassPeakID = Column(Integer)
+    ScanEventID = Column(Integer)
+    LastScan = Column(Integer)
+    FirstScan = Column(Integer)
+    RetentionTime = Column(Float)
+    Hidden = Column(Boolean, nullable=False, server_default=text("0"))
+    ScanNumbers = Column(String)
+    Charge = Column(SmallInteger)
+    Mass = Column(Float)
+    CreatingProcessingNodeNumber = Column(Integer, nullable=False)
+    UniqueSpectrumID = Column(Integer, nullable=False, server_default=text("0"))
+
+
+class SpectrumScore(Base):
+    __tablename__ = "SpectrumScores"
+
+    ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False)
+    SpectrumID = Column(Integer, primary_key=True, nullable=False)
+    Score = Column(Float, nullable=False)
+
+
+t_TaxonomyNames = Table(
+    "TaxonomyNames",
+    metadata,
+    Column("TaxonomyID", Integer, nullable=False, index=True),
+    Column("Name", String),
+    Column("NameCategory", Integer, nullable=False),
+)
+
+
+class TaxonomyNode(Base):
+    __tablename__ = "TaxonomyNodes"
+    __table_args__ = (
+        Index("IX_TaxonomyNodes_LeftNodeIndex_RightNodeIndex", "LeftNodeIndex", "RightNodeIndex"),
+    )
+
+    TaxonomyID = Column(Integer, primary_key=True, unique=True)
+    ParentTaxonomyID = Column(Integer, nullable=False)
+    TaxonomyRank = Column(Integer, nullable=False)
+    LeftNodeIndex = Column(Integer, nullable=False)
+    RightNodeIndex = Column(Integer, nullable=False)
+
+
+t_WorkflowInfo = Table(
+    "WorkflowInfo",
+    metadata,
+    Column("WorkflowName", String, nullable=False),
+    Column("WorkflowDescription", String, nullable=False),
+    Column("WorkflowState", Integer, nullable=False, server_default=text("0")),
+    Column("WorkflowStartDate", DateTime, nullable=False),
+    Column("WorkflowTemplate", String, nullable=False),
+    Column("User", String, nullable=False),
+    Column("WorkflowGUID", String, nullable=False),
+    Column("MachineGUID", String, nullable=False),
+    Column("MachineName", String, nullable=False),
+    Column("MergeSimilarIdentificationResults", Boolean, nullable=False),
+    Column("IsValid", Boolean, nullable=False),
+    Column("Version", Integer, nullable=False),
+)
+
+
+class WorkflowMessage(Base):
+    __tablename__ = "WorkflowMessages"
+
+    MessageID = Column(Integer, primary_key=True)
+    ProcessingNodeID = Column(Integer, nullable=False)
+    ProcessingNodeNumber = Column(Integer, nullable=False)
+    Time = Column(BigInteger, nullable=False)
+    MessageKind = Column(Integer, nullable=False)
+    Message = Column(String, nullable=False)
+
+
+t_sqlite_sequence = Table(
+    "sqlite_sequence", metadata, Column("name", NullType), Column("seq", NullType)
+)
diff --git a/psm_utils/io/peptideshaker.py b/psm_utils/io/peptideshaker.py
deleted file mode 100644
index e7a876d..0000000
--- a/psm_utils/io/peptideshaker.py
+++ /dev/null
@@ -1,201 +0,0 @@
-"""PeptideShaker Extended PSM Report."""
-
-import logging
-import os
-from typing import Union
-
-import click
-import numpy as np
-import pandas as pd
-
-from psm_utils.io.peptide_record import PeptideRecord
-
-logger = logging.getLogger(__name__)
-
-
-@pd.api.extensions.register_dataframe_accessor("ext_psm_report")
-class ExtendedPsmReportAccessor:
-    """
-    Pandas extension for PeptideShaker Extended PSM Reports.
-
-    Examples
-    --------
-    >>> import pandas as pd
-    >>> from ms2rescore.peptideshaker import ExtendedPsmReportAccessor
-    >>> psm_report = pd.DataFrame.ext_psm_report.from_tsv(kwargs["input_psm_report"])
-    >>> peprec = psm_report.ext_psm_report.to_peprec()
-    """
-
-    def __init__(self, pandas_obj: pd.DataFrame) -> None:
-        """Pandas extension for PeptideShaker Extended PSM Reports."""
-        self._obj = pandas_obj
-        self._validate()
-
-    def _validate(self):
-        """Validate Pandas DataFrame as Extended PSM Report."""
-        # TODO: Implement validation of PSM report DataFrame
-        self.drop_invalid_amino_acids()
-
-    def drop_invalid_amino_acids(self, invalid_amino_acids=r"[BJOUXZ]"):
-        """Drop all PSMs (rows) with peptides containing invalid amino acids."""
-        to_drop = self._obj[
-            self._obj['Sequence'].str.contains(invalid_amino_acids, regex=True)
-        ].index
-        if len(to_drop) > 0:
-            logger.warning(
-                "Dropping %i PSMs from report due to invalid amino acids (%s)",
-                len(to_drop),
-                invalid_amino_acids
-            )
-            self._obj = self._obj.drop(index=to_drop)
-
-    @staticmethod
-    def from_tsv(path: Union[str, os.PathLike]) -> pd.DataFrame:
-        """Read Extended PSM Report from TSV file."""
-        ext_psm_report = pd.read_csv(path, sep="\t", index_col=0)
-        ext_psm_report.ext_psm_report._validate()
-        return ext_psm_report
-
-    @staticmethod
-    def from_xls(path: Union[str, os.PathLike]) -> pd.DataFrame:
-        """Read Extended PSM Report from XLS file."""
-        ext_psm_report = pd.read_excel(path, sheet_name=0, index_col=0)
-        pd.ext_psm_report._validate(ext_psm_report)
-        return ext_psm_report
-
-    @staticmethod
-    def from_file(path: Union[str, os.PathLike]) -> pd.DataFrame:
-        """Read Extended PSM Report from file, inferring filetype from extension."""
-        ext = os.path.splitext(path)[-1].lower()
-        if (ext == ".tsv") or (ext == ".txt"):
-            return pd.DataFrame.ext_psm_report.from_tsv(path)
-        elif (ext == ".xls") or (ext == ".xlsx"):
-            return pd.DataFrame.ext_psm_report.from_xls(path)
-        else:
-            raise NotImplementedError(
-                f"Extended PSM Report with filetype extension {ext} is not supported."
-            )
-
-    @staticmethod
-    def _parse_modification(modified_seq):
-        """
-        Parse modified sequence to peprec modification string.
-
-        TODO: Do not hardcode modification mapping.
-        TODO: Refactor method (e.g. use regex for matching).
-        TODO: Parse C-term modifications
-
-        """
-        # Initiate variables for nterm, seq and cterm
-        mod_list = list()
-        nterm, seq, cterm = modified_seq.split("-")
-
-        # Initiatle variable for nterm
-        pyro_bool = False
-
-        # Initiate variables for seq
-        mod_index = 0
-        mod_description = False  # to check if it's an amino acid (False) or a description in < ... > (True)
-
-        # Check amino terminus for modifications
-        if nterm == "ace":
-            mod_list.append("0|Acetyl")
-        elif nterm == "pyro":
-            pyro_bool = True
-        elif nterm != "NH2":
-            print("Unknown N-terminal modification: {}".format(nterm))
-
-        # Check internal sequence
-        for char in seq:
-            if char == "<":
-                mod_peprec = "{}|".format(mod_index)
-                mod_name = ""
-                mod_description = True
-            elif char == ">":
-                mod_description = False
-                if mod_name == 'ox':
-                    mod_peprec += 'Oxidation'
-                elif mod_name == 'cmm':
-                    mod_peprec += 'Carbamidomethyl'
-                elif mod_name == 'deam':
-                    mod_peprec += 'Deamidated'
-                else:
-                    logger.warning("Unknown internal modification: %s", mod_name)
-                mod_list.append("{}".format(mod_peprec))  # peprec format
-                mod_peprec = ""
-
-            else:
-                if pyro_bool:
-                    if char == 'C':
-                        mod_name = "Pyro-carbamidomethyl"
-                    elif char == 'Q':
-                        mod_name = "Gln->pyro-Glu"
-                    elif char == 'E':
-                        mod_name = "Glu->pyro-Glu"
-                    elif char == 'P':
-                        mod_name = "Pro->pyro-Glu"
-                    else:
-                        logger.warning("Unknown N-terminal pyro modification from %s", char)
-                    mod_list.append("1|{}".format(mod_name))
-                    pyro_bool = False
-                    mod_index += 1
-                    mod_name = ""
-                else:
-                    if mod_description:
-                        mod_name += char
-                    else:
-                        mod_index += 1
-
-        mods_peprec = "|".join(mod_list)
-        if mods_peprec == "":
-            mods_peprec = "-"
-
-        return mods_peprec
-
-    def to_peprec(self):
-        """Convert Extended PSM Report to PEPREC."""
-        column_mapping = {
-            "Spectrum Title": "spec_id",
-            "Modified Sequence": "modifications",
-            "Sequence": "peptide",
-            "Measured Charge": "charge",
-            "Decoy": "Label",
-            "RT": "observed_retention_time",
-            "Confidence [%]": "psm_score",
-        }
-
-        # Convert DataFrame to PEPREC
-        df = self._obj[column_mapping.keys()].rename(columns=column_mapping)
-        df["charge"] = df["charge"].str.strip("+")
-        df["modifications"] = df["modifications"].apply(self._parse_modification)
-        df["Label"] = df["Label"].apply(
-            lambda x: 1 if x == 0 else (-1 if x == 1 else np.nan)
-        )
-        if df["Label"].isna().any():
-            raise ValueError(
-                "Missing target/decoy labels in PeptideShaker Extended PSM "
-                "Report."
-            )
-
-        peprec = PeptideRecord()
-        peprec.df = df
-        return peprec
-
-    def get_search_engine_features(self):
-        """Get pandas.DataFrame with search engine features."""
-        # TODO: Implement this!
-        raise NotImplementedError
-
-
-@click.command()
-@click.argument("input-psm-report")
-@click.argument("output-peprec")
-def main(**kwargs):
-    """Convert Extended PSM Report to PEPREC."""
-    psm_report = pd.DataFrame.ext_psm_report.from_file(kwargs["input_psm_report"])
-    peprec = psm_report.ext_psm_report.to_peprec()
-    peprec.to_csv(kwargs["output_peprec"])
-
-
-if __name__ == "__main__":
-    main()
diff --git a/psm_utils/io/proteome_discoverer.py b/psm_utils/io/proteome_discoverer.py
new file mode 100644
index 0000000..edb7a5e
--- /dev/null
+++ b/psm_utils/io/proteome_discoverer.py
@@ -0,0 +1,304 @@
+"""Reader for Proteome Discoverer MSF PSM files."""
+
+import logging
+import re
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Tuple, Union
+
+import pyteomics.proforma as proforma
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+
+import psm_utils.io._pd_msf_tables as msf
+from psm_utils import PSM, Peptidoform
+from psm_utils.io._base_classes import ReaderBase
+
+logger = logging.getLogger(__name__)
+
+COMPATIBLE_VERSIONS = [79]
+
+
+class MSFReader(ReaderBase):
+    """Reader for Proteome Discoverer MSF files."""
+
+    def __init__(
+        self,
+        filename: Union[str, Path],
+        *args,
+        **kwargs,
+    ) -> None:
+        """
+        Reader for Proteome Discoverer MSF file.
+
+        Parameters
+        ----------
+        filename: str, pathlib.Path
+            Path to MSF file.
+
+        """
+        super().__init__(filename, *args, **kwargs)
+
+        self._engine = create_engine(f"sqlite:///{self.filename.as_posix()}")
+        self._session = sessionmaker(bind=self._engine)()
+
+        self._check_version()
+
+    def __len__(self):
+        """Return number of PSMs in file."""
+        return sum(
+            self._session.query(peptide).count() for peptide in [msf.Peptide, msf.PeptideDecoy]
+        )
+
+    def __iter__(self):
+        """Iterate over file and return PSMs one-by-one."""
+        for is_decoy in [False, True]:
+            modifications = self._get_modifications(is_decoy)
+            terminal_modifications = self._get_terminal_modifications(is_decoy)
+            protein_entries = self._get_protein_entries(is_decoy)
+            main_score = self._get_main_score(is_decoy)
+            secondary_scores = self._get_secondary_scores(is_decoy)
+
+            for entry in self._iter_peptides(is_decoy):
+                peptide_id = entry.PeptideDecoy.PeptideID if is_decoy else entry.Peptide.PeptideID
+                yield self._parse_entry(
+                    entry,
+                    modifications[peptide_id],
+                    terminal_modifications[peptide_id],
+                    protein_entries[peptide_id],
+                    main_score[peptide_id],
+                    secondary_scores[peptide_id],
+                    is_decoy,
+                )
+
+    def _check_version(self):
+        """Check if MSF file version is compatible."""
+        version = self._session.query(msf.SchemaInfo.Version).first()[0]
+        if version not in COMPATIBLE_VERSIONS:
+            logger.warning(
+                f"MSF file version {version} might not be compatible with this reader. "
+                f"Checked versions are: {COMPATIBLE_VERSIONS}."
+            )
+
+    def _iter_peptides(self, is_decoy: bool):
+        """Iterate over peptides in MSF file."""
+        Peptide = msf.PeptideDecoy if is_decoy else msf.Peptide
+        for entry in (
+            self._session.query(Peptide, msf.SpectrumHeader, msf.MassPeak, msf.FileInfo)
+            .select_from(Peptide)
+            .join(msf.SpectrumHeader, Peptide.SpectrumID == msf.SpectrumHeader.SpectrumID)
+            .join(msf.MassPeak, msf.MassPeak.MassPeakID == msf.SpectrumHeader.MassPeakID)
+            .join(msf.FileInfo, msf.FileInfo.FileID == msf.MassPeak.FileID)
+        ):
+            yield entry
+
+    def _get_modifications(self, is_decoy: bool) -> Dict[int, Tuple[int, int]]:
+        """Get all modifications per peptide ID."""
+        PeptidesAminoAcidModification = (
+            msf.PeptidesAminoAcidModificationsDecoy
+            if is_decoy
+            else msf.PeptidesAminoAcidModification
+        )
+        query = (
+            self._session.query(
+                PeptidesAminoAcidModification.PeptideID,
+                PeptidesAminoAcidModification.Position,
+                msf.AminoAcidModification.UnimodAccession,
+            )
+            .select_from(PeptidesAminoAcidModification)
+            .join(
+                msf.AminoAcidModification,
+                PeptidesAminoAcidModification.AminoAcidModificationID
+                == msf.AminoAcidModification.AminoAcidModificationID,
+            )
+        )
+        modifications_by_peptide = defaultdict(list)
+        for peptide_id, position, unimod_accession in query:
+            modifications_by_peptide[peptide_id].append((position, unimod_accession))
+
+        return modifications_by_peptide
+
+    def _get_terminal_modifications(self, is_decoy: bool) -> Dict[int, Tuple[int, int]]:
+        """Get terminal modifications for a peptide."""
+        PeptidesTerminalModification = (
+            msf.PeptidesTerminalModification if is_decoy else msf.PeptidesTerminalModificationDecoy
+        )
+        query = (
+            self._session.query(
+                PeptidesTerminalModification.PeptideID,
+                msf.AminoAcidModification.PositionType,
+                msf.AminoAcidModification.UnimodAccession,
+            )
+            .select_from(msf.AminoAcidModification)
+            .join(
+                PeptidesTerminalModification,
+                PeptidesTerminalModification.TerminalModificationID
+                == msf.AminoAcidModification.AminoAcidModificationID,
+            )
+        )
+        terminal_modifications = defaultdict(list)
+        for peptide_id, position_type, unimod_accession in query:
+            terminal_modifications[peptide_id].append((position_type, unimod_accession))
+        return terminal_modifications
+
+    def _get_protein_entries(self, is_decoy: bool) -> Dict[int, List[str]]:
+        """Get protein descriptions or a peptide."""
+        PeptidesProtein = msf.PeptidesProteinDecoy if is_decoy else msf.PeptidesProtein
+        query = (
+            self._session.query(PeptidesProtein.PeptideID, msf.ProteinAnnotation.Description)
+            .select_from(PeptidesProtein)
+            .join(
+                msf.ProteinAnnotation,
+                PeptidesProtein.ProteinID == msf.ProteinAnnotation.ProteinID,
+            )
+        )
+        proteins = defaultdict(list)
+        for peptide_id, description in query:
+            proteins[peptide_id].append(re.sub(r"^>", "", description))
+        return proteins
+
+    def _get_main_score(self, is_decoy: bool) -> Dict[int, Tuple[float, str]]:
+        """Get main score and its name for a peptide."""
+        PeptideScore = msf.PeptideScoreDecoy if is_decoy else msf.PeptideScore
+        query = (
+            self._session.query(
+                PeptideScore.PeptideID, PeptideScore.ScoreValue, msf.ProcessingNodeScore.ScoreName
+            )
+            .select_from(PeptideScore)
+            .join(
+                msf.ProcessingNodeScore,
+                msf.ProcessingNodeScore.ScoreID == PeptideScore.ScoreID,
+            )
+            .filter(msf.ProcessingNodeScore.IsMainScore == True)  # noqa: E712
+        )
+        scores = dict()
+        for peptide_id, score_value, score_name in query:
+            scores[peptide_id] = (score_value, score_name)
+        return scores
+
+    def _get_secondary_scores(self, is_decoy: bool) -> Dict[int, Dict[str, float]]:
+        """Get secondary scores and their names for a peptide."""
+        PeptideScore = msf.PeptideScoreDecoy if is_decoy else msf.PeptideScore
+        query = (
+            self._session.query(
+                PeptideScore.PeptideID, PeptideScore.ScoreValue, msf.ProcessingNodeScore.ScoreName
+            )
+            .select_from(PeptideScore)
+            .join(
+                msf.ProcessingNodeScore,
+                msf.ProcessingNodeScore.ScoreID == PeptideScore.ScoreID,
+            )
+            .filter(msf.ProcessingNodeScore.IsMainScore == False)  # noqa: E712
+        )
+        scores = defaultdict(dict)
+        for peptide_id, score_value, score_name in query:
+            scores[peptide_id][score_name] = score_value
+        return scores
+
+    def _compile_peptidoform(
+        self,
+        sequence: str,
+        charge: int,
+        modifications: List[Tuple[int, int]],
+        terminal_modifications: List[Tuple[int, int]],
+    ) -> Peptidoform:
+        """
+        Compile a peptidoform from a sequence, charge, and list of (terminal) modifications.
+
+        Parameters
+        ----------
+        sequence
+            The stripped sequence of the peptidoform.
+        charge
+            Precursor charge.
+        modifications
+            List of tuples of the form (position, unimod identifier).
+        terminal_modifications
+            List of tuples of the form (position type, unimod identifier).
+
+        Notes
+        -----
+        The position type is either 1 (Any N-term), 2 (Any C-term), 3 (Protein N-term), or 4
+        (Protein C-term). Position type 0 (Anywhere) should not be present in the
+        terminal_modifications list.
+
+        """
+        modifications_dict = defaultdict(list)
+        for position, unimod_id in modifications:
+            modifications_dict[position].append(proforma.process_tag_tokens(f"U:{unimod_id}"))
+
+        n_term = [
+            proforma.process_tag_tokens(f"U:{unimod_id}")
+            for position_type, unimod_id in terminal_modifications
+            if position_type in [1, 3]  # Position types 'Any N-term' or 'Protein N-term'
+        ]
+        c_term = [
+            proforma.process_tag_tokens(f"U:{unimod_id}")
+            for position_type, unimod_id in terminal_modifications
+            if position_type in [2, 4]  # Position types 'Any C-term' or 'Protein C-term'
+        ]
+
+        sequence = [(aa, modifications_dict[i] or None) for i, aa in enumerate(sequence)]
+        properties = {
+            "n_term": n_term,
+            "c_term": c_term,
+            "charge_state": proforma.ChargeState(charge),
+            "unlocalized_modifications": [],
+            "labile_modifications": [],
+            "fixed_modifications": [],
+            "intervals": [],
+            "isotopes": [],
+            "group_ids": [],
+        }
+
+        return Peptidoform(proforma.ProForma(sequence, properties))
+
+    def _parse_entry(
+        self,
+        entry: Tuple[msf.Peptide, msf.SpectrumHeader, msf.MassPeak, msf.FileInfo],
+        modifications: List[Tuple[int, int]],
+        terminal_modifications: List[Tuple[int, int]],
+        protein_entries: List[str],
+        main_score: Tuple[float, str],
+        secondary_scores: Dict[str, float],
+        is_decoy: bool,
+    ) -> PSM:
+        """Parse an entry from the MSF file."""
+        peptide = entry.PeptideDecoy if is_decoy else entry.Peptide
+        return PSM(
+            peptidoform=self._compile_peptidoform(
+                peptide.Sequence,
+                entry.SpectrumHeader.Charge,
+                modifications,
+                terminal_modifications,
+            ),
+            spectrum_id=entry.SpectrumHeader.LastScan,
+            run=Path(entry.FileInfo.FileName).stem,
+            is_decoy=is_decoy,
+            score=main_score[0],
+            qvalue=None,
+            pep=None,
+            precursor_mz=entry.MassPeak.Mass,
+            retention_time=entry.SpectrumHeader.RetentionTime,
+            ion_mobility=None,
+            protein_list=protein_entries,
+            rank=peptide.SearchEngineRank,
+            source="proteome_discoverer",
+            provenance_data={
+                "scan_numbers": entry.SpectrumHeader.ScanNumbers,
+            },
+            metadata={
+                "ms1_intensity": str(entry.MassPeak.Intensity),
+                "ms1_percent_isolation_interference": str(
+                    entry.MassPeak.PercentIsolationInterference
+                ),
+                "ms1_ion_inject_time": str(entry.MassPeak.IonInjectTime),
+                "main_score_name": main_score[1],
+                **secondary_scores,
+            },
+            rescoring_features={
+                "missed_cleavages": peptide.MissedCleavages,
+                "total_ions_count": peptide.TotalIonsCount,
+                "matched_ions_count": peptide.MatchedIonsCount,
+            },
+        )
diff --git a/psm_utils/peptidoform.py b/psm_utils/peptidoform.py
index f7de6a1..814a8df 100644
--- a/psm_utils/peptidoform.py
+++ b/psm_utils/peptidoform.py
@@ -12,20 +12,15 @@ class Peptidoform:
     Peptide sequence, modifications and charge state represented in ProForma notation.
     """
 
-    def __init__(self, proforma_sequence: str) -> None:
+    def __init__(self, proforma_sequence: [str, proforma.ProForma]) -> None:
         """
         Peptide sequence, modifications and charge state represented in ProForma notation.
 
         Parameters
         ----------
-        proforma_sequence : str
-            Peptidoform sequence in ProForma v2 notation.
-
-        Examples
-        --------
-        >>> peptidoform = Peptidoform("ACDM[Oxidation]EK")
-        >>> peptidoform.theoretical_mass
-        711.2567622919099
+        proforma_sequence
+            Peptidoform sequence in ProForma v2 notation as :py:class:`str` or
+            :py:class:`pyteomics.proforma.ProForma` object.
 
         Attributes
         ----------
@@ -34,18 +29,30 @@ def __init__(self, proforma_sequence: str) -> None:
         properties : dict[str, Any]
             Dict with sequence-wide properties.
 
+        Examples
+        --------
+        >>> peptidoform = Peptidoform("ACDM[Oxidation]EK")
+        >>> peptidoform.theoretical_mass
+        711.2567622919099
+
         """
-        try:
-            self.parsed_sequence, self.properties = proforma.parse(proforma_sequence)
-        except proforma.ProFormaError as e:
-            raise PeptidoformException(
-                f"Could not parse ProForma sequence: {proforma_sequence}"
-            ) from e
+        if isinstance(proforma_sequence, str):
+            try:
+                self.parsed_sequence, self.properties = proforma.parse(proforma_sequence)
+            except proforma.ProFormaError as e:
+                raise PeptidoformException(
+                    f"Could not parse ProForma sequence: {proforma_sequence}"
+                ) from e
+        elif isinstance(proforma_sequence, proforma.ProForma):
+            self.parsed_sequence = proforma_sequence.sequence
+            self.properties = proforma_sequence.properties
+        else:
+            raise TypeError(
+                f"Expected ProForma sequence or ProForma object, got {type(proforma_sequence)}."
+            )
 
         if self.properties["isotopes"]:
-            raise NotImplementedError(
-                "Peptidoforms with isotopes are currently not supported."
-            )
+            raise NotImplementedError("Peptidoforms with isotopes are currently not supported.")
 
     def __repr__(self) -> str:
         return f"{self.__class__.__qualname__}('{self.proforma}')"
@@ -186,8 +193,7 @@ def sequential_composition(self) -> list[mass.Composition]:
                         position_comp += tag.composition
                     except (AttributeError, KeyError) as e:
                         raise ModificationException(
-                            "Cannot resolve composition for modification "
-                            f"{tag.value}."
+                            "Cannot resolve composition for modification " f"{tag.value}."
                         ) from e
             comp_list.append(position_comp)
 
@@ -282,9 +288,7 @@ def sequential_theoretical_mass(self) -> float:
             try:
                 position_mass = mass.std_aa_mass[aa]
             except (AttributeError, KeyError) as e:
-                raise AmbiguousResidueException(
-                    f"Cannot resolve mass for amino acid {aa}."
-                ) from e
+                raise AmbiguousResidueException(f"Cannot resolve mass for amino acid {aa}.") from e
             # Fixed modifications
             if aa in fixed_rules:
                 position_mass += fixed_rules[aa]
@@ -428,9 +432,7 @@ def _rename_modification_list(mods):
             "fixed_modifications",
         ]:
             if self.properties[mod_type]:
-                self.properties[mod_type] = _rename_modification_list(
-                    self.properties[mod_type]
-                )
+                self.properties[mod_type] = _rename_modification_list(self.properties[mod_type])
 
     def add_fixed_modifications(
         self, modification_rules: list[tuple[str, list[str]]] | dict[str, list[str]]
diff --git a/psm_utils/psm.py b/psm_utils/psm.py
index 8dbf5e7..affa2ab 100644
--- a/psm_utils/psm.py
+++ b/psm_utils/psm.py
@@ -27,7 +27,7 @@ class PSM(BaseModel):
     source: Optional[str] = None
     provenance_data: Optional[Dict[str, str]] = dict()
     metadata: Optional[Dict[str, str]] = dict()
-    rescoring_features: Optional[Dict[str, str]] = dict()
+    rescoring_features: Optional[Dict[str, float]] = dict()
 
     class Config:
         arbitrary_types_allowed = True  # Allows non-pydantic class Peptidoform
diff --git a/psm_utils/psm_list.py b/psm_utils/psm_list.py
index 8efaf6a..ff71c32 100644
--- a/psm_utils/psm_list.py
+++ b/psm_utils/psm_list.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import re
-from itertools import compress
 from typing import Iterable, List, Sequence
 
 import numpy as np
@@ -99,7 +98,12 @@ def __getitem__(self, item) -> PSM | list[PSM]:
             return PSMList(psm_list=self.psm_list[item])
         elif isinstance(item, str):
             # Return PSM property as array across full PSMList
-            return np.array([psm[item] for psm in self.psm_list])
+            try:
+                # Let NumPy coerce dtype (e.g., multidimensional arrays)
+                return np.array([psm[item] for psm in self.psm_list])
+            except ValueError:
+                # If dtype is not consistent, force dtype to be object
+                return np.array([psm[item] for psm in self.psm_list], dtype=object)
         elif _is_iterable_of_bools(item):
             # Return new PSMList with items that were True
             return PSMList(psm_list=[self.psm_list[i] for i in np.flatnonzero(item)])
@@ -121,7 +125,7 @@ def __setitem__(self, item, values: Sequence) -> None:
     @property
     def collections(self) -> list:
         """List of collections in :py:class:`PSMList`."""
-        if (self["collection"] != None).any():
+        if (self["collection"] != None).any():  # noqa: E711
             return list(np.unique(self["collection"]))
         else:
             return [None]
@@ -129,11 +133,19 @@ def collections(self) -> list:
     @property
     def runs(self) -> list:
         """List of runs in :py:class:`PSMList`."""
-        if (self["run"] != None).any():
+        if (self["run"] != None).any():  # noqa: E711
             return list(np.unique(self["run"]))
         else:
             return [None]
 
+    def append(self, psm: PSM) -> None:
+        """Append PSM to :py:class:`PSMList`."""
+        self.psm_list.append(psm)
+
+    def extend(self, psm_list: PSMList) -> None:
+        """Extend :py:class:`PSMList` with another :py:class:`PSMList`."""
+        self.psm_list.extend(psm_list)
+
     def get_psm_dict(self):
         """Get nested dictionary of PSMs by collection, run, and spectrum_id."""
         psm_dict = {}
@@ -199,9 +211,7 @@ def find_decoys(self, decoy_pattern: str) -> None:
         """
         decoy_pattern = re.compile(decoy_pattern)
         for psm in self:
-            psm.is_decoy = all(
-                [decoy_pattern.search(p) is not None for p in psm.protein_list]
-            )
+            psm.is_decoy = all([decoy_pattern.search(p) is not None for p in psm.protein_list])
 
     def calculate_qvalues(self, reverse: bool = True, **kwargs) -> None:
         """
@@ -289,9 +299,7 @@ def add_fixed_modifications(
         ]
         for psm in self.psm_list:
             if psm.peptidoform.properties["fixed_modifications"]:
-                psm.peptidoform.properties["fixed_modifications"].extend(
-                    modification_rules
-                )
+                psm.peptidoform.properties["fixed_modifications"].extend(modification_rules)
             else:
                 psm.peptidoform.properties["fixed_modifications"] = modification_rules
 
diff --git a/pyproject.toml b/pyproject.toml
index d2005e0..f47b71e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,12 +2,12 @@
 name = "psm-utils"
 description = "Common utilities for parsing and handling peptide-spectrum matches and search engine results."
 readme = "README.rst"
-license = {file = "LICENSE"}
+license = { file = "LICENSE" }
 keywords = ["proteomics", "peptide", "spectrum", "identification", "parsing"]
 authors = [
-    {name = "Ralf Gabriels", email = "ralf@gabriels.dev"},
-    {name = "Robbin Bouwmeester", email = "robbin.bouwmeester@ugent.be"},
-    {name = "Arthur Declercq", email = "arthur.declercq@ugent.be"},
+    { name = "Ralf Gabriels", email = "ralf@gabriels.dev" },
+    { name = "Robbin Bouwmeester", email = "robbin.bouwmeester@ugent.be" },
+    { name = "Arthur Declercq", email = "arthur.declercq@ugent.be" },
 ]
 classifiers = [
     "Intended Audience :: Science/Research",
@@ -28,15 +28,11 @@ dependencies = [
     "click",
     "rich",
     "pydantic",
+    "sqlalchemy",
 ]
 
 [project.optional-dependencies]
-dev = [
-    "black",
-    "isort>5",
-    "pytest",
-    "pytest-cov"
-]
+dev = ["black", "isort>5", "pytest", "pytest-cov"]
 docs = [
     "sphinx",
     "numpydoc>=1,<2",
@@ -47,10 +43,7 @@ docs = [
     "sphinx_rtd_theme",
     "sphinx-autobuild",
 ]
-online = [
-    "streamlit",
-    "plotly",
-]
+online = ["streamlit", "plotly"]
 
 [project.urls]
 GitHub = "https://github.com/compomics/psm_utils"
@@ -74,3 +67,7 @@ profile = "black"
 [tool.black]
 line-length = 99
 target-version = ['py37']
+
+[tool.ruff]
+line-length = 99
+target-version = 'py37'
diff --git a/tests/test_io/test_msamanda.py b/tests/test_io/test_msamanda.py
index cecb62e..8007b02 100644
--- a/tests/test_io/test_msamanda.py
+++ b/tests/test_io/test_msamanda.py
@@ -1,7 +1,6 @@
 import pytest
 
 import psm_utils.io.msamanda as msamanda
-from psm_utils import peptidoform, psm, psm_list
 
 TEST_COL = [
     "Title",

From ab8390d790209e871cf4baab0a630664a9cc918d Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Fri, 25 Aug 2023 16:53:25 +0200
Subject: [PATCH 03/13] Black formatting (line length 99)

---
 psm_utils/io/__init__.py      | 8 ++------
 psm_utils/io/_base_classes.py | 1 +
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/psm_utils/io/__init__.py b/psm_utils/io/__init__.py
index be0b424..f77e6f5 100644
--- a/psm_utils/io/__init__.py
+++ b/psm_utils/io/__init__.py
@@ -92,9 +92,7 @@
 def _infer_filetype(filename: str):
     """Infer filetype from filename."""
     for filetype, properties in FILETYPES.items():
-        if re.fullmatch(
-            properties["filename_pattern"], str(filename), flags=re.IGNORECASE
-        ):
+        if re.fullmatch(properties["filename_pattern"], str(filename), flags=re.IGNORECASE):
             return filetype
     else:
         raise PSMUtilsIOException("Could not infer filetype.")
@@ -260,9 +258,7 @@ def convert(
     if _supports_write_psm(writer_cls):
         # Setup iterator, potentially with progress bar
         iterator = (
-            track(reader, description="[green]Converting file")
-            if show_progressbar
-            else reader
+            track(reader, description="[green]Converting file") if show_progressbar else reader
         )
 
         # Get example PSM and instantiate writer
diff --git a/psm_utils/io/_base_classes.py b/psm_utils/io/_base_classes.py
index f4c3cfc..60f9c19 100644
--- a/psm_utils/io/_base_classes.py
+++ b/psm_utils/io/_base_classes.py
@@ -45,6 +45,7 @@ def read_file(self) -> PSMList:
         """Read full PSM file into a PSMList object."""
         return PSMList(psm_list=[psm for psm in self.__iter__()])
 
+
 class WriterBase(ABC):
     """Abstract base class for PSM file writers."""
 

From c20f728465975ef28b958f596e22431dff3311d8 Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Fri, 25 Aug 2023 17:50:18 +0200
Subject: [PATCH 04/13] `io.mzid`: Allow score key not to be present in all
 PSMs in a single mzid file `io.mzid`: Add support for user to define custom
 score key `io.mzid`: Add `Proteome Discoverer Delta Score` to known scores

---
 psm_utils/io/mzid.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/psm_utils/io/mzid.py b/psm_utils/io/mzid.py
index 0aae4ec..0a1ef20 100644
--- a/psm_utils/io/mzid.py
+++ b/psm_utils/io/mzid.py
@@ -57,6 +57,7 @@
     "ProteinProspector:score",
     "ProteinScape:SequestMetaScore",
     "ProteomeDiscoverer:Delta Score",
+    "Proteome Discoverer Delta Score",
     "SEQUEST:xcorr",
     "SIM-XL score ",
     "SQID:score ",
@@ -87,7 +88,7 @@
 
 
 class MzidReader(ReaderBase):
-    def __init__(self, filename: str | Path, *args, **kwargs) -> None:
+    def __init__(self, filename: str | Path, *args, score_key: str = None, **kwargs) -> None:
         """
         Reader for mzIdentML PSM files.
 
@@ -95,6 +96,9 @@ def __init__(self, filename: str | Path, *args, **kwargs) -> None:
         ----------
         filename: str, pathlib.Path
             Path to PSM file.
+        score_key: str, optional
+            Name of the score metric to use as PSM score. If not provided, the score metric is
+            inferred from the file if one of the child parameters of ``MS:1001143`` is present.
 
         Examples
         --------
@@ -127,8 +131,10 @@ def __init__(self, filename: str | Path, *args, **kwargs) -> None:
 
         """
         super().__init__(filename, *args, **kwargs)
+
+        self.score_key = score_key
+
         self._non_metadata_keys = None
-        self._score_key = None
         self._rt_key = None
         self._spectrum_rt_key = None
         self._qvalue_key = None
@@ -258,12 +264,17 @@ def _get_peptide_spectrum_match(
         else:
             psm_spectrum_id = spectrum_id
 
+        try:
+            score = sii[self.score_key]
+        except KeyError:
+            score = None
+
         psm = PSM(
             peptidoform=peptidoform,
             spectrum_id=psm_spectrum_id,
             run=run,
             is_decoy=is_decoy,
-            score=sii[self._score_key],
+            score=score,
             qvalue=sii[self._qvalue_key] if self._qvalue_key else None,
             pep=sii[self._pep_key] if self._pep_key else None,
             precursor_mz=precursor_mz,
@@ -288,8 +299,9 @@ def _get_non_metadata_keys(self, keys: list):
             "Modification",
         ]
         # Get the score key and add to default keys
-        self._score_key = self._infer_score_name(keys)
-        default_keys.append(self._score_key)
+        if not self.score_key:
+            self.score_key = self._infer_score_name(keys)
+        default_keys.append(self.score_key)
 
         # Get the q-value key and add to default keys
         self._qvalue_key = self._infer_qvalue_name(keys)

From 86a0b1d3c734252280135137fdb4536b1d99da46 Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Fri, 25 Aug 2023 18:12:11 +0200
Subject: [PATCH 05/13] Fix sage test with coerced float rescoring_features

---
 tests/test_data/results.sage.tsv |  2 +-
 tests/test_io/test_sage.py       | 37 ++++++++++++++++++++++++++------
 2 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/tests/test_data/results.sage.tsv b/tests/test_data/results.sage.tsv
index 6840337..9d6ec16 100644
--- a/tests/test_data/results.sage.tsv
+++ b/tests/test_data/results.sage.tsv
@@ -1,2 +1,2 @@
 peptide	proteins	num_proteins	filename	scannr	rank	label	expmass	calcmass	charge	peptide_len	missed_cleavages	isotope_error	precursor_ppm	fragment_ppm	hyperscore	delta_next	delta_best	rt	aligned_rt	predicted_rt	delta_rt_model	matched_peaks	longest_b	longest_y	longest_y_pct	matched_intensity_pct	scored_candidates	poisson	sage_discriminant_score	posterior_error	spectrum_q	peptide_q	protein_q	ms1_intensity	ms2_intensity
-LQSRPAAPPAPGPGQLTLR	sp|Q99536|VAT1_HUMAN	1	LQSRPAAPPAPGPGQLTLR.mzML	controllerType=0 controllerNumber=1 scan=30069	1	1	1926.0815	1926.08	3	19	0	0.0	0.8239083	0.5347518	71.78844460255384	71.78844460255384	0.0	108.2854	NaN	0.0	NaN	22	9	12	0.6315789	50.785	1	-1.9562811911083433	1.2944585	1.0	1.0	1.0	1.0	306146180.0	56930696.0
+LQSRPAAPPAPGPGQLTLR	sp|Q99536|VAT1_HUMAN	1	LQSRPAAPPAPGPGQLTLR.mzML	controllerType=0 controllerNumber=1 scan=30069	1	1	1926.0815	1926.08	3	19	0	0.0	0.8239083	0.5347518	71.78844460255384	71.78844460255384	0.0	108.2854	0.0	0.0	0.0	22	9	12	0.6315789	50.785	1	-1.9562811911083433	1.2944585	1.0	1.0	1.0	1.0	306146180.0	56930696.0
diff --git a/tests/test_io/test_sage.py b/tests/test_io/test_sage.py
index d7addd6..10d2bcc 100644
--- a/tests/test_io/test_sage.py
+++ b/tests/test_io/test_sage.py
@@ -1,12 +1,12 @@
 """Tests for psm_utils.io.sage."""
 
-from psm_utils.psm import PSM
 from psm_utils.io.sage import SageReader
+from psm_utils.psm import PSM
 
 test_psm = PSM(
-    peptidoform='LQSRPAAPPAPGPGQLTLR/3',
-    spectrum_id='controllerType=0 controllerNumber=1 scan=30069',
-    run='LQSRPAAPPAPGPGQLTLR',
+    peptidoform="LQSRPAAPPAPGPGQLTLR/3",
+    spectrum_id="controllerType=0 controllerNumber=1 scan=30069",
+    run="LQSRPAAPPAPGPGQLTLR",
     collection=None,
     spectrum=None,
     is_decoy=False,
@@ -16,11 +16,34 @@
     precursor_mz=643.0349916987367,
     retention_time=108.2854,
     ion_mobility=None,
-    protein_list=['sp|Q99536|VAT1_HUMAN'],
+    protein_list=["sp|Q99536|VAT1_HUMAN"],
     rank=1,
-    source='sage',
+    source="sage",
     metadata={},
-    rescoring_features={'expmass': '1926.0815', 'calcmass': '1926.08', 'peptide_len': '19', 'missed_cleavages': '0', 'isotope_error': '0.0', 'precursor_ppm': '0.8239083', 'fragment_ppm': '0.5347518', 'hyperscore': '71.78844460255384', 'delta_next': '71.78844460255384', 'delta_best': '0.0', 'delta_rt_model': 'NaN', 'aligned_rt': 'NaN', 'predicted_rt': '0.0', 'matched_peaks': '22', 'longest_b': '9', 'longest_y': '12', 'longest_y_pct': '0.6315789', 'matched_intensity_pct': '50.785', 'scored_candidates': '1', 'poisson': '-1.9562811911083433', 'ms1_intensity': '306146180.0', 'ms2_intensity': '56930696.0'},
+    rescoring_features={
+        "expmass": 1926.0815,
+        "calcmass": 1926.08,
+        "peptide_len": 19.0,
+        "missed_cleavages": 0.0,
+        "isotope_error": 0.0,
+        "precursor_ppm": 0.8239083,
+        "fragment_ppm": 0.5347518,
+        "hyperscore": 71.78844460255384,
+        "delta_next": 71.78844460255384,
+        "delta_best": 0.0,
+        "delta_rt_model": 0.0,
+        "aligned_rt": 0.0,
+        "predicted_rt": 0.0,
+        "matched_peaks": 22.0,
+        "longest_b": 9.0,
+        "longest_y": 12.0,
+        "longest_y_pct": 0.6315789,
+        "matched_intensity_pct": 50.785,
+        "scored_candidates": 1.0,
+        "poisson": -1.9562811911083433,
+        "ms1_intensity": 306146180.0,
+        "ms2_intensity": 56930696.0,
+    },
 )
 
 

From 22373f9b670858bf990b81aab0c1134cdf2b1c77 Mon Sep 17 00:00:00 2001
From: Buur Louise <77845950+louisebuur@users.noreply.github.com>
Date: Tue, 12 Sep 2023 16:31:43 +0200
Subject: [PATCH 06/13] Update msamanda.py

---
 psm_utils/io/msamanda.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/psm_utils/io/msamanda.py b/psm_utils/io/msamanda.py
index b79c7fd..3040a74 100644
--- a/psm_utils/io/msamanda.py
+++ b/psm_utils/io/msamanda.py
@@ -121,16 +121,15 @@ def _parse_peptidoform(seq, modifications, charge):
         "Parse MSAmanda sequence, modifications and charge to proforma sequence"
         peptide = [""] + [aa.upper() for aa in seq] + [""]
         pattern = re.compile(
-            r"(?P<site>[A-Z])(?P<loc>-term|\d+)\((?P<mod_name>[A-Za-z]+)\|([-0-9.]+)\|(variable|fixed)\);?"
+            r"(?:(?:(?P<site>[A-Z])(?P<loc>\d+))|(?P<term>[CN]-Term))\((?P<mod_name>[^|()]+)\|(?P<mz>[-0-9.]+)\|(?P<type>variable|fixed)\);?"
         )
 
         for match in pattern.finditer(modifications):
-            if match.group("loc") == "-term":
-                if match.group("site") == "N":
-                    peptide[0] = peptide[0] + f'[{match.group("mod_name")}]'
-                elif match.group("site") == "C":
-                    peptide[-1] = peptide[-1] + f'[{match.group("mod_name")}]'
-            else:
+            if match.group("term") == "N-Term":
+                peptide[0] = peptide[0] + f'[{match.group("mod_name")}]'
+            elif match.group("term") == "C-Term":
+                peptide[-1] = peptide[-1] + f'[{match.group("mod_name")}]'
+            if match.group("loc") is not None: 
                 peptide[int(match.group("loc"))] = (
                     peptide[int(match.group("loc"))] + f'[{match.group("mod_name")}]'
                 )

From dcf28dba8a405b4fb7c6c34a5061a022769025b9 Mon Sep 17 00:00:00 2001
From: louisebuur <77845950+louisebuur@users.noreply.github.com>
Date: Wed, 13 Sep 2023 14:57:14 +0200
Subject: [PATCH 07/13] Update test_msamanda.py

---
 tests/test_io/test_msamanda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_io/test_msamanda.py b/tests/test_io/test_msamanda.py
index 8007b02..48f6820 100644
--- a/tests/test_io/test_msamanda.py
+++ b/tests/test_io/test_msamanda.py
@@ -50,7 +50,7 @@ def test_parse_peptidoform(self):
                     4,
                 ),
                 ("TLPMFHDEEHAR", "", 3),
-                ("VSAGEIAVTGAGR", "C-term(Amidated|-0.984016|variable)", 2),
+                ("VSAGEIAVTGAGR", "C-Term(Amidated|-0.984016|variable)", 2),
                 ("VQAELDETK", "", 2),
             ],
             "expected_output": [

From 7be325ed8736b42ff68d86049915d694f16bf19e Mon Sep 17 00:00:00 2001
From: louisebuur <77845950+louisebuur@users.noreply.github.com>
Date: Wed, 13 Sep 2023 15:08:51 +0200
Subject: [PATCH 08/13] Update test_msamanda.py

---
 tests/test_io/test_msamanda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_io/test_msamanda.py b/tests/test_io/test_msamanda.py
index 48f6820..82e94f1 100644
--- a/tests/test_io/test_msamanda.py
+++ b/tests/test_io/test_msamanda.py
@@ -46,7 +46,7 @@ def test_parse_peptidoform(self):
                 ),
                 (
                     "LRDTcLQK",
-                    "N-term(Acetyl|40|variable);C5(Carbamidomethyl|57.021464|fixed)",
+                    "N-Term(Acetyl|40|variable);C5(Carbamidomethyl|57.021464|fixed)",
                     4,
                 ),
                 ("TLPMFHDEEHAR", "", 3),

From adb0f875be9b429ffc96578d9858c3abc86c50c1 Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Wed, 13 Sep 2023 15:45:57 +0200
Subject: [PATCH 09/13] Set newer build>os configuration for readthedocs

---
 .readthedocs.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.readthedocs.yml b/.readthedocs.yml
index 0a400bd..3f8e86a 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -1,7 +1,11 @@
 version: 2
 
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.11"
+
 python:
-  version: '3.8'
   install:
     - method: pip
       path: .

From 357c7495336b2276bb2564119b90bd380bcfcca3 Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Wed, 13 Sep 2023 15:47:27 +0200
Subject: [PATCH 10/13] io.mzid: Give `PeptideShaker PSM score` priority over
 other potential search engine scores (required for correct PeptideShaker mzid
 parsing)

---
 psm_utils/io/mzid.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/psm_utils/io/mzid.py b/psm_utils/io/mzid.py
index 0a1ef20..315b153 100644
--- a/psm_utils/io/mzid.py
+++ b/psm_utils/io/mzid.py
@@ -29,7 +29,9 @@
 
 # Excerpt from MS:1001143 items (PSM-level search engine specific statistic)
 # Not all child terms are used, as not all statistics are direct scores.
+# Items are sorted by priority (if more scores are present, the first found one is used)
 STANDARD_SEARCHENGINE_SCORES = [
+    "PeptideShaker PSM score",
     "Amanda:AmandaScore",
     "Andromeda:score",
     "Byonic:Score",
@@ -49,7 +51,6 @@
     "OMSSA:evalue",
     "OpenPepXL:score",
     "PEAKS:peptideScore",
-    "PeptideShaker PSM score",
     "Phenyx:Pepzscore",
     "ProLuCID:xcorr",
     "ProSight:specral C-score",

From 5434309255366087bd6391abd066c187819b0540 Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Wed, 13 Sep 2023 15:47:59 +0200
Subject: [PATCH 11/13] Upgrade Github action versions

---
 .github/workflows/publish.yml | 34 +++++++-------
 .github/workflows/test.yml    | 87 +++++++++++++++++------------------
 2 files changed, 60 insertions(+), 61 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 29b3934..91ac915 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -8,24 +8,24 @@ jobs:
   publish:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
-    - name: Set up Python
-      uses: actions/setup-python@v4
-      with:
-        python-version: '3.8'
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
 
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip flit
-        pip install --editable .[dev]
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip flit
+          pip install --editable .[dev]
 
-    - name: Test package
-      run: |
-        pytest
+      - name: Test package
+        run: |
+          pytest
 
-    - name: Build and publish to PyPI
-      env:
-        FLIT_USERNAME: ${{ secrets.PYPI_USERNAME }}
-        FLIT_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
-      run: flit publish
+      - name: Build and publish to PyPI
+        env:
+          FLIT_USERNAME: ${{ secrets.PYPI_USERNAME }}
+          FLIT_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
+        run: flit publish
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 766f11e..bea1eee 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -10,64 +10,63 @@ jobs:
   test-with-codecov:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
-    - name: Set up Python 3.8
-      uses: actions/setup-python@v4
-      with:
-        python-version: '3.8'
+      - name: Set up Python 3.8
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
 
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install flit flake8
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install flit flake8
 
-    - name: Lint with flake8
-      run: |
-        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+      - name: Lint with flake8
+        run: |
+          # stop the build if there are Python syntax errors or undefined names
+          flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+          # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+          flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
 
-    - name: Install with Flit
-      run: flit install --deps all
+      - name: Install with Flit
+        run: flit install --deps all
 
-    - name: Test with pytest and codecov
-      run: |
-        pytest --cov=psm_utils tests
-
-    - name: Upload coverage reports to Codecov
-      uses: codecov/codecov-action@v3
+      - name: Test with pytest and codecov
+        run: |
+          pytest --cov=psm_utils tests
 
+      - name: Upload coverage reports to Codecov
+        uses: codecov/codecov-action@v3
 
   test-platforms:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.8', '3.9', '3.10', '3.11']
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
     steps:
-    - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v4
-      with:
-        python-version: ${{ matrix.python-version }}
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
 
-    - name: Install package and its dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install flit flake8
+      - name: Install package and its dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install flit flake8
 
-    - name: Lint with flake8
-      run: |
-        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+      - name: Lint with flake8
+        run: |
+          # stop the build if there are Python syntax errors or undefined names
+          flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+          # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+          flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
 
-    - name: Install with Flit
-      run: flit install --deps all
+      - name: Install with Flit
+        run: flit install --deps all
 
-    - name: Test with pytest
-      run: |
-        pytest
+      - name: Test with pytest
+        run: |
+          pytest

From 7fc6cd775b93835a936f5c82ec55af2706162c03 Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Wed, 13 Sep 2023 15:58:38 +0200
Subject: [PATCH 12/13] online: Remove useless `== True`

---
 online/pages/1_PSM_file_statistics.py | 31 ++++++++-------------------
 1 file changed, 9 insertions(+), 22 deletions(-)

diff --git a/online/pages/1_PSM_file_statistics.py b/online/pages/1_PSM_file_statistics.py
index f143853..9daf6f7 100644
--- a/online/pages/1_PSM_file_statistics.py
+++ b/online/pages/1_PSM_file_statistics.py
@@ -92,14 +92,14 @@ def _input_form(self):
                     "Score type: order",
                     options=[True, False],
                     format_func=lambda x: "Higher score is better"
-                    if x == True
+                    if x
                     else "Lower score is better",
                 )
                 self.state["log_scale"] = row[1].radio(
                     "Score type: scale",
                     options=[False, True],
                     format_func=lambda x: "Logarithmic scale (e.g., e-value)"
-                    if x == True
+                    if x
                     else "Linear scale (e.g., Andromeda score)",
                     help=(
                         """
@@ -142,9 +142,7 @@ def _read_file(self):
                 # Write file to disk for psm_utils; then read
                 with NamedTemporaryFile(mode="wb", delete=False) as tmp_file:
                     if self.state["input_file"].name.lower().endswith(".gz"):
-                        tmp_file.write(
-                            gzip.decompress(self.state["input_file"].getvalue())
-                        )
+                        tmp_file.write(gzip.decompress(self.state["input_file"].getvalue()))
                     else:
                         tmp_file.write(self.state["input_file"].getvalue())
                     tmp_file.flush()
@@ -179,7 +177,7 @@ def _prepare_psms(self):
             )
 
         # If no q-values, try to calculate
-        if (psm_list["qvalue"] == None).any():
+        if (psm_list["qvalue"] == None).any():  # noqa: E711
             # If no decoys, display error
             if percent_decoys == 0.0:
                 st.error(
@@ -231,13 +229,9 @@ def _show_results(self):
 
         n_collections = psm_df["collection"].unique().shape[0]
         n_runs = psm_df[["run", "collection"]].drop_duplicates().shape[0]
-        n_spectra = (
-            psm_df[["spectrum_id", "run", "collection"]].drop_duplicates().shape[0]
-        )
+        n_spectra = psm_df[["spectrum_id", "run", "collection"]].drop_duplicates().shape[0]
         n_psms = psm_df.shape[0]
-        n_peptidoforms = (
-            psm_df["peptidoform"].apply(lambda x: x.proforma).unique().shape[0]
-        )
+        n_peptidoforms = psm_df["peptidoform"].apply(lambda x: x.proforma).unique().shape[0]
         percent_decoys = np.count_nonzero(psm_list["is_decoy"]) / len(psm_list)
 
         row_1 = st.columns(3)
@@ -262,19 +256,12 @@ def _show_results(self):
         else:
             psm_df_filtered = psm_df[psm_df["qvalue"] <= self.state["fdr_threshold"]]
             n_spectra = (
-                psm_df_filtered[["spectrum_id", "run", "collection"]]
-                .drop_duplicates()
-                .shape[0]
+                psm_df_filtered[["spectrum_id", "run", "collection"]].drop_duplicates().shape[0]
             )
             n_psms = psm_df_filtered.shape[0]
-            n_peptides = (
-                psm_df["peptidoform"].apply(lambda x: x.sequence).unique().shape[0]
-            )
+            n_peptides = psm_df["peptidoform"].apply(lambda x: x.sequence).unique().shape[0]
             n_peptidoforms = (
-                psm_df_filtered["peptidoform"]
-                .apply(lambda x: x.proforma)
-                .unique()
-                .shape[0]
+                psm_df_filtered["peptidoform"].apply(lambda x: x.proforma).unique().shape[0]
             )
 
             row_3 = st.columns(4)

From 600697b01b53d4b478389dc6bc0c0be54688718e Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Fri, 15 Sep 2023 11:51:33 +0200
Subject: [PATCH 13/13] io.tsv: Raise more descriptive error if TSV row cannot
 be parsed into PSM

---
 psm_utils/io/tsv.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/psm_utils/io/tsv.py b/psm_utils/io/tsv.py
index 60f7cc1..98751f8 100644
--- a/psm_utils/io/tsv.py
+++ b/psm_utils/io/tsv.py
@@ -52,6 +52,8 @@
 from pathlib import Path
 from typing import Optional
 
+from pydantic import ValidationError
+
 from psm_utils.io._base_classes import ReaderBase, WriterBase
 from psm_utils.io.exceptions import PSMUtilsIOException
 from psm_utils.psm import PSM
@@ -66,10 +68,13 @@ def __iter__(self):
         with open(self.filename, "rt") as open_file:
             reader = csv.DictReader(open_file, delimiter="\t")
             for row in reader:
-                yield PSM(**self._parse_entry(row))
+                try:
+                    yield PSM(**self._parse_entry(row))
+                except ValidationError as e:
+                    raise PSMUtilsIOException(f"Could not parse PSM from row: `{row}`") from e
 
     @staticmethod
-    def _parse_entry(entry: dict):
+    def _parse_entry(entry: dict) -> dict:
         """Parse single TSV entry to :py:class:`~psm_utils.psm.PSM`."""
         # Replace empty strings with None
         entry = {k: v if v else None for k, v in entry.items()}