Merge pull request #94 from compomics/fix/percolator-scannr

Percolator fixes: ScanNr writing and inferring style from file extension
compomics · Aug 28, 2024 · 5e42ace · 5e42ace
2 parents 50f9a9c + 711fd10
commit 5e42ace
Show file tree

Hide file tree

Showing 4 changed files with 194 additions and 41 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.0.1] - 2024-08-28
+
+### Fixed
+
+- `io.percolator`: Fix and improve ScanNr inferring and writing
+- `io.percolator`: Infer style from file extension if not provided (enables dynamic style determination in, for instance, `convert` function).
+
 ## [1.0.0] - 2024-08-14
 
 ### Added

diff --git a/psm_utils/__init__.py b/psm_utils/__init__.py
@@ -1,6 +1,6 @@
 """Common utilities for parsing and handling PSMs, and search engine results."""
 
-__version__ = "1.0.0"
+__version__ = "1.0.1"
 __all__ = ["Peptidoform", "PSM", "PSMList"]
 
 from warnings import filterwarnings

diff --git a/psm_utils/io/percolator.py b/psm_utils/io/percolator.py
@@ -16,17 +16,19 @@
 from __future__ import annotations
 
 import csv
+import logging
 import re
 from pathlib import Path
-from typing import Iterable, Optional
+from typing import Iterable, List, Optional, Tuple, Union
 
 from psm_utils.io._base_classes import ReaderBase, WriterBase
+from psm_utils.io._utils import set_csv_field_size_limit
 from psm_utils.io.exceptions import PSMUtilsIOException
 from psm_utils.peptidoform import Peptidoform
 from psm_utils.psm import PSM
 from psm_utils.psm_list import PSMList
-from psm_utils.io._utils import set_csv_field_size_limit
 
+LOGGER = logging.getLogger(__name__)
 set_csv_field_size_limit()
 
 
@@ -96,7 +98,7 @@ def __init__(
  # Validate column names from parameters
  for col in [self.score_column, self.rt_column, self.mz_column]:
  if col and col.lower() not in self.fieldnames:
- raise ValueError(
+ raise PercolatorIOException(
  f"Column `{col}` not found in header of Percolator Tab file "
  f"`{self.filename}`."
  )
@@ -197,7 +199,7 @@ class PercolatorTabWriter(WriterBase):
  def __init__(
  self,
  filename: str | Path,
- style: str = "pin",
+ style: Optional[str] = None,
  feature_names: Optional[list[str]] = None,
  add_basic_features: bool = False,
  *args,
@@ -210,12 +212,13 @@ def __init__(
  ----------
  filename: str, pathlib.Path
  Path to PSM file.
- style: str
+ style: str, optional
  Percolator Tab style. One of {``pin``, ``pout``}. If ``pin``, the columns
  ``SpecId``, ``Label``, ``ScanNr``, ``ChargeN``, ``PSMScore``, ``Peptide``, and
  ``Proteins`` are written alongside the requested feature names
  (see ``feature_names``). If ``pout``, the columns ``PSMId``, ``Label``, ``score``,
  ``q-value``, ``posterior_error_prob``, ``peptide``, and ``proteinIds`` are written.
+ By default, the style is inferred from the file name extension.
  feature_names: list[str], optional
  List of feature names to extract from PSMs and write to file. List values
  should correspond to keys in the
@@ -232,15 +235,27 @@ def __init__(
  self.feature_names = list(feature_names) if feature_names else []
  self.add_basic_features = add_basic_features
 
- if style == "pin":
+ if not style:
+ suffix = self.filename.suffix.lower()
+ if suffix == ".pin":
+ self.style = "pin"
+ elif suffix == ".pout":
+ self.style = "pout"
+ else:
+ raise PercolatorIOException(
+ f"Could not infer Percolator Tab style from file extension `{suffix}`. "
+ "Please provide the `style` parameter."
+ )
+
+ if self.style == "pin":
  basic_features = ["PSMScore", "ChargeN"] if add_basic_features else []
  self._columns = (
  ["SpecId", "Label", "ScanNr"]
  + basic_features
  + self.feature_names
  + ["Peptide", "Proteins"]
  )
- elif style == "pout":
+ elif self.style == "pout":
  self._columns = [
  "PSMId",
  "Label",
@@ -252,11 +267,11 @@ def __init__(
  ]
  else:
  raise ValueError("Invalid Percolator Tab style. Should be one of {`pin`, `pout`}.")
- self.style = style
+
  self._open_file = None
  self._writer = None
  self._protein_separator = "|||"
- self._last_scan_number = None
+ self._current_scannr = 0
 
  def __enter__(self) -> PercolatorTabWriter:
  """Either open existing file in append mode or new file in write mode."""
@@ -266,28 +281,10 @@ def __enter__(self) -> PercolatorTabWriter:
  self.filename, mode, newline="", protein_separator=self._protein_separator
  )
  if file_existed:
- with open(self.filename, "rt") as open_file:
- # Read header
- for line in open_file:
- fieldnames = line.strip().split("\t")
- break
- else:
- raise ValueError(f"File {self.filename} is not a valid Percolator Tab file.")
- # Determine last scan number
- open_file.seek(0)
- last_line = None
- for line in open_file:
- if line.strip():
- last_line = line
- if last_line:
- last_line = {k: v for k, v in zip(fieldnames, last_line.strip().split("\t"))}
- try:
- self._last_scan_number = int(last_line["ScanNr"])
- except ValueError:
- self._last_scan_number = None
+ fieldnames, self._current_scannr = self._parse_existing_file(self.filename)
  else:
  fieldnames = self._columns
- self._last_scan_number = -1
+ self._current_scannr = -1
  self._writer = csv.DictWriter(
  self._open_file,
  fieldnames=fieldnames,
@@ -302,15 +299,13 @@ def __exit__(self, *args, **kwargs) -> None:
  self._open_file.close()
  self._open_file = None
  self._writer = None
- self._last_scan_number = None
+ self._current_scannr = None
 
  def write_psm(self, psm: PSM):
  """Write a single PSM to the PSM file."""
  entry = self._psm_to_entry(psm)
- if self._last_scan_number is not None:
- entry["ScanNr"] = self._last_scan_number + 1
- else:
- entry["ScanNr"] = None
+ self._current_scannr += 1
+ entry["ScanNr"] = self._current_scannr
  try:
  self._writer.writerow(entry)
  except AttributeError as e:
@@ -319,7 +314,7 @@ def write_psm(self, psm: PSM):
  "is opened in context (i.e., using the `with` statement)."
  ) from e
  else:
- self._last_scan_number = entry["ScanNr"]
+ self._current_scannr = entry["ScanNr"]
 
  def write_file(self, psm_list: PSMList):
  """Write an entire PSMList to the PSM file."""
@@ -330,10 +325,8 @@ def write_file(self, psm_list: PSMList):
  f, fieldnames=self._columns, delimiter="\t", extrasaction="ignore"
  )
  writer.writeheader()
- for i, psm in enumerate(psm_list):
- entry = self._psm_to_entry(psm)
- entry["ScanNr"] = i
- writer.writerow(entry)
+ for psm in psm_list:
+ writer.writerow(self._psm_to_entry(psm))
 
  def _psm_to_entry(self, psm: PSM):
  """Parse PSM to Percolator Tab entry."""
@@ -366,6 +359,50 @@ def _psm_to_entry(self, psm: PSM):
  }
  return entry
 
+ @staticmethod
+ def _parse_existing_file(
+ filename: Union[str, Path], style: str
+ ) -> Tuple[List[str], Optional[int]]:
+ """Parse existing Percolator Tab file to determine fieldnames and last ScanNr."""
+ # Get fieldnames
+ with open(filename, "rt") as open_file:
+ for line in open_file:
+ fieldnames = line.strip().split("\t")
+ break
+ else:
+ raise PercolatorIOException(
+ f"Existing file {filename} is not a valid Percolator Tab file."
+ )
+ if not _fieldnames_are_valid(fieldnames, style):
+ raise PercolatorIOException(
+ f"Existing file {filename} is not a valid Percolator Tab file of style {style}."
+ )
+
+ # Get last ScanNr
+ last_scannr = None
+ with open(filename, "rt") as open_file:
+ # Read last line
+ open_file.seek(0)
+ last_line = None
+ for line in open_file:
+ if line.strip():
+ last_line = line
+ if last_line:
+ # Parse last line
+ last_line_items = {k: v for k, v in zip(fieldnames, last_line.strip().split("\t"))}
+ try:
+ last_scannr = int(last_line_items["ScanNr"])
+ except (KeyError, ValueError):
+ pass
+
+ if last_scannr is None:
+ last_scannr = -1
+ LOGGER.warning(
+ f"Could not determine last ScanNr from file {filename}. Starting ScanNr from 0."
+ )
+
+ return fieldnames, last_scannr
+
 
 class _PercolatorTabIO:
  def __init__(self, *args, protein_separator="|||", **kwargs) -> None:
@@ -405,6 +442,17 @@ def write(self, __s: str):
  self._open_file.write(__s)
 
 
+def _fieldnames_are_valid(fieldnames: List[str], style: str) -> bool:
+ """Check if fieldnames are valid for Percolator Tab style."""
+ if style == "pin":
+ required_columns = ["SpecId", "Label", "ScanNr"]
+ elif style == "pout":
+ required_columns = ["PSMId", "score", "q-value", "posterior_error_prob"]
+ else:
+ raise ValueError("Invalid Percolator Tab style. Should be one of {`pin`, `pout`}.")
+ return all(col in fieldnames for col in required_columns)
+
+
 def join_pout_files(
  target_filename: str | Path,
  decoy_filename: str | Path,
@@ -429,3 +477,9 @@ def join_pout_files(
  for psm in decoy_reader:
  psm.is_decoy = True
  writer.write_psm(psm)
+
+
+class PercolatorIOException(PSMUtilsIOException):
+ """Exception for Percolator Tab file I/O errors."""
+
+ pass
diff --git a/tests/test_io/test_percolator.py b/tests/test_io/test_percolator.py
@@ -1,6 +1,10 @@
 """Tests for psm_utils.io.percolator."""
 
-from psm_utils.io.percolator import PercolatorTabReader
+from unittest.mock import mock_open, patch
+
+import pytest
+
+from psm_utils.io.percolator import PercolatorIOException, PercolatorTabReader, PercolatorTabWriter
 
 
 class TestPercolatorTabReader:
@@ -32,3 +36,91 @@ def test_parse_peptidoform(self):
  ]
  for test_in, expected_out in test_cases:
  assert expected_out == PercolatorTabReader._parse_peptidoform(*test_in).proforma
+
+
+@pytest.fixture
+def valid_pout_file():
+ return (
+ "PSMId\tLabel\tscore\tq-value\tposterior_error_prob\tpeptide\tproteinIds\n"
+ "PSM1\t1\t0.8\t0.01\t0.001\tPEPTIDE\tPROT1\n"
+ "PSM2\t-1\t0.7\t0.02\t0.002\tPEPTIDER\tPROT2\n"
+ )
+
+
+@pytest.fixture
+def invalid_pout_file():
+ return "Not a valid header\nSome invalid content"
+
+
+@pytest.fixture
+def empty_file():
+ return ""
+
+
+def test_parse_existing_file_valid(valid_pout_file):
+ with patch("builtins.open", mock_open(read_data=valid_pout_file)):
+ fieldnames, last_scannr = PercolatorTabWriter._parse_existing_file("dummy_path", "pout")
+ assert fieldnames == [
+ "PSMId",
+ "Label",
+ "score",
+ "q-value",
+ "posterior_error_prob",
+ "peptide",
+ "proteinIds",
+ ]
+ assert last_scannr == -1 # No ScanNr in POUT style, so it should be -1 by default
+
+
+def test_parse_existing_file_invalid(invalid_pout_file):
+ with patch("builtins.open", mock_open(read_data=invalid_pout_file)):
+ with pytest.raises(PercolatorIOException):
+ PercolatorTabWriter._parse_existing_file("dummy_path", "pin")
+
+
+def test_parse_existing_file_empty(empty_file):
+ with patch("builtins.open", mock_open(read_data=empty_file)):
+ with pytest.raises(PercolatorIOException):
+ PercolatorTabWriter._parse_existing_file("dummy_path", "pin")
+
+
+def test_parse_existing_file_no_scannr_column():
+ # Simulate a file without ScanNr column but a valid header and entries
+ data = (
+ "PSMId\tLabel\tscore\tq-value\tposterior_error_prob\tpeptide\tproteinIds\n"
+ "PSM1\t1\t0.8\t0.01\t0.001\tPEPTIDE\tPROT1\n"
+ )
+ with patch("builtins.open", mock_open(read_data=data)):
+ fieldnames, last_scannr = PercolatorTabWriter._parse_existing_file("dummy_path", "pout")
+ assert fieldnames == [
+ "PSMId",
+ "Label",
+ "score",
+ "q-value",
+ "posterior_error_prob",
+ "peptide",
+ "proteinIds",
+ ]
+ assert last_scannr == -1 # Since no ScanNr column exists, should default to -1
+
+
+def test_parse_existing_file_with_scannr():
+ # Simulate a file with ScanNr column and entries
+ data = (
+ "PSMId\tLabel\tScanNr\tscore\tq-value\tposterior_error_prob\tpeptide\tproteinIds\n"
+ "PSM1\t1\t0\t0.8\t0.01\t0.001\tPEPTIDE\tPROT1\n"
+ "PSM2\t1\t1\t0.7\t0.02\t0.002\tPEPTIDER\tPROT2\n"
+ )
+ with patch("builtins.open", mock_open(read_data=data)):
+ fieldnames, last_scannr = PercolatorTabWriter._parse_existing_file("dummy_path", "pout")
+ assert fieldnames == [
+ "PSMId",
+ "Label",
+ "ScanNr",
+ "score",
+ "q-value",
+ "posterior_error_prob",
+ "peptide",
+ "proteinIds",
+ ]
+ assert last_scannr == 1 # The last ScanNr should be 1