From 9106acc9e8f79fd79a9f6eb63547b6aefbc88e6a Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 28 Aug 2024 12:53:36 +0200 Subject: [PATCH 1/3] io.percolator: Fix and improve scannr inferring and writing for PercolatorTabWriter --- psm_utils/io/percolator.py | 109 +++++++++++++++++++++---------- tests/test_io/test_percolator.py | 94 +++++++++++++++++++++++++- 2 files changed, 168 insertions(+), 35 deletions(-) diff --git a/psm_utils/io/percolator.py b/psm_utils/io/percolator.py index 586d9ee..ee7f73c 100644 --- a/psm_utils/io/percolator.py +++ b/psm_utils/io/percolator.py @@ -16,17 +16,19 @@ from __future__ import annotations import csv +import logging import re from pathlib import Path -from typing import Iterable, Optional +from typing import Iterable, List, Optional, Tuple, Union from psm_utils.io._base_classes import ReaderBase, WriterBase +from psm_utils.io._utils import set_csv_field_size_limit from psm_utils.io.exceptions import PSMUtilsIOException from psm_utils.peptidoform import Peptidoform from psm_utils.psm import PSM from psm_utils.psm_list import PSMList -from psm_utils.io._utils import set_csv_field_size_limit +LOGGER = logging.getLogger(__name__) set_csv_field_size_limit() @@ -96,7 +98,7 @@ def __init__( # Validate column names from parameters for col in [self.score_column, self.rt_column, self.mz_column]: if col and col.lower() not in self.fieldnames: - raise ValueError( + raise PercolatorIOException( f"Column `{col}` not found in header of Percolator Tab file " f"`{self.filename}`." ) @@ -256,7 +258,7 @@ def __init__( self._open_file = None self._writer = None self._protein_separator = "|||" - self._last_scan_number = None + self._current_scannr = 0 def __enter__(self) -> PercolatorTabWriter: """Either open existing file in append mode or new file in write mode.""" @@ -266,28 +268,10 @@ def __enter__(self) -> PercolatorTabWriter: self.filename, mode, newline="", protein_separator=self._protein_separator ) if file_existed: - with open(self.filename, "rt") as open_file: - # Read header - for line in open_file: - fieldnames = line.strip().split("\t") - break - else: - raise ValueError(f"File {self.filename} is not a valid Percolator Tab file.") - # Determine last scan number - open_file.seek(0) - last_line = None - for line in open_file: - if line.strip(): - last_line = line - if last_line: - last_line = {k: v for k, v in zip(fieldnames, last_line.strip().split("\t"))} - try: - self._last_scan_number = int(last_line["ScanNr"]) - except ValueError: - self._last_scan_number = None + fieldnames, self._current_scannr = self._parse_existing_file(self.filename) else: fieldnames = self._columns - self._last_scan_number = -1 + self._current_scannr = -1 self._writer = csv.DictWriter( self._open_file, fieldnames=fieldnames, @@ -302,15 +286,13 @@ def __exit__(self, *args, **kwargs) -> None: self._open_file.close() self._open_file = None self._writer = None - self._last_scan_number = None + self._current_scannr = None def write_psm(self, psm: PSM): """Write a single PSM to the PSM file.""" entry = self._psm_to_entry(psm) - if self._last_scan_number is not None: - entry["ScanNr"] = self._last_scan_number + 1 - else: - entry["ScanNr"] = None + self._current_scannr += 1 + entry["ScanNr"] = self._current_scannr try: self._writer.writerow(entry) except AttributeError as e: @@ -319,7 +301,7 @@ def write_psm(self, psm: PSM): "is opened in context (i.e., using the `with` statement)." ) from e else: - self._last_scan_number = entry["ScanNr"] + self._current_scannr = entry["ScanNr"] def write_file(self, psm_list: PSMList): """Write an entire PSMList to the PSM file.""" @@ -330,10 +312,8 @@ def write_file(self, psm_list: PSMList): f, fieldnames=self._columns, delimiter="\t", extrasaction="ignore" ) writer.writeheader() - for i, psm in enumerate(psm_list): - entry = self._psm_to_entry(psm) - entry["ScanNr"] = i - writer.writerow(entry) + for psm in psm_list: + writer.writerow(self._psm_to_entry(psm)) def _psm_to_entry(self, psm: PSM): """Parse PSM to Percolator Tab entry.""" @@ -366,6 +346,50 @@ def _psm_to_entry(self, psm: PSM): } return entry + @staticmethod + def _parse_existing_file( + filename: Union[str, Path], style: str + ) -> Tuple[List[str], Optional[int]]: + """Parse existing Percolator Tab file to determine fieldnames and last ScanNr.""" + # Get fieldnames + with open(filename, "rt") as open_file: + for line in open_file: + fieldnames = line.strip().split("\t") + break + else: + raise PercolatorIOException( + f"Existing file {filename} is not a valid Percolator Tab file." + ) + if not _fieldnames_are_valid(fieldnames, style): + raise PercolatorIOException( + f"Existing file {filename} is not a valid Percolator Tab file of style {style}." + ) + + # Get last ScanNr + last_scannr = None + with open(filename, "rt") as open_file: + # Read last line + open_file.seek(0) + last_line = None + for line in open_file: + if line.strip(): + last_line = line + if last_line: + # Parse last line + last_line_items = {k: v for k, v in zip(fieldnames, last_line.strip().split("\t"))} + try: + last_scannr = int(last_line_items["ScanNr"]) + except (KeyError, ValueError): + pass + + if last_scannr is None: + last_scannr = -1 + LOGGER.warning( + f"Could not determine last ScanNr from file {filename}. Starting ScanNr from 0." + ) + + return fieldnames, last_scannr + class _PercolatorTabIO: def __init__(self, *args, protein_separator="|||", **kwargs) -> None: @@ -405,6 +429,17 @@ def write(self, __s: str): self._open_file.write(__s) +def _fieldnames_are_valid(fieldnames: List[str], style: str) -> bool: + """Check if fieldnames are valid for Percolator Tab style.""" + if style == "pin": + required_columns = ["SpecId", "Label", "ScanNr"] + elif style == "pout": + required_columns = ["PSMId", "score", "q-value", "posterior_error_prob"] + else: + raise ValueError("Invalid Percolator Tab style. Should be one of {`pin`, `pout`}.") + return all(col in fieldnames for col in required_columns) + + def join_pout_files( target_filename: str | Path, decoy_filename: str | Path, @@ -429,3 +464,9 @@ def join_pout_files( for psm in decoy_reader: psm.is_decoy = True writer.write_psm(psm) + + +class PercolatorIOException(PSMUtilsIOException): + """Exception for Percolator Tab file I/O errors.""" + + pass diff --git a/tests/test_io/test_percolator.py b/tests/test_io/test_percolator.py index 692f5f9..fb5a794 100644 --- a/tests/test_io/test_percolator.py +++ b/tests/test_io/test_percolator.py @@ -1,6 +1,10 @@ """Tests for psm_utils.io.percolator.""" -from psm_utils.io.percolator import PercolatorTabReader +from unittest.mock import mock_open, patch + +import pytest + +from psm_utils.io.percolator import PercolatorIOException, PercolatorTabReader, PercolatorTabWriter class TestPercolatorTabReader: @@ -32,3 +36,91 @@ def test_parse_peptidoform(self): ] for test_in, expected_out in test_cases: assert expected_out == PercolatorTabReader._parse_peptidoform(*test_in).proforma + + +@pytest.fixture +def valid_pout_file(): + return ( + "PSMId\tLabel\tscore\tq-value\tposterior_error_prob\tpeptide\tproteinIds\n" + "PSM1\t1\t0.8\t0.01\t0.001\tPEPTIDE\tPROT1\n" + "PSM2\t-1\t0.7\t0.02\t0.002\tPEPTIDER\tPROT2\n" + ) + + +@pytest.fixture +def invalid_pout_file(): + return "Not a valid header\nSome invalid content" + + +@pytest.fixture +def empty_file(): + return "" + + +def test_parse_existing_file_valid(valid_pout_file): + with patch("builtins.open", mock_open(read_data=valid_pout_file)): + fieldnames, last_scannr = PercolatorTabWriter._parse_existing_file("dummy_path", "pout") + assert fieldnames == [ + "PSMId", + "Label", + "score", + "q-value", + "posterior_error_prob", + "peptide", + "proteinIds", + ] + assert last_scannr == -1 # No ScanNr in POUT style, so it should be -1 by default + + +def test_parse_existing_file_invalid(invalid_pout_file): + with patch("builtins.open", mock_open(read_data=invalid_pout_file)): + with pytest.raises(PercolatorIOException): + PercolatorTabWriter._parse_existing_file("dummy_path", "pin") + + +def test_parse_existing_file_empty(empty_file): + with patch("builtins.open", mock_open(read_data=empty_file)): + with pytest.raises(PercolatorIOException): + PercolatorTabWriter._parse_existing_file("dummy_path", "pin") + + +def test_parse_existing_file_no_scannr_column(): + # Simulate a file without ScanNr column but a valid header and entries + data = ( + "PSMId\tLabel\tscore\tq-value\tposterior_error_prob\tpeptide\tproteinIds\n" + "PSM1\t1\t0.8\t0.01\t0.001\tPEPTIDE\tPROT1\n" + ) + with patch("builtins.open", mock_open(read_data=data)): + fieldnames, last_scannr = PercolatorTabWriter._parse_existing_file("dummy_path", "pout") + assert fieldnames == [ + "PSMId", + "Label", + "score", + "q-value", + "posterior_error_prob", + "peptide", + "proteinIds", + ] + assert last_scannr == -1 # Since no ScanNr column exists, should default to -1 + + +def test_parse_existing_file_with_scannr(): + # Simulate a file with ScanNr column and entries + data = ( + "PSMId\tLabel\tScanNr\tscore\tq-value\tposterior_error_prob\tpeptide\tproteinIds\n" + "PSM1\t1\t0\t0.8\t0.01\t0.001\tPEPTIDE\tPROT1\n" + "PSM2\t1\t1\t0.7\t0.02\t0.002\tPEPTIDER\tPROT2\n" + ) + with patch("builtins.open", mock_open(read_data=data)): + fieldnames, last_scannr = PercolatorTabWriter._parse_existing_file("dummy_path", "pout") + assert fieldnames == [ + "PSMId", + "Label", + "ScanNr", + "score", + "q-value", + "posterior_error_prob", + "peptide", + "proteinIds", + ] + assert last_scannr == 1 # The last ScanNr should be 1 From 4a72fbd8625289c77e633c491be9ed9b848461eb Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 28 Aug 2024 16:31:45 +0200 Subject: [PATCH 2/3] io.percolator: Infer style from filename suffix if not defined --- psm_utils/io/percolator.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/psm_utils/io/percolator.py b/psm_utils/io/percolator.py index ee7f73c..fefed62 100644 --- a/psm_utils/io/percolator.py +++ b/psm_utils/io/percolator.py @@ -199,7 +199,7 @@ class PercolatorTabWriter(WriterBase): def __init__( self, filename: str | Path, - style: str = "pin", + style: Optional[str] = None, feature_names: Optional[list[str]] = None, add_basic_features: bool = False, *args, @@ -212,12 +212,13 @@ def __init__( ---------- filename: str, pathlib.Path Path to PSM file. - style: str + style: str, optional Percolator Tab style. One of {``pin``, ``pout``}. If ``pin``, the columns ``SpecId``, ``Label``, ``ScanNr``, ``ChargeN``, ``PSMScore``, ``Peptide``, and ``Proteins`` are written alongside the requested feature names (see ``feature_names``). If ``pout``, the columns ``PSMId``, ``Label``, ``score``, ``q-value``, ``posterior_error_prob``, ``peptide``, and ``proteinIds`` are written. + By default, the style is inferred from the file name extension. feature_names: list[str], optional List of feature names to extract from PSMs and write to file. List values should correspond to keys in the @@ -234,7 +235,19 @@ def __init__( self.feature_names = list(feature_names) if feature_names else [] self.add_basic_features = add_basic_features - if style == "pin": + if not style: + suffix = self.filename.suffix.lower() + if suffix == ".pin": + self.style = "pin" + elif suffix == ".pout": + self.style = "pout" + else: + raise PercolatorIOException( + f"Could not infer Percolator Tab style from file extension `{suffix}`. " + "Please provide the `style` parameter." + ) + + if self.style == "pin": basic_features = ["PSMScore", "ChargeN"] if add_basic_features else [] self._columns = ( ["SpecId", "Label", "ScanNr"] @@ -242,7 +255,7 @@ def __init__( + self.feature_names + ["Peptide", "Proteins"] ) - elif style == "pout": + elif self.style == "pout": self._columns = [ "PSMId", "Label", @@ -254,7 +267,7 @@ def __init__( ] else: raise ValueError("Invalid Percolator Tab style. Should be one of {`pin`, `pout`}.") - self.style = style + self._open_file = None self._writer = None self._protein_separator = "|||" From 711fd10a7f8d315037dce2cff7c73c149ad952ea Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 28 Aug 2024 16:38:04 +0200 Subject: [PATCH 3/3] Update changelog; bump version --- CHANGELOG.md | 7 +++++++ psm_utils/__init__.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 04f8319..a85b873 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.0.1] - 2024-08-28 + +### Fixed + +- `io.percolator`: Fix and improve ScanNr inferring and writing +- `io.percolator`: Infer style from file extension if not provided (enables dynamic style determination in, for instance, `convert` function). + ## [1.0.0] - 2024-08-14 ### Added diff --git a/psm_utils/__init__.py b/psm_utils/__init__.py index fdca612..b82c8ee 100644 --- a/psm_utils/__init__.py +++ b/psm_utils/__init__.py @@ -1,6 +1,6 @@ """Common utilities for parsing and handling PSMs, and search engine results.""" -__version__ = "1.0.0" +__version__ = "1.0.1" __all__ = ["Peptidoform", "PSM", "PSMList"] from warnings import filterwarnings