Skip to content

Commit

Permalink
Merge pull request #94 from compomics/fix/percolator-scannr
Browse files Browse the repository at this point in the history
Percolator fixes: ScanNr writing and inferring style from file extension
  • Loading branch information
RalfG authored Aug 28, 2024
2 parents 50f9a9c + 711fd10 commit 5e42ace
Show file tree
Hide file tree
Showing 4 changed files with 194 additions and 41 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [1.0.1] - 2024-08-28

### Fixed

- `io.percolator`: Fix and improve ScanNr inferring and writing
- `io.percolator`: Infer style from file extension if not provided (enables dynamic style determination in, for instance, `convert` function).

## [1.0.0] - 2024-08-14

### Added
Expand Down
2 changes: 1 addition & 1 deletion psm_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Common utilities for parsing and handling PSMs, and search engine results."""

__version__ = "1.0.0"
__version__ = "1.0.1"
__all__ = ["Peptidoform", "PSM", "PSMList"]

from warnings import filterwarnings
Expand Down
132 changes: 93 additions & 39 deletions psm_utils/io/percolator.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,19 @@
from __future__ import annotations

import csv
import logging
import re
from pathlib import Path
from typing import Iterable, Optional
from typing import Iterable, List, Optional, Tuple, Union

from psm_utils.io._base_classes import ReaderBase, WriterBase
from psm_utils.io._utils import set_csv_field_size_limit
from psm_utils.io.exceptions import PSMUtilsIOException
from psm_utils.peptidoform import Peptidoform
from psm_utils.psm import PSM
from psm_utils.psm_list import PSMList
from psm_utils.io._utils import set_csv_field_size_limit

LOGGER = logging.getLogger(__name__)
set_csv_field_size_limit()


Expand Down Expand Up @@ -96,7 +98,7 @@ def __init__(
# Validate column names from parameters
for col in [self.score_column, self.rt_column, self.mz_column]:
if col and col.lower() not in self.fieldnames:
raise ValueError(
raise PercolatorIOException(
f"Column `{col}` not found in header of Percolator Tab file "
f"`{self.filename}`."
)
Expand Down Expand Up @@ -197,7 +199,7 @@ class PercolatorTabWriter(WriterBase):
def __init__(
self,
filename: str | Path,
style: str = "pin",
style: Optional[str] = None,
feature_names: Optional[list[str]] = None,
add_basic_features: bool = False,
*args,
Expand All @@ -210,12 +212,13 @@ def __init__(
----------
filename: str, pathlib.Path
Path to PSM file.
style: str
style: str, optional
Percolator Tab style. One of {``pin``, ``pout``}. If ``pin``, the columns
``SpecId``, ``Label``, ``ScanNr``, ``ChargeN``, ``PSMScore``, ``Peptide``, and
``Proteins`` are written alongside the requested feature names
(see ``feature_names``). If ``pout``, the columns ``PSMId``, ``Label``, ``score``,
``q-value``, ``posterior_error_prob``, ``peptide``, and ``proteinIds`` are written.
By default, the style is inferred from the file name extension.
feature_names: list[str], optional
List of feature names to extract from PSMs and write to file. List values
should correspond to keys in the
Expand All @@ -232,15 +235,27 @@ def __init__(
self.feature_names = list(feature_names) if feature_names else []
self.add_basic_features = add_basic_features

if style == "pin":
if not style:
suffix = self.filename.suffix.lower()
if suffix == ".pin":
self.style = "pin"
elif suffix == ".pout":
self.style = "pout"
else:
raise PercolatorIOException(
f"Could not infer Percolator Tab style from file extension `{suffix}`. "
"Please provide the `style` parameter."
)

if self.style == "pin":
basic_features = ["PSMScore", "ChargeN"] if add_basic_features else []
self._columns = (
["SpecId", "Label", "ScanNr"]
+ basic_features
+ self.feature_names
+ ["Peptide", "Proteins"]
)
elif style == "pout":
elif self.style == "pout":
self._columns = [
"PSMId",
"Label",
Expand All @@ -252,11 +267,11 @@ def __init__(
]
else:
raise ValueError("Invalid Percolator Tab style. Should be one of {`pin`, `pout`}.")
self.style = style

self._open_file = None
self._writer = None
self._protein_separator = "|||"
self._last_scan_number = None
self._current_scannr = 0

def __enter__(self) -> PercolatorTabWriter:
"""Either open existing file in append mode or new file in write mode."""
Expand All @@ -266,28 +281,10 @@ def __enter__(self) -> PercolatorTabWriter:
self.filename, mode, newline="", protein_separator=self._protein_separator
)
if file_existed:
with open(self.filename, "rt") as open_file:
# Read header
for line in open_file:
fieldnames = line.strip().split("\t")
break
else:
raise ValueError(f"File {self.filename} is not a valid Percolator Tab file.")
# Determine last scan number
open_file.seek(0)
last_line = None
for line in open_file:
if line.strip():
last_line = line
if last_line:
last_line = {k: v for k, v in zip(fieldnames, last_line.strip().split("\t"))}
try:
self._last_scan_number = int(last_line["ScanNr"])
except ValueError:
self._last_scan_number = None
fieldnames, self._current_scannr = self._parse_existing_file(self.filename)
else:
fieldnames = self._columns
self._last_scan_number = -1
self._current_scannr = -1
self._writer = csv.DictWriter(
self._open_file,
fieldnames=fieldnames,
Expand All @@ -302,15 +299,13 @@ def __exit__(self, *args, **kwargs) -> None:
self._open_file.close()
self._open_file = None
self._writer = None
self._last_scan_number = None
self._current_scannr = None

def write_psm(self, psm: PSM):
"""Write a single PSM to the PSM file."""
entry = self._psm_to_entry(psm)
if self._last_scan_number is not None:
entry["ScanNr"] = self._last_scan_number + 1
else:
entry["ScanNr"] = None
self._current_scannr += 1
entry["ScanNr"] = self._current_scannr
try:
self._writer.writerow(entry)
except AttributeError as e:
Expand All @@ -319,7 +314,7 @@ def write_psm(self, psm: PSM):
"is opened in context (i.e., using the `with` statement)."
) from e
else:
self._last_scan_number = entry["ScanNr"]
self._current_scannr = entry["ScanNr"]

def write_file(self, psm_list: PSMList):
"""Write an entire PSMList to the PSM file."""
Expand All @@ -330,10 +325,8 @@ def write_file(self, psm_list: PSMList):
f, fieldnames=self._columns, delimiter="\t", extrasaction="ignore"
)
writer.writeheader()
for i, psm in enumerate(psm_list):
entry = self._psm_to_entry(psm)
entry["ScanNr"] = i
writer.writerow(entry)
for psm in psm_list:
writer.writerow(self._psm_to_entry(psm))

def _psm_to_entry(self, psm: PSM):
"""Parse PSM to Percolator Tab entry."""
Expand Down Expand Up @@ -366,6 +359,50 @@ def _psm_to_entry(self, psm: PSM):
}
return entry

@staticmethod
def _parse_existing_file(
filename: Union[str, Path], style: str
) -> Tuple[List[str], Optional[int]]:
"""Parse existing Percolator Tab file to determine fieldnames and last ScanNr."""
# Get fieldnames
with open(filename, "rt") as open_file:
for line in open_file:
fieldnames = line.strip().split("\t")
break
else:
raise PercolatorIOException(
f"Existing file {filename} is not a valid Percolator Tab file."
)
if not _fieldnames_are_valid(fieldnames, style):
raise PercolatorIOException(
f"Existing file {filename} is not a valid Percolator Tab file of style {style}."
)

# Get last ScanNr
last_scannr = None
with open(filename, "rt") as open_file:
# Read last line
open_file.seek(0)
last_line = None
for line in open_file:
if line.strip():
last_line = line
if last_line:
# Parse last line
last_line_items = {k: v for k, v in zip(fieldnames, last_line.strip().split("\t"))}
try:
last_scannr = int(last_line_items["ScanNr"])
except (KeyError, ValueError):
pass

if last_scannr is None:
last_scannr = -1
LOGGER.warning(
f"Could not determine last ScanNr from file {filename}. Starting ScanNr from 0."
)

return fieldnames, last_scannr


class _PercolatorTabIO:
def __init__(self, *args, protein_separator="|||", **kwargs) -> None:
Expand Down Expand Up @@ -405,6 +442,17 @@ def write(self, __s: str):
self._open_file.write(__s)


def _fieldnames_are_valid(fieldnames: List[str], style: str) -> bool:
"""Check if fieldnames are valid for Percolator Tab style."""
if style == "pin":
required_columns = ["SpecId", "Label", "ScanNr"]
elif style == "pout":
required_columns = ["PSMId", "score", "q-value", "posterior_error_prob"]
else:
raise ValueError("Invalid Percolator Tab style. Should be one of {`pin`, `pout`}.")
return all(col in fieldnames for col in required_columns)


def join_pout_files(
target_filename: str | Path,
decoy_filename: str | Path,
Expand All @@ -429,3 +477,9 @@ def join_pout_files(
for psm in decoy_reader:
psm.is_decoy = True
writer.write_psm(psm)


class PercolatorIOException(PSMUtilsIOException):
"""Exception for Percolator Tab file I/O errors."""

pass
94 changes: 93 additions & 1 deletion tests/test_io/test_percolator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
"""Tests for psm_utils.io.percolator."""

from psm_utils.io.percolator import PercolatorTabReader
from unittest.mock import mock_open, patch

import pytest

from psm_utils.io.percolator import PercolatorIOException, PercolatorTabReader, PercolatorTabWriter


class TestPercolatorTabReader:
Expand Down Expand Up @@ -32,3 +36,91 @@ def test_parse_peptidoform(self):
]
for test_in, expected_out in test_cases:
assert expected_out == PercolatorTabReader._parse_peptidoform(*test_in).proforma


@pytest.fixture
def valid_pout_file():
return (
"PSMId\tLabel\tscore\tq-value\tposterior_error_prob\tpeptide\tproteinIds\n"
"PSM1\t1\t0.8\t0.01\t0.001\tPEPTIDE\tPROT1\n"
"PSM2\t-1\t0.7\t0.02\t0.002\tPEPTIDER\tPROT2\n"
)


@pytest.fixture
def invalid_pout_file():
return "Not a valid header\nSome invalid content"


@pytest.fixture
def empty_file():
return ""


def test_parse_existing_file_valid(valid_pout_file):
with patch("builtins.open", mock_open(read_data=valid_pout_file)):
fieldnames, last_scannr = PercolatorTabWriter._parse_existing_file("dummy_path", "pout")
assert fieldnames == [
"PSMId",
"Label",
"score",
"q-value",
"posterior_error_prob",
"peptide",
"proteinIds",
]
assert last_scannr == -1 # No ScanNr in POUT style, so it should be -1 by default


def test_parse_existing_file_invalid(invalid_pout_file):
with patch("builtins.open", mock_open(read_data=invalid_pout_file)):
with pytest.raises(PercolatorIOException):
PercolatorTabWriter._parse_existing_file("dummy_path", "pin")


def test_parse_existing_file_empty(empty_file):
with patch("builtins.open", mock_open(read_data=empty_file)):
with pytest.raises(PercolatorIOException):
PercolatorTabWriter._parse_existing_file("dummy_path", "pin")


def test_parse_existing_file_no_scannr_column():
# Simulate a file without ScanNr column but a valid header and entries
data = (
"PSMId\tLabel\tscore\tq-value\tposterior_error_prob\tpeptide\tproteinIds\n"
"PSM1\t1\t0.8\t0.01\t0.001\tPEPTIDE\tPROT1\n"
)
with patch("builtins.open", mock_open(read_data=data)):
fieldnames, last_scannr = PercolatorTabWriter._parse_existing_file("dummy_path", "pout")
assert fieldnames == [
"PSMId",
"Label",
"score",
"q-value",
"posterior_error_prob",
"peptide",
"proteinIds",
]
assert last_scannr == -1 # Since no ScanNr column exists, should default to -1


def test_parse_existing_file_with_scannr():
# Simulate a file with ScanNr column and entries
data = (
"PSMId\tLabel\tScanNr\tscore\tq-value\tposterior_error_prob\tpeptide\tproteinIds\n"
"PSM1\t1\t0\t0.8\t0.01\t0.001\tPEPTIDE\tPROT1\n"
"PSM2\t1\t1\t0.7\t0.02\t0.002\tPEPTIDER\tPROT2\n"
)
with patch("builtins.open", mock_open(read_data=data)):
fieldnames, last_scannr = PercolatorTabWriter._parse_existing_file("dummy_path", "pout")
assert fieldnames == [
"PSMId",
"Label",
"ScanNr",
"score",
"q-value",
"posterior_error_prob",
"peptide",
"proteinIds",
]
assert last_scannr == 1 # The last ScanNr should be 1

0 comments on commit 5e42ace

Please sign in to comment.