Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Percolator fixes: ScanNr writing and inferring style from file extension #94

Merged
merged 3 commits into from
Aug 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [1.0.1] - 2024-08-28

### Fixed

- `io.percolator`: Fix and improve ScanNr inferring and writing
- `io.percolator`: Infer style from file extension if not provided (enables dynamic style determination in, for instance, `convert` function).

## [1.0.0] - 2024-08-14

### Added
Expand Down
2 changes: 1 addition & 1 deletion psm_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Common utilities for parsing and handling PSMs, and search engine results."""

__version__ = "1.0.0"
__version__ = "1.0.1"
__all__ = ["Peptidoform", "PSM", "PSMList"]

from warnings import filterwarnings
Expand Down
132 changes: 93 additions & 39 deletions psm_utils/io/percolator.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,19 @@
from __future__ import annotations

import csv
import logging
import re
from pathlib import Path
from typing import Iterable, Optional
from typing import Iterable, List, Optional, Tuple, Union

from psm_utils.io._base_classes import ReaderBase, WriterBase
from psm_utils.io._utils import set_csv_field_size_limit
from psm_utils.io.exceptions import PSMUtilsIOException
from psm_utils.peptidoform import Peptidoform
from psm_utils.psm import PSM
from psm_utils.psm_list import PSMList
from psm_utils.io._utils import set_csv_field_size_limit

LOGGER = logging.getLogger(__name__)
set_csv_field_size_limit()


Expand Down Expand Up @@ -96,7 +98,7 @@
# Validate column names from parameters
for col in [self.score_column, self.rt_column, self.mz_column]:
if col and col.lower() not in self.fieldnames:
raise ValueError(
raise PercolatorIOException(

Check warning on line 101 in psm_utils/io/percolator.py

View check run for this annotation

Codecov / codecov/patch

psm_utils/io/percolator.py#L101

Added line #L101 was not covered by tests
f"Column `{col}` not found in header of Percolator Tab file "
f"`{self.filename}`."
)
Expand Down Expand Up @@ -197,7 +199,7 @@
def __init__(
self,
filename: str | Path,
style: str = "pin",
style: Optional[str] = None,
feature_names: Optional[list[str]] = None,
add_basic_features: bool = False,
*args,
Expand All @@ -210,12 +212,13 @@
----------
filename: str, pathlib.Path
Path to PSM file.
style: str
style: str, optional
Percolator Tab style. One of {``pin``, ``pout``}. If ``pin``, the columns
``SpecId``, ``Label``, ``ScanNr``, ``ChargeN``, ``PSMScore``, ``Peptide``, and
``Proteins`` are written alongside the requested feature names
(see ``feature_names``). If ``pout``, the columns ``PSMId``, ``Label``, ``score``,
``q-value``, ``posterior_error_prob``, ``peptide``, and ``proteinIds`` are written.
By default, the style is inferred from the file name extension.
feature_names: list[str], optional
List of feature names to extract from PSMs and write to file. List values
should correspond to keys in the
Expand All @@ -232,15 +235,27 @@
self.feature_names = list(feature_names) if feature_names else []
self.add_basic_features = add_basic_features

if style == "pin":
if not style:
suffix = self.filename.suffix.lower()
if suffix == ".pin":
self.style = "pin"
elif suffix == ".pout":
self.style = "pout"

Check warning on line 243 in psm_utils/io/percolator.py

View check run for this annotation

Codecov / codecov/patch

psm_utils/io/percolator.py#L238-L243

Added lines #L238 - L243 were not covered by tests
else:
raise PercolatorIOException(

Check warning on line 245 in psm_utils/io/percolator.py

View check run for this annotation

Codecov / codecov/patch

psm_utils/io/percolator.py#L245

Added line #L245 was not covered by tests
f"Could not infer Percolator Tab style from file extension `{suffix}`. "
"Please provide the `style` parameter."
)

if self.style == "pin":

Check warning on line 250 in psm_utils/io/percolator.py

View check run for this annotation

Codecov / codecov/patch

psm_utils/io/percolator.py#L250

Added line #L250 was not covered by tests
basic_features = ["PSMScore", "ChargeN"] if add_basic_features else []
self._columns = (
["SpecId", "Label", "ScanNr"]
+ basic_features
+ self.feature_names
+ ["Peptide", "Proteins"]
)
elif style == "pout":
elif self.style == "pout":

Check warning on line 258 in psm_utils/io/percolator.py

View check run for this annotation

Codecov / codecov/patch

psm_utils/io/percolator.py#L258

Added line #L258 was not covered by tests
self._columns = [
"PSMId",
"Label",
Expand All @@ -252,11 +267,11 @@
]
else:
raise ValueError("Invalid Percolator Tab style. Should be one of {`pin`, `pout`}.")
self.style = style

self._open_file = None
self._writer = None
self._protein_separator = "|||"
self._last_scan_number = None
self._current_scannr = 0

Check warning on line 274 in psm_utils/io/percolator.py

View check run for this annotation

Codecov / codecov/patch

psm_utils/io/percolator.py#L274

Added line #L274 was not covered by tests

def __enter__(self) -> PercolatorTabWriter:
"""Either open existing file in append mode or new file in write mode."""
Expand All @@ -266,28 +281,10 @@
self.filename, mode, newline="", protein_separator=self._protein_separator
)
if file_existed:
with open(self.filename, "rt") as open_file:
# Read header
for line in open_file:
fieldnames = line.strip().split("\t")
break
else:
raise ValueError(f"File {self.filename} is not a valid Percolator Tab file.")
# Determine last scan number
open_file.seek(0)
last_line = None
for line in open_file:
if line.strip():
last_line = line
if last_line:
last_line = {k: v for k, v in zip(fieldnames, last_line.strip().split("\t"))}
try:
self._last_scan_number = int(last_line["ScanNr"])
except ValueError:
self._last_scan_number = None
fieldnames, self._current_scannr = self._parse_existing_file(self.filename)

Check warning on line 284 in psm_utils/io/percolator.py

View check run for this annotation

Codecov / codecov/patch

psm_utils/io/percolator.py#L284

Added line #L284 was not covered by tests
else:
fieldnames = self._columns
self._last_scan_number = -1
self._current_scannr = -1

Check warning on line 287 in psm_utils/io/percolator.py

View check run for this annotation

Codecov / codecov/patch

psm_utils/io/percolator.py#L287

Added line #L287 was not covered by tests
self._writer = csv.DictWriter(
self._open_file,
fieldnames=fieldnames,
Expand All @@ -302,15 +299,13 @@
self._open_file.close()
self._open_file = None
self._writer = None
self._last_scan_number = None
self._current_scannr = None

Check warning on line 302 in psm_utils/io/percolator.py

View check run for this annotation

Codecov / codecov/patch

psm_utils/io/percolator.py#L302

Added line #L302 was not covered by tests

def write_psm(self, psm: PSM):
"""Write a single PSM to the PSM file."""
entry = self._psm_to_entry(psm)
if self._last_scan_number is not None:
entry["ScanNr"] = self._last_scan_number + 1
else:
entry["ScanNr"] = None
self._current_scannr += 1
entry["ScanNr"] = self._current_scannr

Check warning on line 308 in psm_utils/io/percolator.py

View check run for this annotation

Codecov / codecov/patch

psm_utils/io/percolator.py#L307-L308

Added lines #L307 - L308 were not covered by tests
try:
self._writer.writerow(entry)
except AttributeError as e:
Expand All @@ -319,7 +314,7 @@
"is opened in context (i.e., using the `with` statement)."
) from e
else:
self._last_scan_number = entry["ScanNr"]
self._current_scannr = entry["ScanNr"]

Check warning on line 317 in psm_utils/io/percolator.py

View check run for this annotation

Codecov / codecov/patch

psm_utils/io/percolator.py#L317

Added line #L317 was not covered by tests

def write_file(self, psm_list: PSMList):
"""Write an entire PSMList to the PSM file."""
Expand All @@ -330,10 +325,8 @@
f, fieldnames=self._columns, delimiter="\t", extrasaction="ignore"
)
writer.writeheader()
for i, psm in enumerate(psm_list):
entry = self._psm_to_entry(psm)
entry["ScanNr"] = i
writer.writerow(entry)
for psm in psm_list:
writer.writerow(self._psm_to_entry(psm))

Check warning on line 329 in psm_utils/io/percolator.py

View check run for this annotation

Codecov / codecov/patch

psm_utils/io/percolator.py#L328-L329

Added lines #L328 - L329 were not covered by tests

def _psm_to_entry(self, psm: PSM):
"""Parse PSM to Percolator Tab entry."""
Expand Down Expand Up @@ -366,6 +359,50 @@
}
return entry

@staticmethod
def _parse_existing_file(
filename: Union[str, Path], style: str
) -> Tuple[List[str], Optional[int]]:
"""Parse existing Percolator Tab file to determine fieldnames and last ScanNr."""
# Get fieldnames
with open(filename, "rt") as open_file:
for line in open_file:
fieldnames = line.strip().split("\t")
break
else:
raise PercolatorIOException(
f"Existing file {filename} is not a valid Percolator Tab file."
)
if not _fieldnames_are_valid(fieldnames, style):
raise PercolatorIOException(
f"Existing file {filename} is not a valid Percolator Tab file of style {style}."
)

# Get last ScanNr
last_scannr = None
with open(filename, "rt") as open_file:
# Read last line
open_file.seek(0)
last_line = None
for line in open_file:
if line.strip():
last_line = line
if last_line:
# Parse last line
last_line_items = {k: v for k, v in zip(fieldnames, last_line.strip().split("\t"))}
try:
last_scannr = int(last_line_items["ScanNr"])
except (KeyError, ValueError):
pass

if last_scannr is None:
last_scannr = -1
LOGGER.warning(
f"Could not determine last ScanNr from file {filename}. Starting ScanNr from 0."
)

return fieldnames, last_scannr


class _PercolatorTabIO:
def __init__(self, *args, protein_separator="|||", **kwargs) -> None:
Expand Down Expand Up @@ -405,6 +442,17 @@
self._open_file.write(__s)


def _fieldnames_are_valid(fieldnames: List[str], style: str) -> bool:
"""Check if fieldnames are valid for Percolator Tab style."""
if style == "pin":
required_columns = ["SpecId", "Label", "ScanNr"]
elif style == "pout":
required_columns = ["PSMId", "score", "q-value", "posterior_error_prob"]
else:
raise ValueError("Invalid Percolator Tab style. Should be one of {`pin`, `pout`}.")

Check warning on line 452 in psm_utils/io/percolator.py

View check run for this annotation

Codecov / codecov/patch

psm_utils/io/percolator.py#L452

Added line #L452 was not covered by tests
return all(col in fieldnames for col in required_columns)


def join_pout_files(
target_filename: str | Path,
decoy_filename: str | Path,
Expand All @@ -429,3 +477,9 @@
for psm in decoy_reader:
psm.is_decoy = True
writer.write_psm(psm)


class PercolatorIOException(PSMUtilsIOException):
"""Exception for Percolator Tab file I/O errors."""

pass
94 changes: 93 additions & 1 deletion tests/test_io/test_percolator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
"""Tests for psm_utils.io.percolator."""

from psm_utils.io.percolator import PercolatorTabReader
from unittest.mock import mock_open, patch

import pytest

from psm_utils.io.percolator import PercolatorIOException, PercolatorTabReader, PercolatorTabWriter


class TestPercolatorTabReader:
Expand Down Expand Up @@ -32,3 +36,91 @@ def test_parse_peptidoform(self):
]
for test_in, expected_out in test_cases:
assert expected_out == PercolatorTabReader._parse_peptidoform(*test_in).proforma


@pytest.fixture
def valid_pout_file():
return (
"PSMId\tLabel\tscore\tq-value\tposterior_error_prob\tpeptide\tproteinIds\n"
"PSM1\t1\t0.8\t0.01\t0.001\tPEPTIDE\tPROT1\n"
"PSM2\t-1\t0.7\t0.02\t0.002\tPEPTIDER\tPROT2\n"
)


@pytest.fixture
def invalid_pout_file():
return "Not a valid header\nSome invalid content"


@pytest.fixture
def empty_file():
return ""


def test_parse_existing_file_valid(valid_pout_file):
with patch("builtins.open", mock_open(read_data=valid_pout_file)):
fieldnames, last_scannr = PercolatorTabWriter._parse_existing_file("dummy_path", "pout")
assert fieldnames == [
"PSMId",
"Label",
"score",
"q-value",
"posterior_error_prob",
"peptide",
"proteinIds",
]
assert last_scannr == -1 # No ScanNr in POUT style, so it should be -1 by default


def test_parse_existing_file_invalid(invalid_pout_file):
with patch("builtins.open", mock_open(read_data=invalid_pout_file)):
with pytest.raises(PercolatorIOException):
PercolatorTabWriter._parse_existing_file("dummy_path", "pin")


def test_parse_existing_file_empty(empty_file):
with patch("builtins.open", mock_open(read_data=empty_file)):
with pytest.raises(PercolatorIOException):
PercolatorTabWriter._parse_existing_file("dummy_path", "pin")


def test_parse_existing_file_no_scannr_column():
# Simulate a file without ScanNr column but a valid header and entries
data = (
"PSMId\tLabel\tscore\tq-value\tposterior_error_prob\tpeptide\tproteinIds\n"
"PSM1\t1\t0.8\t0.01\t0.001\tPEPTIDE\tPROT1\n"
)
with patch("builtins.open", mock_open(read_data=data)):
fieldnames, last_scannr = PercolatorTabWriter._parse_existing_file("dummy_path", "pout")
assert fieldnames == [
"PSMId",
"Label",
"score",
"q-value",
"posterior_error_prob",
"peptide",
"proteinIds",
]
assert last_scannr == -1 # Since no ScanNr column exists, should default to -1


def test_parse_existing_file_with_scannr():
# Simulate a file with ScanNr column and entries
data = (
"PSMId\tLabel\tScanNr\tscore\tq-value\tposterior_error_prob\tpeptide\tproteinIds\n"
"PSM1\t1\t0\t0.8\t0.01\t0.001\tPEPTIDE\tPROT1\n"
"PSM2\t1\t1\t0.7\t0.02\t0.002\tPEPTIDER\tPROT2\n"
)
with patch("builtins.open", mock_open(read_data=data)):
fieldnames, last_scannr = PercolatorTabWriter._parse_existing_file("dummy_path", "pout")
assert fieldnames == [
"PSMId",
"Label",
"ScanNr",
"score",
"q-value",
"posterior_error_prob",
"peptide",
"proteinIds",
]
assert last_scannr == 1 # The last ScanNr should be 1
Loading