From f6e4086b9c5418acaf41ef08f231be5250cac81c Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 14 Aug 2024 14:38:11 +0200 Subject: [PATCH 1/2] TSV: Raise exception if three consecutive rows cannot be parsed --- psm_utils/io/tsv.py | 15 ++++++++++++--- tests/test_data/test.tsv | 4 ++++ tests/test_io/test_tsv.py | 19 +++++++++++++++++++ 3 files changed, 35 insertions(+), 3 deletions(-) create mode 100644 tests/test_data/test.tsv diff --git a/psm_utils/io/tsv.py b/psm_utils/io/tsv.py index ff4c55f..268cf96 100644 --- a/psm_utils/io/tsv.py +++ b/psm_utils/io/tsv.py @@ -57,10 +57,10 @@ from pydantic import ValidationError from psm_utils.io._base_classes import ReaderBase, WriterBase +from psm_utils.io._utils import set_csv_field_size_limit from psm_utils.io.exceptions import PSMUtilsIOException from psm_utils.psm import PSM from psm_utils.psm_list import PSMList -from psm_utils.io._utils import set_csv_field_size_limit set_csv_field_size_limit() @@ -74,12 +74,21 @@ def __iter__(self): """Iterate over file and return PSMs one-by-one.""" with open(self.filename, "rt") as open_file: reader = csv.DictReader(open_file, delimiter="\t") + failed_rows = 0 for row in reader: try: yield PSM(**self._parse_entry(row)) - except ValidationError: + except ValidationError as e: + failed_rows += 1 logger.warning(f"Could not parse PSM from row: `{row}`") - continue + if failed_rows >= 3: + raise PSMUtilsIOException( + "Could not parse PSM from three consecutive rows. Verify that the " + "file is formatted correctly as a psm_utils TSV file or that the " + "correct file type reader is used." + ) from e + else: + failed_rows = 0 @staticmethod def _parse_entry(entry: dict) -> dict: diff --git a/tests/test_data/test.tsv b/tests/test_data/test.tsv new file mode 100644 index 0000000..db77d9b --- /dev/null +++ b/tests/test_data/test.tsv @@ -0,0 +1,4 @@ +spectrum_id peptidoform +peptide1 ACDEK/2 +peptide2 AC[Carbamidomethyl]DEFGR/3 +peptide3 [Acetyl]-AC[Carbamidomethyl]DEFGHIK/2 diff --git a/tests/test_io/test_tsv.py b/tests/test_io/test_tsv.py index e4a4ef6..b299bd0 100644 --- a/tests/test_io/test_tsv.py +++ b/tests/test_io/test_tsv.py @@ -1,5 +1,8 @@ """Tests for psm_utils.io.tsv.""" +import pytest + +from psm_utils.io.exceptions import PSMUtilsIOException # noqa: F401 from psm_utils.io.tsv import TSVReader, TSVWriter # noqa: F401 test_cases = [ @@ -30,3 +33,19 @@ class TestTSVReader: def test__parse_entry(self): for test_in, expected_out in test_cases: assert TSVReader._parse_entry(test_in) == expected_out + + def test_iter(self): + reader = TSVReader("tests/test_data/test.tsv") + for psm in reader: + assert psm.peptidoform == "ACDEK/2" + assert psm.spectrum_id == "peptide1" + assert psm.provenance_data == {} + assert psm.metadata == {} + assert psm.rescoring_features == {} + break + + def test_iter_raises(self): + with TSVReader("tests/test_data/peprec.tsv") as reader: + with pytest.raises(PSMUtilsIOException): + for psm in reader: + pass From dd9aac07ba4223b1f5c745483e757b6ed1da6eb9 Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 14 Aug 2024 15:03:14 +0200 Subject: [PATCH 2/2] Fix tsv tests --- tests/test_io/test_tsv.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_io/test_tsv.py b/tests/test_io/test_tsv.py index b299bd0..8e77395 100644 --- a/tests/test_io/test_tsv.py +++ b/tests/test_io/test_tsv.py @@ -2,8 +2,9 @@ import pytest -from psm_utils.io.exceptions import PSMUtilsIOException # noqa: F401 -from psm_utils.io.tsv import TSVReader, TSVWriter # noqa: F401 +from psm_utils.io.exceptions import PSMUtilsIOException +from psm_utils.io.tsv import TSVReader +from psm_utils.peptidoform import Peptidoform test_cases = [ ( @@ -37,7 +38,7 @@ def test__parse_entry(self): def test_iter(self): reader = TSVReader("tests/test_data/test.tsv") for psm in reader: - assert psm.peptidoform == "ACDEK/2" + assert psm.peptidoform == Peptidoform("ACDEK/2") assert psm.spectrum_id == "peptide1" assert psm.provenance_data == {} assert psm.metadata == {}