From deca7e6062c15d3dff13144d31dde3baaea2e594 Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Tue, 13 Feb 2024 16:31:55 +0100 Subject: [PATCH 1/7] overwrite set_meta for Fasta and do not use the slower one from Sequence xref https://github.com/galaxyproject/galaxy/issues/17451#issuecomment-1939610089 --- lib/galaxy/datatypes/sequence.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/lib/galaxy/datatypes/sequence.py b/lib/galaxy/datatypes/sequence.py index d7fc89a30363..a2b3d0f1b900 100644 --- a/lib/galaxy/datatypes/sequence.py +++ b/lib/galaxy/datatypes/sequence.py @@ -349,6 +349,22 @@ class Fasta(Sequence): edam_format = "format_1929" file_ext = "fasta" + def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None: + """ + Set the number of sequences and the number of data lines in a FASTA dataset. + """ + data_lines = 0 + sequences = 0 + with compression_utils.get_fileobj(dataset.get_file_name()) as fh: + for line in fh: + if line.startswith(">"): + sequences += 1 + data_lines += 1 + else: + data_lines += 1 + dataset.metadata.data_lines = data_lines + dataset.metadata.sequences = sequences + def sniff_prefix(self, file_prefix: FilePrefix) -> bool: """ Determines whether the file is in fasta format From 6246d6b9da5837d038b9dfd1dde983fe34440d76 Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Tue, 13 Feb 2024 16:42:44 +0100 Subject: [PATCH 2/7] stricter FASTQ metadata setting --- lib/galaxy/datatypes/sequence.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/lib/galaxy/datatypes/sequence.py b/lib/galaxy/datatypes/sequence.py index a2b3d0f1b900..b7cdcd07b0ad 100644 --- a/lib/galaxy/datatypes/sequence.py +++ b/lib/galaxy/datatypes/sequence.py @@ -715,24 +715,11 @@ def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> N return data_lines = 0 sequences = 0 - seq_counter = 0 # blocks should be 4 lines long with compression_utils.get_fileobj(dataset.get_file_name()) as in_file: for line in in_file: - line = line.strip() - if line and line.startswith("#") and not data_lines: - # We don't count comment lines for sequence data types - continue - seq_counter += 1 + if line.startswith("@"): + sequences += 1 data_lines += 1 - if line and line.startswith("@"): - if seq_counter >= 4: - # count previous block - # blocks should be 4 lines long - sequences += 1 - seq_counter = 1 - if seq_counter >= 4: - # count final block - sequences += 1 dataset.metadata.data_lines = data_lines dataset.metadata.sequences = sequences From fcb6fd2517ea2d7376a2ea77a0f7597aee26e296 Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Tue, 13 Feb 2024 17:08:20 +0100 Subject: [PATCH 3/7] add tests for fasta/fastq set_meta --- lib/galaxy/datatypes/test/1.fasta | 1 + test/unit/data/datatypes/test_sequence.py | 52 +++++++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 120000 lib/galaxy/datatypes/test/1.fasta create mode 100644 test/unit/data/datatypes/test_sequence.py diff --git a/lib/galaxy/datatypes/test/1.fasta b/lib/galaxy/datatypes/test/1.fasta new file mode 120000 index 000000000000..b10b7978f162 --- /dev/null +++ b/lib/galaxy/datatypes/test/1.fasta @@ -0,0 +1 @@ +../../../../test-data/1.fasta \ No newline at end of file diff --git a/test/unit/data/datatypes/test_sequence.py b/test/unit/data/datatypes/test_sequence.py new file mode 100644 index 000000000000..f83d0860f1c1 --- /dev/null +++ b/test/unit/data/datatypes/test_sequence.py @@ -0,0 +1,52 @@ +from galaxy.datatypes.sequence import Fasta, FastqSanger, FastqSolexa +from .util import ( + get_dataset, +) + + +def test_fasta_set_meta(): + b = Fasta() + with get_dataset("1.fasta") as dataset: + b.set_meta(dataset=dataset) + assert dataset.metadata.data_lines == 2 + assert dataset.metadata.sequences == 1 + + +def test_fastagz_set_meta(): + b = Fasta() + with get_dataset("1.fasta.gz") as dataset: + b.set_meta(dataset=dataset) + assert dataset.metadata.data_lines == 2 + assert dataset.metadata.sequences == 1 + + +def test_fastqsanger_set_meta(): + b = FastqSanger() + with get_dataset("1.fastqsanger") as dataset: + b.set_meta(dataset=dataset) + assert dataset.metadata.data_lines == 8 + assert dataset.metadata.sequences == 2 + + +def test_fastqsangergz_set_meta(): + b = FastqSanger() + with get_dataset("1.fastqsanger.gz") as dataset: + b.set_meta(dataset=dataset) + assert dataset.metadata.data_lines == 8 + assert dataset.metadata.sequences == 2 + + +def test_fastqsangerbz2_set_meta(): + b = FastqSanger() + with get_dataset("1.fastqsanger.bz2") as dataset: + b.set_meta(dataset=dataset) + assert dataset.metadata.data_lines == 8 + assert dataset.metadata.sequences == 2 + + +def test_fastqsolexa_set_meta(): + b = FastqSolexa() + with get_dataset("1.fastqsolexa") as dataset: + b.set_meta(dataset=dataset) + assert dataset.metadata.data_lines == 8 + assert dataset.metadata.sequences == 2 From 0dc23a65d1005cac9b41966654098a2294bd1d6b Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Tue, 13 Feb 2024 17:15:32 +0100 Subject: [PATCH 4/7] only check count every 4th line as sequence because @ can be in the qualities (like # .. which was handled wrong previously) --- lib/galaxy/datatypes/sequence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/galaxy/datatypes/sequence.py b/lib/galaxy/datatypes/sequence.py index b7cdcd07b0ad..9c84fc487e64 100644 --- a/lib/galaxy/datatypes/sequence.py +++ b/lib/galaxy/datatypes/sequence.py @@ -717,7 +717,7 @@ def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> N sequences = 0 with compression_utils.get_fileobj(dataset.get_file_name()) as in_file: for line in in_file: - if line.startswith("@"): + if line.startswith("@") and data_lines % 4 == 0: sequences += 1 data_lines += 1 dataset.metadata.data_lines = data_lines From e5ff24238f0b88c49a446c214da7418d1d18cf26 Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Tue, 13 Feb 2024 20:23:19 +0100 Subject: [PATCH 5/7] isort import fix --- test/unit/data/datatypes/test_sequence.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test/unit/data/datatypes/test_sequence.py b/test/unit/data/datatypes/test_sequence.py index f83d0860f1c1..fefc6963121b 100644 --- a/test/unit/data/datatypes/test_sequence.py +++ b/test/unit/data/datatypes/test_sequence.py @@ -1,7 +1,9 @@ -from galaxy.datatypes.sequence import Fasta, FastqSanger, FastqSolexa -from .util import ( - get_dataset, +from galaxy.datatypes.sequence import ( + Fasta, + FastqSanger, + FastqSolexa, ) +from .util import get_dataset def test_fasta_set_meta(): From 6b52d6d8bb5ca057b81ec695f9ab5eac8c74e6d3 Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Tue, 13 Feb 2024 22:10:55 +0100 Subject: [PATCH 6/7] fix unit tests --- test/unit/data/datatypes/test_sequence.py | 62 ++++++++++------------- 1 file changed, 26 insertions(+), 36 deletions(-) diff --git a/test/unit/data/datatypes/test_sequence.py b/test/unit/data/datatypes/test_sequence.py index fefc6963121b..64982e89d637 100644 --- a/test/unit/data/datatypes/test_sequence.py +++ b/test/unit/data/datatypes/test_sequence.py @@ -1,12 +1,24 @@ +import pytest + from galaxy.datatypes.sequence import ( Fasta, FastqSanger, FastqSolexa, ) -from .util import get_dataset +from .util import ( + get_dataset, + MockDatasetDataset, +) -def test_fasta_set_meta(): +@pytest.mark.parametrize( + "input_file", + [ + "1.fasta", + "1.fasta.gz", + ], +) +def test_fasta_set_meta(input_file): b = Fasta() with get_dataset("1.fasta") as dataset: b.set_meta(dataset=dataset) @@ -14,41 +26,19 @@ def test_fasta_set_meta(): assert dataset.metadata.sequences == 1 -def test_fastagz_set_meta(): - b = Fasta() - with get_dataset("1.fasta.gz") as dataset: - b.set_meta(dataset=dataset) - assert dataset.metadata.data_lines == 2 - assert dataset.metadata.sequences == 1 - - -def test_fastqsanger_set_meta(): - b = FastqSanger() +@pytest.mark.parametrize( + "fastq_type,input_file", + [ + [FastqSanger, "1.fastqsanger"], + [FastqSanger, "1.fastqsanger.gz"], + [FastqSanger, "1.fastqsanger.bz2"], + [FastqSolexa, "1.fastqssolexa"], + ], +) +def test_fastqsanger_set_meta(fastq_type, input_file): + b = fastq_type() with get_dataset("1.fastqsanger") as dataset: - b.set_meta(dataset=dataset) - assert dataset.metadata.data_lines == 8 - assert dataset.metadata.sequences == 2 - - -def test_fastqsangergz_set_meta(): - b = FastqSanger() - with get_dataset("1.fastqsanger.gz") as dataset: - b.set_meta(dataset=dataset) - assert dataset.metadata.data_lines == 8 - assert dataset.metadata.sequences == 2 - - -def test_fastqsangerbz2_set_meta(): - b = FastqSanger() - with get_dataset("1.fastqsanger.bz2") as dataset: - b.set_meta(dataset=dataset) - assert dataset.metadata.data_lines == 8 - assert dataset.metadata.sequences == 2 - - -def test_fastqsolexa_set_meta(): - b = FastqSolexa() - with get_dataset("1.fastqsolexa") as dataset: + dataset.dataset = MockDatasetDataset(dataset.get_file_name()) b.set_meta(dataset=dataset) assert dataset.metadata.data_lines == 8 assert dataset.metadata.sequences == 2 From 24950a705ee0a44b0d589e3c0777fa68202df69f Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Wed, 14 Feb 2024 16:15:18 +0100 Subject: [PATCH 7/7] consider empty fasta lines --- lib/galaxy/datatypes/sequence.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/galaxy/datatypes/sequence.py b/lib/galaxy/datatypes/sequence.py index 9c84fc487e64..4b580530ade5 100644 --- a/lib/galaxy/datatypes/sequence.py +++ b/lib/galaxy/datatypes/sequence.py @@ -357,7 +357,9 @@ def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> N sequences = 0 with compression_utils.get_fileobj(dataset.get_file_name()) as fh: for line in fh: - if line.startswith(">"): + if not line: + continue + elif line[0] == ">": sequences += 1 data_lines += 1 else: