diff --git a/lib/galaxy/datatypes/sequence.py b/lib/galaxy/datatypes/sequence.py index d7fc89a30363..4b580530ade5 100644 --- a/lib/galaxy/datatypes/sequence.py +++ b/lib/galaxy/datatypes/sequence.py @@ -349,6 +349,24 @@ class Fasta(Sequence): edam_format = "format_1929" file_ext = "fasta" + def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None: + """ + Set the number of sequences and the number of data lines in a FASTA dataset. + """ + data_lines = 0 + sequences = 0 + with compression_utils.get_fileobj(dataset.get_file_name()) as fh: + for line in fh: + if not line: + continue + elif line[0] == ">": + sequences += 1 + data_lines += 1 + else: + data_lines += 1 + dataset.metadata.data_lines = data_lines + dataset.metadata.sequences = sequences + def sniff_prefix(self, file_prefix: FilePrefix) -> bool: """ Determines whether the file is in fasta format @@ -699,24 +717,11 @@ def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> N return data_lines = 0 sequences = 0 - seq_counter = 0 # blocks should be 4 lines long with compression_utils.get_fileobj(dataset.get_file_name()) as in_file: for line in in_file: - line = line.strip() - if line and line.startswith("#") and not data_lines: - # We don't count comment lines for sequence data types - continue - seq_counter += 1 + if line.startswith("@") and data_lines % 4 == 0: + sequences += 1 data_lines += 1 - if line and line.startswith("@"): - if seq_counter >= 4: - # count previous block - # blocks should be 4 lines long - sequences += 1 - seq_counter = 1 - if seq_counter >= 4: - # count final block - sequences += 1 dataset.metadata.data_lines = data_lines dataset.metadata.sequences = sequences diff --git a/lib/galaxy/datatypes/test/1.fasta b/lib/galaxy/datatypes/test/1.fasta new file mode 120000 index 000000000000..b10b7978f162 --- /dev/null +++ b/lib/galaxy/datatypes/test/1.fasta @@ -0,0 +1 @@ +../../../../test-data/1.fasta \ No newline at end of file diff --git a/test/unit/data/datatypes/test_sequence.py b/test/unit/data/datatypes/test_sequence.py new file mode 100644 index 000000000000..64982e89d637 --- /dev/null +++ b/test/unit/data/datatypes/test_sequence.py @@ -0,0 +1,44 @@ +import pytest + +from galaxy.datatypes.sequence import ( + Fasta, + FastqSanger, + FastqSolexa, +) +from .util import ( + get_dataset, + MockDatasetDataset, +) + + +@pytest.mark.parametrize( + "input_file", + [ + "1.fasta", + "1.fasta.gz", + ], +) +def test_fasta_set_meta(input_file): + b = Fasta() + with get_dataset("1.fasta") as dataset: + b.set_meta(dataset=dataset) + assert dataset.metadata.data_lines == 2 + assert dataset.metadata.sequences == 1 + + +@pytest.mark.parametrize( + "fastq_type,input_file", + [ + [FastqSanger, "1.fastqsanger"], + [FastqSanger, "1.fastqsanger.gz"], + [FastqSanger, "1.fastqsanger.bz2"], + [FastqSolexa, "1.fastqssolexa"], + ], +) +def test_fastqsanger_set_meta(fastq_type, input_file): + b = fastq_type() + with get_dataset("1.fastqsanger") as dataset: + dataset.dataset = MockDatasetDataset(dataset.get_file_name()) + b.set_meta(dataset=dataset) + assert dataset.metadata.data_lines == 8 + assert dataset.metadata.sequences == 2