Skip to content

Commit

Permalink
Merge pull request #17462 from bernt-matthias/topic/fasta-set-meta
Browse files Browse the repository at this point in the history
Faster FASTA and FASTQ metadata setting
  • Loading branch information
martenson authored Feb 19, 2024
2 parents a626b31 + 24950a7 commit 0d380c4
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 15 deletions.
35 changes: 20 additions & 15 deletions lib/galaxy/datatypes/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,24 @@ class Fasta(Sequence):
edam_format = "format_1929"
file_ext = "fasta"

def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None:
"""
Set the number of sequences and the number of data lines in a FASTA dataset.
"""
data_lines = 0
sequences = 0
with compression_utils.get_fileobj(dataset.get_file_name()) as fh:
for line in fh:
if not line:
continue
elif line[0] == ">":
sequences += 1
data_lines += 1
else:
data_lines += 1
dataset.metadata.data_lines = data_lines
dataset.metadata.sequences = sequences

def sniff_prefix(self, file_prefix: FilePrefix) -> bool:
"""
Determines whether the file is in fasta format
Expand Down Expand Up @@ -699,24 +717,11 @@ def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> N
return
data_lines = 0
sequences = 0
seq_counter = 0 # blocks should be 4 lines long
with compression_utils.get_fileobj(dataset.get_file_name()) as in_file:
for line in in_file:
line = line.strip()
if line and line.startswith("#") and not data_lines:
# We don't count comment lines for sequence data types
continue
seq_counter += 1
if line.startswith("@") and data_lines % 4 == 0:
sequences += 1
data_lines += 1
if line and line.startswith("@"):
if seq_counter >= 4:
# count previous block
# blocks should be 4 lines long
sequences += 1
seq_counter = 1
if seq_counter >= 4:
# count final block
sequences += 1
dataset.metadata.data_lines = data_lines
dataset.metadata.sequences = sequences

Expand Down
1 change: 1 addition & 0 deletions lib/galaxy/datatypes/test/1.fasta
44 changes: 44 additions & 0 deletions test/unit/data/datatypes/test_sequence.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import pytest

from galaxy.datatypes.sequence import (
Fasta,
FastqSanger,
FastqSolexa,
)
from .util import (
get_dataset,
MockDatasetDataset,
)


@pytest.mark.parametrize(
"input_file",
[
"1.fasta",
"1.fasta.gz",
],
)
def test_fasta_set_meta(input_file):
b = Fasta()
with get_dataset("1.fasta") as dataset:
b.set_meta(dataset=dataset)
assert dataset.metadata.data_lines == 2
assert dataset.metadata.sequences == 1


@pytest.mark.parametrize(
"fastq_type,input_file",
[
[FastqSanger, "1.fastqsanger"],
[FastqSanger, "1.fastqsanger.gz"],
[FastqSanger, "1.fastqsanger.bz2"],
[FastqSolexa, "1.fastqssolexa"],
],
)
def test_fastqsanger_set_meta(fastq_type, input_file):
b = fastq_type()
with get_dataset("1.fastqsanger") as dataset:
dataset.dataset = MockDatasetDataset(dataset.get_file_name())
b.set_meta(dataset=dataset)
assert dataset.metadata.data_lines == 8
assert dataset.metadata.sequences == 2

0 comments on commit 0d380c4

Please sign in to comment.