Skip to content

Commit

Permalink
Merge branch 'trs/curate/excel-input'
Browse files Browse the repository at this point in the history
  • Loading branch information
tsibley committed Jul 22, 2024
2 parents 63bbff2 + 981c740 commit be94e50
Show file tree
Hide file tree
Showing 13 changed files with 171 additions and 43 deletions.
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@

* export v2: we now limit numerical precision on floats in the JSON. This should not change how a dataset is displayed / interpreted in Auspice but allows the gzipped & minimised JSON filesize to be reduced by around 30% (dataset-dependent). [#1512][] (@jameshadfield)
* traits, export v2: `augur traits` now reports all confidence values above 0.1% rather than limiting them to the top 4 results. There is no change in the eventual Auspice dataset as `augur export v2` will still only consider the top 4. [#1512][] (@jameshadfield)
* curate: Excel (`.xlsx` and `.xls`) and OpenOffice (`.ods`) spreadsheet files are now also supported as metadata inputs (`--metadata`). The first sheet in the workbook is read as tabular data. [#1550][] (@tsibley)

[#1512]: https://github.com/nextstrain/augur/pull/1512
[#1550]: https://github.com/nextstrain/augur/pull/1550


## 25.1.1 (15 July 2024)
Expand Down
6 changes: 3 additions & 3 deletions augur/curate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,13 @@ def create_shared_parser():
If no input options are provided, commands will try to read NDJSON records from stdin.
""")
shared_inputs.add_argument("--metadata",
help="Input metadata file. Accepts '-' to read metadata from stdin.")
help="Input metadata file. May be plain text (TSV, CSV) or an Excel or OpenOffice spreadsheet workbook file. When an Excel or OpenOffice workbook, only the first visible worksheet will be read and initial empty rows/columns will be ignored. Accepts '-' to read plain text from stdin.")
shared_inputs.add_argument("--id-column",
help="Name of the metadata column that contains the record identifier for reporting duplicate records. "
"Uses the first column of the metadata file if not provided. "
"Ignored if also providing a FASTA file input.")
shared_inputs.add_argument("--metadata-delimiters", default=DEFAULT_DELIMITERS, nargs="+", action=ExtendOverwriteDefault,
help="Delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")
help="Delimiters to accept when reading a plain text metadata file. Only one delimiter will be inferred.")

shared_inputs.add_argument("--fasta",
help="Plain or gzipped FASTA file. Headers can only contain the sequence id used to match a metadata record. " +
Expand Down Expand Up @@ -181,7 +181,7 @@ def run(args):
# Read inputs
# Special case single hyphen as stdin
if args.metadata == '-':
args.metadata = sys.stdin
args.metadata = sys.stdin.buffer

if args.metadata and args.fasta:
try:
Expand Down
20 changes: 18 additions & 2 deletions augur/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@
SOFTWARE.
"""
import json
from datetime import date, datetime
from datetime import date, datetime, time, timedelta
from isodate import duration_isoformat
from typing import Iterable
from uuid import UUID

Expand All @@ -51,6 +52,16 @@ def as_json(value):
>>> as_json(datetime(year=2024, month=7, day=17, hour=11, minute=38))
'"2024-07-17T11:38:00"'
:class:`~datetime.time` objects:
>>> as_json(time(hour=11, minute=38))
'"11:38:00"'
:class:`~datetime.timedelta` objects:
>>> as_json(timedelta(days=42))
'"P42D"'
and :class:`~uuid.UUID` objects:
>>> as_json(UUID(int=147952133113722764103424939352979237618))
Expand Down Expand Up @@ -113,11 +124,16 @@ def default(self, value):
Serializes:
* :class:`~datetime.date` using :meth:`~datetime.date.isoformat()`
* :class:`~datetime.datetime` using :meth:`~datetime.datetime.isoformat()`
* :class:`~datetime.time` using :meth:`~datetime.time.isoformat()`
* :class:`~datetime.timedelta` using ``isodate.duration_isoformat()``
* :class:`~uuid.UUID` using ``str()``
"""
if isinstance(value, (date, datetime)):
if isinstance(value, (date, datetime, time)):
return value.isoformat()

elif isinstance(value, timedelta):
return duration_isoformat(value)

elif isinstance(value, UUID):
return str(value)

Expand Down
111 changes: 88 additions & 23 deletions augur/io/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
from typing import Iterable, Sequence
import pandas as pd
import pyfastx
import python_calamine as calamine
import sys
from io import StringIO
from itertools import chain
from io import StringIO, TextIOWrapper
from itertools import chain, zip_longest
from textwrap import dedent

from augur.errors import AugurError
from augur.io.print import print_err
Expand Down Expand Up @@ -166,14 +168,19 @@ def read_table_to_dict(table, delimiters, duplicate_reporting=DataErrorMethod.ER
Will report duplicate records based on the *id_column* if requested via
*duplicate_reporting* after the generator has been exhausted.
When the *table* file is an Excel or OpenOffice workbook, only the first
visible worksheet will be read and initial empty rows/columns will be
ignored.
Parameters
----------
table: str
Path to a CSV or TSV file or IO buffer
Path to a CSV, TSV, Excel, or OpenOffice file or binary IO buffer
delimiters : list of str
List of possible delimiters to check for between columns in the metadata.
Only one delimiter will be inferred.
Ignored if *table* is an Excel or OpenOffice file.
duplicate_reporting: DataErrorMethod, optional
How should duplicate records be reported
Expand All @@ -197,34 +204,87 @@ def read_table_to_dict(table, delimiters, duplicate_reporting=DataErrorMethod.ER
"""
seen_ids = set()
duplicate_ids = set()
with open_file(table) as handle:
# Get sample to determine delimiter
table_sample = handle.readline()
with open_file(table, "rb") as handle:
# open_file(x, "rb") will return x as-is if it's already a file handle,
# and in that case the handle might be text mode even though we asked
# for bytes. This assertion guards against usage errors in our caller.
assert isinstance(handle.read(0), bytes)

columns = None
records = None

# Try binary handle as Excel/OpenOffice, as long as it's seekable so we
# can reset to the start on failure.
if handle.seekable():
handle.seek(0)
else:
table_sample_file = StringIO(table_sample)
handle = chain(table_sample_file, handle)
try:
workbook = calamine.load_workbook(handle)
except calamine.CalamineError:
handle.seek(0)
else:
def visible_worksheet(s: calamine.SheetMetadata) -> bool:
# Normally one would use "is" to compare to an enum, but
# these aren't actual Python enum.Enum classes.
return s.visible == calamine.SheetVisibleEnum.Visible \
and s.typ == calamine.SheetTypeEnum.WorkSheet

if not (sheet := next(filter(visible_worksheet, workbook.sheets_metadata), None)):
if not workbook.sheets_metadata:
error_msg = f"Excel/OpenOffice workbook {table!r} contains no sheets."
else:
error_msg = dedent(f"""\
Excel/OpenOffice workbook {table!r} contains no visible worksheets.
{len(workbook.sheets_metadata)} other sheets found:
""")

for sheet in workbook.sheets_metadata:
type = str(sheet.typ).replace('SheetTypeEnum.', '').lower()
visibility = str(sheet.visible).replace('SheetVisibleEnum.', '').lower()
error_msg += f" - {sheet.name!r} ({type=!s}, {visibility=!s})\n"

raise AugurError(error_msg)

rows = workbook.get_sheet_by_name(sheet.name).to_python(skip_empty_area=True)
columns = rows[0]
records = (
dict(zip_longest(columns, row[:len(columns)]))
for row
in rows[1:])

# Not Excel/OpenOffice, so convert handle to text and sniff the delimiter.
if records is None:
handle = TextIOWrapper(handle, encoding="utf-8", newline="")

# Get sample to determine delimiter
table_sample = handle.readline()

if handle.seekable():
handle.seek(0)
else:
table_sample_file = StringIO(table_sample)
handle = chain(table_sample_file, handle)

try:
# Note: this sort of duplicates _get_delimiter(), but it's easier if
# this is separate since it handles non-seekable buffers.
dialect = csv.Sniffer().sniff(table_sample, delimiters)
except csv.Error as error:
# This assumes all csv.Errors imply a delimiter issue. That might
# change in a future Python version.
raise InvalidDelimiter from error
try:
# Note: this sort of duplicates _get_delimiter(), but it's easier if
# this is separate since it handles non-seekable buffers.
dialect = csv.Sniffer().sniff(table_sample, delimiters)
except csv.Error as error:
# This assumes all csv.Errors imply a delimiter issue. That might
# change in a future Python version.
raise InvalidDelimiter from error

metadata_reader = csv.DictReader(handle, dialect=dialect)

columns, records = metadata_reader.fieldnames, iter(metadata_reader)

metadata_reader = csv.DictReader(handle, dialect=dialect)
if duplicate_reporting is DataErrorMethod.SILENT:
# Directly yield from metadata reader since we do not need to check for duplicate ids
yield from metadata_reader
yield from records
else:
if id_column is None:
id_column = metadata_reader.fieldnames[0]
id_column = columns[0]

for record in metadata_reader:
for record in records:
record_id = record.get(id_column)
if record_id is None:
raise AugurError(f"The provided id column {id_column!r} does not exist in {table!r}.")
Expand Down Expand Up @@ -281,13 +341,18 @@ def read_metadata_with_sequences(metadata, metadata_delimiters, fasta, seq_id_co
See pyfastx docs for more details:
https://pyfastx.readthedocs.io/en/latest/usage.html#fasta
When the *metadata* file is an Excel or OpenOffice workbook, only the first
visible worksheet will be read and initial empty rows/columns will be
ignored.
Parameters
----------
metadata: str
Path to a CSV or TSV metadata file
Path to a CSV, TSV, Excel, or OpenOffice metadata file or binary IO buffer
metadata_delimiters : list of str
List of possible delimiters to check for between columns in the metadata.
Ignored if *metadata* is an Excel or OpenOffice file.
fasta: str
Path to a plain or gzipped FASTA file
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
"pandas >=1.0.0, ==1.*",
"phylo-treetime >=0.11.2, <0.12",
"pyfastx >=1.0.0, <3.0",
"python_calamine >=0.2.0",
"scipy ==1.*",
"xopen[zstd] >=1.7.0, <3" # TODO: Deprecated, remove v1 support around November 2024
],
Expand Down
52 changes: 52 additions & 0 deletions tests/functional/curate/cram/metadata-input.t
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,58 @@ Test CSV metadata input from stdin
{"strain": "sequence_B", "country": "USA", "date": "2020-10-02"}
{"strain": "sequence_C", "country": "USA", "date": "2020-10-03"}

Test Excel (.xls) metadata input

$ ${AUGUR} curate passthru \
> --metadata "$TESTDIR/../data/metadata.xls"
{"strain": "sequence_A", "country": "USA", "date": "2020-10-01", "authors": "A,B,C,D,E,F,G,H,I,J,K"}
{"strain": "sequence_B", "country": "USA", "date": "2020-10-02", "authors": "A,B,C,D,E,F,G,H,I,J,K"}
{"strain": "sequence_C", "country": "USA", "date": "2020-10-03", "authors": "A,B,C,D,E,F,G,H,I,J,K"}

Test Excel (.xlsx) metadata input

$ ${AUGUR} curate passthru \
> --metadata "$TESTDIR/../data/metadata.xlsx"
{"strain": "sequence_A", "country": "USA", "date": "2020-10-01", "authors": "A,B,C,D,E,F,G,H,I,J,K"}
{"strain": "sequence_B", "country": "USA", "date": "2020-10-02", "authors": "A,B,C,D,E,F,G,H,I,J,K"}
{"strain": "sequence_C", "country": "USA", "date": "2020-10-03", "authors": "A,B,C,D,E,F,G,H,I,J,K"}

Test OpenOffice (.ods) metadata input

$ ${AUGUR} curate passthru \
> --metadata "$TESTDIR/../data/metadata.ods"
{"strain": "sequence_A", "country": "USA", "date": "2020-10-01", "authors": "A,B,C,D,E,F,G,H,I,J,K"}
{"strain": "sequence_B", "country": "USA", "date": "2020-10-02", "authors": "A,B,C,D,E,F,G,H,I,J,K"}
{"strain": "sequence_C", "country": "USA", "date": "2020-10-03", "authors": "A,B,C,D,E,F,G,H,I,J,K"}

Excel (.xlsx) workbook, skipped rows/cols

$ ${AUGUR} curate passthru \
> --metadata "$TESTDIR/../data/metadata-skipped-areas.xlsx"
{"strain": "sequence_A", "country": "USA", "date": "2020-10-01", "authors": "A,B,C,D,E,F,G,H,I,J,K"}
{"strain": "sequence_B", "country": "USA", "date": "2020-10-02", "authors": "A,B,C,D,E,F,G,H,I,J,K"}
{"strain": "sequence_C", "country": "USA", "date": "2020-10-03", "authors": "A,B,C,D,E,F,G,H,I,J,K"}

Excel (.xlsx) workbook, skipped hidden sheet

$ ${AUGUR} curate passthru \
> --metadata "$TESTDIR/../data/metadata-skipped-hidden-sheet.xlsx"
{"strain": "sequence_A", "country": "USA", "date": "2020-10-01", "authors": "A,B,C,D,E,F,G,H,I,J,K"}
{"strain": "sequence_B", "country": "USA", "date": "2020-10-02", "authors": "A,B,C,D,E,F,G,H,I,J,K"}
{"strain": "sequence_C", "country": "USA", "date": "2020-10-03", "authors": "A,B,C,D,E,F,G,H,I,J,K"}

Excel (.xlsx) workbook, no valid sheets

$ ${AUGUR} curate passthru \
> --metadata "$TESTDIR/../data/metadata-no-valid-sheet.xlsx"
ERROR: Excel/OpenOffice workbook '*/metadata-no-valid-sheet.xlsx' contains no visible worksheets. (glob)

3 other sheets found:
- 'Hidden' (type=worksheet, visibility=hidden)
- 'VeryHidden' (type=worksheet, visibility=veryhidden)
- 'Chart' (type=chartsheet, visibility=visible)

[2]

Create a metadata TSV file with duplicate records

Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added tests/functional/curate/data/metadata.ods
Binary file not shown.
Binary file added tests/functional/curate/data/metadata.xls
Binary file not shown.
Binary file added tests/functional/curate/data/metadata.xlsx
Binary file not shown.
22 changes: 7 additions & 15 deletions tests/io/test_metadata.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import pytest
import shutil
import sys
from io import StringIO
from io import BytesIO

from augur.errors import AugurError
from augur.io.metadata import InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv, Metadata
Expand All @@ -28,11 +27,6 @@ def metadata_with_duplicate(tmpdir):
fh.write('SEQ_B\t2020-10-03\tUSA\n')
return path

@pytest.fixture
def mp_context(monkeypatch):
with monkeypatch.context() as mp:
yield mp

class TestReadMetadataToDict:
def test_read_table_to_dict_with_csv(self, tmpdir, expected_record):
path = str(tmpdir / 'metadata.csv')
Expand All @@ -43,10 +37,9 @@ def test_read_table_to_dict_with_csv(self, tmpdir, expected_record):
record = next(read_table_to_dict(path, (',')))
assert record == expected_record

def test_read_table_to_dict_with_csv_from_stdin(self, mp_context, expected_record):
stdin = StringIO('strain,date,country,lab\nSEQ_A,2020-10-03,USA,A Virology Lab "Vector"\n')
mp_context.setattr('sys.stdin', stdin)
record = next(read_table_to_dict(sys.stdin, (',')))
def test_read_table_to_dict_with_csv_from_handle(self, expected_record):
handle = BytesIO(b'strain,date,country,lab\nSEQ_A,2020-10-03,USA,A Virology Lab "Vector"\n')
record = next(read_table_to_dict(handle, (',')))
assert record == expected_record

def test_read_table_to_dict_with_tsv(self, tmpdir, expected_record):
Expand All @@ -58,10 +51,9 @@ def test_read_table_to_dict_with_tsv(self, tmpdir, expected_record):
record = next(read_table_to_dict(path, ('\t')))
assert record == expected_record

def test_read_table_to_dict_with_tsv_from_stdin(self, mp_context, expected_record):
stdin = StringIO('strain\tdate\tcountry\tlab\nSEQ_A\t2020-10-03\tUSA\tA Virology Lab "Vector"\n')
mp_context.setattr('sys.stdin', stdin)
record = next(read_table_to_dict(sys.stdin, ('\t')))
def test_read_table_to_dict_with_tsv_from_handle(self, expected_record):
handle = BytesIO(b'strain\tdate\tcountry\tlab\nSEQ_A\t2020-10-03\tUSA\tA Virology Lab "Vector"\n')
record = next(read_table_to_dict(handle, ('\t')))
assert record == expected_record

def test_read_table_to_dict_with_bad_delimiter(self, tmpdir):
Expand Down

0 comments on commit be94e50

Please sign in to comment.