Merge branch 'trs/curate/excel-input'

nextstrain · Jul 22, 2024 · be94e50 · be94e50
2 parents 63bbff2 + 981c740
commit be94e50
Show file tree

Hide file tree

Showing 13 changed files with 171 additions and 43 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -6,8 +6,10 @@
 
 * export v2: we now limit numerical precision on floats in the JSON. This should not change how a dataset is displayed / interpreted in Auspice but allows the gzipped & minimised JSON filesize to be reduced by around 30% (dataset-dependent). [#1512][] (@jameshadfield)
 * traits, export v2: `augur traits` now reports all confidence values above 0.1% rather than limiting them to the top 4 results. There is no change in the eventual Auspice dataset as `augur export v2` will still only consider the top 4. [#1512][] (@jameshadfield)
+* curate: Excel (`.xlsx` and `.xls`) and OpenOffice (`.ods`) spreadsheet files are now also supported as metadata inputs (`--metadata`).  The first sheet in the workbook is read as tabular data.  [#1550][] (@tsibley)
 
 [#1512]: https://github.com/nextstrain/augur/pull/1512
+[#1550]: https://github.com/nextstrain/augur/pull/1550
 
 
 ## 25.1.1 (15 July 2024)

diff --git a/augur/curate/__init__.py b/augur/curate/__init__.py
@@ -55,13 +55,13 @@ def create_shared_parser():
             If no input options are provided, commands will try to read NDJSON records from stdin.
         """)
     shared_inputs.add_argument("--metadata",
-        help="Input metadata file. Accepts '-' to read metadata from stdin.")
+        help="Input metadata file. May be plain text (TSV, CSV) or an Excel or OpenOffice spreadsheet workbook file. When an Excel or OpenOffice workbook, only the first visible worksheet will be read and initial empty rows/columns will be ignored. Accepts '-' to read plain text from stdin.")
     shared_inputs.add_argument("--id-column",
         help="Name of the metadata column that contains the record identifier for reporting duplicate records. "
              "Uses the first column of the metadata file if not provided. "
              "Ignored if also providing a FASTA file input.")
     shared_inputs.add_argument("--metadata-delimiters", default=DEFAULT_DELIMITERS, nargs="+", action=ExtendOverwriteDefault,
-        help="Delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")
+        help="Delimiters to accept when reading a plain text metadata file. Only one delimiter will be inferred.")
 
     shared_inputs.add_argument("--fasta",
         help="Plain or gzipped FASTA file. Headers can only contain the sequence id used to match a metadata record. " +
@@ -181,7 +181,7 @@ def run(args):
     # Read inputs
     # Special case single hyphen as stdin
     if args.metadata == '-':
-        args.metadata = sys.stdin
+        args.metadata = sys.stdin.buffer
 
     if args.metadata and args.fasta:
         try:

diff --git a/augur/io/json.py b/augur/io/json.py
@@ -32,7 +32,8 @@
     SOFTWARE.
 """
 import json
-from datetime import date, datetime
+from datetime import date, datetime, time, timedelta
+from isodate import duration_isoformat
 from typing import Iterable
 from uuid import UUID
 
@@ -51,6 +52,16 @@ def as_json(value):
     >>> as_json(datetime(year=2024, month=7, day=17, hour=11, minute=38))
     '"2024-07-17T11:38:00"'
 
+    :class:`~datetime.time` objects:
+
+    >>> as_json(time(hour=11, minute=38))
+    '"11:38:00"'
+
+    :class:`~datetime.timedelta` objects:
+
+    >>> as_json(timedelta(days=42))
+    '"P42D"'
+
     and :class:`~uuid.UUID` objects:
 
     >>> as_json(UUID(int=147952133113722764103424939352979237618))
@@ -113,11 +124,16 @@ def default(self, value):
         Serializes:
         * :class:`~datetime.date` using :meth:`~datetime.date.isoformat()`
         * :class:`~datetime.datetime` using :meth:`~datetime.datetime.isoformat()`
+        * :class:`~datetime.time` using :meth:`~datetime.time.isoformat()`
+        * :class:`~datetime.timedelta` using ``isodate.duration_isoformat()``
         * :class:`~uuid.UUID` using ``str()``
         """
-        if isinstance(value, (date, datetime)):
+        if isinstance(value, (date, datetime, time)):
             return value.isoformat()
 
+        elif isinstance(value, timedelta):
+            return duration_isoformat(value)
+
         elif isinstance(value, UUID):
             return str(value)
 

diff --git a/augur/io/metadata.py b/augur/io/metadata.py
@@ -3,9 +3,11 @@
 from typing import Iterable, Sequence
 import pandas as pd
 import pyfastx
+import python_calamine as calamine
 import sys
-from io import StringIO
-from itertools import chain
+from io import StringIO, TextIOWrapper
+from itertools import chain, zip_longest
+from textwrap import dedent
 
 from augur.errors import AugurError
 from augur.io.print import print_err
@@ -166,14 +168,19 @@ def read_table_to_dict(table, delimiters, duplicate_reporting=DataErrorMethod.ER
     Will report duplicate records based on the *id_column* if requested via
     *duplicate_reporting* after the generator has been exhausted.
 
+    When the *table* file is an Excel or OpenOffice workbook, only the first
+    visible worksheet will be read and initial empty rows/columns will be
+    ignored.
+
     Parameters
     ----------
     table: str
-        Path to a CSV or TSV file or IO buffer
+        Path to a CSV, TSV, Excel, or OpenOffice file or binary IO buffer
 
     delimiters : list of str
         List of possible delimiters to check for between columns in the metadata.
         Only one delimiter will be inferred.
+        Ignored if *table* is an Excel or OpenOffice file.
 
     duplicate_reporting: DataErrorMethod, optional
         How should duplicate records be reported
@@ -197,34 +204,87 @@ def read_table_to_dict(table, delimiters, duplicate_reporting=DataErrorMethod.ER
     """
     seen_ids = set()
     duplicate_ids = set()
-    with open_file(table) as handle:
-        # Get sample to determine delimiter
-        table_sample = handle.readline()
+    with open_file(table, "rb") as handle:
+        # open_file(x, "rb") will return x as-is if it's already a file handle,
+        # and in that case the handle might be text mode even though we asked
+        # for bytes.  This assertion guards against usage errors in our caller.
+        assert isinstance(handle.read(0), bytes)
+
+        columns = None
+        records = None
 
+        # Try binary handle as Excel/OpenOffice, as long as it's seekable so we
+        # can reset to the start on failure.
         if handle.seekable():
-            handle.seek(0)
-        else:
-            table_sample_file = StringIO(table_sample)
-            handle = chain(table_sample_file, handle)
+            try:
+                workbook = calamine.load_workbook(handle)
+            except calamine.CalamineError:
+                handle.seek(0)
+            else:
+                def visible_worksheet(s: calamine.SheetMetadata) -> bool:
+                    # Normally one would use "is" to compare to an enum, but
+                    # these aren't actual Python enum.Enum classes.
+                    return s.visible == calamine.SheetVisibleEnum.Visible \
+                       and s.typ == calamine.SheetTypeEnum.WorkSheet
+
+                if not (sheet := next(filter(visible_worksheet, workbook.sheets_metadata), None)):
+                    if not workbook.sheets_metadata:
+                        error_msg = f"Excel/OpenOffice workbook {table!r} contains no sheets."
+                    else:
+                        error_msg = dedent(f"""\
+                            Excel/OpenOffice workbook {table!r} contains no visible worksheets.
+
+                            {len(workbook.sheets_metadata)} other sheets found:
+                            """)
+
+                        for sheet in workbook.sheets_metadata:
+                            type = str(sheet.typ).replace('SheetTypeEnum.', '').lower()
+                            visibility = str(sheet.visible).replace('SheetVisibleEnum.', '').lower()
+                            error_msg += f"  - {sheet.name!r} ({type=!s}, {visibility=!s})\n"
+
+                    raise AugurError(error_msg)
+
+                rows = workbook.get_sheet_by_name(sheet.name).to_python(skip_empty_area=True)
+                columns = rows[0]
+                records = (
+                    dict(zip_longest(columns, row[:len(columns)]))
+                        for row
+                         in rows[1:])
+
+        # Not Excel/OpenOffice, so convert handle to text and sniff the delimiter.
+        if records is None:
+            handle = TextIOWrapper(handle, encoding="utf-8", newline="")
+
+            # Get sample to determine delimiter
+            table_sample = handle.readline()
+
+            if handle.seekable():
+                handle.seek(0)
+            else:
+                table_sample_file = StringIO(table_sample)
+                handle = chain(table_sample_file, handle)
 
-        try:
-            # Note: this sort of duplicates _get_delimiter(), but it's easier if
-            # this is separate since it handles non-seekable buffers.
-            dialect = csv.Sniffer().sniff(table_sample, delimiters)
-        except csv.Error as error:
-            # This assumes all csv.Errors imply a delimiter issue. That might
-            # change in a future Python version.
-            raise InvalidDelimiter from error
+            try:
+                # Note: this sort of duplicates _get_delimiter(), but it's easier if
+                # this is separate since it handles non-seekable buffers.
+                dialect = csv.Sniffer().sniff(table_sample, delimiters)
+            except csv.Error as error:
+                # This assumes all csv.Errors imply a delimiter issue. That might
+                # change in a future Python version.
+                raise InvalidDelimiter from error
+
+            metadata_reader = csv.DictReader(handle, dialect=dialect)
+
+            columns, records = metadata_reader.fieldnames, iter(metadata_reader)
 
-        metadata_reader = csv.DictReader(handle, dialect=dialect)
         if duplicate_reporting is DataErrorMethod.SILENT:
             # Directly yield from metadata reader since we do not need to check for duplicate ids
-            yield from metadata_reader
+            yield from records
         else:
             if id_column is None:
-                id_column = metadata_reader.fieldnames[0]
+                id_column = columns[0]
 
-            for record in metadata_reader:
+            for record in records:
                 record_id = record.get(id_column)
                 if record_id is None:
                     raise AugurError(f"The provided id column {id_column!r} does not exist in {table!r}.")
@@ -281,13 +341,18 @@ def read_metadata_with_sequences(metadata, metadata_delimiters, fasta, seq_id_co
     See pyfastx docs for more details:
     https://pyfastx.readthedocs.io/en/latest/usage.html#fasta
 
+    When the *metadata* file is an Excel or OpenOffice workbook, only the first
+    visible worksheet will be read and initial empty rows/columns will be
+    ignored.
+
     Parameters
     ----------
     metadata: str
-        Path to a CSV or TSV metadata file
+        Path to a CSV, TSV, Excel, or OpenOffice metadata file or binary IO buffer
 
     metadata_delimiters : list of str
         List of possible delimiters to check for between columns in the metadata.
+        Ignored if *metadata* is an Excel or OpenOffice file.
 
     fasta: str
         Path to a plain or gzipped FASTA file

diff --git a/setup.py b/setup.py
@@ -64,6 +64,7 @@
         "pandas >=1.0.0, ==1.*",
         "phylo-treetime >=0.11.2, <0.12",
         "pyfastx >=1.0.0, <3.0",
+        "python_calamine >=0.2.0",
         "scipy ==1.*",
         "xopen[zstd] >=1.7.0, <3" # TODO: Deprecated, remove v1 support around November 2024
     ],

diff --git a/tests/functional/curate/cram/metadata-input.t b/tests/functional/curate/cram/metadata-input.t
@@ -57,6 +57,58 @@ Test CSV metadata input from stdin
   {"strain": "sequence_B", "country": "USA", "date": "2020-10-02"}
   {"strain": "sequence_C", "country": "USA", "date": "2020-10-03"}
 
+Test Excel (.xls) metadata input
+
+  $ ${AUGUR} curate passthru \
+  > --metadata "$TESTDIR/../data/metadata.xls"
+  {"strain": "sequence_A", "country": "USA", "date": "2020-10-01", "authors": "A,B,C,D,E,F,G,H,I,J,K"}
+  {"strain": "sequence_B", "country": "USA", "date": "2020-10-02", "authors": "A,B,C,D,E,F,G,H,I,J,K"}
+  {"strain": "sequence_C", "country": "USA", "date": "2020-10-03", "authors": "A,B,C,D,E,F,G,H,I,J,K"}
+
+Test Excel (.xlsx) metadata input
+
+  $ ${AUGUR} curate passthru \
+  > --metadata "$TESTDIR/../data/metadata.xlsx"
+  {"strain": "sequence_A", "country": "USA", "date": "2020-10-01", "authors": "A,B,C,D,E,F,G,H,I,J,K"}
+  {"strain": "sequence_B", "country": "USA", "date": "2020-10-02", "authors": "A,B,C,D,E,F,G,H,I,J,K"}
+  {"strain": "sequence_C", "country": "USA", "date": "2020-10-03", "authors": "A,B,C,D,E,F,G,H,I,J,K"}
+
+Test OpenOffice (.ods) metadata input
+
+  $ ${AUGUR} curate passthru \
+  > --metadata "$TESTDIR/../data/metadata.ods"
+  {"strain": "sequence_A", "country": "USA", "date": "2020-10-01", "authors": "A,B,C,D,E,F,G,H,I,J,K"}
+  {"strain": "sequence_B", "country": "USA", "date": "2020-10-02", "authors": "A,B,C,D,E,F,G,H,I,J,K"}
+  {"strain": "sequence_C", "country": "USA", "date": "2020-10-03", "authors": "A,B,C,D,E,F,G,H,I,J,K"}
+
+Excel (.xlsx) workbook, skipped rows/cols
+
+  $ ${AUGUR} curate passthru \
+  > --metadata "$TESTDIR/../data/metadata-skipped-areas.xlsx"
+  {"strain": "sequence_A", "country": "USA", "date": "2020-10-01", "authors": "A,B,C,D,E,F,G,H,I,J,K"}
+  {"strain": "sequence_B", "country": "USA", "date": "2020-10-02", "authors": "A,B,C,D,E,F,G,H,I,J,K"}
+  {"strain": "sequence_C", "country": "USA", "date": "2020-10-03", "authors": "A,B,C,D,E,F,G,H,I,J,K"}
+
+Excel (.xlsx) workbook, skipped hidden sheet
+
+  $ ${AUGUR} curate passthru \
+  > --metadata "$TESTDIR/../data/metadata-skipped-hidden-sheet.xlsx"
+  {"strain": "sequence_A", "country": "USA", "date": "2020-10-01", "authors": "A,B,C,D,E,F,G,H,I,J,K"}
+  {"strain": "sequence_B", "country": "USA", "date": "2020-10-02", "authors": "A,B,C,D,E,F,G,H,I,J,K"}
+  {"strain": "sequence_C", "country": "USA", "date": "2020-10-03", "authors": "A,B,C,D,E,F,G,H,I,J,K"}
+
+Excel (.xlsx) workbook, no valid sheets
+
+  $ ${AUGUR} curate passthru \
+  > --metadata "$TESTDIR/../data/metadata-no-valid-sheet.xlsx"
+  ERROR: Excel/OpenOffice workbook '*/metadata-no-valid-sheet.xlsx' contains no visible worksheets. (glob)
+
+  3 other sheets found:
+    - 'Hidden' (type=worksheet, visibility=hidden)
+    - 'VeryHidden' (type=worksheet, visibility=veryhidden)
+    - 'Chart' (type=chartsheet, visibility=visible)
+
+  [2]
 
 Create a metadata TSV file with duplicate records
 

diff --git a/tests/functional/curate/data/metadata-no-valid-sheet.xlsx b/tests/functional/curate/data/metadata-no-valid-sheet.xlsx
diff --git a/tests/functional/curate/data/metadata-skipped-areas.xlsx b/tests/functional/curate/data/metadata-skipped-areas.xlsx
diff --git a/tests/functional/curate/data/metadata-skipped-hidden-sheet.xlsx b/tests/functional/curate/data/metadata-skipped-hidden-sheet.xlsx
diff --git a/tests/functional/curate/data/metadata.ods b/tests/functional/curate/data/metadata.ods
diff --git a/tests/functional/curate/data/metadata.xls b/tests/functional/curate/data/metadata.xls
diff --git a/tests/functional/curate/data/metadata.xlsx b/tests/functional/curate/data/metadata.xlsx
diff --git a/tests/io/test_metadata.py b/tests/io/test_metadata.py
@@ -1,7 +1,6 @@
 import pytest
 import shutil
-import sys
-from io import StringIO
+from io import BytesIO
 
 from augur.errors import AugurError
 from augur.io.metadata import InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv, Metadata
@@ -28,11 +27,6 @@ def metadata_with_duplicate(tmpdir):
         fh.write('SEQ_B\t2020-10-03\tUSA\n')
     return path
 
-@pytest.fixture
-def mp_context(monkeypatch):
-    with monkeypatch.context() as mp:
-        yield mp
-
 class TestReadMetadataToDict:
     def test_read_table_to_dict_with_csv(self, tmpdir, expected_record):
         path = str(tmpdir / 'metadata.csv')
@@ -43,10 +37,9 @@ def test_read_table_to_dict_with_csv(self, tmpdir, expected_record):
         record = next(read_table_to_dict(path, (',')))
         assert record == expected_record
 
-    def test_read_table_to_dict_with_csv_from_stdin(self, mp_context, expected_record):
-        stdin = StringIO('strain,date,country,lab\nSEQ_A,2020-10-03,USA,A Virology Lab "Vector"\n')
-        mp_context.setattr('sys.stdin', stdin)
-        record = next(read_table_to_dict(sys.stdin, (',')))
+    def test_read_table_to_dict_with_csv_from_handle(self, expected_record):
+        handle = BytesIO(b'strain,date,country,lab\nSEQ_A,2020-10-03,USA,A Virology Lab "Vector"\n')
+        record = next(read_table_to_dict(handle, (',')))
         assert record == expected_record
 
     def test_read_table_to_dict_with_tsv(self, tmpdir, expected_record):
@@ -58,10 +51,9 @@ def test_read_table_to_dict_with_tsv(self, tmpdir, expected_record):
         record = next(read_table_to_dict(path, ('\t')))
         assert record == expected_record
 
-    def test_read_table_to_dict_with_tsv_from_stdin(self, mp_context, expected_record):
-        stdin = StringIO('strain\tdate\tcountry\tlab\nSEQ_A\t2020-10-03\tUSA\tA Virology Lab "Vector"\n')
-        mp_context.setattr('sys.stdin', stdin)
-        record = next(read_table_to_dict(sys.stdin, ('\t')))
+    def test_read_table_to_dict_with_tsv_from_handle(self, expected_record):
+        handle = BytesIO(b'strain\tdate\tcountry\tlab\nSEQ_A\t2020-10-03\tUSA\tA Virology Lab "Vector"\n')
+        record = next(read_table_to_dict(handle, ('\t')))
         assert record == expected_record
 
     def test_read_table_to_dict_with_bad_delimiter(self, tmpdir):