From ca06be4ffea0e5fb6c104fb03e24bfaddd644241 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Wed, 17 Jul 2024 17:46:43 -0700 Subject: [PATCH 1/4] Be more precise about what we're testing in read_table_to_dict() These tests have nothing directly to do with stdin per se: they're testing whether read_table_to_dict() accepts a handle or not. There's no need to override sys.stdin, as it's not used within read_table_to_dict() or the nested call to open_file(). Note that open_file("-") _will_ use sys.stdin and read_table_to_dict(x) calls open_file(x), so you might think that we do sometimes use sys.stdin and should instead (or additionally) actually test that. However, our only real calls to read_table_to_dict() originate from augur/curate/__init__.py, which translates a filename of "-" to sys.stdin itself before open_file() ever sees it. --- tests/io/test_metadata.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/tests/io/test_metadata.py b/tests/io/test_metadata.py index 45ec13f10..f9d724243 100644 --- a/tests/io/test_metadata.py +++ b/tests/io/test_metadata.py @@ -1,6 +1,5 @@ import pytest import shutil -import sys from io import StringIO from augur.errors import AugurError @@ -28,11 +27,6 @@ def metadata_with_duplicate(tmpdir): fh.write('SEQ_B\t2020-10-03\tUSA\n') return path -@pytest.fixture -def mp_context(monkeypatch): - with monkeypatch.context() as mp: - yield mp - class TestReadMetadataToDict: def test_read_table_to_dict_with_csv(self, tmpdir, expected_record): path = str(tmpdir / 'metadata.csv') @@ -43,10 +37,9 @@ def test_read_table_to_dict_with_csv(self, tmpdir, expected_record): record = next(read_table_to_dict(path, (','))) assert record == expected_record - def test_read_table_to_dict_with_csv_from_stdin(self, mp_context, expected_record): - stdin = StringIO('strain,date,country,lab\nSEQ_A,2020-10-03,USA,A Virology Lab "Vector"\n') - mp_context.setattr('sys.stdin', stdin) - record = next(read_table_to_dict(sys.stdin, (','))) + def test_read_table_to_dict_with_csv_from_handle(self, expected_record): + handle = StringIO('strain,date,country,lab\nSEQ_A,2020-10-03,USA,A Virology Lab "Vector"\n') + record = next(read_table_to_dict(handle, (','))) assert record == expected_record def test_read_table_to_dict_with_tsv(self, tmpdir, expected_record): @@ -58,10 +51,9 @@ def test_read_table_to_dict_with_tsv(self, tmpdir, expected_record): record = next(read_table_to_dict(path, ('\t'))) assert record == expected_record - def test_read_table_to_dict_with_tsv_from_stdin(self, mp_context, expected_record): - stdin = StringIO('strain\tdate\tcountry\tlab\nSEQ_A\t2020-10-03\tUSA\tA Virology Lab "Vector"\n') - mp_context.setattr('sys.stdin', stdin) - record = next(read_table_to_dict(sys.stdin, ('\t'))) + def test_read_table_to_dict_with_tsv_from_handle(self, expected_record): + handle = StringIO('strain\tdate\tcountry\tlab\nSEQ_A\t2020-10-03\tUSA\tA Virology Lab "Vector"\n') + record = next(read_table_to_dict(handle, ('\t'))) assert record == expected_record def test_read_table_to_dict_with_bad_delimiter(self, tmpdir): From 6eff3e4d149892c498e1e796b8d8f28ea19a0dff Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Wed, 17 Jul 2024 11:49:25 -0700 Subject: [PATCH 2/4] curate: Accept Excel and OpenOffice metadata files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reading Excel spreadsheets directly will be helpful for users (and ourselves!) to avoid repeated manual conversions and ad-hoc Excel → TSV programs. OpenOffice supports comes along for the ride with Calamine, the Rust library we're using (via unofficial Python bindings) to read the files. Only the first sheet in the workbook is read, and how it's read is not configurable. In review of previous Excel files we've used, this is sufficient functionality. We can extend support across alternate sheets or multiple sheets or parts of sheets in the future, if needed. Using Calamine means that we can support .xls and .xlsx with a single library instead of using both xlrd and openpyxl. The latter is certainly doable—I've done it that way many times in the past; it's also what Pandas does—and our needs a pretty basic so they'd suffice, but still, a zero deps, single library, Rust-underneath solution is nice, and I was pleased to find it. If we find out Calamine is lacking in some way, we can still fallback to the tried and true xlrd + openpyxl combo. Resolves: --- CHANGES.md | 2 + augur/curate/__init__.py | 6 +- augur/io/metadata.py | 85 +++++++++++++----- setup.py | 1 + tests/functional/curate/cram/metadata-input.t | 23 +++++ tests/functional/curate/data/metadata.ods | Bin 0 -> 10816 bytes tests/functional/curate/data/metadata.xls | Bin 0 -> 7168 bytes tests/functional/curate/data/metadata.xlsx | Bin 0 -> 6878 bytes tests/io/test_metadata.py | 6 +- 9 files changed, 94 insertions(+), 29 deletions(-) create mode 100644 tests/functional/curate/data/metadata.ods create mode 100644 tests/functional/curate/data/metadata.xls create mode 100644 tests/functional/curate/data/metadata.xlsx diff --git a/CHANGES.md b/CHANGES.md index e732cc89b..89ea757d2 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -6,8 +6,10 @@ * export v2: we now limit numerical precision on floats in the JSON. This should not change how a dataset is displayed / interpreted in Auspice but allows the gzipped & minimised JSON filesize to be reduced by around 30% (dataset-dependent). [#1512][] (@jameshadfield) * traits, export v2: `augur traits` now reports all confidence values above 0.1% rather than limiting them to the top 4 results. There is no change in the eventual Auspice dataset as `augur export v2` will still only consider the top 4. [#1512][] (@jameshadfield) +* curate: Excel (`.xlsx` and `.xls`) and OpenOffice (`.ods`) spreadsheet files are now also supported as metadata inputs (`--metadata`). The first sheet in the workbook is read as tabular data. [#1550][] (@tsibley) [#1512]: https://github.com/nextstrain/augur/pull/1512 +[#1550]: https://github.com/nextstrain/augur/pull/1550 ## 25.1.1 (15 July 2024) diff --git a/augur/curate/__init__.py b/augur/curate/__init__.py index 369844354..e0cd7c4de 100644 --- a/augur/curate/__init__.py +++ b/augur/curate/__init__.py @@ -55,13 +55,13 @@ def create_shared_parser(): If no input options are provided, commands will try to read NDJSON records from stdin. """) shared_inputs.add_argument("--metadata", - help="Input metadata file. Accepts '-' to read metadata from stdin.") + help="Input metadata file. May be plain text (TSV, CSV) or an Excel or OpenOffice spreadsheet. Accepts '-' to read plain text from stdin.") shared_inputs.add_argument("--id-column", help="Name of the metadata column that contains the record identifier for reporting duplicate records. " "Uses the first column of the metadata file if not provided. " "Ignored if also providing a FASTA file input.") shared_inputs.add_argument("--metadata-delimiters", default=DEFAULT_DELIMITERS, nargs="+", action=ExtendOverwriteDefault, - help="Delimiters to accept when reading a metadata file. Only one delimiter will be inferred.") + help="Delimiters to accept when reading a plain text metadata file. Only one delimiter will be inferred.") shared_inputs.add_argument("--fasta", help="Plain or gzipped FASTA file. Headers can only contain the sequence id used to match a metadata record. " + @@ -181,7 +181,7 @@ def run(args): # Read inputs # Special case single hyphen as stdin if args.metadata == '-': - args.metadata = sys.stdin + args.metadata = sys.stdin.buffer if args.metadata and args.fasta: try: diff --git a/augur/io/metadata.py b/augur/io/metadata.py index bcaef5b79..429f8e5d2 100644 --- a/augur/io/metadata.py +++ b/augur/io/metadata.py @@ -3,9 +3,10 @@ from typing import Iterable, Sequence import pandas as pd import pyfastx +import python_calamine as calamine import sys -from io import StringIO -from itertools import chain +from io import StringIO, TextIOWrapper +from itertools import chain, zip_longest from augur.errors import AugurError from augur.io.print import print_err @@ -166,14 +167,18 @@ def read_table_to_dict(table, delimiters, duplicate_reporting=DataErrorMethod.ER Will report duplicate records based on the *id_column* if requested via *duplicate_reporting* after the generator has been exhausted. + When the *table* file is an Excel or OpenOffice workbook, only the first + sheet will be read. + Parameters ---------- table: str - Path to a CSV or TSV file or IO buffer + Path to a CSV, TSV, Excel, or OpenOffice file or binary IO buffer delimiters : list of str List of possible delimiters to check for between columns in the metadata. Only one delimiter will be inferred. + Ignored if *table* is an Excel or OpenOffice file. duplicate_reporting: DataErrorMethod, optional How should duplicate records be reported @@ -197,34 +202,64 @@ def read_table_to_dict(table, delimiters, duplicate_reporting=DataErrorMethod.ER """ seen_ids = set() duplicate_ids = set() - with open_file(table) as handle: - # Get sample to determine delimiter - table_sample = handle.readline() + with open_file(table, "rb") as handle: + # open_file(x, "rb") will return x as-is if it's already a file handle, + # and in that case the handle might be text mode even though we asked + # for bytes. This assertion guards against usage errors in our caller. + assert isinstance(handle.read(0), bytes) + + columns = None + records = None + # Try binary handle as Excel/OpenOffice, as long as it's seekable so we + # can reset to the start on failure. if handle.seekable(): - handle.seek(0) - else: - table_sample_file = StringIO(table_sample) - handle = chain(table_sample_file, handle) + try: + workbook = calamine.load_workbook(handle) + except calamine.CalamineError: + handle.seek(0) + else: + rows = workbook.get_sheet_by_index(0).to_python() + columns = rows[0] + records = ( + dict(zip_longest(columns, row[:len(columns)])) + for row + in rows[1:]) + + # Not Excel/OpenOffice, so convert handle to text and sniff the delimiter. + if records is None: + handle = TextIOWrapper(handle, encoding="utf-8", newline="") + + # Get sample to determine delimiter + table_sample = handle.readline() + + if handle.seekable(): + handle.seek(0) + else: + table_sample_file = StringIO(table_sample) + handle = chain(table_sample_file, handle) - try: - # Note: this sort of duplicates _get_delimiter(), but it's easier if - # this is separate since it handles non-seekable buffers. - dialect = csv.Sniffer().sniff(table_sample, delimiters) - except csv.Error as error: - # This assumes all csv.Errors imply a delimiter issue. That might - # change in a future Python version. - raise InvalidDelimiter from error + try: + # Note: this sort of duplicates _get_delimiter(), but it's easier if + # this is separate since it handles non-seekable buffers. + dialect = csv.Sniffer().sniff(table_sample, delimiters) + except csv.Error as error: + # This assumes all csv.Errors imply a delimiter issue. That might + # change in a future Python version. + raise InvalidDelimiter from error + + metadata_reader = csv.DictReader(handle, dialect=dialect) + + columns, records = metadata_reader.fieldnames, iter(metadata_reader) - metadata_reader = csv.DictReader(handle, dialect=dialect) if duplicate_reporting is DataErrorMethod.SILENT: # Directly yield from metadata reader since we do not need to check for duplicate ids - yield from metadata_reader + yield from records else: if id_column is None: - id_column = metadata_reader.fieldnames[0] + id_column = columns[0] - for record in metadata_reader: + for record in records: record_id = record.get(id_column) if record_id is None: raise AugurError(f"The provided id column {id_column!r} does not exist in {table!r}.") @@ -281,13 +316,17 @@ def read_metadata_with_sequences(metadata, metadata_delimiters, fasta, seq_id_co See pyfastx docs for more details: https://pyfastx.readthedocs.io/en/latest/usage.html#fasta + When the *metadata* file is an Excel or OpenOffice workbook, only the first + sheet will be read. + Parameters ---------- metadata: str - Path to a CSV or TSV metadata file + Path to a CSV, TSV, Excel, or OpenOffice metadata file or binary IO buffer metadata_delimiters : list of str List of possible delimiters to check for between columns in the metadata. + Ignored if *metadata* is an Excel or OpenOffice file. fasta: str Path to a plain or gzipped FASTA file diff --git a/setup.py b/setup.py index 73c58eab5..3abfcb7bd 100644 --- a/setup.py +++ b/setup.py @@ -64,6 +64,7 @@ "pandas >=1.0.0, ==1.*", "phylo-treetime >=0.11.2, <0.12", "pyfastx >=1.0.0, <3.0", + "python_calamine >=0.2.0", "scipy ==1.*", "xopen[zstd] >=1.7.0, <3" # TODO: Deprecated, remove v1 support around November 2024 ], diff --git a/tests/functional/curate/cram/metadata-input.t b/tests/functional/curate/cram/metadata-input.t index 901b4fdff..08d966374 100644 --- a/tests/functional/curate/cram/metadata-input.t +++ b/tests/functional/curate/cram/metadata-input.t @@ -57,6 +57,29 @@ Test CSV metadata input from stdin {"strain": "sequence_B", "country": "USA", "date": "2020-10-02"} {"strain": "sequence_C", "country": "USA", "date": "2020-10-03"} +Test Excel (.xls) metadata input + + $ ${AUGUR} curate passthru \ + > --metadata "$TESTDIR/../data/metadata.xls" + {"strain": "sequence_A", "country": "USA", "date": "2020-10-01", "authors": "A,B,C,D,E,F,G,H,I,J,K"} + {"strain": "sequence_B", "country": "USA", "date": "2020-10-02", "authors": "A,B,C,D,E,F,G,H,I,J,K"} + {"strain": "sequence_C", "country": "USA", "date": "2020-10-03", "authors": "A,B,C,D,E,F,G,H,I,J,K"} + +Test Excel (.xlsx) metadata input + + $ ${AUGUR} curate passthru \ + > --metadata "$TESTDIR/../data/metadata.xlsx" + {"strain": "sequence_A", "country": "USA", "date": "2020-10-01", "authors": "A,B,C,D,E,F,G,H,I,J,K"} + {"strain": "sequence_B", "country": "USA", "date": "2020-10-02", "authors": "A,B,C,D,E,F,G,H,I,J,K"} + {"strain": "sequence_C", "country": "USA", "date": "2020-10-03", "authors": "A,B,C,D,E,F,G,H,I,J,K"} + +Test OpenOffice (.ods) metadata input + + $ ${AUGUR} curate passthru \ + > --metadata "$TESTDIR/../data/metadata.ods" + {"strain": "sequence_A", "country": "USA", "date": "2020-10-01", "authors": "A,B,C,D,E,F,G,H,I,J,K"} + {"strain": "sequence_B", "country": "USA", "date": "2020-10-02", "authors": "A,B,C,D,E,F,G,H,I,J,K"} + {"strain": "sequence_C", "country": "USA", "date": "2020-10-03", "authors": "A,B,C,D,E,F,G,H,I,J,K"} Create a metadata TSV file with duplicate records diff --git a/tests/functional/curate/data/metadata.ods b/tests/functional/curate/data/metadata.ods new file mode 100644 index 0000000000000000000000000000000000000000..03d6fe9b196869dd8b4a94316019c1664077143d GIT binary patch literal 10816 zcmdsdWmua_*DluLg;LyFtT+@XP>L0IiU;?gfdIj^xD+VvuEihXGb3(Cyc z3~23O0o1Xu02%A)SQ~@Q8Ewt=8NfOaV+aG-0%)!e*0V7Knp-nKEUbV!`Vb=^&|2m% zoSG5{+BnakpdK&W2Tlbe8#7&V9b*uL(fZFKgN3=Fzl^j9DiS`@!!4*{qJnY{uk{bt zJp$~*b9gKz5DE(BN=8yah=iDumX(K-g`Sm}nT?&7iZ>T|sjHf(D?qg5bhY&KwKdFi z6wUP1?F>}(boKOrrUphJW1y~)fw8GE5Cj4NLAC}KE@qb2X4Z}n6J3b8v7?!;1K0>+ zVdiY1?*uW0Kp7Q!#Kqsy)ziu~)a+BZ zqldqhM}(DkEW|I--Y3#6Aj&x))-gEUEhNnotl@5{=V@c??O^6-qaWk|401H}a<&b1 zHV$#K^7n9zbT^Cgu}ckh&I)t!d@vt>Z@(ZvAMek8UO|DMJ^jPogA)RR!u-PGLcG1h zgZ$!xJQ9KfqC$gWV`DwSGCU%4ykZLjqEf!ZWd+9P`XzjeO3VyPF8z{P5S(5WmzEQi zRrxjACp{thTcUqTa!P7SQhrKAURr!bYEXGrWJyL$+1JGC%+Q9Mh^qX=%(T?B%$%&; z!ko;sf}E_Jyu!5niuA(joPv_HlFF>&+Pvb5ywbYz{M6c#oQjIllFGWWs)oAq;)be{ z`r7K4g64$cwyes|in^A9=CRt=o|3kaZyi(R-)E}Y2J5?rYkFrJ2Is5$mgt^as-7RLTkNY|8LDe(ZENc6YwhamZg1}S-qP3C-`YFUH`v=V zGCkJYHa*fkJ~7ccy3#SV-Z8b=JHFUEz1lOoJvz5Ku(;E^bTGNHJ-L3gHQl)~GqSQc zv$Hs|y*hiiHgfo5>U3-FYIpYXV18w7ZE0;|d1H6`$I{Nm^7hWo>dx`n!P&~u)$Y;B z&dKG@)!p&V>e24z*};#K!|m&%wX2iuyzB6am-`A=_V^}gopIJ>vFH@Y`;U6nfu*X>B)p- zPd+%_?7W)so{X=%EhAsYS%a*WpJT=9z6dbS`<_S%4(G2nX~{k>LMW%m=tg~kyf)ow zo(jOMXhzv5FREW0>Kro+|1e~ix8f#)gwvDviu4N7S=I{)-c(6$s~j()<#i6PG)zM0 zr+X4`8hRU=Tjm7LW5>QZ(m;&B=|m(QjF$ABP;jezO|x0;J#Xf4Q8!C_EhNq`;pKxcAa_piOR`h-^qO)HtE(>|4H(ktA-{H2PBVzpJ$6Aq!B^`8sefq z!S@8G-q+inP2d%YlCWofAar3PTD}HXlSe{6tBaedCY}sfPOTGJu8BcrNX`+fF1w#I zN@%cBIDHeU)?V2?@!+qdEOr?wKb@-qGL6)Lba!o zxbHNYtb$l~Zn$kHkuqVPluPj*P!Tv;3qKgJ=4efny|(0^U} zpk!KW0szhBix@n%rwkWj*#dCl3e;;+-9u4-;n`kkg*3g)iwZ8ZmjK}{kek=+XHwF& zUy68X35^;gf3d%Z%DeRa4i#{g`r$cIJ_=JitD^YH*kY)oNKmD3nR1WTd{m*^@HC!c zyd~VCm4j9GO<)?_k=AJ0G`HFE^dQvz4-%9YW?jD1kduZ?eV7)bX z)>C`?PJQazg7Lz}QdMRBLLoMrr8-lxWBv2foSl)Ng?)~x-Ft^&*e5j}+`J%yvMa6k zx255&yy=MJp_~N~O1!AIugxnV_8s}aygAJZG(cS{V+c#E#B@{f0BzmrRf?%^QDm&z z;!*IAR2%U5y~4^&v!&F|=@CO+2h@ZM>h?2x;7%KLP2e$Wi)Xw^?0i3~r>O`kt`@I- zCLaD}O`^2Kh?bf8g5ox33}u(A_LX0m>NKG)E1eJH+Gp6N8NI&dD^h_H*Oi4Z%<|ZM z-RGq22uCa?S_A$@;jTAtG;Mn_==h7*y_S76ytv^Sgebbgc^_a6F|baR+M~* zkuolr%$|Ss?JlE@DHlWhsG~}f&qFU9bR~E0eDnUO)oOBAPpO=>_G&?;)^81mCSM6O z6Z~!3ftAiw%Kt0QS@SR?{7TYibMJaU8UhalRG*iY#m$x$*jOdUp1%IrwYZj&+P{a` zppW)mZDS|lqPd1Pad>EjsDzlOavH;qCUJd@+riH>Tw*qC9wk3FgZ(Zc&AXFg6vA{| zFAW}l&Qo-fb}Vq1DZSR7LzrtZzDP z$S`H9K;m?kSCu9aSh0?t=E^VLur{;!P@7f&W2YF%Q_zcFFn+n*-O%^0U@|&-F^}(X3hcgnyYU!%{=JLn@42YCV@%bNihk7Zi(<9 z!Wg-60YY5Qr#e>94!Y5q(O!{>y&>A|QMyNUI_^pgV&90lL4DTF>Ez@fPCL91d-sW1 zm@Oi2GF~T#{ykGJk{3LTam+%0LcDG$b*|KfNRh5@sW14G`WRi^K`HCEI#sgP%~!@0bcQNcSV>UorB&Q*gTGXk0|TIcTMiw%9U?sp2Bxfa1(D)NfSV3aD_%kZD|9 z#Pj0JYKa=QLqX=#QboS>T?%#J;M;AOPF7zpa><;??!+^yI@4*9*>#92Hz_D#1P31l ziaLF8f~n<7mF|JGy#T^rvJX#gEZo&+VfEzFc7Q>%H67x zAY9S(@W1u(cb<9d-oJm--}Ufu2rYEXfuR5E83Y7`KeUO5PW_J|Ka}*q=GKqGgS{DO zS49dk!-ige4ltUGO@iZ#m<=uv?Gixh3C!%XVLeAxWdMy z4P~rm1p2k@VX?h3UB@v_trr@b0?q4T@AWSo^PjTZ5mfLQ>(KVK(k1rWkK(KSNU;Z1 zs64n*bA;VRS#LO@_7(b&wp)6D?_UTqu}X;Xk4eI_gbtM*#Z_;_Cg$PFIu1D1sugb+ zY)YTouub|fs+=ZId1&m*gqdj>dU|Kn?1gS@*U1wK!IUKF3zlMkCNkXY^sXF#AFoq4 zXD%|cDcXg54FCWpRZUy?f_kjJ)6<~5SsRtiD&{@1^p^Vi+T{{2UxKH#aC68((5_ZhQT8qmweBgnW$}nRZB6|yLVszqB zJe==28PNTmpFUXYk!!sq4M!MTdeiJ({WPzb<>aMZ<#}`5>s2U&z*VQ*~+Nbq=WM$3iFvJdMIxsL| z^~Qy2H!|zH-n+Q~))7yF8WL*%=4?(a`o+>>C(5$oYFzb32hOU-J^hKNJb0Y5nax%S ztEx@q-7xwqqswKDLFI5QMU=GFGjUHeUBT`IurGj3p2K2kWIx;%E#(!q83P-g56*OIy{ zIddOIOXzk1Q2P3KeD&?V$s2du5b=o6n6xbvVe<8wawq-6=?WyFvq8gwGN zxFAAkm7t@IWch}&6s`CW(~?~@j%Lu5JSqIzeNQwpjxk6#Dm_b`VTiCcIg%b)sE~Jo zoBaBjp>vrROInsjT2=xeDp2%zwDe_x8P)T%e!Jb*ANw~sd4QQZ6fL()%bzL|y-cL2 z=}IR>AoT0AQ4Sej*=7J{ha?7V!|0YJX@pl5eilzSHMIl9DS}8m1T9Ef@|tD=*)nP! zt)IGNh{)A(cfp8sbUUJ-uP3$k$#op1j<5k=6AkRci>ig@MtsM7 zV{>6}aVGJiFRw`EZ8*HY!BL|TF-_d|Lv7@tcGd3RL3AnT_RgW^?qFdj|9~u- z*u!g2zS*JeZ4zbG;isI2X*X=OJ6IVBc!WH%QGSXiP*4B@D5(ETk-v+se<;5|Yn{i? z8Whe4PI!avy?=|<)rSRwRcV4DJsp(KRq7VrGHaiHW@)+;Y1Z;%0S5&%WlMXy!M<=& zk2kf}S{>d@z^-JZ2u(3b!RqooPA=4Cu>r(RvWQi#~01pKa^ z`vHtO6oeJ^HESj6o%o4=Ub#2Ocsa;mjn4Z@#T~FN4cDTBn?~YaW=z5m0CTk!!0fmr zT_2jHi)ryJI^7Bwo7mHFZVxk^!38(f>qQVs&Qj1w)_mhu;kV}Scj9-NE%Cjxn;%!^mW+C~ zaodgkO3>Gx6hf|J$W^?UBfeP0(VeT?MFfVa6(EZ3r`I)6cQ4~cTeQEvdr8e6$w6SK zr#t48&p!X{uA)=xjiai^jbg)I+P;F7+6wQ6P|{Qst$E!_`aAr)F-Gh?QcR$%)j5@d z8;|pVov#Ky=u_+Wjh6}U{bYRicux6CEav+62@HLEaEi@Qj1yDbCp1`|5y{Qb0^H$x zN!R2G@#pE&c%(J+0-^FH>`!493l@~_VIButCSU$_B{URN-h+Vt&x8G_w?nKQKtRaj zaL=i#foIq-T^EYb6|^TmeGl7x8&u!cJYHB^-aJ?~qt`4N(Hn&EB%mVv?uuw$UxR2W zC)`|mP+`?Bxc{=>J7e{@XC~skX)kh139=m*k~+L0$!F0h^{SWijc5I{J?(&Xx?nNC z_2V7YPA7=nEyjQ7iAtmE1*1!)Gtc(~#s`hquFG$o%FH&FLMs-ajv23?b0Ih|h z>Y^Hxg0ik@$%es2B32a*k!2Yk4c)gUV?{0l0dFw!R(mh9N*oyYZ%9KY#z*!FV3fJLh>>ZvaUgO+8kjO>P9z*--Y8yEgHcD$~;&ot#X#qbhEN6$-b zlaLPuUw;tdKtbC9HJ`~VMCPnbyvIw>*v|0%Vc~zV$R)q%m@ApSl*1g9TSAS|lXQ`E z3D%LjJwEGWE3$uoqKToFPP0n>awkg|u!g6XnIrcBr+tbU>r&U|m0FEKaD+!je_!wwl+ot{uy0J`AW4D}c6=W%U zOdI~@gd8*=vd~Ha;yhR0Y02Epfb)~wv!XB3pwSqJ%Z8zzAg@*Xkfa%%XBe z64-GVj&@6~K~_r63r(AR6FE6i_KeMV_NeXhAW_*-bdKjFWznq5f@?mFGi87a@X5MK zr3zLrdiWla`-t!4i4VQMh2|1#w+syPtuyPVFS>JBFC;CO>TY)J)~|p?Hd2d*rTzOo zd@P0Rknzt5okl`^*aIePKDlR0-Z63$Es!?Z7TJaqtFI)zPuyKnlVIK&=~Ux{@~23e zKhMstp2T>KoK2tAyF=|2(?{JMh9$0zZD{h!ONr#2^7+!VS?)oKKze>`1(M#Wa#wa{ zA`{YXTTfnvYPCqpg>B_BMj5%(*xG0jfQFxFl(GMmt$QcSQPqGj}cb2RxiejdOOsdMjBGY`C-Pv|rP-_D7!%y_^oN7_Koz zLHHWh5*wHkbvSYBKLtRLgvD05IFVOB-pi>WE-SdG@}UId zwoL^3?;MBB?MJ6-`)R>R!;;^iWIDjxZcs9aSWAu*b~ruBPkS-^amtTg7$}LRFF<;E z09^pL&3A%lL*+dI3G>lM#byf_FgF& zUbbXm{>t|a+{fe}T^81rwh6dz7n@)`Np?fj-$-)FG3=-jt4%uwxYn0>k3 z4IgHk6Pr2tX8d_Hd)IHl9@B_nt$|UiU*^VW-Js zIg<{Cqnq2Z6!iy&bV^^gyaAn0IP6UhC(TPc`iHE2WoUKOi%p2jojO1-24Txny_F!H zT@vAPd83^wW|K0~wNb1iit>$oab0Ui4Yw;T5^Ng#C9hDCdNwJO zxPRU!G<{?L91e1-hCdf6Y;&E&P)WDqs`#P)O10NO5t`dJL;a*SZ(;wAVP;wDaeCAC z+lkA53I&Bs`ES!3%ER<#rekhw0EAdGSm_(|h4-0zzCrie=kf;jPkvy0^2Ca&yV@kV ziPcB>3FA>5#(YC?xRPu}?pE^XN}({zLU399t`1?>=aR3@3UBqODY`|C$~S4>*D_RR z&b*(Vte|}vM|5c7nVC#eV`omop%vv#&yE5bC{3fXk2~i2SV!7C_RdlirK2nu0g*U} z-}rRZ8K0wn8PNbBYrUNk;&A?o`}5rUoRktIoeH^n--=hE?jM2>CA+<>Jqc^1CE8h{ zj%7KTQ+Jm;wd+Sb)3~$`XoCkP=lxK!)-)IEi}%z$7n~!F9QX7bZARJ;7sw*C6PWit zZq;%++^^p;4BX=_7nRuxfj6E{%;Ihn!2aQlBXYNy@0JrQ%UuSD&9rO(GkSnsSE)rZzT|peDAJ zzre}wH|zX}5@l^$&NIy&=6q+nQrdyHMp(NE8Vw~LvON%)xOqx^1amY8_UCsp6@X~;H&y2ETn36#Fvb9N- z8f&sZ2~&ttb|0?2WNh>Nx*)@5VN9qYrKcooc>aqN$+Dfm;iu8qHs#7W9<$8*f{y<0 zBB`6Itq;oR$jhXClnh>}+@>#jBa__08U)JI8TDD{E??pz+4nYMoM<)^nnj^vb+mU0 z#@GgfB4|w~dBC>_NAgV*#WDsV_o+#$YkcQ3_5DK|FH(9G3)d6bcS}nSq>|yJy16Gm zMYBGUz2g+`FtYVG!}BGnIbzlnr+5pUuOsO@>@J$$k%Gr-CDrflQzLGzVOX#kMe-#nw3ags57ZF!6dC(Jd0iX(g^$tNuLw*Bq{~S8k%AMzI z0qU+x(Ha%;D&hdhXJ6y%zJFWr&UjZRTZnp1bk-#NocI(blE0CEkDEM%V{ zRyzM$`jYZjlGwOC4j0bGjuph#+m135o;AdyrzwT-Hr#`!DS%(97cAR+vJk5d1zRG@ z?2k`6)i#+%h8+5_x+W9+T2|Wnjf=EiTq*N+K4pLl@l}I9Y+i--qSgC>j39XvJyw*2 zd(T26KWGr;9Qa|gcbiZT03 zvpyXxT9x{WyS50r{W06b@u@SX8k3XywoZm8NF9c`FD%QCH52Mdj(9Jqag)n(0?O`v1 znR6FJ4puMHmC>e%2_e)-dC;ySmY9 zBq)nN3#dO|U)0zNawR!??ixSGfvc!n-r?$h+Nd9{C)x(cE=A5X-S+*0sV0;Re%1*6 z7)df*+?`Q-ZG0RNKKn zbGgXnoieYTK0zvQ7qn7USR>AwsnFp_JE55(mpu(3FCT{gusbsg8+J3pD3U#6{wb>yp&EH`-8X$f?hR_e4MkL-X1`agi>&Fz%-p#G~qMh$qQ`Xr-)TF<@m9jDU z<}hTN7{(~iqz29?2B~?<77l}Q)VcT5fXF(vpv-$BXg_}nDqp=lHyM>^I4KF@Vy92K zPGj!4Tc6d>h=X>m2n|I|4_8yD$rw3~K@OY%WdqH^;SgC_$3=;?=j}3@88fgs(Vt(} zyb~=ltWP@p+V^59y0CI>;eIFB?GLYGaX=TaJj8Fh2hYn$K*L}{{qr%d z2iN~%e;(@kJL>NTwSGbr9*X~TZ0qkVKT-cV?)9^t`fvd3pY{5`_TVQ93hGg@|7jDy zVfiop{-4Pnb?TpniT2-L>8GCm&n%C+^-mN34a=|k{{NQ8<2O9N>iz%B^Qid$w3^@W zSo}r*|KBp5{D$eT`vL#V^jA+ap#Qh0f9?V*xymVCZvz5%1>Lz`Bh{2JMPy^>@nZ^ zX&uCWO5BDE{nLNbkGp}X_CNhT_y0U#K2Q$e2#^Y(5?BB%1Qr2{ zfk%KPz@xxYU>WcjupFoY9tU(_1yBt<0Xzw;1cE>funMRJ>VSH{02+YRz*E2)fOBcG zQ|6yR{#jtHOr ze=4-9F@!bR<^+s%gI0#Sh6{Fj0`YwLRMnhS@djmqXc!WG9TMJ3ZfYU46FH{hY8B5% ztjQOO=dz+S!ZyD}tMr@i*c-vCwZI1sNm!ziRf~2rRa@nmE8{$4H=847JRGg^RLCjr zuU8Oy8#+6C8`|5is0}qPzom7E&V&h_bdFZhZx%+UDsr{YM`y3O+W&v)DeKp z=Xy&Iscd>}9=ef-UYCdN&q)`UgBlNgvI-OWaTf|nNiz*3-72j}mZ`Kd=~3x|5r z8-oixVmZ#heU5W3=DiC#qMNBuItXC4w?De^V`pF2#+{0uRM%?NkF4a1LQw}%bIwzz zIOhHI%>=sYaG5nt;v}jI?;VwKEw`_KxU%hhVgI{lL8w`B>u21S~KD$K!dKSp|-7Uz& zjlwHiv_Yi2t|S5a!lMBHo8Jd`pBM-D|9l#t#A|n~e_@`BJ%c#bNA>QZzNpC;82%&Y zqm#E02mw;_C4Z5vJTnL(VBMx~er~#SSz~_5zVq{%kw^AVA>aHd!*$`i3nx}D3mktB z^EYmN(hiz8i`EO;5`di(Jafp)x#gpyx)lB+8kouVKehf@J|NdLd80^LmqB?Mua9xr zBYo-}a!;=2Q_?D7ZL~M;!^}NoN(Q^KiW?F^lQu{m#tX+bjA6G$EFz|S&WExY3Ss5m z-D)?~AU7&zA#6F@@CIs0w&I+PvWU$n^R14-E8Q}N`48j8gKxiE@e16p)`XM*1OhvH&7J?l^6*V6BB7T>Z<|LFLodP*~gK`%-!12 zgU8#&-NDku#ev($$&nlJO-E{vMjrqt>c~HK*gLzK60@kR?LmpjkWUD!SB||C(Gqeq z#Ebw_)UHxkP0X-)>-H8exbn<`svm<0w$8@-h6+`5D`(vcYyByYcT!yq**j7UYUsV@ zK=bF$R2+4@Q7mjr22_GiBnnk?=Ao~^_7$npIUR?<4Bwjvc9qCyS!Ig?Shi5&2TzAW zZfDM&Rwx6`KLyH>@fav24k3%M+3y0_FAf_@P`Kk-1|ls==b35N)MQ>=^rs|csH5@wkIbe^{}v5r4ht3l5` zkEre5B`m^iGpZGwI}}7cz}@v-gBsvQSrJiXh>XS7-&z`Yw+(Xek&Bg7v+0Q5;;y2V zK`ZtcCp(1b#dHh~Hdt6-(La9xnQbGWf6jkos8L9q@l;SKX4c!rlvtH zP^e%psaPF^?Y+qfnat-S6~9O>J|P(Nlyc={gBUYRX49VIKVmA`Vl?*eQJzkk?e9OSXj`yzE?sDyr3|cve3zEoWx1|4n>4l!q^Z4mJJNO3iHyMM6fjBh366r; zoxj3~{O>p+Qpv;C+S=2D=f@jA0;@b-br(t@f)1EIf%x{EJyTO+xK(L)a#dLfm?9$; zsRJ}-$YsGDP1mBeZ?-`P<9EiB_A+?xlEL36o{0KmjWOUGM6ycyQMZlVq{~DT$Q%cJ~EItxTOu!#EkqT<_F$7@g{~8q(0_ z!%W{jsbSAo^Gog$!FlgwDAxUa_JRg6GD_Ju@5hy)-!o#tYs5_JtW7%$$ScS6PE1$9 zaHR!lEtgKy4BHG4o)btcS>faRC`!3Z&il?&n|{^;MGRwk0BbJZ(PyWgHH0-3RA<$G zX)k4@8EAijL4_BhJrq0J7Rrai9WJ~tElVgB zL}s&qst!8WJVQR)l$K$;Pi=`ujBk=u!1|Fy=(&!%#LOT$<+P|t3d!+W7hCt9q_Dq2 zH7s&kZpKoiVBZCL{KA&;L_ypl)rW2`w;IHT(LaPK-*sZ}(lbG+h6JQ*LYjM5MiXGe zf4EXS9-NoPKxq#!fi*2LuK|n1m}4&R3G5Q=rj!<@ZKt-(-+A>eTatDJ2GE;X7mJx2iP*?QbzK#gnWYMMkMVV^B@^v+)GdM~z5O zF1OW}a9}&v<`3k+r-p8E+`>gl0~lPfLmUiUI_x;mNM#o0iO0!hLIy*;pTJ`Gd++0~yjVK(qp){# zdKY_3<1$NTA&1q-PAcf7sJig*d4CDJDtp>pw(d-A{G*bq69FvuyyCQjc%v?&ra*}D zl)f1y&Jm5RfZtQq)FT=Pi=d8Zh6IU)`%wQl>+fx&4KwPei_9mt;tE;@MjK3!yjt;` z_0;CR7k1Y?h_b`#pxNYwQ_~vye^GY4h_dU@>v7(C2roMhRO<6Nqh>IRwB9^@)1Gsv zBUS$r_cSF}A}2j@1rwWpYVA>S8k4+x+{@_g&6vLTIaxw!>-_~m#Nxssvud&O!W0R8 zNOJ6K_b%lx&g`zoyvS{3QJEfN8AR|D_Gyl29)l{MyMxbZ9j!M*(5hOu%HW-j$IF zdz(kOeFZp)ttV7Fk7*V%pA0;c=n>gEik%T-MtZHF+jI<~d65#V@G$w}aL}yK3Ji2C z07rQzXQ9s~uaOFupxeS$%lN3m9nE6%wsDan<8@XSi2=DfESJdqrV+ifql8)+5BrIP zA=XjS_JOxjS7V0c?J9SL>Zp zs(NRQiTrDKIeSA2&ybuR9xT)1p2Aw`oVqw&Vc+Jmo7zMcceC$qU-KL2IW04^FVzmi zMzr`jAEkS{A|>8%TP;MXeaCt!M3+Kt2R&RM8dk1winE_?oK+y*AQ$unk5QC9m>l$y z#{Sy(RVZ(Zq2Z`3kkHJLfwPpxo{wUpJQ0I-t+bIoim^UrNUB9!;;m+5!n*79SfBqb zHXHX%p@$I0_%!AJxA>y}5%$ zl^o|ef_He2(vKc7Ps;klFIlag7~#5cPieazPZB@IA!^$N0Zw;~LZ8HiX))8GuJ`iP<1nO+shp&N*;@=00T zy<)M7IO+=%JK)u%KHd;!-|TyZZ7&!$$UiEyr$mTP09^xpZb+&rZ>~)10i0Z=}kuihjsWN0$G(X^XXxbF5CYSNQ5-X{xX6 zA1=}7uazv7D`<^U@Jm=Fxz~=OkJ1Hb3tS)1_B=|INqY0(!)L|CEzXjwvXdOOwXPLm zdkfB&nxAgzj5O}Rt~Sjbs)cvwpN-iVso$U@=%JIVif0K<564*l1wHWnuPgtrEB~)6 z|G#x*{GY?{KliEx?G@)008ZeBKDnP}z=*XnLHTgXeXcsn?3%y~C((I&)u?^}<>>26 zTaiP)M%MFWCE^iTny)@VUezTq)IqG*mZaw>pF&4Hx#u3*NFJuHNBEMqjJC{SSw6ip zRzphr3B&1W#k^S3*YpI#X9iON8#YWjtLI+0vEq>rNfh}e<{?eU`71<79DgWE1I{_P zi5&=SJ_Q3jK`}Vxdk%k79YTqmvEZR@$ z)Zd^t{E?a8Xf(5Esh|o}%=!i*o?`=qi8rln9fMvyrnOYy5H(rFKm5+>&S@9A)nm2T zu@7UVJv| z>oJ+v--Qzgt44%(<;Ps7vHSKwM^JO&U8V(P8CB!g`p|75<>jKu9f1fR z-jLeZm&Zg^LcZy?hQCXD>jOT5!Fw1Abn)Oj)$U*JpSb_nKRtaNt$!?@2L>jZD*)oa zjoP{^JY`j5bCs|QVEH3J5vW8RjL+;wgdHr`jE&nxBc?H1g2USyO~MvfALxG6GPJNz zk+*3z!0OYjMc5-NybPo37AL*?-VoD;^rV9UGfhEUTE~^Dmx#lMC=e@+o2_$_cfK3o zO49py??c1q3gID~ia?OLbAE{oU&#c}CZC;A;n^FFct7w)fiDgrUU3M|Qm!4EW&O5j z#1`qDns3?UP7QZa6AvFVDvXki*UJ(61qdBe`bwMfz}6=n_EB zf6Ytn42cAd>vM04ZA-tPt6tV=p|%R&-T>BRSkT9-3PsL)>VqP8S&j{sND#&;9U zVfD=Qk*fi8(>|qL*a=i|GpDSedVqP3TyH9Soo|`D^2N2xxbo#GDmv2 zueCK4#7b@6Xxk6Am3+F8j5}@}^7hU=%HM)K8mygtG1}&L1NyEg23^qKqjPd8o2cTb z^Qd*KazhtmtrI#W2x{2iux0V^R+SNaf5vLc&i|Z0+RM^uHWzyk*vFTBZi~0Bj1u&% z%ec}o$^IO#vNSDo$a>6k5uDv#P*J}3YUt?&%u(N9&}P8+;#1Wr(Mebi*fCt;7>~PSZ zV3^cXVZ;ZIR?hW1o<2x%Mt(yHSbPlXH82(ZU|veq5^c3+Vuo^SwR0X*alUT5 zewm;b4U2eDSFxuM?Y(V!M?qxfgZ|58E*Pr<4}UwM=9HYaZsC26Hb)-sZ~hvXtn~tGrPRRTcX|bB=3EWsJZbFd5fu#bkm-q{>PZ@yY3klTS>Uwgn?;O83x&e zU-nBzTBif2+S%IxQFSMjT#;HR+r8V zQh25PL1PWV64H1uJ~1`*eJLnFOw5>?&iY<}Vg@b)i4RHR@nvHB+0*`Vf zgAZ>IHYzaPCI9VT~OoH+OQJ@*i&I>M$n=r#aV5{{-2%NP1&VK>nf4FnKL77mM+xihPx zG3bQLgZ+fIv%k!C=dcNqvvez%DjD{kVLIhPQNd$EKtPU9)Yr{Q0W=Q@L5FIJTN{V& zJb{iOG~o>XAy)?Sy~3Ep6cPLsKL;z0WDgu7Ea2n^&XF%X3!!ZL6Bu0XgPajnf~zI@ z-QDZPgfY6hOuCTL@89^ZcmwmA^uR^gx^7aQK2#B|DC5#JzR2#j-ZJZLQ#>bkU*jvq#B?n;tPd>q24CZv{L%*{)5CvkX*TQQAx0*Tc)8EXunDE{6qUNxV>d z4GFp=)@jddN(0*8=<@BA-Q3Z;^R@i@n-xJiAFjlv+nj?G)7$ZV3fLU@QWWN!cny_F0PzycAWV&47ck1jf)j~KKAQ{WLID1+|dfAnw;h8c4MdKpeRN<4`dz?N~u2w1Z&uC{ha-Ffn^O8018(3PXaA)IX!MJICXm zu{Cyl4*1-yK6YH`^o9wYZ&J?^cq$*IC*ri~L>ASE86Qs+6NH?m~W` z3+AYBrQr(BYMCA#PO`u=t&GfNz`4BU08T12@+OVQ#*bnQq80Pz3={IGe7@#<$|l290!sk#xj!~Ax<<jdoxBvVw;P*Jc>cI$g z{HL+O{}~?V|7hjENBC9RLg>an4Ft~`c!WPyk$;c!>jwPqQHJ4H|0hv?h+@A-`OoL_ z`R<=a`LR3y9_80r4l(@yG*kEh_m6Y==PK~K^RF!uG4B1e3V198p#K{Fes}$~i6DCD zPfH^Dw~qR|N7E|61e-;{9ne@QnDE_djX)clTej6u|^PEtvFA Yc>$=323J9mkg(yeF?fnxlOc}&1|cJ%!2kdN literal 0 HcmV?d00001 diff --git a/tests/io/test_metadata.py b/tests/io/test_metadata.py index f9d724243..8812667a1 100644 --- a/tests/io/test_metadata.py +++ b/tests/io/test_metadata.py @@ -1,6 +1,6 @@ import pytest import shutil -from io import StringIO +from io import BytesIO from augur.errors import AugurError from augur.io.metadata import InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv, Metadata @@ -38,7 +38,7 @@ def test_read_table_to_dict_with_csv(self, tmpdir, expected_record): assert record == expected_record def test_read_table_to_dict_with_csv_from_handle(self, expected_record): - handle = StringIO('strain,date,country,lab\nSEQ_A,2020-10-03,USA,A Virology Lab "Vector"\n') + handle = BytesIO(b'strain,date,country,lab\nSEQ_A,2020-10-03,USA,A Virology Lab "Vector"\n') record = next(read_table_to_dict(handle, (','))) assert record == expected_record @@ -52,7 +52,7 @@ def test_read_table_to_dict_with_tsv(self, tmpdir, expected_record): assert record == expected_record def test_read_table_to_dict_with_tsv_from_handle(self, expected_record): - handle = StringIO('strain\tdate\tcountry\tlab\nSEQ_A\t2020-10-03\tUSA\tA Virology Lab "Vector"\n') + handle = BytesIO(b'strain\tdate\tcountry\tlab\nSEQ_A\t2020-10-03\tUSA\tA Virology Lab "Vector"\n') record = next(read_table_to_dict(handle, ('\t'))) assert record == expected_record From cee0e1dec349401575b516679cebcae2e2ebc372 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Thu, 18 Jul 2024 16:01:56 -0700 Subject: [PATCH 3/4] curate: Read the first *visible* *worksheet* from Excel/OpenOffice files Sheets may be hidden (or even "very hidden"), and we'll skip those. Sheets may also not be worksheets but be chart sheets, macro sheets, and more, so we'll skip those as well. Hidden sheets seem more likely to crop up in practice, but we're filtering anyway so it's not much more effort to be precise about the sheet type too. Additionally, clarify in documentation (and code) the behaviour of skipping initial empty rows/columns in the sheet, as @joverlee521 made the good suggestion to do in review. --- augur/curate/__init__.py | 2 +- augur/io/metadata.py | 32 ++++++++++++++++-- tests/functional/curate/cram/metadata-input.t | 29 ++++++++++++++++ .../curate/data/metadata-no-valid-sheet.xlsx | Bin 0 -> 14442 bytes .../curate/data/metadata-skipped-areas.xlsx | Bin 0 -> 6897 bytes .../data/metadata-skipped-hidden-sheet.xlsx | Bin 0 -> 7018 bytes 6 files changed, 59 insertions(+), 4 deletions(-) create mode 100644 tests/functional/curate/data/metadata-no-valid-sheet.xlsx create mode 100644 tests/functional/curate/data/metadata-skipped-areas.xlsx create mode 100644 tests/functional/curate/data/metadata-skipped-hidden-sheet.xlsx diff --git a/augur/curate/__init__.py b/augur/curate/__init__.py index e0cd7c4de..f2af6b7d9 100644 --- a/augur/curate/__init__.py +++ b/augur/curate/__init__.py @@ -55,7 +55,7 @@ def create_shared_parser(): If no input options are provided, commands will try to read NDJSON records from stdin. """) shared_inputs.add_argument("--metadata", - help="Input metadata file. May be plain text (TSV, CSV) or an Excel or OpenOffice spreadsheet. Accepts '-' to read plain text from stdin.") + help="Input metadata file. May be plain text (TSV, CSV) or an Excel or OpenOffice spreadsheet workbook file. When an Excel or OpenOffice workbook, only the first visible worksheet will be read and initial empty rows/columns will be ignored. Accepts '-' to read plain text from stdin.") shared_inputs.add_argument("--id-column", help="Name of the metadata column that contains the record identifier for reporting duplicate records. " "Uses the first column of the metadata file if not provided. " diff --git a/augur/io/metadata.py b/augur/io/metadata.py index 429f8e5d2..f2e646aae 100644 --- a/augur/io/metadata.py +++ b/augur/io/metadata.py @@ -7,6 +7,7 @@ import sys from io import StringIO, TextIOWrapper from itertools import chain, zip_longest +from textwrap import dedent from augur.errors import AugurError from augur.io.print import print_err @@ -168,7 +169,8 @@ def read_table_to_dict(table, delimiters, duplicate_reporting=DataErrorMethod.ER *duplicate_reporting* after the generator has been exhausted. When the *table* file is an Excel or OpenOffice workbook, only the first - sheet will be read. + visible worksheet will be read and initial empty rows/columns will be + ignored. Parameters ---------- @@ -219,7 +221,30 @@ def read_table_to_dict(table, delimiters, duplicate_reporting=DataErrorMethod.ER except calamine.CalamineError: handle.seek(0) else: - rows = workbook.get_sheet_by_index(0).to_python() + def visible_worksheet(s: calamine.SheetMetadata) -> bool: + # Normally one would use "is" to compare to an enum, but + # these aren't actual Python enum.Enum classes. + return s.visible == calamine.SheetVisibleEnum.Visible \ + and s.typ == calamine.SheetTypeEnum.WorkSheet + + if not (sheet := next(filter(visible_worksheet, workbook.sheets_metadata), None)): + if not workbook.sheets_metadata: + error_msg = f"Excel/OpenOffice workbook {table!r} contains no sheets." + else: + error_msg = dedent(f"""\ + Excel/OpenOffice workbook {table!r} contains no visible worksheets. + + {len(workbook.sheets_metadata)} other sheets found: + """) + + for sheet in workbook.sheets_metadata: + type = str(sheet.typ).replace('SheetTypeEnum.', '').lower() + visibility = str(sheet.visible).replace('SheetVisibleEnum.', '').lower() + error_msg += f" - {sheet.name!r} ({type=!s}, {visibility=!s})\n" + + raise AugurError(error_msg) + + rows = workbook.get_sheet_by_name(sheet.name).to_python(skip_empty_area=True) columns = rows[0] records = ( dict(zip_longest(columns, row[:len(columns)])) @@ -317,7 +342,8 @@ def read_metadata_with_sequences(metadata, metadata_delimiters, fasta, seq_id_co https://pyfastx.readthedocs.io/en/latest/usage.html#fasta When the *metadata* file is an Excel or OpenOffice workbook, only the first - sheet will be read. + visible worksheet will be read and initial empty rows/columns will be + ignored. Parameters ---------- diff --git a/tests/functional/curate/cram/metadata-input.t b/tests/functional/curate/cram/metadata-input.t index 08d966374..dbc2e3319 100644 --- a/tests/functional/curate/cram/metadata-input.t +++ b/tests/functional/curate/cram/metadata-input.t @@ -81,6 +81,35 @@ Test OpenOffice (.ods) metadata input {"strain": "sequence_B", "country": "USA", "date": "2020-10-02", "authors": "A,B,C,D,E,F,G,H,I,J,K"} {"strain": "sequence_C", "country": "USA", "date": "2020-10-03", "authors": "A,B,C,D,E,F,G,H,I,J,K"} +Excel (.xlsx) workbook, skipped rows/cols + + $ ${AUGUR} curate passthru \ + > --metadata "$TESTDIR/../data/metadata-skipped-areas.xlsx" + {"strain": "sequence_A", "country": "USA", "date": "2020-10-01", "authors": "A,B,C,D,E,F,G,H,I,J,K"} + {"strain": "sequence_B", "country": "USA", "date": "2020-10-02", "authors": "A,B,C,D,E,F,G,H,I,J,K"} + {"strain": "sequence_C", "country": "USA", "date": "2020-10-03", "authors": "A,B,C,D,E,F,G,H,I,J,K"} + +Excel (.xlsx) workbook, skipped hidden sheet + + $ ${AUGUR} curate passthru \ + > --metadata "$TESTDIR/../data/metadata-skipped-hidden-sheet.xlsx" + {"strain": "sequence_A", "country": "USA", "date": "2020-10-01", "authors": "A,B,C,D,E,F,G,H,I,J,K"} + {"strain": "sequence_B", "country": "USA", "date": "2020-10-02", "authors": "A,B,C,D,E,F,G,H,I,J,K"} + {"strain": "sequence_C", "country": "USA", "date": "2020-10-03", "authors": "A,B,C,D,E,F,G,H,I,J,K"} + +Excel (.xlsx) workbook, no valid sheets + + $ ${AUGUR} curate passthru \ + > --metadata "$TESTDIR/../data/metadata-no-valid-sheet.xlsx" + ERROR: Excel/OpenOffice workbook '*/metadata-no-valid-sheet.xlsx' contains no visible worksheets. (glob) + + 3 other sheets found: + - 'Hidden' (type=worksheet, visibility=hidden) + - 'VeryHidden' (type=worksheet, visibility=veryhidden) + - 'Chart' (type=chartsheet, visibility=visible) + + [2] + Create a metadata TSV file with duplicate records $ cat >metadata.tsv <<~~ diff --git a/tests/functional/curate/data/metadata-no-valid-sheet.xlsx b/tests/functional/curate/data/metadata-no-valid-sheet.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..b672fd53ab6d5a0c66611bf69950b95713e3040d GIT binary patch literal 14442 zcmb7K1yqz<*B(+Dq)WORL6DFZ=|)<*yHjb9k_M$)>CTbv776K2X{03oLGE_}^b!2`e` zS0Mm^A73(mzufO~_hqE(mmMu_Ol^(y_3do0aWbRd85Dt@-{7Ewj)eg|#DG{GXC`Yf z03Z?^0KonQ=Lg(F1}Af~ACh@s_M^uAiWm~nF9ptw(zZ=eCpgFz#hWJJKuqD&!LJ+a z!Z2D{mbrIhVP zqdQcH!=o=Z>)d%>@tmp$M@mFC8qoIeC+z_(Y_VrP8Y`c=Gn+my=2K_WNy_{_l#LoQhj|a8@_cO@9C5YRPK=TX$cka%<01% z$iNheF@3Hqh8Ab%;3S0*Rf5sv3R7{@qZwGYv+xNY-+Q9H+NZ6RWC|s(AP&K{L5xI$ z^TA#7>o}p2aH2QMXZjd>@ziYC@ULS2l0G&DLsl=8?H=ZL9M!ne4|_gng`GB9`lL%R z?sICoKlF_c0W!QT-RW=x8xb!lBNcaId$WJb_GoxXhr5dv$E>0We-XAm)PJ`Z=ofv7 zRce4iJ)d^}8{J$Yg<`^DUz%mkTYiREMW@DLh}ATNvLhZ9QwhYx4m4 z`eEo_N<5ye!I?kFB??ukOsIZB(FV4Teg9!A$|rmUQE$MP543$YEmkO!mF?TC7f?6B ztoln4eSQZ}*`*_X0W3gQ5rnQ0Fs zIHWg@yqAvZ!`#aOQ`9X3DzM~?9^r~op(7$ycJF%$lqHNFN%cke_>!<5xF{wXH${nC zW53i>DVA*ti{OR1Kg`D>yv041u+Px*b`Lvv=R<*!h*^*hWz&1GgV;`0dg8Jg$PxOa zi(-wL(7-oRQwAk#?XaoP2-M!#2;{H{^=#WIi#U+7O7rX0C$M*8%wfVL$q%FpFHkoA zD^31wq1&4HHzg#>_liL{X&MqT?2pJU+-I zPP?;$L+mycKw8sOB65_6yx$F$Pb22+jcqCvR?ACadAUL!%RjGCm~d`NCZMELO85?7 z^PNWpdL?u4(P7uOVA|VJ}`{^eN ztXitC@R})JLxy6$)(_?o3Gh7(_|$mD9iaQCmx59(@_O(`>YALNQzTHRD*->$<{(H! zxPHPzC^*4N5Dh#`@22{iSjl1QLo#UNrG?HyG(i55S}%4`YA2PNd^;Ao!!B-zz%T|J zcIKlw5Bqj}=(f^$y-{sxDQ5C#N7-5F0WL~S?ox|Dr~m~GG}oZ{#nTHSvLMg$O?}crgndBgL3b8~6O|#pi7=lYuh54fe zQ;=s_G@SaF3Zj}P$_!5WTQq>FA7~ZQ4GUm_VwT2}FGJJLdD01QB5(1hYZUg{&u@iF zH|VZ|`Tb3N9V%B49)%;HAwjM&3UUqH+Yndxv5uvs>D>tV2h7??Nt0$K6yGD?Y5tzN zn1wQ9YS|KMU$cpQfB=CSMTH*gAmL116TP*%*IJVv`Er2sw_RaX%lbG~H?5Q|E~p?f zD9^s*k2aZJmFM4jAz*YiZ8Ti{;W)`gEeZ-p7?n-mNb2Rr2-#U@G(4;fm5V4IH4<|R zVwgtfVU5$p5gRkH{Y2hHC#m!>!KJ}qk=mi!tEfAc=uji*a`*75ERNgWaxH(mp>T&Z zA(GyGj<~NfB(#sk$WC?N2`PoGXm73f5a&WH7vk%TumVCBso!oib?V78<*G!^6|@bC zoVb{rO(iUF9L_VQ zr`lOwpRxEd2B8dNb`m?<0|-C8lLj_eQMdi=Ug6@|rllVJKC7Uw93dN0JP0Eq9nR3f z!6hzCi9X`C;~Sp7uZ0(k5Jlfo^C=c6wZ0P$UZdLghNq*Z5Lwl=yGQKx)wro4pENRJ zvQ-X3AEttw<>lF*HcTN4R_AkCs`Icz~); zi0_H>_%85+!@U%)KG26d3K|{Zx$V3^+SVja3ndUEzWMoGX3aMP}R&^Phl;f!& ze7e}cC>HXY>1p3VSO!N{OdK7SPL&bPSnj!ZP9%6~pmWf2b7eDcJ@0?;3sfSgNtrd9=))@wPiQcMm!HW(b zBF>2%SJ4&t(H;3DY}iaWg66}c8Jv=iM%?}1o&jz`rRJwB26tQk*NuQXD2@H5Y`<$m zv5=IKBq#uY3kLu|zk~53>VEk0znO1S#oTg&4dsGI>yjVs#9Zb;N6jI4+?SsHX>kT( zL>a_kNDNgx%hMdP=u!HU8Ao(-wUR+rM!XUG&U^9PNmA`ANn8YpSd3qA$_t9Zd`iPN zd5_HCOW#uZ&X5@{GBn0VcB9NslqQ z%fEnjfxIuwt`d4~RPtmMo| zyHy=b@@=mh(5dv1mOK#Q3b!>iOIAVeb`hK=}&0Gm=~xeh76 zJxiK-=Ax%7=4l{=%K_+VrJSh2q#_*K)AfX`pl{JZ5oC(`KpBoY{&rU}2Od#@qq7@% zLDFl`c1QRX_;A6!iyrYpk?$(OX3#2mEUIA`Bc(w$dTO*D=mfCbrM_}6-;iYq8P*z_ zPDIE@7}<3NWI??7z*Lk|uKD#73D;6UxbKtxZ#3>f8ya!A*6%Tah_nVHZk~N22HBEt zhtGU=gB9MkCd92^hMjU`n+iz4!y>c?sC*RAL9^~s zdjw|mS{yS%Zwl2mT+5nt&@zZ1P*;Sk9!ZQA;RuwSS>MlOSnh()JNl{>G$^|llW?_X z6J#zR3q^3W)^o>mfAilN7-;(}Mk_FDkgXI``kK<|3|G)us7zr(^wBXBO$saS8$~c# zOn}}od?DfKNpbGyIrT>vXJV<}9{Hxot(IdUyObkm zNWzvLlPyJU(zXx(Xd?d6>4h2Qt71vz#+4J`q%V+sn&n)I}^GhG{s-N^um(qb#HbxbiK<=keOP#dZcgQBv|R$z@&(W zUHiKdhc%{>TsqGeURZ!>SZ&$_GjRdGi^chfmRxxw$&l<4M&=-&moH^K@jp$uG2&B&<=T|g5l`hlQopZxld%lk2+IUCp(0KzM{fvM z9*!-h$;9EM|3E^igl?0o(Osbrm2vMNb8}`!Q`0D2{UavX0DgF{!sY$FiCSat8C%hp z`}E(l9nO(Wlz>N7nGVt7@_7Z!1%a)T5;-Qt^xUE3=|D$*M%T6}qxxLu6Rr^3sek+f^L4o-yq%G z9e?Z*^TQg&T9`m12(N9FWsAWLej&CgZ=wywFZW>9$`IrAnj}z853{g}zYZ+$HebGM z#+N3f&T-U(jiP^BaL;#a5PQ#TZgbD=2q2czogNf5&=d&%e(rGj*lS?;c?3DSY)C|F zlbosZkm{LNr|2^%+X;!5&S6aFk=F0I@v?*`7cag!LL+cxafV_Za})4sjCj?LQae$k zW;-NBWEsP~O2;jIZ&LrbKpH(ja~-#M0KZ?E{@9!%%k-Yt#f14q^SL0c+F8lhDLjAa zyB8}y`uh*-AQf4F`hz>=C!?;VnWc^GA8*#oNPdfEB9y0vI@TV+n`9RHtz&i?_5*0a zZ78H*Xke1@6IfimP3B$-{!=|GwP0ip=&Cx+7f+Hx=<4#Uv+r{X)Bfg^s_QC5*m zg2=5Rsb~;0j`-wyHnat>R~hDmIC-qn5(<5(d`zX$h3*=}9DQMBuN+CI4}n>Z<{NFj z3YlmewN{6wNPWmIM+ORb^h9%U1Ua6zSf#Z$2Of8?2+JP6B+qa_1B0bK_r-? zz13m`ubE@Tnkn{4Fv2uoLoQ^PTMp|IBUAQISKZAsItkrSi=rc&{{;OB{D=C@wBYeN+G9vPzd6~ zn#>s{c;ts9XbH7KqQIIlyhY_c1Bc_0>#ES~;tz>3#WQ*osm&(dArT<4le5-$9y3v- zhzlPZCySno;uQdwx~v&Ju{lAQ^#~cK%b1j5EW%IP)JV=U-+VQca+5q|NyW0M!M<1p zn^ldE{=on<8YA9Y!&gF_(tRS2EHo5Lu{AjQ6UPSE2rf9E%Z)F4H;q(%FYx8D=6Yq_ zPq6S&x?hw@JFf{!;!_uAS-7YdqdFc}THCZ7Vsv%LeZ2?ka?+3iF!ORdkm=cqm4^vpMwKBRj+m;v=pc`nqA->OcnNZ3E z>f3OXhEbVsdA}nty-+^nd-hRLou`lRTTD|GTP72>?}H${x4k@4UgUUtnCV;ggNyy- z3~dEr=3k2hC+!NPdlzD>@?O|O*OuxxettNq@-Bkflil`oBWH_oPvhDU?WW{%?ZNx= z!IwH5?It%^#!#i+u`Mt5vKFh2S4Z~>Jc_-Pk)G!;z&h$dYX`$^IBD*(?WyWtzF?wN zAswRG#TvxoL?Gazs(HnF>Iw~|(vFHWCYC(3qKQ3vuA@g=gh4k~l=1>OC)(}>vC|_J zSl$6{^jSLg{d%PgDM4}6U2s(yeC%y#9p0uDf1#=d+pcF{H55ZBr@fv)P0_>z`&rdE zNq-_JW-3hYq>%8F=6Z7Xf#l3kw@^IJVrZ?WpMv}#S}wTmLc02 zZ{`Dv%=}d2na`02&sL-t_Nh}B6}MD~Gefeq3lKYfDKHN5{mLkjoVeULdCih5UGfDd zi^<1FO--`KSWB6{bj@=<%K?8L{Gu^vq4-=QH$9F`SOQpde&M<@hlwof z;cdn?OJGrMiJ(&dAK!ifdu2f`^K5kSaZ-x|wfl(2^lx2>&!*O~D zSGScl13vv?HW9-OxXI}D7DE52(p+T85*O4*V1p~|afz%Y=Mwk}F9ENGM!0dSClkpb z4oGXVHlnbFHBvw`$vOC_qE=yv&x76n&m2hHrScSP)?ObRf)2h9;GNW{K_VUjs@n2!55<-~Y%zuaHlKb~9L zWqg0MolJ1~c+*vJ8#VaUxprmd<>42lqlRSOm*;1VD<<6C3(4m@yQP}$?%~caQVv=> zOZ_RO>(3TLw+X3c4k7S|tkWcmyKJP&z*fIY*iIf<}On12fw#0(^Yv zC**6Ik_K-{L1O#C1M68Oy!ug-UQ02!6Gt#ygNB%>H$~T+y3}bGvLX7>a3WG-R|a7kc(19D5Z<9|@MDpQRRFQ?IDIhutEpCg0h~>1D|m0{7O~ik z1tcX}ji&I|-O5j8d}?Onr{Yui1G?PM9-xQosPLsd3=nAy!J2#BEYgWBOcpxn$*mAR ziATxbI2Rlwa&$j6W%y}uwp{_m0{B!nN%M#1I`KBy`|qr{Wr!V#GM)7zkdPaSRqnXYPOS<=#Jn>)9^3>(KIouv>@xHB&w?fwx)G7WbKDmv0bjfU8 zsMZ}X{Urz?f2DuxvxMEUw;M8ANqalwCeyq1ci0w{^gbmdAw9{YeyLPaK*re<*eLxl znTz|&b1z{0)70b-KN-pozJ&IR3YZd5GgaGSOE-#jV97S_Bhc^#Uxcq7H8-#+X5W?g z8jaO&Wc#gT#LyYMfYrqTwYo2m%nT}^9G-}Jo~K~(vYFCb*x912gBA{1^z_uC_RrjKqPzvhV9A^($$Bx`O-^VfOx~SY zW@q=8slXtg=@hIq^UXuC57OXSGFIp89y&{DB0cEnLbvu1^0f3>ETq~ zc7;jQEFgkAA_R?`h;P{T1 zB5<04qgOan;hQIZR3$Z!??fZ-nTaiKW1>fpx*52`DwKe zDh(|rKrPe}l*>I&*JsGx*`&%ys1r}Y(VwC$7Vr9H31eypDu~il$nGOO5o6cDsPzsV zx{m=g`6%jXcD(%lGI_wtu8!LG2y)ztTpl$EvIU}3p>63Q`H>@`tKdA2xW2G&L|6np zwhhm#ON+ASF6A;NOT>I}QRpMj1Ge#=^NIEbkA{wW2i_3V6zVlQ z#7G^RK~8q^2qAhUp;4i@2GUsVH8ekc7dup8Ptx;}FSBAfO7_FLIe93>vtASV-l&Co zodl33v8mQ9UG!Of(0o$sCtFC9Y4_LGF3`FRrh4lULm!irp^DceYE6!9Lzi^uD~vk;*i*-TS>YieIlNt35HD zktn@HR2g#p5*MmtNaKi7V3+`q*r<87C0m&2_>>KsP zU{wANG0^uc0+)5&q~Tej`MSdCsoL>}^B>jI&F(|rL4~XV)XN@%p1+2~wxFGizMj0D z&DFO0Pc(x^Qj*djnn7*+F(HMma1E{w!deVO!0tVwqml>4%?>qo28tK8Q1hF$AEklc zdr95oqZO*!SM>e)xTC_3M;BJM&est6hirFrwZHBPc;Ou#Ydu9*jmd2Pz?mNsPAqp= zLWgE8Unkc40x$m=sXRF$8LBZsulSeRxD7N?1bqW@APmddF_a;)WxHCke zSeIA48CXy;hG79vT<pBtr(KMU>;j`pM5_#)&bo<&XBM z%IMDZsFH6uhm;*>6*Kw6TNj}(JnxVnHE$25N%3~rwJY@*hhq>9GTyrgf5L8u*4jDX zljT{%Vop=_ddgrUlN%$?fvIuldsC$pV*sHYk^_e{t<;LD>|FRM4iHd8jy6A{ITCsR z>#9Lo9^q@puIM*6emcA@$W#WeCNZ&lrmNchiWxJ1UH$@W3Y{~RP+<|7t9fF3f-n?0 zbf%)7fUts>3q3Bs%1Zsb@o`bE{Yo{C{Y7iSa|q#9lNwYD7wjG@ta@Dx&CcVngQOYL z%I>f8l@~2(b%l*1Tlw~qLC^gRkGl_!d#!9VS{jM)k-erl#J#j1CP9jBn{dBFZ)QR& ztRr|bg~dHNWeU;J0oU;m*t*p-h& zr(Z2ZU{dvgL>W^kG=y5Hl_BG8vZyJiu1#S4narCF%x})m%$$MA?`3OkC8R$3*(-YM zShjG^7&{pgC^@FcQCJ$I_v5}v(5eU4jq`jHsYcJ7_RWB6r9^z$H%GwyE{Res8uD=I zee`2lhW=4)A{LqUs_b>O;Ch@QHKyT4lh^lLA#ymN>wM_V*AaMMJI9Z%pwWk|^h}p% z_$q(ZbU6Z_q_?u(M22yklGxi}6&3-RXD2qHFI=z^ z`)peTHLXYt+mM(P4`wl;{$+F4bo6YxUs#F{B92O{@*65d8z^5G2nn^G#Xc;O`C-QF zZd>}frcNNZSw_3Di?H^CMjDxC`jjFLf$21L&lNbvj^Ts8X9+#&n~u#KkQ3U7a3A^% zoZWltrYz1Bd>Dz~Ye2O9@eN+m!}0o(cj(YvPf&|m2zj(g3DeA#7lm&a0mXwPane=}`cJe^?N7~569*VZn|LZi52$%9U|}vq z&*W#Hoy%P2Y*%XTxaARE=6>3-A|AI7KK@R(%!vz=yqCSR+h8Phl5YGYEa{6c(n%ZN zjFvxzDs|C4`2`G2*MZ^b8O=9s1ysJRN0|?5J|D8F$J#m)lWb_7I*0WZK~t%RS>V+G z;5b(Vp9$iVgn4=v4F@O{)XQBWRernlkM0qPfX6(@7_j|J~?arAp7fsV($KJ z`|DcT=>N$9LLz=F-~En6CW-fWtO?|@^T?tB_D{*l0{J%RT*rl5kjMPWDrc$8l!>>( zNar%l#{(IZ$1mBF4TbwCO=a(GYnf&0c}7qPgNbB0k;g*0BuFAnXpdAvPcG2TxPh*#&k$+HHUq(EZ2MqX_!1xLpANjL)ZCOleYcRlZsGf=++w-1 z_JfC}rBp|~aUGt8DS+TOtHZ(PH8QO;k!)brcL_}0rzdid5SJ8Nb zHKvBSt!L8KlUc$5C;*Aa#HRF+BHc&?JSrnIPzgSm16QwyH-xpXygw>us93(vmy;e z##F0j_5G7+DWQGH8|5)pRhYF^gSfSk+Si2l2&0N9gx;#OjnaK_@Bs5kB0CPB0kfax zVhHKvfV4H3#G_>j+=85=%V{34te*Stjg}17w zeTw(H$ioc+Je&iwMZ8b*E!t252Op;esRzN!EEpROxu?X+LV``f&zf=0lw8@Z@}}m! z@hT3qd%F+TU%=bb)a0y%j18%>J5)<+j6QA7r5#(bPe?-T8s%lmX<b{7<+yTE0t)y{(6Y-{RgThW=Ii{F_Q{4!X-Wy?sy~ zXee-7CAW|Gp`L#<=e2r%+n`3jg5J`{6+`g1&^N}r%MZM*kIH|7 zz9GonXyzXcd2`ZT+THDwx8 zsDDt*U543hao^m*yWtG{7fFBRGu)Qc1@*S1w+wm3Zuo<|cexF>W$R@FB{%v z2i!hn1?{%zcen!oEsNY;pWlWi#<&fAdzt=k$k);HvuD8G@eF$qdl2M!|68g3H^;cU zP`fQ?)or-j>$QKA^EyC&9i`p|#v=M5=k28Q!`Xf_RK32xZ=&w*4D2>&DajAe-%P~* zEbg!Kq}#}iq}Rw-w1>Z@OaBIWZK+=y`D@7QswJKR@ON7e!um}&e*1_2+SK0`Z{lyP z8{hv&!2jY#2vmTZ{`J4+m%9z{ZTX~c!~NY3|C(8*ib_lm& jCujk`?-xM8SN*~-Sqlo(WL&YY2mv2Jy3M1%`q%#fUVw_4 literal 0 HcmV?d00001 diff --git a/tests/functional/curate/data/metadata-skipped-areas.xlsx b/tests/functional/curate/data/metadata-skipped-areas.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..979bb9299760c79cadcccbdfa1e1820f51d1ecbb GIT binary patch literal 6897 zcmeHMWmJ^w+9m~th8YlskWz+_R8qQokP-xB2x$R{B*3Ka2LY(a}6ekGwN)oa(+1l~m_MpPYGB#1=_b2$};;9AP!OTeNthf7ph3Hc-$7WFu3 zwdb=pwp6TTT!Hr>vpQ4{o{MBErc8|&7+V&`h^I8}sl~fp-LWV_JxR)+;RTqF1MfI@ z`&^Hn+RTIDryHJ9{Q5X&!|EN6Qax#(ZZJ>8VgRaHHzgAb#YU1FD;6u7V| z8n~1Muqx`f1{?@Bu8JVM=6Xj@bB{$VJMmQl<*N#?i!<>N2~lu)L2JH z44bhR87ACNd9{Ap!ZvTZZVc>MTJv38^vk`=V#6Ulb5EMms}H@+`%S1fJRNfmfi{pV z2%-HIPq+Wh(@iKjnwy$BIdc4XwEqda(fuI0TJ;^nB{~F;ErIw;oi}#zJnejTW=Du6%R!ap28oK*u-Q}|PD};U zNY<;*M(6Y3Avlo?AmWzR1&*+H^A(e5j>j=b?AdzD8K_jE%&ua995m53JmN>E7*ay! zTfLX)wk_7D4e~VQ29zi8`Y*T6M2|VpBwo7E3 z0+u~3RDCXQgr>)=llYWKWY&a`&{bB7DFLe9o^W{%||-fP7!h?AgHE3St^c%DyJFG z-h9r0cUmk1*pgEpQ8jyaK`l$@Ztxi)kwt{Xu-x>plxvie6HM_FeyZ5`IbSLE5Hbgvq|PF^m?QTqpoIeU}zRXlm6#(N%Put=wbl!Bgi zSmzEb$<;jPZET4&5yq3ZN;9()ne`2lk49 z=jz|*vf|p7rf21qjN4Bb7p4+I3u}};<+|O30Ci~Wb-S6Mq3roI#rI6%?LY?Pg_Y)?mRKenc`FuLvTpPwhlx>uR^9i#j?i7|U zY$ijmY3#%T}luA8)L)KYewe|VF2_O3mJAo(Zd)IqSW_`H0c z$WQN6&GPwzQKZAgeNe?9r=|O7%L(#T9B8f>;*)GVlFNS+L4xv5I<4Cg~p)6ajZ0aUxi)W6d0_BvY z2R5EGJ`$;<6x<&PD_~~njaCX87_iz}mI8O?edM*p5AI5lpxg9aQKC_>#92V`leHAWR)snR?Q8@( z6Axlu*o>fE@?I!!9>@*WTNreZk7G*~`iExs!Y$qGM{N{p9}SGUBw-zwWDn3Fet zdiFZ#=!#t3`T`oFR*U0;ei(b=)4pLU%xFYKrQ}!D?XW}Kc=CbTLutvqxQpibJ6V*t znok*xj^Cc=beQ>n?WQ>r|2D)>PV>=dAlrVqigGPABmddy5O6>yL?4g&wg(<0F)eo) zF4lT`la@KLnFsR-3QIr-bkRrxHts=gtJPAj;X$z7mjcw)OZf6YiX2$qP~-O3Etljc zN6?~YgJthCdV-!ZRAXS?El0%pb4iRlH4yVJT!yZvOFT2fWKVti)L_6WZaY2`-5D=Q zm19nHYgonHyr(ejW05)5Yn&{P?aaxGB8);v$j345)u#NaVv3Ny0*$%gX;00afx{k` zBk{{Y+xU0zvvl7DIWRbk!MhJ<2?gC=l@SwdN3Q%pk*-P#7+}+__8mTWh)Dh7ynaw7 z^~g|$EBJm`sBo$@xh z@^14s|JF0SY{iXhI!YDc^NVLjR>k~V(>VRky2@9#ZmLAN>QcfovPv8R{);NX{lDh? zzvleE=KTNGoC$x<#Q$8cA|B7%&U4~s<*K)vKNt&;&UN4Jt5g1*7cQCS3^bOxWr~4BR%|!QN76aVKYWr#@ z05E5?{t`0U4J;ey!YQGsuW=DiLUe}VOJ+|&BYZ@91svSdb zW5EuvO-F@R&!@XQx_yaRvl)LtbC}C(gj3Aa5W-apYlqN+r?fB`RzdxbgnQqa9oQ^< z*IG?x8W((K;%`^T^G75G2{&5Z0#!i8EM9Sv=@SRFh4hlDHFnWUTwyZQRO}tcf(!cb zQnWrd3>KD;<3w-E>%39;GBbc7FknOTAh@(VCv0acZKpM5)<0~gRcKbSJ3mJ(XUH6Z z)~a@)un!s~OmVx-+_aT?C%$8S$D6oo*1%s%1^o+__)JS1v8bNY=95nK@yodAJURTz zxtH8#i;KaDm*4p#yZ-3q|03OQKZ4u74c#Aq#{Y$BT3$j?@16GeI-mSp*3eds<$R6# z)_H?J%#r>HwVoS?zM_b9)pZF^^W(Ku!kY@-%8;Q+fLyJ<{N);o|Bp4)$<5mI$NssS zsINND2@Kpvu(B;lj*XEeM&A*cupn152U?8Jypm7tXrq0zE74xs9T~$5kqqC1Nh{b{ zHw$fbFXE9N>NxOvjvRAiLxF`+at_aQqfi1C%PbTEwc9!8iK1&W0v<4> z@Vpaai+>tq|52gkfg$Od%|qXol_6xE>2HgU zoh+Q+s#7p5Q8vIIAS++oE@7BN*8ob37#E5lf8&tHkH#a-R!bpWn?hGxTzI;w*M?rFjtl|MxvoWDwp zIW31g9-Twc=s@sWC~YuuJr;Vi$g)rn+gvhkJ})>a!iPHjr2t0;8+i*{|6V%rYE24aBO$Qjaw= zDOKvNIb6PZs9-Zs>AhbLev$S;qar-9`K54ekVBd%>1II5m7~B1Y?-L5hG^x_{pOP` zqxZ~3k@6E3wpo#1%`)<~|5|1tLjJ}1K`AZ3qzx6aULgwD}G(%^E=-dY$MMM3kbe!HeD=cz*cY;{lm2Au~j%sY1QTy z`HA|+L~yT&avmtakU zk~;R21<8~tMI?%SO%^!(06&kX{#lq0fHi(iPN7@OPdM*E_=>(@00Oi8C<8|MsRGUt zti&e2Z}<}k7*BnqV*Hq!F4nzhwHI8!U|SX@RFo##L3@-S{d{sWTOWiT%hWB7{a6ei zm=3z*Sx@?vM)c7pPpq^!@3H@TT9vQMzTxyFQKF>q!zR$qnvChUy7wL@Wbh}gS$TrF zrl8EE)v+1ELBRvn)(sHoQkK zYc8?6HLxR}36^Y)uhCAD)_M6{)T^ZB!8_+=C@)Z5%z|=MMTpW;x zb@G=2BC;N!=ty;>9NAzjRI!~C_-YBks$h`UIFCVWQG?YgOU<9b zbl1g7Cnew`e~tIP^5!iT+i?67NM++De6vNl|FFoWoaq+#kQPksSUONkz-Fg`M0LFW zLN{Et(V^~Cj$&B1NQyeA8TF2)F>9ec4ZC+z?MPowlo5eJ(W_Jjyz@&|wIP{GuBcwg z@II_A%p9(i9-@8Ius9!0^kG!6A0Igx#}GTfnQaXGX2JFtBEgi4h=6iCJk`;?#7y`0 zi78LXnscpNMnx6%D~v}yF~<9tM1tVYO&JWA*qyNIkM@I$6&A{be_W11_)-EDx_jJUo>!ts5!+{|uS~o{p?==F44te`@p9s~G z5LPYAiQ4Cic%wUZwZ^RE2d-z;R_fmOP8GJiQgd8x*q39XG<}3*|2gJOx}c%~QGPxV z_*DYGc_Q%d_Me{*{EqXhM0_Kd|1=2lpOHBKM?U`@;a9cmMoRu^P-N5~5&l$I{vGAl zZTR0&dXRVjCn!I3vfoku^R?{Z{3*(hJ^FW)U+21;Mc}8IB8RzuT+2V#g5RBg?UFa6 z-%l$;V%-q>uR-v4*I)a{O&k4b86^MKRDXB;wXWW@nVD6kO!19~mMbhD`%PQYTa9jRobY|$sD96kCMt~ll2+Ox<36uImdBa)PDHcFP0gkR z7@~F+!fIj$P3za!cmWm1=F~j^BE%AqrH&d^bUkzFG1ig?muFl}75O9?fM$ZY>4{l_ zGc{WcPdGDhR-anHMj}r&b84c@!oEC7I*-#75z6dQ7}70WE1&ZV^ zpy0OVZdR6B?rwI@Ht<`VQpQI#Q0NE%2dyA!VG9ER$V3hQuKlH`{b=}3Y?vjvuC?t# zX7%d!86(tU?Pb2=O5a5Ue&Z>HPgBTK&eM$C5!Uye&G~SDqr;J*A4eH{jO$_(0DOU5 zq%)aZ7CET*i@pq_0#{C^CNlJpdhXy+HW(dV_;+q+rdUF?2(rL%lN`OtV(OT~tVTNT z!CJj6ru`*i{s3a~M{MqfyRx5!GX!lmSutDzk3e)lWy1kecxOIt`ThlBlRU*ckX~`h z+(0_&EKlA&fZUa}E<3+YL7kooYF-_Gzq9$^HTzoTCT}wd+t^boSGK7&jfH5yH5LoACs<6o2nTHH-dLTZBy8E}|$M$olr?wBsqA^4o}q#x@iz*70T zg<#f)h-Lc`jg>bt^=r|yS$y1RDps*W9fmKX#3N2&PkwtRf3)+Dr4^mg;veo0@f#2*{q4xx3`!we9~7kXp>T zLK$6xPHf1G*MO1SUY&gGmtBhC8JnU4aJdiBoGbotuiv_t@Pt5O))F7zOHm5;e#(1_ z#$;0y655Xi2Und=(q|+c*9X_gID z^oji`fEq7Qt1oK2HHa67GemetT9#1CpWJ#HRUL8yIz~QTk(L2c(m?S@@QvegSw4{p zC1|TljJ>0v`XFkYNcwHD1K7DODeS9Ii3s~3HwG2S-GNPfdv42ks32~hyf30jfc%+RW@n@=E$zh-}=N zcx;znB-ryvrTjy=4ZrjkTmh7>v*_=P9~rnta|-7x^#VAe`q&scv{`W`!jzdG zj68}j5z_DD`D`IZ*+q#z|9tk?=dQi0(`3~3J=i$8xg1slE1AG!Mb&A+gq|W+Rn}xo zU}u^Z{(jNLAwQN|c46{vj9~{+COU7@kQ?Q%?9s zTm5JqtRGW9nt5<|Ev}%cZ@A3VoLw!Rxs=q@{oL-7>y~!p1%dlKH$`g^{r^Ebo?F_r z=|W)hf&~7{)dWW$N;&GXi-DR3@?VFy1NrgR8qiJ@qM4&HdbN}Xoxg8;Dh?#tllRs{ ze4-=VkM+FYA?C04W6!9#kVe1s!|?jnEMsiv220B9`*l#<(DG7G!eN(E{#d8DKRS8G zkiRm*Hun+c3WJK-f;eYeb#-FTCnoK#Y;)XjakM!VFT**fHG6ECaP<@o>!gA0BBH?_ zr*O7&n41}-jM|wS#>(BU4wc3R9T=C}liS#pxxfl#?C?(*RDz8$b_4wyXSygaN<~jE z8J3b(S2XJQw*+mbXpG5j1=y``YYcg4O#4 zPWJidztS(d~!65of3BVHR{CyiH%rc=75Yf*@4<;X^x-GSLTVMk}u2!Vl zPGVDX9R;J}@7R_oHMR|cXq4ID_rn%hSt~>Q)!uV1KQeJI;AP*MLYwSJGd_{**+zdz zc*X#_E9amDru{J{y2aE~z6G5&oBoP6H@hnrik1(a@=lFvEtw15b!F>Y#7*}Yx~h9e zSZX{AHdfr{gMsRKht@*o21zA~GFeMV9ld(B#D zPwhR*Isv@YzAt$2X~bHI?%`<`$j!FPY@k+m;Uh3p8;-BH+(k7_#an85A>S{f}_9r z+h#7>A)Xv7>*;vhgZzEM94xgn|E&OD!)uQ{UloQ}LgFm;OY(w?(E+Y+kbavK;(~`r zEB0HPhR>f%=Z$BPD5=XS3-4IUkSTR*?xE-580|X z%Nde}IsoV~Whuw?L^x7%djz{ej(7+86HL*<8~A!ig4&372U5SliWjFrYsfew9`QZW zPlO|R!Z5-~L5YrArXViZ=DTju3*`aGan8e(=yl5Pl@yw|Sprv_!1ltdK%JoqL5Zk! zl^U*2bb{sO(`cnJO2);__*I$#ckJ>pXH^Pifw!0NOcg$ix+g%7;6|i0%QEO(5z=a? z;|`$TYYQ&RhN1+?AYkj78SjYl;}U2OK)RY8Mwo=LGy$Pfn*6xF+rGYjwgT=;`Q#7- zLo=aJW4NA&j5t-&TU_-dN7C>mqI6#!n^j+oO2ArOOZ%7VMF{B91-IqP539iQQFVu@ zlh0Bw93BKBxO!^r$8$PI{L$K}R7YxDw3iZ4%vWHtCr+J3FKE4p=_7*#tpad^a8d>v z%Fxx>isip$L$jNkvY=|G1Fr;agZwG4O5k2apBMSL)}rk~^FK<N%FqHMQVJPf*A`0=k~S1 zvtA!AK*qFqPKQ~a#E7&1HWGX-)pe1roEFEIiKj%-ildyC%V^F+yDC>khbg_^4OSUE zE@_C``jjV4dz@{(E7nG7C?!ONHV{@+9qiDfOWQ4XBJr48)ixtm=niwcX~KdQ#&Bgt zSW@p9)I`;|8I`A|zTfY)?RB24SbjcJsfQa|F^$xN(7l%-x~DIQj!GEaR81q#i>BtR z-B}AY?F!8hI<J;M8ic$%z3wVVgW-Tu&hU!Dj%W^p6YPfGsG&MH9iqAc|42$e7C`=0qd z9}={+4vEBvICA4oh8?!{*T%il01E zzj{+9PO$!|OyK{Tw*S4-J=2PYpA>nK*^SnwsWFk-A?Gn6B}{+ zg)t}AXP)N3$%>d4*M{&L`^fz{X(mf*+RBYNeLYl2PhtNCT8+uMjLa1&Lv#JrQ4v*{ ztb)@EuO+_cD376IId3X-yvt6v&vrQ z+MN^9UEe%I`t&)pLe%4{$zzA^vc2WUpApbUSLnC-BuM^9gZO51+q%iA|Gxj?{%ikr z_ja`Ux%lp;8H47zNy7ILZ0yUjAf%%&m~aP`snGzu~$c+?VhMX<4>8NV#Uhp zL(xD5prCQ4-f59{loWUL=!P3PC46(D;KT8AnE|-BOvxr4PU_0l0 zadchgBnM2Hd@rTgQy)dREU0t}m=do$2?lkpzL$qq`n{!AmCHx;t=LJPmYyx_8JjHa z89$o2Ym&{2rTKO`o%sVMBrm5Er67Dm50jwV*^f1HD4kxxcB`kBbd6fs>g#HWjxx19 zc3ubH#EW;Sk>ELBDcz^RckG@qO{CM#k{k4i|mAVP;Fj?Kwb``@UiZ)3@7B=>nTcpEkCDxX5X|iS3>a$KZ zAb;%RMjbX0EHE*K8{I&X^gJMK(x`jSQ60+GavmnNo^r*>??$q|1u1}<_cT2XW!8bh z)juU3oQI_)ozwxf#^#W;dl15QYM&W-w8RpYSQpEZJE|sZ=S3$ags>-5T2t&(-}dqF z-5Y)bwcc(MNV_|XV~x%ilI}FDhUc;M+>Lh*D3p}##<#>I)OS4_GySl=wc8It-LcE# zpVIEWbbJ^$B{`Sg?mv5P@WgudjSrf~65ITwa0m_QjL;E*fBm)D2aQjAsiF2=+mJWf zpB3I`O${_gijc?Thx2Bfa0B*x0!^>>$pp6Zt@oukihIO4jmFL6_`M~|H@C|a#V&m3 zNK}BGp z3JMQF#Qu}0cwf&ZQaQ`|wx=I5m7WBq4PTiHrE}kX{KPi|!?3&<5U1>aI3gsQu}EZn z9F?UQr2wliWQ&Qtpztls#Y1dUwK?dGFud1QVZ^r>toYdDcto4%j9f>>J@W|CrEem- zW>!qy5@ES$=^N@~Y>IMi`Q_wk`N@*)()lyp2t?@fn(}Rh2+s|ZI|?FWYkCFo90(Q# zF1|N}piwz3ojgj7R!1(+GhYo1mMNp!m8Y)Rue9r9(mEcCw?w#QOA>E}R$akGUSTN2 zUA;+E-x{)=>>OjZmHesVSXTr$uiV_W*gvW`1UJ91gjs;yU@mYja~GGN>PCm}0=Ur&6X)HD~2R7^ukBpA~nDzG)6EmWrvm*9W zbf!{ZJz%w6N6>h0=WYv>^Qu%A3s!4d3-2f)pvw<>L>fFwBSQNQx7dKDb6|mF zp4T<9w1Qb@Ee zJ%$FC)qU{Xk}$#0xCHhUL7{*OR9S9k*e3Y-GHHX8K76^7Ol-Z>`&7SK)>qFM7w1zL z7g*^o-0_?6-51-WrPIP1TrH8X+YIWQYG2sCfx+)a^7`)ulkj(-EMRU{w@5wHeCyoB zjnlTzRZ5#<&gE9Q7U z942krn0F9@RnsbDovl`U(T!jOS9rY3b02gX-|7|yL3Lt&SH;Fd#Z!wD(f!IFj~ZS; z1M4>UG0d^bf)?uUY|QM%yy+X|iUtn)m{;#d$VyUjbn)ChQWC#VJD5DzJ)oFoHFC6X z)q}m)W1=>!kTulB%K&mdyZlgza!ulBq6u-_J>es`a$ABtD!-ggwCAfQulXD=NmJFj zNR&1GRr@@5whV-BK@`U|oP+k7hh)dgH^rdC;p_NR@KRNKV0xkC^O+eJt*%->JQ%0I z`A^Ezat_P5-5l2OKW6q#JwP&&g*m%hIlG%`d%IY{jc<=^BpIUG%}o-vj9^nSDQ=rb zCA4b>yj7$T&SiSw>7bt(x**&fu&=&}&gvY4dkk!7+w|M)R3G|Q;Z(y*^+~poeKzhrE!TvaNJd=vBEC!tO zOE&O(g$ACu0oj;Az&o@;p3HuNeXHn{Kpm7(WU62xax$*>99SOg<5UTY&PPBQrXs{M z2(Q~CeZxz%oPfZjy3G7Juh%Qc$dbY3smR9`I1A}S2;rj4krT;h_hA**0d^WRZ_tPy7KyEnK+_@G3pVltuhS+6%>d>Tl`sBTg`T%`QDUOBdU%7r z4caYTknfNn{dyGfyY_zjDB?fuzdxAxGtTb{^sQ3=%YZjmzKQdHbo)Oe{4S>5D%HQt z;U;TtBK)mx{b!WlSNQ*ovUYR#e-q`W4EJZ0-zWV4j8cdBw^4pB|9?jLeXhIR27cM> z%`o?`|MKtM;7{k@yX5WY_sf7c9pDzw{~ZMXbp5@L+_uqQMsY*2- Date: Thu, 18 Jul 2024 15:56:53 -0700 Subject: [PATCH 4/4] io.json: Serialize time and timedelta objects as ISO 8601 too These object types may be produced by our Excel/OpenOffice spreadsheet reader, Calamine, and we want to be able to serialize them without error. I missed adding these in "io.json: Serialize date objects as ISO 8601 too" (e20f920b). --- augur/io/json.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/augur/io/json.py b/augur/io/json.py index 9e41a7ab8..60d98c277 100644 --- a/augur/io/json.py +++ b/augur/io/json.py @@ -32,7 +32,8 @@ SOFTWARE. """ import json -from datetime import date, datetime +from datetime import date, datetime, time, timedelta +from isodate import duration_isoformat from typing import Iterable from uuid import UUID @@ -51,6 +52,16 @@ def as_json(value): >>> as_json(datetime(year=2024, month=7, day=17, hour=11, minute=38)) '"2024-07-17T11:38:00"' + :class:`~datetime.time` objects: + + >>> as_json(time(hour=11, minute=38)) + '"11:38:00"' + + :class:`~datetime.timedelta` objects: + + >>> as_json(timedelta(days=42)) + '"P42D"' + and :class:`~uuid.UUID` objects: >>> as_json(UUID(int=147952133113722764103424939352979237618)) @@ -113,11 +124,16 @@ def default(self, value): Serializes: * :class:`~datetime.date` using :meth:`~datetime.date.isoformat()` * :class:`~datetime.datetime` using :meth:`~datetime.datetime.isoformat()` + * :class:`~datetime.time` using :meth:`~datetime.time.isoformat()` + * :class:`~datetime.timedelta` using ``isodate.duration_isoformat()`` * :class:`~uuid.UUID` using ``str()`` """ - if isinstance(value, (date, datetime)): + if isinstance(value, (date, datetime, time)): return value.isoformat() + elif isinstance(value, timedelta): + return duration_isoformat(value) + elif isinstance(value, UUID): return str(value)