diff --git a/pdtable/io/parsers/blocks.py b/pdtable/io/parsers/blocks.py index 9e952e0..174fd5e 100644 --- a/pdtable/io/parsers/blocks.py +++ b/pdtable/io/parsers/blocks.py @@ -24,8 +24,9 @@ - The original, raw cell grid, in case the user wants to do some low-level processing. """ +import itertools import re -from typing import Sequence, Optional, Tuple, Any, Iterable +from typing import Sequence, Optional, Tuple, Any, Iterable, List, Union import pandas as pd @@ -78,6 +79,17 @@ def default_fixer(**kwargs): return fixer +def parse_column_names(column_names_raw: Sequence[Union[str, None]]) -> List[str]: + """Parses column names from the sequence read from file + + Rejects everything after first blank cell, since there can be comments there. + Strips column names. + """ + return [ + c.strip() for c in itertools.takewhile(lambda x: not _is_cell_blank(x), column_names_raw) + ] + + def make_table_json_precursor(cells: CellGrid, **kwargs) -> JsonDataPrecursor: """Parses cell grid into a JSON-like data structure but with some non-JSON-native values @@ -103,11 +115,12 @@ def make_table_json_precursor(cells: CellGrid, **kwargs) -> JsonDataPrecursor: destinations = {dest: None for dest in cells[1][0].strip().split(" ")} if transposed: - col_names_raw = [line[0] for line in cells[2:]] + # Column names are in lines' first cell + column_names = parse_column_names([line[0] for line in cells[2:]]) else: - col_names_raw = cells[2] - # handle multiple columns w. same name - column_names = preprocess_column_names(col_names_raw, fixer) + # Column names are on line 2 (zero-based) + column_names = parse_column_names(cells[2]) + column_names = _fix_duplicate_column_names(column_names, fixer) n_col = len(column_names) if transposed: @@ -348,29 +361,17 @@ def parse_blocks(cell_rows: Iterable[Sequence], **kwargs) -> BlockIterator: yield block_type, block -def preprocess_column_names(col_names_raw: Sequence[str], fixer: ParseFixer): - """ - handle known issues in column_names - """ - n_names_col = len(col_names_raw) - for el in reversed(col_names_raw): - if el is not None and len(el) > 0: - break - n_names_col -= 1 - - # handle multiple columns w. same name +def _fix_duplicate_column_names(col_names_raw: Sequence[str], fixer: ParseFixer): + """Finds duplicate column names and sends them to ParseFixer for fixing.""" column_names = [] - cnames_all = [el.strip() for el in col_names_raw[:n_names_col]] names = {} - for col, cname in enumerate(cnames_all): + for col, cname in enumerate(col_names_raw): if cname not in names and len(cname) > 0: names[cname] = 0 column_names.append(cname) else: fixer.column_name = col - if len(cname) == 0: - cname = fixer.fix_missing_column_name(input_columns=column_names) - elif cname in names: + if cname in names: cname = fixer.fix_duplicate_column_name(cname, input_columns=column_names) assert cname not in names names[cname] = 0 diff --git a/pdtable/io/parsers/fixer.py b/pdtable/io/parsers/fixer.py index ef97eaf..c2b3466 100644 --- a/pdtable/io/parsers/fixer.py +++ b/pdtable/io/parsers/fixer.py @@ -85,18 +85,6 @@ def fix_duplicate_column_name(self, column_name: str, input_columns: List[str]) return "{column_name}-fixed" - def fix_missing_column_name(self, input_columns: List[str]) -> str: - """ - The column_name: self.TableColumn is empty - This method should provide a unique replacement name - """ - if self.verbose: - print( - f"ParseFixer: fix missing column ({self.column_name}) {input_columns} " - f"in table: {self.table_name}" - ) - return self.fix_duplicate_column_name("missing", input_columns) - def fix_missing_rows_in_column_data( self, row: int, row_data: List[str], num_columns: int ) -> List[str]: diff --git a/pdtable/test/io/input/with_errors/all.csv b/pdtable/test/io/input/with_errors/all.csv index c9686e1..a4e249b 100644 --- a/pdtable/test/io/input/with_errors/all.csv +++ b/pdtable/test/io/input/with_errors/all.csv @@ -13,20 +13,6 @@ goose; 2; 9; -; 9.1; # error/handle: multiple columns w. same name -**farm_cols2;;; -your_farm my_farm farms_galore;;; -species; num; ; dt; flt; -text; -; kg; datetime; kg; -chicken; 2; 3; 2020-07-01; 3.21; -pig; 4; 39; 2020-07-02; 39.1; -goat; 4; -; -; 1.1; -zybra; 4; -; -; 2.1; -cow; NaN; 200; -; 200.2; -goose; 2; 9; -; 9.1; -1234; -; -; -; 7.11; - -# error/handle: missing column name - Author:;JEACO ;;; ***RevisionHistory;;; diff --git a/pdtable/test/io/input/with_errors/all.json b/pdtable/test/io/input/with_errors/all.json index 3aecb50..dc0cf10 100644 --- a/pdtable/test/io/input/with_errors/all.json +++ b/pdtable/test/io/input/with_errors/all.json @@ -81,76 +81,6 @@ "farms_galore": null } }, - "cols2.csv": { - "name": "farm_cols2", - "columns": { - "species": { - "unit": "text", - "values": [ - "chicken", - "pig", - "goat", - "zybra", - "cow", - "goose", - "1234" - ] - }, - "num": { - "unit": "-", - "values": [ - 2.0, - 4.0, - 4.0, - 4.0, - null, - 2.0, - null - ] - }, - "missing_fixed_000": { - "unit": "kg", - "values": [ - 3.0, - 39.0, - null, - null, - 200.0, - 9.0, - null - ] - }, - "dt": { - "unit": "datetime", - "values": [ - "2020-07-01 00:00:00", - "2020-07-02 00:00:00", - null, - null, - null, - null, - null - ] - }, - "flt": { - "unit": "kg", - "values": [ - 3.21, - 39.1, - 1.1, - 2.1, - 200.2, - 9.1, - 7.11 - ] - } - }, - "destinations": { - "your_farm": null, - "my_farm": null, - "farms_galore": null - } - }, "ex0.csv": { "name": "farm_animals0", "columns": { diff --git a/pdtable/test/io/input/with_errors/auto_fixed.py b/pdtable/test/io/input/with_errors/auto_fixed.py index eda6640..7e265a0 100644 --- a/pdtable/test/io/input/with_errors/auto_fixed.py +++ b/pdtable/test/io/input/with_errors/auto_fixed.py @@ -34,19 +34,6 @@ goose;2.0;9.0;-;9.1;6.5 1234;-;-;-;7.11;7.6 """, - "cols2.csv": """ - **farm_cols2; - your_farm my_farm farms_galore - species;num;missing_fixed_000;dt;flt - text;-;kg;datetime;kg - chicken;2.0;3.0;2020-07-01 00:00:00;3.21 - pig;4.0;39.0;2020-07-02 00:00:00;39.1 - goat;4.0;-;-;1.1 - zybra;4.0;-;-;2.1 - cow;-;200.0;-;200.2 - goose;2.0;9.0;-;9.1 - 1234;-;-;-;7.11 - """, "ex0.csv": """ **farm_animals0; your_farm my_farm farms_galore diff --git a/pdtable/test/io/input/with_errors/cols2.csv b/pdtable/test/io/input/with_errors/cols2.csv deleted file mode 100644 index a8f9fbf..0000000 --- a/pdtable/test/io/input/with_errors/cols2.csv +++ /dev/null @@ -1,14 +0,0 @@ - -**farm_cols2;;; -your_farm my_farm farms_galore;;; -species; num; ; dt; flt; -text; -; kg; datetime; kg; -chicken; 2; 3; 2020-07-01; 3.21; -pig; 4; 39; 2020-07-02; 39.1; -goat; 4; -; -; 1.1; -zybra; 4; -; -; 2.1; -cow; NaN; 200; -; 200.2; -goose; 2; 9; -; 9.1; -1234; -; -; -; 7.11; - -# error/handle: missing column name diff --git a/pdtable/test/io/test_read_csv.py b/pdtable/test/io/test_read_csv.py index d49441b..136b116 100644 --- a/pdtable/test/io/test_read_csv.py +++ b/pdtable/test/io/test_read_csv.py @@ -122,3 +122,21 @@ def test_read_csv__reads_transposed_tables_with_arbitrary_trailing_csv_delimiter assert len(t0.df) == 1 for t in tables: assert t.equals(t0) + + +def test_read_csv__successfully_ignores_comments_on_column_name_row(): + csv_data_transposed_tables = dedent( + """\ + **places; + all + place;distance;ETA;is_hot;;;; --> this is a perfectly legal comment <-- ; + text;km;datetime;onoff + home;0.0;2020-08-04 08:00:00;1 + work;1.0;2020-08-04 09:00:00;0 + beach;2.0;2020-08-04 17:00:00;1 + """ + ) + bl = list(read_csv(io.StringIO(csv_data_transposed_tables))) + tables: List[Table] = [b for t, b in bl if t == BlockType.TABLE] + t0: Table = tables[0] + assert t0.column_names == ["place", "distance", "ETA", "is_hot"] diff --git a/pdtable/test/io/test_read_csv_fixer.py b/pdtable/test/io/test_read_csv_fixer.py index f4834b4..1d332d7 100644 --- a/pdtable/test/io/test_read_csv_fixer.py +++ b/pdtable/test/io/test_read_csv_fixer.py @@ -42,24 +42,6 @@ def test_columns_duplicate(): assert tab.df["flt"][0] == 3.0 -def test_columns_missing(): - """ - Verify that default ParseFixer corrects missing column name - - """ - tab = None - with open(input_dir() / "cols2.csv", "r") as fh: - g = read_csv(fh, fixer=custom_test_fixer) - for tp, tt in g: - if True: - if tp == BlockType.TABLE: - tab = tt - break - assert tab is not None - assert tab.df["missing_fixed_000"] is not None - assert tab.df["flt"][6] == 7.11 - - def test_custom_fixer(): """ Test custom ParseFixer Verify that read_csv uses custom ParseFixer