Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/parser fails on comments #73

Merged
merged 5 commits into from
Nov 11, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 22 additions & 21 deletions pdtable/io/parsers/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@
- The original, raw cell grid, in case the user wants to do some low-level processing.

"""
import itertools
import re
from typing import Sequence, Optional, Tuple, Any, Iterable
from typing import Sequence, Optional, Tuple, Any, Iterable, List, Union

import pandas as pd

Expand Down Expand Up @@ -78,6 +79,17 @@ def default_fixer(**kwargs):
return fixer


def parse_column_names(column_names_raw: Sequence[Union[str, None]]) -> List[str]:
"""Parses column names from the sequence read from file

Rejects everything after first blank cell, since there can be comments there.
Strips column names.
"""
return [
c.strip() for c in itertools.takewhile(lambda x: not _is_cell_blank(x), column_names_raw)
]


def make_table_json_precursor(cells: CellGrid, **kwargs) -> JsonDataPrecursor:
"""Parses cell grid into a JSON-like data structure but with some non-JSON-native values

Expand All @@ -103,11 +115,12 @@ def make_table_json_precursor(cells: CellGrid, **kwargs) -> JsonDataPrecursor:
destinations = {dest: None for dest in cells[1][0].strip().split(" ")}

if transposed:
col_names_raw = [line[0] for line in cells[2:]]
# Column names are in lines' first cell
column_names = parse_column_names([line[0] for line in cells[2:]])
else:
col_names_raw = cells[2]
# handle multiple columns w. same name
column_names = preprocess_column_names(col_names_raw, fixer)
# Column names are on line 2 (zero-based)
column_names = parse_column_names(cells[2])
column_names = _fix_duplicate_column_names(column_names, fixer)

n_col = len(column_names)
if transposed:
Expand Down Expand Up @@ -348,29 +361,17 @@ def parse_blocks(cell_rows: Iterable[Sequence], **kwargs) -> BlockIterator:
yield block_type, block


def preprocess_column_names(col_names_raw: Sequence[str], fixer: ParseFixer):
"""
handle known issues in column_names
"""
n_names_col = len(col_names_raw)
for el in reversed(col_names_raw):
if el is not None and len(el) > 0:
break
n_names_col -= 1

# handle multiple columns w. same name
def _fix_duplicate_column_names(col_names_raw: Sequence[str], fixer: ParseFixer):
"""Finds duplicate column names and sends them to ParseFixer for fixing."""
column_names = []
cnames_all = [el.strip() for el in col_names_raw[:n_names_col]]
names = {}
for col, cname in enumerate(cnames_all):
for col, cname in enumerate(col_names_raw):
if cname not in names and len(cname) > 0:
names[cname] = 0
column_names.append(cname)
else:
fixer.column_name = col
if len(cname) == 0:
cname = fixer.fix_missing_column_name(input_columns=column_names)
elif cname in names:
if cname in names:
cname = fixer.fix_duplicate_column_name(cname, input_columns=column_names)
assert cname not in names
names[cname] = 0
Expand Down
12 changes: 0 additions & 12 deletions pdtable/io/parsers/fixer.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,18 +85,6 @@ def fix_duplicate_column_name(self, column_name: str, input_columns: List[str])

return "{column_name}-fixed"

def fix_missing_column_name(self, input_columns: List[str]) -> str:
"""
The column_name: self.TableColumn is empty
This method should provide a unique replacement name
"""
if self.verbose:
print(
f"ParseFixer: fix missing column ({self.column_name}) {input_columns} "
f"in table: {self.table_name}"
)
return self.fix_duplicate_column_name("missing", input_columns)

def fix_missing_rows_in_column_data(
self, row: int, row_data: List[str], num_columns: int
) -> List[str]:
Expand Down
14 changes: 0 additions & 14 deletions pdtable/test/io/input/with_errors/all.csv
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,6 @@ goose; 2; 9; -; 9.1;

# error/handle: multiple columns w. same name

**farm_cols2;;;
your_farm my_farm farms_galore;;;
species; num; ; dt; flt;
text; -; kg; datetime; kg;
chicken; 2; 3; 2020-07-01; 3.21;
pig; 4; 39; 2020-07-02; 39.1;
goat; 4; -; -; 1.1;
zybra; 4; -; -; 2.1;
cow; NaN; 200; -; 200.2;
goose; 2; 9; -; 9.1;
1234; -; -; -; 7.11;

# error/handle: missing column name

Author:;JEACO
;;;
***RevisionHistory;;;
Expand Down
70 changes: 0 additions & 70 deletions pdtable/test/io/input/with_errors/all.json
Original file line number Diff line number Diff line change
Expand Up @@ -81,76 +81,6 @@
"farms_galore": null
}
},
"cols2.csv": {
"name": "farm_cols2",
"columns": {
"species": {
"unit": "text",
"values": [
"chicken",
"pig",
"goat",
"zybra",
"cow",
"goose",
"1234"
]
},
"num": {
"unit": "-",
"values": [
2.0,
4.0,
4.0,
4.0,
null,
2.0,
null
]
},
"missing_fixed_000": {
"unit": "kg",
"values": [
3.0,
39.0,
null,
null,
200.0,
9.0,
null
]
},
"dt": {
"unit": "datetime",
"values": [
"2020-07-01 00:00:00",
"2020-07-02 00:00:00",
null,
null,
null,
null,
null
]
},
"flt": {
"unit": "kg",
"values": [
3.21,
39.1,
1.1,
2.1,
200.2,
9.1,
7.11
]
}
},
"destinations": {
"your_farm": null,
"my_farm": null,
"farms_galore": null
}
},
"ex0.csv": {
"name": "farm_animals0",
"columns": {
Expand Down
13 changes: 0 additions & 13 deletions pdtable/test/io/input/with_errors/auto_fixed.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,19 +34,6 @@
goose;2.0;9.0;-;9.1;6.5
1234;-;-;-;7.11;7.6
""",
"cols2.csv": """
**farm_cols2;
your_farm my_farm farms_galore
species;num;missing_fixed_000;dt;flt
text;-;kg;datetime;kg
chicken;2.0;3.0;2020-07-01 00:00:00;3.21
pig;4.0;39.0;2020-07-02 00:00:00;39.1
goat;4.0;-;-;1.1
zybra;4.0;-;-;2.1
cow;-;200.0;-;200.2
goose;2.0;9.0;-;9.1
1234;-;-;-;7.11
""",
"ex0.csv": """
**farm_animals0;
your_farm my_farm farms_galore
Expand Down
14 changes: 0 additions & 14 deletions pdtable/test/io/input/with_errors/cols2.csv

This file was deleted.

18 changes: 18 additions & 0 deletions pdtable/test/io/test_read_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,3 +122,21 @@ def test_read_csv__reads_transposed_tables_with_arbitrary_trailing_csv_delimiter
assert len(t0.df) == 1
for t in tables:
assert t.equals(t0)


def test_read_csv__successfully_ignores_comments_on_column_name_row():
csv_data_transposed_tables = dedent(
"""\
**places;
all
place;distance;ETA;is_hot;;;; --> this is a perfectly legal comment <-- ;
text;km;datetime;onoff
home;0.0;2020-08-04 08:00:00;1
work;1.0;2020-08-04 09:00:00;0
beach;2.0;2020-08-04 17:00:00;1
"""
)
bl = list(read_csv(io.StringIO(csv_data_transposed_tables)))
tables: List[Table] = [b for t, b in bl if t == BlockType.TABLE]
t0: Table = tables[0]
assert t0.column_names == ["place", "distance", "ETA", "is_hot"]
18 changes: 0 additions & 18 deletions pdtable/test/io/test_read_csv_fixer.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,24 +42,6 @@ def test_columns_duplicate():
assert tab.df["flt"][0] == 3.0


def test_columns_missing():
"""
Verify that default ParseFixer corrects missing column name

"""
tab = None
with open(input_dir() / "cols2.csv", "r") as fh:
g = read_csv(fh, fixer=custom_test_fixer)
for tp, tt in g:
if True:
if tp == BlockType.TABLE:
tab = tt
break
assert tab is not None
assert tab.df["missing_fixed_000"] is not None
assert tab.df["flt"][6] == 7.11


def test_custom_fixer():
""" Test custom ParseFixer
Verify that read_csv uses custom ParseFixer
Expand Down