startable · jfcorbett · Nov 11, 2020 · Nov 10, 2020 · Nov 10, 2020 · Nov 10, 2020
diff --git a/pdtable/io/parsers/blocks.py b/pdtable/io/parsers/blocks.py
@@ -24,8 +24,9 @@
   - The original, raw cell grid, in case the user wants to do some low-level processing.
 
 """
+import itertools
 import re
-from typing import Sequence, Optional, Tuple, Any, Iterable
+from typing import Sequence, Optional, Tuple, Any, Iterable, List, Union
 
 import pandas as pd
 
@@ -78,6 +79,17 @@ def default_fixer(**kwargs):
     return fixer
 
 
+def parse_column_names(column_names_raw: Sequence[Union[str, None]]) -> List[str]:
+    """Parses column names from the sequence read from file
+
+    Rejects everything after first blank cell, since there can be comments there.
+    Strips column names. 
+    """
+    return [
+        c.strip() for c in itertools.takewhile(lambda x: not _is_cell_blank(x), column_names_raw)
+    ]
+
+
 def make_table_json_precursor(cells: CellGrid, **kwargs) -> JsonDataPrecursor:
     """Parses cell grid into a JSON-like data structure but with some non-JSON-native values
 
@@ -103,11 +115,12 @@ def make_table_json_precursor(cells: CellGrid, **kwargs) -> JsonDataPrecursor:
     destinations = {dest: None for dest in cells[1][0].strip().split(" ")}
 
     if transposed:
-        col_names_raw = [line[0] for line in cells[2:]]
+        # Column names are in lines' first cell
+        column_names = parse_column_names([line[0] for line in cells[2:]])
     else:
-        col_names_raw = cells[2]
-    # handle multiple columns w. same name
-    column_names = preprocess_column_names(col_names_raw, fixer)
+        # Column names are on line 2 (zero-based)
+        column_names = parse_column_names(cells[2])
+    column_names = _fix_duplicate_column_names(column_names, fixer)
 
     n_col = len(column_names)
     if transposed:
@@ -348,29 +361,17 @@ def parse_blocks(cell_rows: Iterable[Sequence], **kwargs) -> BlockIterator:
             yield block_type, block
 
 
-def preprocess_column_names(col_names_raw: Sequence[str], fixer: ParseFixer):
-    """
-       handle known issues in column_names
-    """
-    n_names_col = len(col_names_raw)
-    for el in reversed(col_names_raw):
-        if el is not None and len(el) > 0:
-            break
-        n_names_col -= 1
-
-    # handle multiple columns w. same name
+def _fix_duplicate_column_names(col_names_raw: Sequence[str], fixer: ParseFixer):
+    """Finds duplicate column names and sends them to ParseFixer for fixing."""
     column_names = []
-    cnames_all = [el.strip() for el in col_names_raw[:n_names_col]]
     names = {}
-    for col, cname in enumerate(cnames_all):
+    for col, cname in enumerate(col_names_raw):
         if cname not in names and len(cname) > 0:
             names[cname] = 0
             column_names.append(cname)
         else:
             fixer.column_name = col
-            if len(cname) == 0:
-                cname = fixer.fix_missing_column_name(input_columns=column_names)
-            elif cname in names:
+            if cname in names:
                 cname = fixer.fix_duplicate_column_name(cname, input_columns=column_names)
             assert cname not in names
             names[cname] = 0

diff --git a/pdtable/io/parsers/fixer.py b/pdtable/io/parsers/fixer.py
@@ -85,18 +85,6 @@ def fix_duplicate_column_name(self, column_name: str, input_columns: List[str])
 
         return "{column_name}-fixed"
 
-    def fix_missing_column_name(self, input_columns: List[str]) -> str:
-        """
-            The column_name: self.TableColumn is empty
-            This method should provide a unique replacement name
-        """
-        if self.verbose:
-            print(
-                f"ParseFixer: fix missing column ({self.column_name}) {input_columns} "
-                f"in table: {self.table_name}"
-            )
-        return self.fix_duplicate_column_name("missing", input_columns)
-
     def fix_missing_rows_in_column_data(
         self, row: int, row_data: List[str], num_columns: int
     ) -> List[str]:

diff --git a/pdtable/test/io/input/with_errors/all.csv b/pdtable/test/io/input/with_errors/all.csv
@@ -13,20 +13,6 @@ goose;      2;    9;           -;    9.1;
 
 # error/handle: multiple columns w. same name
 
-**farm_cols2;;;
-your_farm my_farm farms_galore;;;
-species;  num;     ;          dt;    flt;
-text;       -;   kg;    datetime;     kg;
-chicken;    2;    3;  2020-07-01;   3.21;
-pig;        4;   39;  2020-07-02;   39.1;
-goat;       4;    -;           -;    1.1;
-zybra;      4;    -;           -;    2.1;
-cow;      NaN;  200;           -;  200.2;
-goose;      2;    9;           -;    9.1;
-1234;       -;    -;           -;   7.11;
-
-# error/handle: missing column name
-
 Author:;JEACO
 ;;;
 ***RevisionHistory;;;

diff --git a/pdtable/test/io/input/with_errors/all.json b/pdtable/test/io/input/with_errors/all.json
@@ -81,76 +81,6 @@
          "farms_galore": null
       }
    },
-   "cols2.csv": {
-      "name": "farm_cols2",
-      "columns": {
-         "species": {
-            "unit": "text",
-            "values": [
-               "chicken",
-               "pig",
-               "goat",
-               "zybra",
-               "cow",
-               "goose",
-               "1234"
-            ]
-         },
-         "num": {
-            "unit": "-",
-            "values": [
-               2.0,
-               4.0,
-               4.0,
-               4.0,
-               null,
-               2.0,
-               null
-            ]
-         },
-         "missing_fixed_000": {
-            "unit": "kg",
-            "values": [
-               3.0,
-               39.0,
-               null,
-               null,
-               200.0,
-               9.0,
-               null
-            ]
-         },
-         "dt": {
-            "unit": "datetime",
-            "values": [
-               "2020-07-01 00:00:00",
-               "2020-07-02 00:00:00",
-               null,
-               null,
-               null,
-               null,
-               null
-            ]
-         },
-         "flt": {
-            "unit": "kg",
-            "values": [
-               3.21,
-               39.1,
-               1.1,
-               2.1,
-               200.2,
-               9.1,
-               7.11
-            ]
-         }
-      },
-      "destinations": {
-         "your_farm": null,
-         "my_farm": null,
-         "farms_galore": null
-      }
-   },
    "ex0.csv": {
       "name": "farm_animals0",
       "columns": {

diff --git a/pdtable/test/io/input/with_errors/auto_fixed.py b/pdtable/test/io/input/with_errors/auto_fixed.py
@@ -34,19 +34,6 @@
         goose;2.0;9.0;-;9.1;6.5
         1234;-;-;-;7.11;7.6
     """,
-    "cols2.csv": """
-        **farm_cols2;
-        your_farm my_farm farms_galore
-        species;num;missing_fixed_000;dt;flt
-        text;-;kg;datetime;kg
-        chicken;2.0;3.0;2020-07-01 00:00:00;3.21
-        pig;4.0;39.0;2020-07-02 00:00:00;39.1
-        goat;4.0;-;-;1.1
-        zybra;4.0;-;-;2.1
-        cow;-;200.0;-;200.2
-        goose;2.0;9.0;-;9.1
-        1234;-;-;-;7.11
-    """,
     "ex0.csv": """
         **farm_animals0;
         your_farm my_farm farms_galore

diff --git a/pdtable/test/io/input/with_errors/cols2.csv b/pdtable/test/io/input/with_errors/cols2.csv
diff --git a/pdtable/test/io/test_read_csv.py b/pdtable/test/io/test_read_csv.py
@@ -122,3 +122,21 @@ def test_read_csv__reads_transposed_tables_with_arbitrary_trailing_csv_delimiter
     assert len(t0.df) == 1
     for t in tables:
         assert t.equals(t0)
+
+
+def test_read_csv__successfully_ignores_comments_on_column_name_row():
+    csv_data_transposed_tables = dedent(
+        """\
+        **places;
+        all
+        place;distance;ETA;is_hot;;;; --> this is a perfectly legal comment <-- ;
+        text;km;datetime;onoff
+        home;0.0;2020-08-04 08:00:00;1
+        work;1.0;2020-08-04 09:00:00;0
+        beach;2.0;2020-08-04 17:00:00;1
+        """
+    )
+    bl = list(read_csv(io.StringIO(csv_data_transposed_tables)))
+    tables: List[Table] = [b for t, b in bl if t == BlockType.TABLE]
+    t0: Table = tables[0]
+    assert t0.column_names == ["place", "distance", "ETA", "is_hot"]
diff --git a/pdtable/test/io/test_read_csv_fixer.py b/pdtable/test/io/test_read_csv_fixer.py
@@ -42,24 +42,6 @@ def test_columns_duplicate():
     assert tab.df["flt"][0] == 3.0
 
 
-def test_columns_missing():
-    """
-       Verify that default ParseFixer corrects missing column name
-
-    """
-    tab = None
-    with open(input_dir() / "cols2.csv", "r") as fh:
-        g = read_csv(fh, fixer=custom_test_fixer)
-        for tp, tt in g:
-            if True:
-                if tp == BlockType.TABLE:
-                    tab = tt
-                    break
-    assert tab is not None
-    assert tab.df["missing_fixed_000"] is not None
-    assert tab.df["flt"][6] == 7.11
-
-
 def test_custom_fixer():
     """ Test custom ParseFixer
         Verify that read_csv uses custom ParseFixer