init

startable · Mar 29, 2024 · de8678c · de8678c
1 parent ac7b391
commit de8678c
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 4 deletions.
diff --git a/pdtable/io/parsers/blocks.py b/pdtable/io/parsers/blocks.py
@@ -24,11 +24,9 @@
   - The original, raw cell grid, in case the user wants to do some low-level processing.
 
 """
-from abc import abstractmethod
 import itertools
 import re
 from typing import Sequence, Optional, Tuple, Any, Iterable, List, Union, Dict
-from collections import defaultdict
 import pandas as pd
 import warnings
 
@@ -39,7 +37,6 @@
     LocationSheet,
     NullLocationFile,
     TableOrigin,
-    InputIssue,
     InputIssueTracker,
     NullInputIssueTracker,
 )
@@ -49,6 +46,37 @@
 from ...auxiliary import MetadataBlock, Directive
 from ...table_metadata import TableMetadata
 
+
+class EncodingException(Exception):
+    pass
+
+
+def check_encoding(cell_rows: Iterable[Sequence]) -> Iterable[Sequence]:
+    """
+    CSV file can have a BOM character at the start.
+    Reading file with a default encoding does not raise an issue, 
+    but in such a case we ignore the first line 
+    (and the whole table if the file starts with a table block).
+    This function checks if we loaded the file content with a correct encoding 
+    and raise an EncodingException if not.
+    """
+    first_cell_row = next(cell_rows)
+
+    if first_cell_row is not None and len(first_cell_row) > 0 and len(first_cell_row[0]) > 0:
+        first_sign = first_cell_row[0][0]
+
+        try:
+            first_sign.encode("ascii")
+        except UnicodeEncodeError:
+            raise EncodingException(
+                f'File starts with no ascii character "{first_sign}". '
+                'Please verify the file encoding.'
+            )
+
+    yield first_cell_row
+    yield from cell_rows
+
+
 # Typing alias: 2D grid of cells with rows and cols. Intended indexing: cell_grid[row][col]
 CellGrid = Sequence[Sequence]
 
@@ -451,7 +479,8 @@ def block_output(block_type, cell_grid, row: int):
     state = BlockType.METADATA
     next_state = None
     this_block_1st_row = 0
-    for row_number_0based, row in enumerate(cell_rows):
+
+    for row_number_0based, row in enumerate(check_encoding(cell_rows)):
         if row is None or len(row) == 0 or _is_cell_blank(row[0]):
             if state != BlockType.BLANK:
                 next_state = BlockType.BLANK

diff --git a/pdtable/test/io/input/only_tables.csv b/pdtable/test/io/input/only_tables.csv
@@ -0,0 +1,13 @@
+**generic_inf;;;;;;;;;;;;;
+all;;;;;;;;;;;;;
+FATIMA_alias;node;constraint_alias;symmetry;sn_curve;sectional_force_modification;pristrco;signco;alpha;cutpoint_tol;file_name;transformation;IO;detail_type
+text;text;text;text;text;text;text;-;-;mm;text;text;text;-
+C00001;B0C066;C00001;rotate;F3;-;0;3;0.8;2000;..\..\..\inputs\INF\J_tube\CHW2204_INF_Swan_Neck_a30_root_V2.txt;;I;1
+;;;;;;;;;;;;;
+;;;;;;;;;;;;;
+**generic_inf_constraints;;;;;;;;;;;;;
+all;;;;;;;;;;;;;
+constraint_alias;element;symmetry;cut_point_name;node;cut_distance;;;;;;;;
+text;text;text;text;text;m;;;;;;;;
+C00001;C660L;rotate;BRACE1;B0C066;3.091;;;;;;;;
+C00001;CJT1V;rotate;BRACE2;B0C066;1.5319;;;;;;;;
diff --git a/pdtable/test/io/test_csv.py b/pdtable/test/io/test_csv.py
@@ -5,10 +5,12 @@
 
 from pytest import fixture, raises
 import pandas as pd
+import pytest
 
 import pdtable
 from pdtable import Table, BlockType, read_csv, write_csv
 from pdtable.io.csv import _table_to_csv
+from pdtable.io.parsers.blocks import EncodingException
 from pdtable.table_metadata import ColumnFormat
 
 
@@ -417,3 +419,15 @@ def test__table_is_preserved_when_written_to_and_read_from_csv():
     assert table_read.column_names == table_write.column_names
     assert table_read.units == table_write.units
     assert table_read.destinations == table_write.destinations
+
+
+def test_read_csv_starting_with_bom():
+    only_tables_path = Path(__file__).parent / "input" / "only_tables.csv"
+
+    with pytest.raises(EncodingException):
+        list(read_csv(source=only_tables_path))
+
+    source = open(only_tables_path, mode='r', encoding='utf-8-sig')
+    bls = list(read_csv(source=source))
+    tables = [bl for ty, bl in bls if ty == BlockType.TABLE]
+    assert tables[0].name == "generic_inf"