diff --git a/pdtable/io/parsers/blocks.py b/pdtable/io/parsers/blocks.py index b45ca34..70354fd 100644 --- a/pdtable/io/parsers/blocks.py +++ b/pdtable/io/parsers/blocks.py @@ -24,11 +24,9 @@ - The original, raw cell grid, in case the user wants to do some low-level processing. """ -from abc import abstractmethod import itertools import re from typing import Sequence, Optional, Tuple, Any, Iterable, List, Union, Dict -from collections import defaultdict import pandas as pd import warnings @@ -39,7 +37,6 @@ LocationSheet, NullLocationFile, TableOrigin, - InputIssue, InputIssueTracker, NullInputIssueTracker, ) @@ -49,6 +46,37 @@ from ...auxiliary import MetadataBlock, Directive from ...table_metadata import TableMetadata + +class EncodingException(Exception): + pass + + +def check_encoding(cell_rows: Iterable[Sequence]) -> Iterable[Sequence]: + """ + CSV file can have a BOM character at the start. + Reading file with a default encoding does not raise an issue, + but in such a case we ignore the first line + (and the whole table if the file starts with a table block). + This function checks if we loaded the file content with a correct encoding + and raise an EncodingException if not. + """ + first_cell_row = next(cell_rows) + + if first_cell_row is not None and len(first_cell_row) > 0 and len(first_cell_row[0]) > 0: + first_sign = first_cell_row[0][0] + + try: + first_sign.encode("ascii") + except UnicodeEncodeError: + raise EncodingException( + f'File starts with no ascii character "{first_sign}". ' + 'Please verify the file encoding.' + ) + + yield first_cell_row + yield from cell_rows + + # Typing alias: 2D grid of cells with rows and cols. Intended indexing: cell_grid[row][col] CellGrid = Sequence[Sequence] @@ -451,7 +479,8 @@ def block_output(block_type, cell_grid, row: int): state = BlockType.METADATA next_state = None this_block_1st_row = 0 - for row_number_0based, row in enumerate(cell_rows): + + for row_number_0based, row in enumerate(check_encoding(cell_rows)): if row is None or len(row) == 0 or _is_cell_blank(row[0]): if state != BlockType.BLANK: next_state = BlockType.BLANK diff --git a/pdtable/test/io/input/only_tables.csv b/pdtable/test/io/input/only_tables.csv new file mode 100644 index 0000000..1e485e6 --- /dev/null +++ b/pdtable/test/io/input/only_tables.csv @@ -0,0 +1,13 @@ +**generic_inf;;;;;;;;;;;;; +all;;;;;;;;;;;;; +FATIMA_alias;node;constraint_alias;symmetry;sn_curve;sectional_force_modification;pristrco;signco;alpha;cutpoint_tol;file_name;transformation;IO;detail_type +text;text;text;text;text;text;text;-;-;mm;text;text;text;- +C00001;B0C066;C00001;rotate;F3;-;0;3;0.8;2000;..\..\..\inputs\INF\J_tube\CHW2204_INF_Swan_Neck_a30_root_V2.txt;;I;1 +;;;;;;;;;;;;; +;;;;;;;;;;;;; +**generic_inf_constraints;;;;;;;;;;;;; +all;;;;;;;;;;;;; +constraint_alias;element;symmetry;cut_point_name;node;cut_distance;;;;;;;; +text;text;text;text;text;m;;;;;;;; +C00001;C660L;rotate;BRACE1;B0C066;3.091;;;;;;;; +C00001;CJT1V;rotate;BRACE2;B0C066;1.5319;;;;;;;; diff --git a/pdtable/test/io/test_csv.py b/pdtable/test/io/test_csv.py index d0969d8..a2fcc09 100644 --- a/pdtable/test/io/test_csv.py +++ b/pdtable/test/io/test_csv.py @@ -5,10 +5,12 @@ from pytest import fixture, raises import pandas as pd +import pytest import pdtable from pdtable import Table, BlockType, read_csv, write_csv from pdtable.io.csv import _table_to_csv +from pdtable.io.parsers.blocks import EncodingException from pdtable.table_metadata import ColumnFormat @@ -417,3 +419,15 @@ def test__table_is_preserved_when_written_to_and_read_from_csv(): assert table_read.column_names == table_write.column_names assert table_read.units == table_write.units assert table_read.destinations == table_write.destinations + + +def test_read_csv_starting_with_bom(): + only_tables_path = Path(__file__).parent / "input" / "only_tables.csv" + + with pytest.raises(EncodingException): + list(read_csv(source=only_tables_path)) + + source = open(only_tables_path, mode='r', encoding='utf-8-sig') + bls = list(read_csv(source=source)) + tables = [bl for ty, bl in bls if ty == BlockType.TABLE] + assert tables[0].name == "generic_inf"