Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
Jan Bielecki committed Mar 29, 2024
1 parent ac7b391 commit de8678c
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 4 deletions.
37 changes: 33 additions & 4 deletions pdtable/io/parsers/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,9 @@
- The original, raw cell grid, in case the user wants to do some low-level processing.
"""
from abc import abstractmethod
import itertools
import re
from typing import Sequence, Optional, Tuple, Any, Iterable, List, Union, Dict
from collections import defaultdict
import pandas as pd
import warnings

Expand All @@ -39,7 +37,6 @@
LocationSheet,
NullLocationFile,
TableOrigin,
InputIssue,
InputIssueTracker,
NullInputIssueTracker,
)
Expand All @@ -49,6 +46,37 @@
from ...auxiliary import MetadataBlock, Directive
from ...table_metadata import TableMetadata


class EncodingException(Exception):
pass


def check_encoding(cell_rows: Iterable[Sequence]) -> Iterable[Sequence]:
"""
CSV file can have a BOM character at the start.
Reading file with a default encoding does not raise an issue,
but in such a case we ignore the first line
(and the whole table if the file starts with a table block).
This function checks if we loaded the file content with a correct encoding
and raise an EncodingException if not.
"""
first_cell_row = next(cell_rows)

if first_cell_row is not None and len(first_cell_row) > 0 and len(first_cell_row[0]) > 0:
first_sign = first_cell_row[0][0]

try:
first_sign.encode("ascii")
except UnicodeEncodeError:
raise EncodingException(
f'File starts with no ascii character "{first_sign}". '
'Please verify the file encoding.'
)

yield first_cell_row
yield from cell_rows


# Typing alias: 2D grid of cells with rows and cols. Intended indexing: cell_grid[row][col]
CellGrid = Sequence[Sequence]

Expand Down Expand Up @@ -451,7 +479,8 @@ def block_output(block_type, cell_grid, row: int):
state = BlockType.METADATA
next_state = None
this_block_1st_row = 0
for row_number_0based, row in enumerate(cell_rows):

for row_number_0based, row in enumerate(check_encoding(cell_rows)):
if row is None or len(row) == 0 or _is_cell_blank(row[0]):
if state != BlockType.BLANK:
next_state = BlockType.BLANK
Expand Down
13 changes: 13 additions & 0 deletions pdtable/test/io/input/only_tables.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
**generic_inf;;;;;;;;;;;;;
all;;;;;;;;;;;;;
FATIMA_alias;node;constraint_alias;symmetry;sn_curve;sectional_force_modification;pristrco;signco;alpha;cutpoint_tol;file_name;transformation;IO;detail_type
text;text;text;text;text;text;text;-;-;mm;text;text;text;-
C00001;B0C066;C00001;rotate;F3;-;0;3;0.8;2000;..\..\..\inputs\INF\J_tube\CHW2204_INF_Swan_Neck_a30_root_V2.txt;;I;1
;;;;;;;;;;;;;
;;;;;;;;;;;;;
**generic_inf_constraints;;;;;;;;;;;;;
all;;;;;;;;;;;;;
constraint_alias;element;symmetry;cut_point_name;node;cut_distance;;;;;;;;
text;text;text;text;text;m;;;;;;;;
C00001;C660L;rotate;BRACE1;B0C066;3.091;;;;;;;;
C00001;CJT1V;rotate;BRACE2;B0C066;1.5319;;;;;;;;
14 changes: 14 additions & 0 deletions pdtable/test/io/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@

from pytest import fixture, raises
import pandas as pd
import pytest

import pdtable
from pdtable import Table, BlockType, read_csv, write_csv
from pdtable.io.csv import _table_to_csv
from pdtable.io.parsers.blocks import EncodingException
from pdtable.table_metadata import ColumnFormat


Expand Down Expand Up @@ -417,3 +419,15 @@ def test__table_is_preserved_when_written_to_and_read_from_csv():
assert table_read.column_names == table_write.column_names
assert table_read.units == table_write.units
assert table_read.destinations == table_write.destinations


def test_read_csv_starting_with_bom():
only_tables_path = Path(__file__).parent / "input" / "only_tables.csv"

with pytest.raises(EncodingException):
list(read_csv(source=only_tables_path))

source = open(only_tables_path, mode='r', encoding='utf-8-sig')
bls = list(read_csv(source=source))
tables = [bl for ty, bl in bls if ty == BlockType.TABLE]
assert tables[0].name == "generic_inf"

0 comments on commit de8678c

Please sign in to comment.