From 0e7f7956d6e74b52525a0a0ec649081645b0b601 Mon Sep 17 00:00:00 2001 From: Jan Bielecki Date: Wed, 12 Jun 2024 11:20:27 +0200 Subject: [PATCH 1/7] init --- pdtable/io/parsers/blocks.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/pdtable/io/parsers/blocks.py b/pdtable/io/parsers/blocks.py index b45ca34..e1a3632 100644 --- a/pdtable/io/parsers/blocks.py +++ b/pdtable/io/parsers/blocks.py @@ -24,11 +24,10 @@ - The original, raw cell grid, in case the user wants to do some low-level processing. """ -from abc import abstractmethod import itertools +import logging import re from typing import Sequence, Optional, Tuple, Any, Iterable, List, Union, Dict -from collections import defaultdict import pandas as pd import warnings @@ -39,7 +38,6 @@ LocationSheet, NullLocationFile, TableOrigin, - InputIssue, InputIssueTracker, NullInputIssueTracker, ) @@ -51,6 +49,7 @@ # Typing alias: 2D grid of cells with rows and cols. Intended indexing: cell_grid[row][col] CellGrid = Sequence[Sequence] +logger = logging.getLogger(__name__) def make_metadata_block(cells: CellGrid, origin: Optional[str] = None, **_) -> MetadataBlock: @@ -96,6 +95,18 @@ def parse_column_names(column_names_raw: Sequence[Union[str, None]]) -> List[str ] +def _safe_strip(input_data: Any) -> str: + """ + Save strip in cases where the input is not a string type. + """ + # Data Validation + if not isinstance(input_data, str): + # Data Conversion + input_data = str(input_data) + + return input_data.strip() + + def make_table_json_precursor(cells: CellGrid, origin, fixer:ParseFixer) -> Tuple[JsonDataPrecursor, bool]: """Parses cell grid into a JSON-like data structure but with some non-JSON-native values @@ -118,7 +129,7 @@ def make_table_json_precursor(cells: CellGrid, origin, fixer:ParseFixer) -> Tupl fixer.table_name = table_name # internally hold destinations as json-compatible dict - destinations = {dest: None for dest in cells[1][0].strip().split(" ")} + destinations = {dest: None for dest in _safe_strip(cells[1][0]).split(" ")} table_is_empty = len(cells) < 3 if table_is_empty: column_names = [] From eea44fd3d05bbf25a8c057186feb3a64a1c75cbe Mon Sep 17 00:00:00 2001 From: Jan Bielecki Date: Wed, 12 Jun 2024 11:32:27 +0200 Subject: [PATCH 2/7] add test --- pdtable/test/io/parsers/test_blocks.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 pdtable/test/io/parsers/test_blocks.py diff --git a/pdtable/test/io/parsers/test_blocks.py b/pdtable/test/io/parsers/test_blocks.py new file mode 100644 index 0000000..2268933 --- /dev/null +++ b/pdtable/test/io/parsers/test_blocks.py @@ -0,0 +1,15 @@ +from pdtable.io.parsers.blocks import _safe_strip + + +class TestSafeStrip: + def test_string_input(self) -> None: + assert _safe_strip(" hello ") == "hello" + + def test_int_input(self) -> None: + assert _safe_strip(123) == "123" + + def test_string_input_with_leading_trailing_spaces(self) -> None: + assert _safe_strip(" hello world ") == "hello world" + + def test_int_input_with_leading_trailing_spaces(self) -> None: + assert _safe_strip(" 123 ") == "123" From eee7a59cb77b8c674a8062084872285d1517860a Mon Sep 17 00:00:00 2001 From: Jan Bielecki Date: Mon, 17 Jun 2024 11:29:27 +0200 Subject: [PATCH 3/7] continue --- pdtable/io/parsers/blocks.py | 15 ++++++++++----- pdtable/test/io/parsers/test_blocks.py | 22 ++++++++++++++++------ 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/pdtable/io/parsers/blocks.py b/pdtable/io/parsers/blocks.py index e1a3632..c7a96df 100644 --- a/pdtable/io/parsers/blocks.py +++ b/pdtable/io/parsers/blocks.py @@ -24,8 +24,8 @@ - The original, raw cell grid, in case the user wants to do some low-level processing. """ +import datetime import itertools -import logging import re from typing import Sequence, Optional, Tuple, Any, Iterable, List, Union, Dict import pandas as pd @@ -49,7 +49,6 @@ # Typing alias: 2D grid of cells with rows and cols. Intended indexing: cell_grid[row][col] CellGrid = Sequence[Sequence] -logger = logging.getLogger(__name__) def make_metadata_block(cells: CellGrid, origin: Optional[str] = None, **_) -> MetadataBlock: @@ -95,12 +94,18 @@ def parse_column_names(column_names_raw: Sequence[Union[str, None]]) -> List[str ] -def _safe_strip(input_data: Any) -> str: +def _get_destinations_safely_stripped(input_data: Any) -> str: """ Save strip in cases where the input is not a string type. """ # Data Validation - if not isinstance(input_data, str): + if isinstance(input_data, datetime.datetime): + input_data = str(input_data).replace(' ', '_') + warnings.warn( + f"Found destination with a datetime format ({str(input_data)}). " \ + f"Converting to {input_data}." + ) + elif not isinstance(input_data, str): # Data Conversion input_data = str(input_data) @@ -129,7 +134,7 @@ def make_table_json_precursor(cells: CellGrid, origin, fixer:ParseFixer) -> Tupl fixer.table_name = table_name # internally hold destinations as json-compatible dict - destinations = {dest: None for dest in _safe_strip(cells[1][0]).split(" ")} + destinations = {dest: None for dest in _get_destinations_safely_stripped(cells[1][0]).split(" ")} table_is_empty = len(cells) < 3 if table_is_empty: column_names = [] diff --git a/pdtable/test/io/parsers/test_blocks.py b/pdtable/test/io/parsers/test_blocks.py index 2268933..f9b8594 100644 --- a/pdtable/test/io/parsers/test_blocks.py +++ b/pdtable/test/io/parsers/test_blocks.py @@ -1,15 +1,25 @@ -from pdtable.io.parsers.blocks import _safe_strip +import datetime +import warnings +from pdtable.io.parsers.blocks import _get_destinations_safely_stripped -class TestSafeStrip: +class TestGetDestinationsSafelyStripped: def test_string_input(self) -> None: - assert _safe_strip(" hello ") == "hello" + assert _get_destinations_safely_stripped(" hello ") == "hello" def test_int_input(self) -> None: - assert _safe_strip(123) == "123" + assert _get_destinations_safely_stripped(123) == "123" def test_string_input_with_leading_trailing_spaces(self) -> None: - assert _safe_strip(" hello world ") == "hello world" + assert _get_destinations_safely_stripped(" hello world ") == "hello world" def test_int_input_with_leading_trailing_spaces(self) -> None: - assert _safe_strip(" 123 ") == "123" + assert _get_destinations_safely_stripped(" 123 ") == "123" + + def test_datetime_input(self) -> None: + datetime_now = datetime.datetime.now() + + with warnings.catch_warnings(record=True) as w: + destinations = _get_destinations_safely_stripped(datetime_now) + assert len(w) == 1 + assert destinations.replace(' ', '') == destinations From aef00ebe78dce68ac7cc4c19f025e1a3a0975608 Mon Sep 17 00:00:00 2001 From: Jan Bielecki Date: Mon, 17 Jun 2024 11:33:30 +0200 Subject: [PATCH 4/7] older numpy --- requirements_pandas1.txt | 2 +- requirements_pandas2.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements_pandas1.txt b/requirements_pandas1.txt index 2f8b2ec..8126817 100644 --- a/requirements_pandas1.txt +++ b/requirements_pandas1.txt @@ -1,5 +1,5 @@ pandas<2.0 -numpy +numpy<2.0 typing-extensions # OPTIONAL diff --git a/requirements_pandas2.txt b/requirements_pandas2.txt index 6ba7e76..de121eb 100644 --- a/requirements_pandas2.txt +++ b/requirements_pandas2.txt @@ -1,6 +1,6 @@ pandas>=2.0 pyarrow -numpy +numpy<2.0 typing-extensions # OPTIONAL From e8fd5ddc266e93ad682e37dff0b0cc2d93dcb7d8 Mon Sep 17 00:00:00 2001 From: Jan Bielecki Date: Tue, 18 Jun 2024 09:07:18 +0200 Subject: [PATCH 5/7] numpy not pinned for pandas2 --- requirements_pandas2.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements_pandas2.txt b/requirements_pandas2.txt index de121eb..6b45edc 100644 --- a/requirements_pandas2.txt +++ b/requirements_pandas2.txt @@ -1,7 +1,7 @@ pandas>=2.0 -pyarrow -numpy<2.0 +numpy typing-extensions +pyarrow # OPTIONAL openpyxl From 05a568837227113f27129960d4f86eb1b1f550c5 Mon Sep 17 00:00:00 2001 From: Jan Bielecki Date: Tue, 18 Jun 2024 09:37:01 +0200 Subject: [PATCH 6/7] pinned numpy for all ... --- requirements_pandas2.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements_pandas2.txt b/requirements_pandas2.txt index 6b45edc..ea6cee8 100644 --- a/requirements_pandas2.txt +++ b/requirements_pandas2.txt @@ -1,5 +1,5 @@ pandas>=2.0 -numpy +numpy<2.0 # numpy 2 does not work for pint library on python3.9 typing-extensions pyarrow From 9b1cb5b1fa21ff9e1746967cfaf9153b40d88a5b Mon Sep 17 00:00:00 2001 From: Jan Bielecki Date: Tue, 18 Jun 2024 13:08:19 +0200 Subject: [PATCH 7/7] CR fix --- pdtable/io/parsers/blocks.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pdtable/io/parsers/blocks.py b/pdtable/io/parsers/blocks.py index c7a96df..2fcd05d 100644 --- a/pdtable/io/parsers/blocks.py +++ b/pdtable/io/parsers/blocks.py @@ -100,11 +100,12 @@ def _get_destinations_safely_stripped(input_data: Any) -> str: """ # Data Validation if isinstance(input_data, datetime.datetime): - input_data = str(input_data).replace(' ', '_') + fixed_input_data = str(input_data).replace(' ', '_') warnings.warn( f"Found destination with a datetime format ({str(input_data)}). " \ - f"Converting to {input_data}." + f"Converting to {fixed_input_data}." ) + input_data = fixed_input_data elif not isinstance(input_data, str): # Data Conversion input_data = str(input_data)