Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Faster dataframe validation #109

Merged
merged 39 commits into from
Jan 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
4916abd
add 3.11 classifier
fsoubelet Jan 26, 2023
e64095a
fix language in doc config
fsoubelet Jan 26, 2023
10953b9
bump version
fsoubelet Jan 26, 2023
45f3072
normalize subheader line
fsoubelet Jan 26, 2023
b299e4c
normalize subheader line
fsoubelet Jan 26, 2023
de4220e
options to skip validation
fsoubelet Jan 26, 2023
62e65be
conf and examples to functions
fsoubelet Jan 26, 2023
f2a6d99
add new dependencies for doc goodies
fsoubelet Jan 26, 2023
ec4645f
examples
fsoubelet Jan 26, 2023
6979bcd
add missing type hint, fix admonition
fsoubelet Jan 26, 2023
dc3c433
hints and returns
fsoubelet Jan 26, 2023
abf685b
tests for new reader argument
fsoubelet Jan 26, 2023
87ebc1b
tests for new writer argument
fsoubelet Jan 26, 2023
2ada3c8
we are only 3.7+ so this should go away, to confirm with Josch on the…
fsoubelet Jan 26, 2023
8644051
hint admonition for the methodology
fsoubelet Jan 27, 2023
65a76dd
rephrasing
fsoubelet Jan 27, 2023
93eac60
update changelog
fsoubelet Jan 27, 2023
3094cec
remove old commented out test
fsoubelet Jan 30, 2023
2e0e26a
remove and update this too
fsoubelet Jan 30, 2023
5a6e0e6
tests for space in column name
fsoubelet Jan 30, 2023
78cbb00
validation off by default when reading
fsoubelet Jan 30, 2023
d5115b7
another no validation test in writer
fsoubelet Jan 30, 2023
a889533
validate in new name at import
fsoubelet Jan 30, 2023
08dfb4e
adapt warning text in reader
fsoubelet Jan 30, 2023
d96520a
rename validate_after_reading to validate
fsoubelet Jan 30, 2023
8e1489d
rename validate_before_writing to validate
fsoubelet Jan 30, 2023
0cb54d1
adapt warning text in writer
fsoubelet Jan 30, 2023
5925a3e
named admonition in the docs
fsoubelet Jan 30, 2023
5715f8c
since change in validation logic
fsoubelet Jan 30, 2023
e1a748d
spec
fsoubelet Jan 30, 2023
e9a9933
new validation logic, remove old _is_not_finite, look for lists/tuple…
fsoubelet Jan 30, 2023
c4a12d4
changelog
fsoubelet Jan 30, 2023
ac38a51
methodology for validate function docstring
fsoubelet Jan 30, 2023
c951755
Merge branch 'master' into faster_validation
fsoubelet Jan 30, 2023
be3b54d
tiny bit faster
fsoubelet Jan 30, 2023
6cebd65
more verbose / explicit
fsoubelet Jan 30, 2023
23476c6
make that a TfsFormatError
fsoubelet Jan 30, 2023
5ffcb01
also test write crash
fsoubelet Jan 30, 2023
99cf99d
validation on by default
fsoubelet Jan 30, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
## Version 3.3.0

- Added:
- The option is now given to the user to skip DataFrame validation after reading from file / before writing to file. Validation is left "on" by default, but can be turned off with a boolean argument.
- The option is now given to the user to skip data frame validation after reading from file / before writing to file. Validation is left "on" by default, but can be turned off with a boolean argument.

- Changes:
- The `tfs.frame.validate` function has seen its internal logic reworked to be more efficient and users performing validation on large data frames should notice a significant performance improvement.
- The documentation has been expanded and improved, with notably the addition of example code snippets.

## Version 3.2.1
Expand Down
19 changes: 13 additions & 6 deletions tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ def test_fail_on_spaces_columns(self, caplog):

def test_messed_up_dataframe_fails_writes(self, _messed_up_dataframe: TfsDataFrame):
messed_tfs = _messed_up_dataframe
with pytest.raises(ValueError):
with pytest.raises(TfsFormatError): # raises in validate because of list elements
write_tfs("", messed_tfs)

def test_dict_column_dataframe_fails_writes(self, _dict_column_in_dataframe: TfsDataFrame, tmp_path):
Expand All @@ -224,13 +224,20 @@ def test_dict_column_dataframe_fails_writes(self, _dict_column_in_dataframe: Tfs
write_tfs(write_location, dict_col_tfs)
assert write_location.is_file()

def test_list_column_dataframe_fails_writes(self, _list_column_in_dataframe: TfsDataFrame, tmp_path):
def test_list_column_dataframe_fails_writes(self, _list_column_in_dataframe: TfsDataFrame, tmp_path, caplog):
list_col_tfs = _list_column_in_dataframe
with pytest.raises(ValueError): # truth value of nested can't be assesed in _validate
write_tfs("", list_col_tfs)

del list_col_tfs["d"] # should work without the column of lists
write_location = tmp_path / "test.tfs"
with pytest.raises(TfsFormatError): # we look for these and raise in validate
write_tfs(write_location, list_col_tfs)

for record in caplog.records:
assert record.levelname == "ERROR"
assert "contains list/tuple values at Index:" in caplog.text

with pytest.raises(TypeError): # this time crashes on writing
write_tfs(write_location, list_col_tfs, validate=False)

del list_col_tfs["d"] # should work now without the column of lists
write_tfs(write_location, list_col_tfs)
assert write_location.is_file()

Expand Down
41 changes: 30 additions & 11 deletions tfs/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,18 @@ def validate(
Check if a data frame contains finite values only, strings as column names and no empty headers
or column names.

.. admonition:: **Methodology**

This function performs several different checks on the provided dataframe:
1. Checking no single element is a `list` or `tuple`, which is done with a
custom vectorized function applied column-by-column on the dataframe.
2. Checking for non-physical values in the dataframe, which is done by
applying the ``isna`` function with the right option context.
3. Checking for duplicates in either indices or columns.
4. Checking for column names that are not strings.
5. Checking for column names including spaces.


Args:
data_frame (Union[TfsDataFrame, pd.DataFrame]): the dataframe to check on.
info_str (str): additional information to include in logging statements.
Expand All @@ -314,23 +326,30 @@ def validate(
if non_unique_behavior.lower() not in ("warn", "raise"):
raise KeyError("Invalid value for parameter 'non_unique_behavior'")

def is_not_finite(x):
try:
return ~np.isfinite(x)
except TypeError: # most likely string
try:
return np.zeros(x.shape, dtype=bool)
except AttributeError: # single entry
return np.zeros(1, dtype=bool)
# ----- Check that no element is a list / tuple in the dataframe ----- #
def _element_is_list(element):
return isinstance(element, (list, tuple))
_element_is_list = np.vectorize(_element_is_list)

list_or_tuple_bool_df = data_frame.apply(_element_is_list)
if list_or_tuple_bool_df.to_numpy().any():
LOGGER.error(
f"DataFrame {info_str} contains list/tuple values at Index: "
f"{list_or_tuple_bool_df.index[list_or_tuple_bool_df.any(axis='columns')].tolist()}"
)
raise TfsFormatError("Lists or tuple elements are not accepted in a TfsDataFrame")

boolean_df = data_frame.applymap(is_not_finite)
# ----- Check that no element is non-physical value in the dataframe ----- #
with pd.option_context('mode.use_inf_as_na', True):
inf_or_nan_bool_df = data_frame.isna()

if boolean_df.to_numpy().any():
if inf_or_nan_bool_df.to_numpy().any():
LOGGER.warning(
f"DataFrame {info_str} contains non-physical values at Index: "
f"{boolean_df.index[boolean_df.any(axis='columns')].tolist()}"
f"{inf_or_nan_bool_df.index[inf_or_nan_bool_df.any(axis='columns')].tolist()}"
)

# Other sanity checks
if data_frame.index.has_duplicates:
LOGGER.warning("Non-unique indices found.")
if non_unique_behavior.lower() == "raise":
Expand Down
10 changes: 6 additions & 4 deletions tfs/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,18 @@ def read_tfs(
tfs_file_path: Union[pathlib.Path, str],
index: str = None,
non_unique_behavior: str = "warn",
validate: bool = False,
validate: bool = True,
) -> TfsDataFrame:
"""
Parses the **TFS** table present in **tfs_file_path** and returns a ``TfsDataFrame``.

.. warning::
Through the *validate* argument, one can skip dataframe validation after
loading it from a file. This is the default behavior of this function.
The option, however, is left for the user to perform validation should
they not trust the file they are reading.
loading it from a file. While this can speed-up the execution time of this
function, it is **not recommended** and is not the default behavior of this
function. The option, however, is left for the user to use at their own risk
should they wish to avoid lengthy validation of large `TfsDataFrames` (such
as for instance a sliced FCC lattice).

.. admonition:: **Methodology**

Expand Down