diff --git a/CHANGELOG.md b/CHANGELOG.md index e43da252..eb5b9ccc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # TFS-Pandas Changelog +## Version 3.3.0 + +- Added: + - The option is now given to the user to skip DataFrame validation after reading from file / before writing to file. Validation is left "on" by default, but can be turned off with a boolean argument. + +- Changes: + - The documentation has been expanded and improved, with notably the addition of example code snippets. + ## Version 3.2.1 - Changed: diff --git a/doc/conf.py b/doc/conf.py index a7e16530..34c46eb6 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -11,14 +11,9 @@ # # All configuration values have a default; values that are commented out # serve to show the default. -import os import pathlib import sys -# ignore numpy warnings, see: -# https://stackoverflow.com/questions/40845304/runtimewarning-numpy-dtype-size-changed-may-indicate-binary-incompatibility -import warnings - # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. @@ -55,16 +50,36 @@ def about_package(init_posixpath: pathlib.Path) -> dict: # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - "sphinx.ext.autodoc", - "sphinx.ext.doctest", - "sphinx.ext.todo", - "sphinx.ext.coverage", - "sphinx.ext.mathjax", - "sphinx.ext.viewcode", - "sphinx.ext.githubpages", - "sphinx.ext.napoleon", + "sphinx.ext.autodoc", # Include documentation from docstrings + "sphinx.ext.coverage", # Collect doc coverage stats + "sphinx.ext.doctest", # Test snippets in the documentation + "sphinx.ext.githubpages", # Publish HTML docs in GitHub Pages + "sphinx.ext.intersphinx", # Link to other projects’ documentation + "sphinx.ext.mathjax", # Render math via JavaScript + "sphinx.ext.napoleon", # Support for NumPy and Google style docstrings + "sphinx.ext.todo", # Support for todo items + "sphinx.ext.viewcode", # Add links to highlighted source code + "sphinx_copybutton", # Add a "copy" button to code blocks + "sphinx-prompt", # prompt symbols will not be copy-pastable + "sphinx_codeautolink", # Automatically link example code to documentation source ] +# Config for autosectionlabel extension +autosectionlabel_prefix_document = True +autosectionlabel_maxdepth = 2 + +# Config for the napoleon extension +napoleon_numpy_docstring = False +napoleon_include_init_with_doc = True +napoleon_use_admonition_for_examples = True +napoleon_use_admonition_for_notes = True +napoleon_use_admonition_for_references = True +napoleon_preprocess_types = True +napoleon_attr_annotations = True + +# Configuration for sphinx.ext.todo +todo_include_todos = True + # Add any paths that contain templates here, relative to this directory. # templates_path = ['_templates'] @@ -101,7 +116,7 @@ def about_package(init_posixpath: pathlib.Path) -> dict: # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -111,8 +126,9 @@ def about_package(init_posixpath: pathlib.Path) -> dict: # The name of the Pygments (syntax highlighting) style to use. pygments_style = "sphinx" -# If true, `todo` and `todoList` produce output, else they produce nothing. -todo_include_todos = True +# The reST default role (used for this markup: `text`) to use for all +# documents. +default_role = "obj" # -- Options for HTML output ---------------------------------------------- @@ -215,3 +231,17 @@ def about_package(init_posixpath: pathlib.Path) -> dict: "Miscellaneous", ), ] + +# -- Instersphinx Configuration ---------------------------------------------- + +# Example configuration for intersphinx: refer to the Python standard library. +# use in refs e.g: +# :ref:`comparison manual ` +intersphinx_mapping = { + "python": ("https://docs.python.org/3/", None), + "numpy": ("https://numpy.org/doc/stable/", None), + "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), + "matplotlib": ("https://matplotlib.org/stable/", None), + "scipy": ("https://docs.scipy.org/doc/scipy/", None), + "cpymad": ("https://hibtc.github.io/cpymad/", None), +} \ No newline at end of file diff --git a/doc/modules/index.rst b/doc/modules/index.rst index b48c6f19..803c4838 100644 --- a/doc/modules/index.rst +++ b/doc/modules/index.rst @@ -1,5 +1,5 @@ TFS-Pandas Modules -************************** +================== .. automodule:: tfs.collection :members: @@ -31,4 +31,3 @@ TFS-Pandas Modules .. automodule:: tfs.writer :members: - diff --git a/setup.py b/setup.py index 179fc5cb..b8c0f5e2 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ def about_package(init_posixpath: pathlib.Path) -> dict: EXTRA_DEPENDENCIES = { "test": ["pytest>=5.2", "pytest-cov>=2.9", "cpymad>=1.8.1"], "hdf5": ["h5py>=2.9.0", "tables>=3.6.0"], - "doc": ["sphinx", "sphinx_rtd_theme"], + "doc": ["sphinx", "sphinx_rtd_theme", "sphinx_copybutton", "sphinx-prompt", "sphinx_codeautolink"], } EXTRA_DEPENDENCIES.update({"all": [elem for list_ in EXTRA_DEPENDENCIES.values() for elem in list_]}) EXTRA_DEPENDENCIES["test"] += EXTRA_DEPENDENCIES["hdf5"] @@ -66,6 +66,7 @@ def about_package(init_posixpath: pathlib.Path) -> dict: "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", "Topic :: Scientific/Engineering", "Topic :: Software Development :: Libraries :: Python Modules", "Typing :: Typed", diff --git a/tests/inputs/space_in_colname.tfs b/tests/inputs/space_in_colname.tfs new file mode 100644 index 00000000..8ea0e777 --- /dev/null +++ b/tests/inputs/space_in_colname.tfs @@ -0,0 +1,19 @@ +@ TITLE %s "Title of your tfs file" +@ DPP %le 1.0 +@ Q1 %le 0.269974877952 +@ Q1RMS %le 1.75642567736e-07 +@ NATQ1 %le 0.280041400187 +@ NATQ1RMS %le 0.00102479265802 +@ BPMCOUNT %d 9 +# Some comment line here +* NAME S NUMBER CO CORMS "BPM RES" +$ %s %le %d %le %le %le + BPMYB.5L2.B1 28.288 1 -0.280727353099 0.00404721900879 0.121264541395 + BPMYB.4L2.B1 48.858 2 0.601472827003 0.00301396244054 0.129738519811 + BPMWI.4L2.B1 73.3255 3 -0.610294990396 0.0039123010318 0.0952864848273 + BPMSX.4L2.B1 123.4825 3472136972 0.778206651453 0.00542543379504 0.0578581425476 + "BPMS.2L2.B1" 161.394 59055944 0.585105573645 0.00291016910226 0.1223625619 + "BPMSW.1L2.B1" 171.328 09202215 2.50235465023 0.00275350035218 0.148603785488 + BPMSW.1R2.B1 214.518 3117 1.81036167087 0.00282138482457 0.164954082556 + BPMS.2R2.B1 224.452 18943819309 0.0791371365672 0.00474290041487 0.122265653712 + BPMSX.4R2.B1 262.3635 105 -0.00665768479832 0.00350302654669 0.187320306406 diff --git a/tests/test_reader.py b/tests/test_reader.py index 5a90a3a0..656243a8 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -6,8 +6,8 @@ import tfs from tfs import read_tfs, write_tfs -from tfs.errors import TfsFormatError from tfs.constants import HEADER +from tfs.errors import TfsFormatError CURRENT_DIR = pathlib.Path(__file__).parent @@ -35,6 +35,26 @@ def test_tfs_read_str_input(self, _tfs_file_str: str): assert len(str(test_file)) > 0 assert isinstance(test_file.index[0], str) + def test_tfs_read_no_validation(self, _tfs_file_pathlib: pathlib.Path): + test_file = read_tfs(_tfs_file_pathlib, index="NAME", validate=False) + assert len(test_file.headers) > 0 + assert len(test_file.columns) > 0 + assert len(test_file.index) > 0 + assert len(str(test_file)) > 0 + assert isinstance(test_file.index[0], str) + + def test_tfs_read_wrong_file_no_validation(self, _space_in_colnames_tfs_path: pathlib.Path): + # Read file has a space in a column name which should raise, we make sure that it + # goes through when explicitely skipping validation + df = read_tfs(_space_in_colnames_tfs_path, index="NAME", validate=False) + assert "BPM RES" in df.columns + + + def test_tfs_read_no_validation_doesnt_warn(self, caplog): + nan_tfs_path = pathlib.Path(__file__).parent / "inputs" / "has_nans.tfs" + _ = read_tfs(nan_tfs_path, index="NAME", validate=False) + assert "contains non-physical values at Index:" not in caplog.text + def tfs_indx_pathlib_input(self, _tfs_file_pathlib: pathlib.Path): test_file = read_tfs(_tfs_file_pathlib) assert test_file.indx["BPMYB.5L2.B1"] == test_file.set_index("NAME")["BPMYB.5L2.B1"] @@ -104,10 +124,16 @@ def test_id_to_type_handles_typo_str_id(self): with pytest.raises(TfsFormatError): _ = tfs.reader._id_to_type(typoed_str_id) + def test_fail_space_in_colname(self, _space_in_colnames_tfs_path: pathlib.Path): + # Read file has a space in a column name which should raise + with pytest.raises(TfsFormatError): + read_tfs(_space_in_colnames_tfs_path, index="NAME", validate=True) + + class TestWarnings: def test_warn_unphysical_values(self, caplog): nan_tfs_path = pathlib.Path(__file__).parent / "inputs" / "has_nans.tfs" - _ = read_tfs(nan_tfs_path, index="NAME") + _ = read_tfs(nan_tfs_path, index="NAME", validate=True) for record in caplog.records: assert record.levelname == "WARNING" assert "contains non-physical values at Index:" in caplog.text @@ -136,6 +162,11 @@ def _no_colnames_tfs_path() -> pathlib.Path: return pathlib.Path(__file__).parent / "inputs" / "no_colnames.tfs" +@pytest.fixture() +def _space_in_colnames_tfs_path() -> pathlib.Path: + return pathlib.Path(__file__).parent / "inputs" / "space_in_colname.tfs" + + @pytest.fixture() def _tfs_file_wise() -> pathlib.Path: return CURRENT_DIR / "inputs" / "wise_header.tfs" diff --git a/tests/test_writer.py b/tests/test_writer.py index 143d6f92..206d6178 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -9,7 +9,8 @@ import pytest from cpymad.madx import Madx from pandas._testing import assert_dict_equal -from pandas.testing import assert_frame_equal, assert_index_equal, assert_series_equal +from pandas.testing import (assert_frame_equal, assert_index_equal, + assert_series_equal) import tfs from tfs import TfsDataFrame, read_tfs, write_tfs @@ -100,6 +101,15 @@ def test_tfs_write_read(self, _tfs_dataframe, tmp_path): assert_frame_equal(_tfs_dataframe, new, check_exact=False) # float precision can be an issue assert_dict_equal(_tfs_dataframe.headers, new.headers, compare_keys=True) + def test_tfs_write_read_no_validate(self, _tfs_dataframe, tmp_path): + write_location = tmp_path / "test.tfs" + write_tfs(write_location, _tfs_dataframe, validate=False) + assert write_location.is_file() + + new = read_tfs(write_location, validate=False) + assert_frame_equal(_tfs_dataframe, new, check_exact=False) # float precision can be an issue + assert_dict_equal(_tfs_dataframe.headers, new.headers, compare_keys=True) + def test_tfs_write_read_no_headers(self, _dataframe_empty_headers: TfsDataFrame, tmp_path): write_location = tmp_path / "test.tfs" write_tfs(write_location, _dataframe_empty_headers) @@ -140,6 +150,16 @@ def test_tfs_write_read_autoindex(self, _tfs_dataframe, tmp_path): assert_index_equal(df.index, df_read.index, check_exact=False) assert_dict_equal(_tfs_dataframe.headers, df_read.headers, compare_keys=True) + def test_no_warning_on_non_unique_columns_if_no_validate(self, tmp_path, caplog): + df = TfsDataFrame(columns=["A", "B", "A"]) + write_tfs(tmp_path / "temporary.tfs", df, validate=False) + assert "Non-unique column names found" not in caplog.text + + def test_no_validation_non_unique_columns(self, tmp_path, caplog): + # Making sure this goes through if we skip validation + df = TfsDataFrame(columns=["A", "B", "A"]) + write_tfs(tmp_path / "temporary.tfs", df, validate=False) + assert (tmp_path / "temporary.tfs").is_file() class TestFailures: def test_raising_on_non_unique_columns(self, caplog): @@ -231,20 +251,6 @@ def test_header_line_raises_on_non_strings(self): class TestWarnings: - @pytest.mark.skipif( - sys.version_info >= (3, 7), - reason="Our workers on 3.7+ install pandas >= 1.3.0 which has fixed the .convert_dtypes() bug " - "we try...except in _autoset_pandas_types and test here", - ) - def test_empty_df_warns_on_types_inference(self, caplog): - empty_df = pandas.DataFrame() - converted_df = tfs.writer._autoset_pandas_types(empty_df) - assert_frame_equal(converted_df, empty_df) - - for record in caplog.records: - assert record.levelname == "WARNING" - assert "An empty dataframe was provided, no types were inferred" in caplog.text - def test_warning_on_non_unique_columns(self, tmp_path, caplog): df = TfsDataFrame(columns=["A", "B", "A"]) write_tfs(tmp_path / "temporary.tfs", df) diff --git a/tfs/__init__.py b/tfs/__init__.py index 79500c6e..73f2df09 100644 --- a/tfs/__init__.py +++ b/tfs/__init__.py @@ -3,14 +3,14 @@ """ from tfs.errors import TfsFormatError from tfs.frame import TfsDataFrame, concat +from tfs.hdf import read_hdf, write_hdf from tfs.reader import read_tfs from tfs.writer import write_tfs -from tfs.hdf import read_hdf, write_hdf __title__ = "tfs-pandas" __description__ = "Read and write tfs files." __url__ = "https://github.com/pylhc/tfs" -__version__ = "3.2.1" +__version__ = "3.3.0" __author__ = "pylhc" __author_email__ = "pylhc@github.com" __license__ = "MIT" diff --git a/tfs/collection.py b/tfs/collection.py index 79500785..25aed417 100644 --- a/tfs/collection.py +++ b/tfs/collection.py @@ -1,6 +1,6 @@ """ Collection ----------------------- +---------- Advanced **TFS** files reading and writing functionality. """ @@ -48,47 +48,51 @@ class TfsCollection(metaclass=_MetaTfsCollection): Classes inheriting from this abstract class will be able to define **TFS** files as readable or writable, and read or write them just as attribute access or - assignments. All attributes will be read and written as ``TfsDataFrame`` objects. + assignments. All attributes will be read and written as `~tfs.TfsDataFrame` objects. Example: If **./example** is a directory that contains two **TFS** files **beta_phase_x.tfs** and **beta_phase_y.tfs** with `BETX` and `BETY` columns respectively: - .. sourcecode:: python + .. code-block:: python - class ExampleCollection(TfsCollection) - # All TFS attributes must be marked with the Tfs(...) class, and generated attribute - # names will be appended with _x / _y depending on files found in "./example" + >>> # All TFS attributes must be marked with the Tfs(...) class, + ... # and generated attribute names will be appended with _x / _y + ... # depending on files found in "./example" + ... class ExampleCollection(TfsCollection): + ... beta = Tfs("beta_phase_{}.tfs") # A TFS attribute + ... other_value = 7 # A traditional attribute. - beta = Tfs("beta_phase_{}.tfs") # A TFS attribute - other_value = 7 # A traditional attribute. + ... def get_filename(template: str, plane: str) -> str: + ... return template.format(plane) - def get_filename(template: str, plane: str) -> str: - return template.format(plane) + >>> example = ExampleCollection("./example") - example = ExampleCollection("./example") + >>> # Get the BETX / BETY column from "beta_phase_x.tfs": + >>> beta_x_column = example.beta_x.BETX # / example.beta_x.BETY - # Get the BETX / BETY column from "beta_phase_x.tfs": - beta_x_column = example.beta_x.BETX # / example.beta_x.BETY + >>> # Get the BETY column from "beta_phase_y.tfs": + >>> beta_y_column = example.beta_y.BETY - # Get the BETY column from "beta_phase_y.tfs": - beta_y_column = example.beta_y.BETY + >>> # The planes can also be accessed as items (both examples below work): + >>> beta_y_column = example.beta["y"].BETY + >>> beta_y_column = example.beta["Y"].BETY - # The planes can also be accessed as items (both examples below work): - beta_y_column = example.beta["y"].BETY - beta_y_column = example.beta["Y"].BETY + >>> # This will write an empty DataFrame to "beta_phase_y.tfs": + >>> example.allow_write = True + >>> example.beta["y"] = DataFrame() - # This will write an empty DataFrame to "beta_phase_y.tfs": - example.allow_write = True - example.beta["y"] = DataFrame() + If the file to be loaded is not defined for two planes then the attribute can be declared + and accessed as: + + .. code-block:: python - If the file to be loaded is not defined for two planes then the attribute can be declared as: - ``coupling = Tfs("getcouple.tfs", two_planes=False)`` and then accessed as - ``f1001w_column = example.coupling.F1001W``. + >>> coupling = Tfs("getcouple.tfs", two_planes=False) # declaration + >>> f1001w_column = example.coupling.F1001W # access No file will be loaded until the corresponding attribute is accessed and the loaded - ``TfsDataFrame`` will be buffered, thus the user should expect an ``IOError`` if the requested + `~tfs.TfsDataFrame` will be buffered, thus the user should expect an ``IOError`` if the requested file is not in the provided directory (only the first time, but is better to always take it into account!). diff --git a/tfs/constants.py b/tfs/constants.py index 9ad5a906..5e08769f 100644 --- a/tfs/constants.py +++ b/tfs/constants.py @@ -1,6 +1,6 @@ """ Constants -------------------- +--------- General constants used throughout ``tfs-pandas``, relating to the standard of **TFS** files. """ diff --git a/tfs/errors.py b/tfs/errors.py index db922935..c516990c 100644 --- a/tfs/errors.py +++ b/tfs/errors.py @@ -1,6 +1,6 @@ """ Errors -------------------- +------ Errors that can be raised during the handling of **TFS** files. """ diff --git a/tfs/frame.py b/tfs/frame.py index c0f88fb9..74f1b75f 100644 --- a/tfs/frame.py +++ b/tfs/frame.py @@ -1,6 +1,6 @@ """ Frame -------------------- +----- Contains the class definition of a ``TfsDataFrame``, inherited from the ``pandas`` ``DataFrame``, as well as a utility function to validate the correctness of a ``TfsDataFrame``. @@ -9,7 +9,7 @@ from collections import OrderedDict from contextlib import suppress from functools import partial, reduce -from typing import Sequence, Union +from typing import Sequence, Set, Union import numpy as np import pandas as pd @@ -260,7 +260,7 @@ def concat( axes. Data manipulation is done by the ``pandas.concat`` function. Resulting headers are either merged according to the provided **how_headers** method or as given via **new_headers**. - ..warning:: + .. warning:: Please note that when using this function on many ``TfsDataFrames``, leaving the contents of the final headers dictionary to the automatic merger can become unpredictable. In this case it is recommended to provide the **new_headers** argument to ensure the final result, or leave both diff --git a/tfs/hdf.py b/tfs/hdf.py index 100f33df..ed07d8ea 100644 --- a/tfs/hdf.py +++ b/tfs/hdf.py @@ -4,10 +4,11 @@ Additional tools for reading and writing ``TfsDataFrames`` into ``hdf5`` files. """ -import pandas as pd from pathlib import Path - from typing import Union + +import pandas as pd + from tfs import TfsDataFrame try: @@ -25,10 +26,10 @@ LOGGER = logging.getLogger(__name__) -def write_hdf(path: Union[Path, str], df: TfsDataFrame, **kwargs): +def write_hdf(path: Union[Path, str], df: TfsDataFrame, **kwargs) -> None: """Write TfsDataFrame to hdf5 file. The dataframe will be written into the group ``data``, the headers into the group ``headers``. - Only one frame per file is allowed. + Only one dataframe per file is allowed. Args: path (Path, str): Path of the output file. @@ -68,6 +69,9 @@ def read_hdf(path: Union[Path, str]) -> TfsDataFrame: Args: path (Path, str): Path of the file to read. + + Returns: + A ``TfsDataFrame`` object with the loaded data from the file. """ _check_imports() df = pd.read_hdf(path, key="data") @@ -84,6 +88,7 @@ def read_hdf(path: Union[Path, str]) -> TfsDataFrame: def _check_imports(): + """Checks if required packages for HDF5 functionality are installed. Raises ImportError if not.""" not_imported = [name for name, package in (('tables', tables), ('h5py', h5py)) if package is None] if len(not_imported): names = ", ".join(f"`{name}`" for name in not_imported) diff --git a/tfs/reader.py b/tfs/reader.py index e8fdf812..482fada0 100644 --- a/tfs/reader.py +++ b/tfs/reader.py @@ -1,6 +1,6 @@ """ Reader -------------------- +------ Reading functionalty for **TFS** files. """ @@ -15,23 +15,35 @@ from tfs.constants import COMMENTS, HEADER, ID_TO_TYPE, INDEX_ID, NAMES, TYPES from tfs.errors import TfsFormatError -from tfs.frame import TfsDataFrame, validate +from tfs.frame import TfsDataFrame +from tfs.frame import validate as validate_frame LOGGER = logging.getLogger(__name__) def read_tfs( - tfs_file_path: Union[pathlib.Path, str], index: str = None, non_unique_behavior: str = "warn" + tfs_file_path: Union[pathlib.Path, str], + index: str = None, + non_unique_behavior: str = "warn", + validate: bool = False, ) -> TfsDataFrame: """ Parses the **TFS** table present in **tfs_file_path** and returns a ``TfsDataFrame``. - Methodology: This function parses the first lines of the file until it gets to the `types` line. - While parsed, the appropriate information is gathered (headers content, column names & types, - number of lines parsed). After reaching the `types` line, the rest of the file is given to parse - to ``pandas.read_csv`` with the right options to make use of its C engine's speed. After this, - conversion to ``TfsDataDrame`` is made, proper types are applied to columns, the index is set and - the frame is validated before being returned. + .. warning:: + Through the *validate* argument, one can skip dataframe validation after + loading it from a file. This is the default behavior of this function. + The option, however, is left for the user to perform validation should + they not trust the file they are reading. + + .. admonition:: **Methodology** + + This function parses the first lines of the file until it gets to the `types` line. + While parsed, the appropriate information is gathered (headers content, column names & types, + number of lines parsed). After reaching the `types` line, the rest of the file is given to parse + to ``pandas.read_csv`` with the right options to make use of its C engine's speed. After this, + conversion to ``TfsDataDrame`` is made, proper types are applied to columns, the index is set and + the frame is validated before being returned. Args: tfs_file_path (Union[pathlib.Path, str]): Path object to the **TFS** file to read. Can be @@ -41,9 +53,44 @@ def read_tfs( non_unique_behavior (str): behavior to adopt if non-unique indices or columns are found in the dataframe. Accepts `warn` and `raise` as values, case-insensitively, which dictates to respectively issue a warning or raise an error if non-unique elements are found. + validate (bool): Whether to validate the dataframe after reading it. Defaults to ``False``. Returns: A ``TfsDataFrame`` object with the loaded data from the file. + + Examples: + Reading from a file is simple, as most arguments have sane default values. + The simplest usage goes as follows: + + .. code-block:: python + + >>> tfs.read("filename.tfs") + + One can also pass a `~pathlib.Path` object to the function: + + .. code-block:: python + + >>> tfs.read(pathlib.Path("filename.tfs")) + + If one wants to set a specific column as index, this is done as: + + .. code-block:: python + + >>> tfs.read("filename.tfs", index="COLUMN_NAME") + + If one wants to, for instance, raise and error on non-unique indices or columns, + one can do so as: + + .. code-block:: python + + >>> tfs.read("filename.tfs", non_unique_behavior="raise") + + One can choose to skip dataframe validation **at one's own risk** after reading + from file. This is done as: + + .. code-block:: python + + >>> tfs.read("filename.tfs", validate=False) """ tfs_file_path = pathlib.Path(tfs_file_path) headers = OrderedDict() @@ -100,7 +147,9 @@ def read_tfs( LOGGER.debug("Attempting to find index identifier in columns") tfs_data_frame = _find_and_set_index(tfs_data_frame) - validate(tfs_data_frame, f"from file {tfs_file_path.absolute()}", non_unique_behavior) + if validate: + validate_frame(tfs_data_frame, f"from file {tfs_file_path.absolute()}", non_unique_behavior) + return tfs_data_frame diff --git a/tfs/tools.py b/tfs/tools.py index b5777ab9..5c54aa8b 100644 --- a/tfs/tools.py +++ b/tfs/tools.py @@ -1,6 +1,6 @@ """ Tools ------------------ +----- Additional functions to modify **TFS** files. """ diff --git a/tfs/writer.py b/tfs/writer.py index 358d3492..80a3390e 100644 --- a/tfs/writer.py +++ b/tfs/writer.py @@ -1,6 +1,6 @@ """ Writer -------------------- +------ Writing functionalty for **TFS** files. """ @@ -14,7 +14,8 @@ from pandas.api import types as pdtypes from tfs.constants import DEFAULT_COLUMN_WIDTH, INDEX_ID, MIN_COLUMN_WIDTH -from tfs.frame import TfsDataFrame, validate +from tfs.frame import TfsDataFrame +from tfs.frame import validate as validate_frame LOGGER = logging.getLogger(__name__) @@ -27,11 +28,19 @@ def write_tfs( colwidth: int = DEFAULT_COLUMN_WIDTH, headerswidth: int = DEFAULT_COLUMN_WIDTH, non_unique_behavior: str = "warn", + validate: bool = True, ) -> None: """ Writes the provided ``DataFrame`` to disk at **tfs_file_path**, eventually with the `headers_dict` as headers dictionary. + .. warning:: + Through the *validate* argument, one can skip dataframe validation before writing it to file. + While this can speed-up the execution time of this function , it is **not recommended** and + is not the default behavior of this function. The option, however, is left for the user to + use at their own risk should they wish to avoid lengthy validation of large `TfsDataFrames` + (such as for instance a sliced FCC lattice). + Args: tfs_file_path (Union[pathlib.Path, str]): Path object to the output **TFS** file. Can be a string, in which case it will be cast to a Path object. @@ -47,10 +56,35 @@ def write_tfs( non_unique_behavior (str): behavior to adopt if non-unique indices or columns are found in the dataframe. Accepts `warn` and `raise` as values, case-insensitively, which dictates to respectively issue a warning or raise an error if non-unique elements are found. + validate (bool): Whether to validate the dataframe before writing it to file. Defaults to ``True``. + + Examples: + Writing to file is simple, as most arguments have sane default values. + The simplest usage goes as follows: + + .. code-block:: python + + >>> tfs.write("filename.tfs", dataframe) + + If one wants to, for instance, raise and error on non-unique indices or columns, + one can do so as: + + .. code-block:: python + + >>> tfs.write("filename.tfs", dataframe, non_unique_behavior="raise") + + One can choose to skip dataframe validation **at one's own risk** before writing + it to file. This is done as: + + .. code-block:: python + + >>> tfs.write("filename.tfs", dataframe, validate=False) """ left_align_first_column = False tfs_file_path = pathlib.Path(tfs_file_path) - validate(data_frame, f"to be written in {tfs_file_path.absolute()}", non_unique_behavior) + + if validate: + validate_frame(data_frame, f"to be written in {tfs_file_path.absolute()}", non_unique_behavior) if headers_dict is None: # tries to get headers from TfsDataFrame try: @@ -85,14 +119,12 @@ def _autoset_pandas_types(data_frame: Union[TfsDataFrame, pd.DataFrame]) -> Unio dataframe. Otherwise, raise the exception given by ``pandas``. NOTE: Starting with pandas 1.3.0, this behavior which was a bug has been fixed. This means no - ``ValueError`` is raised by calling ``.convert_dtypes()`` on an empty ``DataFrame``, and from this - function a warning is logged. Testing of this behavior is disabled for Python 3.7+ workers, but the - function is kept as to not force a new min version requirement on ``pandas`` or Python for users. + ``ValueError`` is raised by calling ``.convert_dtypes()`` on an empty ``DataFrame``, and from + this function a warning is logged. The function is kept as to not force a new min version + requirement on ``pandas`` or Python for users. When one day we make ``pandas >= 1.3.0`` the + minimum requirement, we can remove the checks altogether and just call ``.convert_dtypes()``. See my comment at https://github.com/pylhc/tfs/pull/83#issuecomment-874208869 - TODO: remove the aforementioned check when we make Python 3.7 the minimum version for tfs-pandas, - aka when Python 3.6 reaches EOL (end of 2021). - Args: data_frame (Union[TfsDataFrame, pd.DataFrame]): ``TfsDataFrame`` or ``pandas.DataFrame`` to determine the types of.