Version 0.0.11 (#65)

* adding version * formatting * added versions to metadata of generated schemas * testing generated_with * test for optional columns * renamed na_limit to na_pct_below * adding to changelog * bumping version in pyproject.toml * formatted * import version only when needed * using importlib.metadata * fixing self.na_limit * fix * fixing version test * update cli * generate metadata upon initialisation * fixing package name * raising * substitute importlib_metadata if python=3.7 * test cli update * updating changelog * stdout * stdout * small refactor of extension check * out * removing qoute * cli
StreetEasy · Jun 14, 2023 · 1b76edc · 1b76edc
1 parent e376b13
commit 1b76edc
Show file tree

Hide file tree

Showing 19 changed files with 225 additions and 51 deletions.
diff --git a/changelog.md b/changelog.md
@@ -1,5 +1,12 @@
 # Changelog
 
+v0.0.11:
+- Version in metadata
+  - adds `dfschema` and `pandas` version in metadata upon generation (Later will worn if Schema is initialized from json, generated by later version)
+- Renamed `na_limit` to `na_pct_below` to make it unambiguous (with backward support)
+- Added `optional=True` flag for columns. If true, does not raise exception if column is not present
+- added `dfschema update {existing_schema} {output_schema}` command to upgrade schemas
+
 v0.0.10:
 - relaxed Pydantic requirement to `>=1.9`
 
@@ -31,7 +38,7 @@ v0.0.6:
     - added pre-commit install to the repo
     - Some benchmarking
     - renamed `dfs.validate_df` to `dfs.validate`
-    
+
 v0.0.5: fix column dtype generation/validation bug
 
 ## Pre-Publication

diff --git a/dfschema/__init__.py b/dfschema/__init__.py
@@ -7,6 +7,7 @@
     DataFrameSummaryError,
 )
 
+__version__ = "0.0.11"
 
 __all__ = [
     "validate",
@@ -16,4 +17,5 @@
     "DataFrameSchemaError",
     "DataFrameValidationError",
     "DataFrameSummaryError",
+    "__version__",
 ]
diff --git a/dfschema/cli.py b/dfschema/cli.py
@@ -100,3 +100,22 @@ def generate(
         raise ValueError(
             f"Unsupported extension: {format}, should be one of [json, yaml]"
         )
+
+
+@app.command()
+def update(
+    input: Path = typer.Argument(..., help="input schema file"),
+    output: Path = typer.Argument(..., help="output schema file"),
+):
+    allowed = (".json", ".yaml", ".yml")
+    for name, f in zip(("input", "output"), (input, output)):
+        if f.suffix not in allowed:
+            raise ValueError(
+                f"Argument `{name}` should end with one of {allowed}, got {f}"
+            )
+
+    schema = DfSchema.from_file(input)
+    protocol_version = schema.metadata.protocol_version
+    print(f"Writing with `{protocol_version}` to `{output}`")
+
+    schema.to_file(output)
diff --git a/dfschema/core/column.py b/dfschema/core/column.py
@@ -1,6 +1,6 @@
 import sys
 from datetime import date, datetime
-from typing import List, Optional, FrozenSet, Union, Tuple  # , Pattern
+from typing import List, Optional, FrozenSet, Union, Tuple, Set  # , Pattern
 from warnings import warn
 
 import pandas as pd
@@ -21,6 +21,7 @@
 def _validate_column_presence(
     df: pd.DataFrame,
     column_names: Tuple[str],
+    optional_columns: Set[str] = set(),
     additionalColumns: bool = True,
     exactColumnOrder: bool = False,
 ) -> None:
@@ -33,7 +34,11 @@ def _validate_column_presence(
             text = f"Some columns should not be in dataframe: {other_cols}"
             raise DataFrameValidationError(text)
 
-    lac_cols = [col for col in column_names if col not in df.columns]
+    lac_cols = [
+        col
+        for col in column_names
+        if (col not in df.columns) and (col not in optional_columns)
+    ]
     if len(lac_cols) != 0:
         text = f"Some columns are not in dataframe: {lac_cols}"
         raise DataFrameValidationError(text)
@@ -140,7 +145,13 @@ def validate_column(self, series: pd.Series, root, col_name: Optional[str] = Non
 
 
 class Categorical(BaseModel):  # type: ignore
-    value_set: Optional[Union[FrozenSet[int], FrozenSet[float], FrozenSet[str],]] = None
+    value_set: Optional[
+        Union[
+            FrozenSet[int],
+            FrozenSet[float],
+            FrozenSet[str],
+        ]
+    ] = None
     mode: Optional[Literal["oneof", "exact_set", "include"]] = None
     unique: bool = Field(
         False, description="if true, the column must contain only unique values"
@@ -188,16 +199,21 @@ def validate_column(self, series: pd.Series, col_name: str, root) -> None:
 class ColSchema(BaseModel):
     name: str = Field(..., description="Name of the column")
     dtype: Optional[DtypeLiteral] = Field(None, description="Data type of the column")  # type: ignore
-
+    optional: Optional[bool] = Field(
+        None,
+        description="If true, will not raise exception if columns is not present in dataframe",
+    )
     # accepted for value limitation checks
     _val_accepted_types = {None, "int", "float", "datetime64[ns]"}
 
-    na_limit: Optional[float] = Field(
+    na_pct_below: Optional[float] = Field(
         None,
         ge=0,
         lt=1.0,
-        description="limit of missing values. If set to true, will raise if all values are empty. If set to a number, will raise if more than that fraction of values are empty (Nan)",
+        description="limit of missing values. If set to true, will raise if all values are empty. If set to a number, will raise if more than given perecnt of values are empty (Nan)",
+        alias="na_limit",
     )
+
     value_limits: Optional[ValueLimits] = Field(
         None, description="Value limits for the column"
     )
@@ -257,8 +273,8 @@ def _validate_dtype(self, series: pd.Series) -> None:
     def _validate_na_limit(self, series: pd.Series) -> None:
         na_fraction = series.isnull().mean()
 
-        if na_fraction > self.na_limit:  # type: ignore
-            text = f"Column `{self.name}` has too many NAs: {na_fraction}, should be <= {self.na_limit}"
+        if na_fraction > self.na_pct_below:  # type: ignore
+            text = f"Column `{self.name}` has too many NAs: {na_fraction}, should be <= {self.na_pct_below}"
             raise DataFrameValidationError(text)
 
     @exception_collector
@@ -294,7 +310,7 @@ def validate_column(self, series: pd.Series, root) -> None:
         if self.dtype:
             self._validate_dtype(series, root=root)
 
-        if self.na_limit:
+        if self.na_pct_below:
             self._validate_na_limit(series, root=root)
 
         if self.value_limits:

diff --git a/dfschema/core/config.py b/dfschema/core/config.py
@@ -0,0 +1,8 @@
+import sys
+
+if sys.version_info >= (3, 8):
+    from typing import Final
+else:
+    from typing_extensions import Final
+
+CURRENT_PROTOCOL_VERSION: Final = 2.0
diff --git a/dfschema/core/core.py b/dfschema/core/core.py
@@ -1,46 +1,23 @@
-from datetime import date
 from typing import Callable, Optional, Union, List
 import json
 from pathlib import Path
 
 
 import pandas as pd
 from pydantic import BaseModel, Extra, Field, PrivateAttr
-import sys
 
 from .column import ColSchema, _validate_column_presence
 from .exceptions import DataFrameSchemaError, DataFrameSummaryError, SubsetSummaryError
 from .shape import ShapeSchema
 from .legacy import infer_protocol_version, LegacySchemaRegistry
 from .generate import generate_schema_dict_from_df
+from .metadata import MetaData
+from .config import CURRENT_PROTOCOL_VERSION
 
 # from .utils import SchemaEncoder
 # from .base_config import BaseConfig
 
 
-if sys.version_info >= (3, 8):
-    from typing import Final
-else:
-    from typing_extensions import Final
-
-CURRENT_PROTOCOL_VERSION: Final = 2.0
-
-
-class MetaData(BaseModel):
-    protocol_version: float = Field(
-        CURRENT_PROTOCOL_VERSION, description="protocol version of the schema"
-    )
-    version: Optional[str] = Field(
-        date.today().strftime("%Y-%m-%d"),
-        description="version of the schema",
-        example="2022-06-12",
-    )
-
-    custom_settings: Optional[dict] = Field(
-        None, description="custom settings. does not affect any logic"
-    )
-
-
 class DfSchema(BaseModel):  # type: ignore
     """Main class of the package
 
@@ -104,8 +81,14 @@ def _summary_error(self) -> DataFrameSummaryError:
 
     def validate_column_presence(self, df: pd.DataFrame) -> None:
         schema_col_names = {col.name for col in self.columns}  # type: ignore
+        optional_columns = {col.name for col in self.columns if col.optional}
+
         _validate_column_presence(
-            df, schema_col_names, additionalColumns=self.additionalColumns, root=self
+            df,
+            schema_col_names,
+            optional_columns=optional_columns,
+            additionalColumns=self.additionalColumns,
+            root=self,
         )
 
     def validate_df(self, df: pd.DataFrame, summary: bool = True) -> None:
@@ -230,7 +213,6 @@ def to_file(self, path: Union[str, Path]) -> None:
             path = Path(path)
 
         try:
-
             if path.suffix == ".json":
                 schema_json = self.json(exclude_none=True, indent=4)
                 with path.open("w") as f:
@@ -254,7 +236,10 @@ def to_file(self, path: Union[str, Path]) -> None:
             raise DataFrameSchemaError(f"Error wriging schema to file {path}") from e
 
     @classmethod
-    def from_dict(cls, dict_: dict,) -> "DfSchema":
+    def from_dict(
+        cls,
+        dict_: dict,
+    ) -> "DfSchema":
         """create DfSchema from dict.
 
         same as `DfSchema(**dict_)`, but will also migrate old protocol schemas if necessary.
@@ -329,7 +314,10 @@ class SubsetSchema(BaseModel, extra=Extra.forbid, arbitrary_types_allowed=True):
     predicate to select subset.
     - If string, will be interpreted as query for `df.query()`.
     - If dict, keys should be column names, values should be values to exactly match"""
-    predicate: Union[dict, str,] = Field(..., description=_predicate_description)
+    predicate: Union[
+        dict,
+        str,
+    ] = Field(..., description=_predicate_description)
 
     shape: Optional[ShapeSchema] = Field(None, description="shape expectations")
     columns: Optional[List[ColSchema]] = Field([], description="columns expectations")

diff --git a/dfschema/core/legacy/v1.py b/dfschema/core/legacy/v1.py
@@ -54,7 +54,9 @@ class Config:
         allow_population_by_field_name = True
 
     version: Optional[str] = Field(
-        None, description="version of the schema", example="2022-06-12",
+        None,
+        description="version of the schema",
+        example="2022-06-12",
     )
 
     protocol_version: float = Field(1.0, description="version of the protocol")

diff --git a/dfschema/core/metadata.py b/dfschema/core/metadata.py
@@ -0,0 +1,42 @@
+import sys
+from datetime import date
+from typing import Optional
+
+from pydantic import BaseModel, Field
+
+from .config import CURRENT_PROTOCOL_VERSION
+
+
+class Generated_With(BaseModel):
+    @property
+    def dfschema(self) -> str:
+        if sys.version_info >= (3, 8):
+            from importlib.metadata import version
+        else:
+            from importlib_metadata import version
+
+        return version("dfschema")
+
+    @property
+    def pandas(self) -> str:
+        import pandas as pd
+
+        return pd.__version__
+
+
+class MetaData(BaseModel):
+    protocol_version: float = Field(
+        CURRENT_PROTOCOL_VERSION, description="protocol version of the schema"
+    )
+    version: Optional[str] = Field(
+        date.today().strftime("%Y-%m-%d"),
+        description="version of the schema",
+        example="2022-06-12",
+    )
+
+    generated_with: Generated_With = Field(
+        Generated_With(), description="version of packages schema was generated with"
+    )
+    custom_settings: Optional[dict] = Field(
+        None, description="custom settings. does not affect any logic"
+    )
diff --git a/dfschema/utils.py b/dfschema/utils.py
@@ -1,5 +1,4 @@
 import pandas as pd
-from datetime import date
 
 from .core.exceptions import DataFrameValidationError
 
@@ -51,13 +50,12 @@ def generate_scheme(
     exactColumnOrder: bool = False,
     na_thlds: bool = True,
     minmax: bool = True,
-    version: str = f"{date.today():%Y-%m-%d}",
 ) -> dict:
-    """generates dummy scheme over given dataframe"""
+    """generates dummy schema over given dataframe"""
+
     schema: dict = {
         "additionalColumns": additionalColumns,
         "exactColumnOrder": exactColumnOrder,
-        "version": version,
     }
 
     cols: dict = {"dtype": df.dtypes.astype(str).to_dict()}

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dfschema"
-version = "0.0.10"
+version = "0.0.11"
 description = "lightweight pandas.DataFrame schema"
 authors = ["Philipp <[email protected]>"]
 readme = "README.md"

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -33,7 +33,14 @@ def df3():
 @pytest.fixture()
 def df4():
     df = pd.DataFrame(
-        {"x": [1, 2, 3, 4], "y": ["foo", "bar", "baz", None], "z": ["2022-10-23",] * 4}
+        {
+            "x": [1, 2, 3, 4],
+            "y": ["foo", "bar", "baz", None],
+            "z": [
+                "2022-10-23",
+            ]
+            * 4,
+        }
     )
     df["z"] = pd.to_datetime(df["z"])
     return df

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -50,3 +50,23 @@ def test_cli_validate_error():
     )
     # assert result.exit_code == 1
     assert "File violates schema:" in result.stderr
+
+
+def test_cli_update():
+    from dfschema.cli import app
+    from dfschema.core.config import CURRENT_PROTOCOL_VERSION
+
+    output_path = "active_sales_v2.json"
+    result = runner.invoke(
+        app,
+        [
+            "update",
+            "tests/test_schemas/v1/good/active_sales.json",
+            output_path,
+        ],
+    )
+
+    assert result.exit_code == 0, result.stdout
+
+    string_to_be = f"Writing with `{CURRENT_PROTOCOL_VERSION}` to `{output_path}`"
+    assert string_to_be in result.stdout