Skip to content

Commit

Permalink
Version 0.0.11 (#65)
Browse files Browse the repository at this point in the history
* adding version

* formatting

* added versions to metadata of generated schemas

* testing generated_with

* test for optional columns

* renamed na_limit to na_pct_below

* adding to changelog

* bumping version in pyproject.toml

* formatted

* import version only when needed

* using importlib.metadata

* fixing self.na_limit

* fix

* fixing version test

* update cli

* generate metadata upon initialisation

* fixing package name

* raising

* substitute importlib_metadata if python=3.7

* test cli update

* updating changelog

* stdout

* stdout

* small refactor of extension check

* out

* removing qoute

* cli
  • Loading branch information
Casyfill authored Jun 14, 2023
1 parent e376b13 commit 1b76edc
Show file tree
Hide file tree
Showing 19 changed files with 225 additions and 51 deletions.
9 changes: 8 additions & 1 deletion changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# Changelog

v0.0.11:
- Version in metadata
- adds `dfschema` and `pandas` version in metadata upon generation (Later will worn if Schema is initialized from json, generated by later version)
- Renamed `na_limit` to `na_pct_below` to make it unambiguous (with backward support)
- Added `optional=True` flag for columns. If true, does not raise exception if column is not present
- added `dfschema update {existing_schema} {output_schema}` command to upgrade schemas

v0.0.10:
- relaxed Pydantic requirement to `>=1.9`

Expand Down Expand Up @@ -31,7 +38,7 @@ v0.0.6:
- added pre-commit install to the repo
- Some benchmarking
- renamed `dfs.validate_df` to `dfs.validate`

v0.0.5: fix column dtype generation/validation bug

## Pre-Publication
Expand Down
2 changes: 2 additions & 0 deletions dfschema/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
DataFrameSummaryError,
)

__version__ = "0.0.11"

__all__ = [
"validate",
Expand All @@ -16,4 +17,5 @@
"DataFrameSchemaError",
"DataFrameValidationError",
"DataFrameSummaryError",
"__version__",
]
19 changes: 19 additions & 0 deletions dfschema/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,22 @@ def generate(
raise ValueError(
f"Unsupported extension: {format}, should be one of [json, yaml]"
)


@app.command()
def update(
input: Path = typer.Argument(..., help="input schema file"),
output: Path = typer.Argument(..., help="output schema file"),
):
allowed = (".json", ".yaml", ".yml")
for name, f in zip(("input", "output"), (input, output)):
if f.suffix not in allowed:
raise ValueError(
f"Argument `{name}` should end with one of {allowed}, got {f}"
)

schema = DfSchema.from_file(input)
protocol_version = schema.metadata.protocol_version
print(f"Writing with `{protocol_version}` to `{output}`")

schema.to_file(output)
34 changes: 25 additions & 9 deletions dfschema/core/column.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import sys
from datetime import date, datetime
from typing import List, Optional, FrozenSet, Union, Tuple # , Pattern
from typing import List, Optional, FrozenSet, Union, Tuple, Set # , Pattern
from warnings import warn

import pandas as pd
Expand All @@ -21,6 +21,7 @@
def _validate_column_presence(
df: pd.DataFrame,
column_names: Tuple[str],
optional_columns: Set[str] = set(),
additionalColumns: bool = True,
exactColumnOrder: bool = False,
) -> None:
Expand All @@ -33,7 +34,11 @@ def _validate_column_presence(
text = f"Some columns should not be in dataframe: {other_cols}"
raise DataFrameValidationError(text)

lac_cols = [col for col in column_names if col not in df.columns]
lac_cols = [
col
for col in column_names
if (col not in df.columns) and (col not in optional_columns)
]
if len(lac_cols) != 0:
text = f"Some columns are not in dataframe: {lac_cols}"
raise DataFrameValidationError(text)
Expand Down Expand Up @@ -140,7 +145,13 @@ def validate_column(self, series: pd.Series, root, col_name: Optional[str] = Non


class Categorical(BaseModel): # type: ignore
value_set: Optional[Union[FrozenSet[int], FrozenSet[float], FrozenSet[str],]] = None
value_set: Optional[
Union[
FrozenSet[int],
FrozenSet[float],
FrozenSet[str],
]
] = None
mode: Optional[Literal["oneof", "exact_set", "include"]] = None
unique: bool = Field(
False, description="if true, the column must contain only unique values"
Expand Down Expand Up @@ -188,16 +199,21 @@ def validate_column(self, series: pd.Series, col_name: str, root) -> None:
class ColSchema(BaseModel):
name: str = Field(..., description="Name of the column")
dtype: Optional[DtypeLiteral] = Field(None, description="Data type of the column") # type: ignore

optional: Optional[bool] = Field(
None,
description="If true, will not raise exception if columns is not present in dataframe",
)
# accepted for value limitation checks
_val_accepted_types = {None, "int", "float", "datetime64[ns]"}

na_limit: Optional[float] = Field(
na_pct_below: Optional[float] = Field(
None,
ge=0,
lt=1.0,
description="limit of missing values. If set to true, will raise if all values are empty. If set to a number, will raise if more than that fraction of values are empty (Nan)",
description="limit of missing values. If set to true, will raise if all values are empty. If set to a number, will raise if more than given perecnt of values are empty (Nan)",
alias="na_limit",
)

value_limits: Optional[ValueLimits] = Field(
None, description="Value limits for the column"
)
Expand Down Expand Up @@ -257,8 +273,8 @@ def _validate_dtype(self, series: pd.Series) -> None:
def _validate_na_limit(self, series: pd.Series) -> None:
na_fraction = series.isnull().mean()

if na_fraction > self.na_limit: # type: ignore
text = f"Column `{self.name}` has too many NAs: {na_fraction}, should be <= {self.na_limit}"
if na_fraction > self.na_pct_below: # type: ignore
text = f"Column `{self.name}` has too many NAs: {na_fraction}, should be <= {self.na_pct_below}"
raise DataFrameValidationError(text)

@exception_collector
Expand Down Expand Up @@ -294,7 +310,7 @@ def validate_column(self, series: pd.Series, root) -> None:
if self.dtype:
self._validate_dtype(series, root=root)

if self.na_limit:
if self.na_pct_below:
self._validate_na_limit(series, root=root)

if self.value_limits:
Expand Down
8 changes: 8 additions & 0 deletions dfschema/core/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import sys

if sys.version_info >= (3, 8):
from typing import Final
else:
from typing_extensions import Final

CURRENT_PROTOCOL_VERSION: Final = 2.0
46 changes: 17 additions & 29 deletions dfschema/core/core.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,23 @@
from datetime import date
from typing import Callable, Optional, Union, List
import json
from pathlib import Path


import pandas as pd
from pydantic import BaseModel, Extra, Field, PrivateAttr
import sys

from .column import ColSchema, _validate_column_presence
from .exceptions import DataFrameSchemaError, DataFrameSummaryError, SubsetSummaryError
from .shape import ShapeSchema
from .legacy import infer_protocol_version, LegacySchemaRegistry
from .generate import generate_schema_dict_from_df
from .metadata import MetaData
from .config import CURRENT_PROTOCOL_VERSION

# from .utils import SchemaEncoder
# from .base_config import BaseConfig


if sys.version_info >= (3, 8):
from typing import Final
else:
from typing_extensions import Final

CURRENT_PROTOCOL_VERSION: Final = 2.0


class MetaData(BaseModel):
protocol_version: float = Field(
CURRENT_PROTOCOL_VERSION, description="protocol version of the schema"
)
version: Optional[str] = Field(
date.today().strftime("%Y-%m-%d"),
description="version of the schema",
example="2022-06-12",
)

custom_settings: Optional[dict] = Field(
None, description="custom settings. does not affect any logic"
)


class DfSchema(BaseModel): # type: ignore
"""Main class of the package
Expand Down Expand Up @@ -104,8 +81,14 @@ def _summary_error(self) -> DataFrameSummaryError:

def validate_column_presence(self, df: pd.DataFrame) -> None:
schema_col_names = {col.name for col in self.columns} # type: ignore
optional_columns = {col.name for col in self.columns if col.optional}

_validate_column_presence(
df, schema_col_names, additionalColumns=self.additionalColumns, root=self
df,
schema_col_names,
optional_columns=optional_columns,
additionalColumns=self.additionalColumns,
root=self,
)

def validate_df(self, df: pd.DataFrame, summary: bool = True) -> None:
Expand Down Expand Up @@ -230,7 +213,6 @@ def to_file(self, path: Union[str, Path]) -> None:
path = Path(path)

try:

if path.suffix == ".json":
schema_json = self.json(exclude_none=True, indent=4)
with path.open("w") as f:
Expand All @@ -254,7 +236,10 @@ def to_file(self, path: Union[str, Path]) -> None:
raise DataFrameSchemaError(f"Error wriging schema to file {path}") from e

@classmethod
def from_dict(cls, dict_: dict,) -> "DfSchema":
def from_dict(
cls,
dict_: dict,
) -> "DfSchema":
"""create DfSchema from dict.
same as `DfSchema(**dict_)`, but will also migrate old protocol schemas if necessary.
Expand Down Expand Up @@ -329,7 +314,10 @@ class SubsetSchema(BaseModel, extra=Extra.forbid, arbitrary_types_allowed=True):
predicate to select subset.
- If string, will be interpreted as query for `df.query()`.
- If dict, keys should be column names, values should be values to exactly match"""
predicate: Union[dict, str,] = Field(..., description=_predicate_description)
predicate: Union[
dict,
str,
] = Field(..., description=_predicate_description)

shape: Optional[ShapeSchema] = Field(None, description="shape expectations")
columns: Optional[List[ColSchema]] = Field([], description="columns expectations")
Expand Down
4 changes: 3 additions & 1 deletion dfschema/core/legacy/v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@ class Config:
allow_population_by_field_name = True

version: Optional[str] = Field(
None, description="version of the schema", example="2022-06-12",
None,
description="version of the schema",
example="2022-06-12",
)

protocol_version: float = Field(1.0, description="version of the protocol")
Expand Down
42 changes: 42 additions & 0 deletions dfschema/core/metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import sys
from datetime import date
from typing import Optional

from pydantic import BaseModel, Field

from .config import CURRENT_PROTOCOL_VERSION


class Generated_With(BaseModel):
@property
def dfschema(self) -> str:
if sys.version_info >= (3, 8):
from importlib.metadata import version
else:
from importlib_metadata import version

return version("dfschema")

@property
def pandas(self) -> str:
import pandas as pd

return pd.__version__


class MetaData(BaseModel):
protocol_version: float = Field(
CURRENT_PROTOCOL_VERSION, description="protocol version of the schema"
)
version: Optional[str] = Field(
date.today().strftime("%Y-%m-%d"),
description="version of the schema",
example="2022-06-12",
)

generated_with: Generated_With = Field(
Generated_With(), description="version of packages schema was generated with"
)
custom_settings: Optional[dict] = Field(
None, description="custom settings. does not affect any logic"
)
6 changes: 2 additions & 4 deletions dfschema/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import pandas as pd
from datetime import date

from .core.exceptions import DataFrameValidationError

Expand Down Expand Up @@ -51,13 +50,12 @@ def generate_scheme(
exactColumnOrder: bool = False,
na_thlds: bool = True,
minmax: bool = True,
version: str = f"{date.today():%Y-%m-%d}",
) -> dict:
"""generates dummy scheme over given dataframe"""
"""generates dummy schema over given dataframe"""

schema: dict = {
"additionalColumns": additionalColumns,
"exactColumnOrder": exactColumnOrder,
"version": version,
}

cols: dict = {"dtype": df.dtypes.astype(str).to_dict()}
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dfschema"
version = "0.0.10"
version = "0.0.11"
description = "lightweight pandas.DataFrame schema"
authors = ["Philipp <[email protected]>"]
readme = "README.md"
Expand Down
9 changes: 8 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,14 @@ def df3():
@pytest.fixture()
def df4():
df = pd.DataFrame(
{"x": [1, 2, 3, 4], "y": ["foo", "bar", "baz", None], "z": ["2022-10-23",] * 4}
{
"x": [1, 2, 3, 4],
"y": ["foo", "bar", "baz", None],
"z": [
"2022-10-23",
]
* 4,
}
)
df["z"] = pd.to_datetime(df["z"])
return df
Expand Down
20 changes: 20 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,23 @@ def test_cli_validate_error():
)
# assert result.exit_code == 1
assert "File violates schema:" in result.stderr


def test_cli_update():
from dfschema.cli import app
from dfschema.core.config import CURRENT_PROTOCOL_VERSION

output_path = "active_sales_v2.json"
result = runner.invoke(
app,
[
"update",
"tests/test_schemas/v1/good/active_sales.json",
output_path,
],
)

assert result.exit_code == 0, result.stdout

string_to_be = f"Writing with `{CURRENT_PROTOCOL_VERSION}` to `{output_path}`"
assert string_to_be in result.stdout
Loading

0 comments on commit 1b76edc

Please sign in to comment.