try pandera: add jupyterlite notebooks, add support for py3.7 (#951)

* add jupyterlite to docs * handle test errors * add jupyterlite_sphinx to deps * exclude 3.7 from docs build
unionai-oss · Oct 4, 2022 · 1bcfe01 · 1bcfe01
1 parent dc6b39c
commit 1bcfe01
Show file tree

Hide file tree

Showing 17 changed files with 217 additions and 67 deletions.
diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml
@@ -86,7 +86,7 @@ jobs:
       fail-fast: false
       matrix:
         os: ["ubuntu-latest", "macos-latest", "windows-latest"]
-        python-version: ["3.8", "3.9", "3.10"]
+        python-version: ["3.7", "3.8", "3.9", "3.10"]
         pandas-version: ["1.2.0", "1.3.0", "latest"]
         exclude:
         - python-version: "3.10"
@@ -196,9 +196,9 @@ jobs:
         uses: codecov/codecov-action@v3
 
       - name: Check Docstrings
-        if: ${{ matrix.os != 'windows-latest' && matrix.python-version != '3.10' }}
+        if: ${{ matrix.os != 'windows-latest' && matrix.python-version != '3.10' && matrix.python-version != '3.7' }}
         run: nox ${{ env.NOX_FLAGS }} --session doctests
 
       - name: Check Docs
-        if: ${{ matrix.os != 'windows-latest' && matrix.python-version != '3.10' }}
+        if: ${{ matrix.os != 'windows-latest' && matrix.python-version != '3.10' && matrix.python-version != '3.7' }}
         run: nox ${{ env.NOX_FLAGS }} --session docs
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -48,6 +48,7 @@
     "sphinx_copybutton",
     "recommonmark",
     "sphinx_panels",
+    "jupyterlite_sphinx",
 ]
 
 doctest_global_setup = """
@@ -192,15 +193,17 @@ def filter(self, record: pylogging.LogRecord) -> bool:
         # that dataclass name is in the message, so that you don't filter out
         # other meaningful warnings
         return not (
-            record.getMessage().startswith(
-                "Cannot resolve forward reference in type annotations of "
-                '"pandera.typing.DataFrame"'
-            )
             # NOTE: forward reference false positive needs to be handled
             # correctly
-            or record.getMessage().startswith(
-                "Cannot resolve forward reference in type annotations of "
-                '"pandera.schemas.DataFrameSchema'
+            record.getMessage().startswith(
+                (
+                    "Cannot resolve forward reference in type annotations of "
+                    '"pandera.typing.DataFrame"',
+                    "Cannot resolve forward reference in type annotations of "
+                    '"pandera.schemas.DataFrameSchema',
+                    "Cannot resolve forward reference in type annotations of "
+                    '"pandera.typing.DataFrame.style"',
+                )
             )
         )
 
@@ -259,3 +262,8 @@ def linkcode_resolve(domain, info):
         )
 
     return f"https://github.com/pandera-dev/pandera/blob/{tag}/pandera/{fn}{linespec}"
+
+
+# jupyterlite config
+jupyterlite_contents = ["notebooks/try_pandera.ipynb"]
+jupyterlite_bind_ipynb_suffix = False
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -345,6 +345,7 @@ page or reach out to the maintainers and pandera community on
     :hidden:
 
     self
+    Try Pandera ▶️ <try_pandera>
 
 .. toctree::
    :maxdepth: 6

diff --git a/docs/source/jupyterlite_config.json b/docs/source/jupyterlite_config.json
@@ -0,0 +1,11 @@
+{
+    "LiteBuildConfig": {
+        "federated_extensions": [
+            "https://conda.anaconda.org/conda-forge/noarch/pandera-0.12.0-hd8ed1ab_0.tar.bz2",
+        ],
+        "ignore_sys_prefix": true,
+        "piplite_urls": [
+            "https://files.pythonhosted.org/packages/95/cc/e058935b0b34d50214596297f0a9edb0781fc5201bf2c6eb8cf1a026d710/pandera-0.12.0-py3-none-any.whl",
+        ]
+    }
+}
diff --git a/docs/source/notebooks/try_pandera.ipynb b/docs/source/notebooks/try_pandera.ipynb
@@ -0,0 +1,80 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ac4294bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import piplite\n",
+    "\n",
+    "\n",
+    "for package in [\n",
+    "    \"wrapt\",\n",
+    "    \"typing_extensions\",\n",
+    "    \"mypy_extensions\",\n",
+    "    \"typing_inspect\",\n",
+    "    \"pydantic\",\n",
+    "    \"pandera\",\n",
+    "]:\n",
+    "    await piplite.install(package, deps=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c9a4eef5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import pandera as pa\n",
+    "from pandera.typing import DataFrame, Series\n",
+    "\n",
+    "\n",
+    "class Schema(pa.SchemaModel):\n",
+    "    item: Series[str] = pa.Field(isin=[\"apple\", \"orange\"], coerce=True)\n",
+    "    price: Series[float] = pa.Field(gt=0)\n",
+    "\n",
+    "\n",
+    "@pa.check_types(lazy=True)\n",
+    "def transform_data(data: DataFrame[Schema]):\n",
+    "    ...\n",
+    "\n",
+    "\n",
+    "data = pd.DataFrame.from_records([\n",
+    "    {\"item\": \"applee\", \"price\": 0.5},\n",
+    "    {\"item\": \"orange\", \"price\": -1000}\n",
+    "])\n",
+    "\n",
+    "\n",
+    "try:\n",
+    "    transform_data(data)\n",
+    "except pa.errors.SchemaErrors as exc:\n",
+    "    display(exc.failure_cases)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/source/try_pandera.rst b/docs/source/try_pandera.rst
@@ -0,0 +1,7 @@
+Try Pandera
+===============
+
+In the notebook below, you can get a sense of how to use pandera right in the
+browser without having to install anything locally!
+
+.. retrolite:: notebooks/try_pandera.ipynb
diff --git a/environment.yml b/environment.yml
@@ -63,6 +63,7 @@ dependencies:
   - python-multipart
 
   # documentation
+  - jupyterlite_sphinx
   - sphinx
   - sphinx-panels
   - sphinx-autodoc-typehints <= 1.14.1

diff --git a/pandera/dtypes.py b/pandera/dtypes.py
@@ -9,14 +9,18 @@
     Any,
     Callable,
     Iterable,
-    Literal,
     Optional,
     Tuple,
     Type,
     TypeVar,
     Union,
 )
 
+try:
+    from typing import Literal
+except ImportError:
+    from typing_extensions import Literal  # type: ignore[misc]
+
 
 class DataType(ABC):
     """Base class of all Pandera data types."""

diff --git a/pandera/engines/pandas_engine.py b/pandera/engines/pandas_engine.py
@@ -25,6 +25,13 @@
 from . import engine, numpy_engine, utils
 from .type_aliases import PandasDataType, PandasExtensionType, PandasObject
 
+try:
+    import pyarrow  # pylint:disable=unused-import
+
+    PYARROW_INSTALLED = True
+except ImportError:
+    PYARROW_INSTALLED = False
+
 
 def pandas_version():
     """Return the pandas version."""
@@ -605,6 +612,13 @@ class STRING(DataType, dtypes.String):
         storage: Optional[Literal["python", "pyarrow"]] = "python"
 
         def __post_init__(self):
+            if self.storage == "pyarrow" and not PYARROW_INSTALLED:
+                raise ModuleNotFoundError(
+                    "pyarrow needs to be installed when using the "
+                    "string[pyarrow] pandas data type. Please "
+                    "`pip install pyarrow` or "
+                    "`conda install -c conda-forge pyarrow` before proceeding."
+                )
             type_ = pd.StringDtype(self.storage)
             object.__setattr__(self, "type", type_)
 

diff --git a/pandera/schemas.py b/pandera/schemas.py
@@ -15,7 +15,6 @@
     Any,
     Dict,
     List,
-    Literal,
     Optional,
     Type,
     TypeVar,
@@ -41,6 +40,12 @@
 from .error_handlers import SchemaErrorHandler
 from .hypotheses import Hypothesis
 
+try:
+    from typing import Literal
+except ImportError:
+    from typing_extensions import Literal  # type: ignore[misc]
+
+
 if TYPE_CHECKING:
     from pandera.schema_components import Column
 

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -38,6 +38,7 @@ nox
 importlib_metadata
 uvicorn
 python-multipart
+jupyterlite_sphinx
 sphinx
 sphinx-panels
 sphinx-autodoc-typehints <= 1.14.1

diff --git a/setup.py b/setup.py
@@ -52,10 +52,9 @@
         "typing_extensions >= 3.7.4.3 ; python_version<'3.8'",
         "typing_inspect >= 0.6.0",
         "wrapt",
-        "pyarrow",
     ],
     extras_require=extras_require,
-    python_requires=">=3.8",
+    python_requires=">=3.7",
     platforms="any",
     classifiers=[
         "Development Status :: 5 - Production/Stable",
@@ -64,6 +63,7 @@
         "Intended Audience :: Science/Research",
         "Programming Language :: Python",
         "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",

diff --git a/tests/core/test_dtypes.py b/tests/core/test_dtypes.py
@@ -113,7 +113,7 @@
 }
 
 nullable_string_dtypes = {pd.StringDtype: "string"}
-if pa.PANDAS_1_3_0_PLUS:
+if pa.PANDAS_1_3_0_PLUS and pandas_engine.PYARROW_INSTALLED:
     nullable_string_dtypes.update(
         {pd.StringDtype(storage="pyarrow"): "string[pyarrow]"}
     )

diff --git a/tests/core/test_from_to_format_conversions.py b/tests/core/test_from_to_format_conversions.py
@@ -7,6 +7,7 @@
 import pytest
 
 import pandera as pa
+from pandera.engines import pandas_engine
 
 
 class InSchema(pa.SchemaModel):
@@ -92,6 +93,19 @@ def invalid_input_dataframe() -> pd.DataFrame:
     return pd.DataFrame({"str_col": ["a"]})
 
 
+def _needs_pyarrow(schema) -> bool:
+    return (
+        schema
+        in {
+            InSchemaParquet,
+            InSchemaFeather,
+            OutSchemaParquet,
+            OutSchemaFeather,
+        }
+        and not pandas_engine.PYARROW_INSTALLED
+    )
+
+
 @pytest.mark.parametrize(
     "schema,to_fn,buf_cls",
     [
@@ -122,24 +136,30 @@ def fn(df: pa.typing.DataFrame[schema]):
         (mock_dataframe(), False),
         (invalid_input_dataframe(), True),
     ]:
+
         buf = None if buf_cls is None else buf_cls()
-        arg = to_fn(df, *([buf] if buf else []))
-        if buf:
-            if buf.closed:
-                pytest.skip(
-                    "skip test for older pandas versions where to_pickle "
-                    "closes user-provided buffers: "
-                    "https://github.com/pandas-dev/pandas/issues/35679"
-                )
-            buf.seek(0)
-            arg = buf
-        if invalid:
-            with pytest.raises(pa.errors.SchemaError):
-                fn(arg)
-            return
-
-        out = fn(arg)
-        assert df.equals(out)
+
+        if _needs_pyarrow(schema):
+            with pytest.raises(ImportError):
+                to_fn(df, *([buf] if buf else []))
+        else:
+            arg = to_fn(df, *([buf] if buf else []))
+            if buf:
+                if buf.closed:
+                    pytest.skip(
+                        "skip test for older pandas versions where to_pickle "
+                        "closes user-provided buffers: "
+                        "https://github.com/pandas-dev/pandas/issues/35679"
+                    )
+                buf.seek(0)
+                arg = buf
+            if invalid:
+                with pytest.raises(pa.errors.SchemaError):
+                    fn(arg)
+                return
+
+            out = fn(arg)
+            assert df.equals(out)
 
 
 @pytest.mark.parametrize(
@@ -170,6 +190,12 @@ def invalid_fn(
         return df
 
     df = mock_dataframe()
+
+    if _needs_pyarrow(schema):
+        with pytest.raises((ImportError)):
+            fn(df)
+        return
+
     try:
         out = fn(df)
     except IOError: