From 2baa480e8e0c546ce2bcff9dd4dda6e949974e29 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Tue, 5 Nov 2024 21:47:22 +0100 Subject: [PATCH] Dependencies: Minimize dependencies of core installation - Defer `polars` to `cratedb-toolkit[io]` and `cratedb-toolkit[cfr]` - Add dedicated testing slot for CrateDB CFR --- .github/workflows/main.yml | 65 +++++++++++++++++++++++++++++++++ CHANGES.md | 2 + cratedb_toolkit/cfr/systable.py | 16 +++++--- pyproject.toml | 4 +- tests/cfr/test_cli.py | 7 ++++ 5 files changed, 88 insertions(+), 6 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 4a56c923..d03f343e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -92,6 +92,71 @@ jobs: fail_ci_if_error: true + tests-cfr: + + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: ["ubuntu-latest"] + python-version: ["3.9", "3.12"] + + env: + OS: ${{ matrix.os }} + PYTHON: ${{ matrix.python-version }} + # Do not tear down Testcontainers + TC_KEEPALIVE: true + + # https://docs.github.com/en/actions/using-containerized-services/about-service-containers + services: + cratedb: + image: crate/crate:nightly + ports: + - 4200:4200 + - 5432:5432 + + name: " + CFR: + Python ${{ matrix.python-version }} on OS ${{ matrix.os }}" + steps: + + - name: Acquire sources + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + architecture: x64 + cache: 'pip' + cache-dependency-path: 'pyproject.toml' + + - name: Set up project + run: | + + # `setuptools 0.64.0` adds support for editable install hooks (PEP 660). + # https://github.com/pypa/setuptools/blob/main/CHANGES.rst#v6400 + pip install "setuptools>=64" --upgrade + + # Install package in editable mode. + pip install --use-pep517 --prefer-binary --editable=.[cfr,test,develop] + + - name: Run linter and software tests + run: | + pytest -m cfr + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v4 + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} + with: + files: ./coverage.xml + flags: cfr + env_vars: OS,PYTHON + name: codecov-umbrella + fail_ci_if_error: true + + tests-pymongo: runs-on: ${{ matrix.os }} diff --git a/CHANGES.md b/CHANGES.md index b3c7b41b..8dcf4cbb 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,8 @@ # Changelog ## Unreleased +- Dependencies: Minimize dependencies of core installation, + defer `polars` to `cratedb-toolkit[io]`. ## 2024/10/13 v0.0.29 - MongoDB: Added Zyp transformations to the CDC subsystem, diff --git a/cratedb_toolkit/cfr/systable.py b/cratedb_toolkit/cfr/systable.py index 55193fb1..ea1b2846 100644 --- a/cratedb_toolkit/cfr/systable.py +++ b/cratedb_toolkit/cfr/systable.py @@ -23,7 +23,9 @@ import typing as t from pathlib import Path -import polars as pl +if t.TYPE_CHECKING: + import polars as pl + import sqlalchemy as sa from tqdm import tqdm @@ -125,7 +127,9 @@ def __init__(self, dburi: str, target: t.Union[Path], data_format: DataFormat = self.info = InfoContainer(adapter=self.adapter) self.inspector = SystemTableInspector(dburi=self.dburi) - def read_table(self, tablename: str) -> pl.DataFrame: + def read_table(self, tablename: str) -> "pl.DataFrame": + import polars as pl + sql = f'SELECT * FROM "{SystemTableKnowledge.SYS_SCHEMA}"."{tablename}"' # noqa: S608 logger.debug(f"Running SQL: {sql}") return pl.read_database( @@ -134,7 +138,7 @@ def read_table(self, tablename: str) -> pl.DataFrame: infer_schema_length=1000, ) - def dump_table(self, frame: pl.DataFrame, file: t.Union[t.TextIO, None] = None): + def dump_table(self, frame: "pl.DataFrame", file: t.Union[t.TextIO, None] = None): if self.data_format == "csv": # polars.exceptions.ComputeError: CSV format does not support nested data # return df.write_csv() # noqa: ERA001 @@ -235,7 +239,7 @@ def load(self): # Load data. try: - df: pl.DataFrame = self.load_table(path_table_data) + df: "pl.DataFrame" = self.load_table(path_table_data) df.write_database(table_name=tablename_restored, connection=self.dburi, if_table_exists="append") except Exception as ex: error_logger(self.debug)(f"Importing table failed: {tablename}. Reason: {ex}") @@ -243,7 +247,9 @@ def load(self): logger.info(f"Successfully imported {table_count} system tables") # df.to_pandas().to_sql(name=tablename, con=self.adapter.engine, if_exists="append", index=False) # noqa: ERA001, E501 - def load_table(self, path: Path) -> pl.DataFrame: + def load_table(self, path: Path) -> "pl.DataFrame": + import polars as pl + if path.suffix in [".jsonl"]: return pl.read_ndjson(path) elif path.suffix in [".parquet", ".pq"]: diff --git a/pyproject.toml b/pyproject.toml index 0e98a795..b31b62c9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -95,7 +95,6 @@ dependencies = [ 'importlib-metadata; python_version < "3.8"', 'importlib-resources; python_version < "3.9"', "orjsonl<2", - "polars<1.11", "pympler<1.2", "python-dateutil<3", "python-dotenv<2", @@ -115,6 +114,7 @@ all = [ ] cfr = [ "pandas<2.3", + "polars<1.11", "pyarrow<18.1", ] cloud = [ @@ -157,6 +157,7 @@ io = [ "dask[dataframe]>=2020", "fsspec[s3,http]", "pandas<2.3,>=1", + "polars<1.11", "sqlalchemy>=2", "universal-pathlib<0.3", ] @@ -268,6 +269,7 @@ testpaths = [ ] xfail_strict = true markers = [ + "cfr", "examples", "dynamodb", "influxdb", diff --git a/tests/cfr/test_cli.py b/tests/cfr/test_cli.py index cbdfe163..5eabcb63 100644 --- a/tests/cfr/test_cli.py +++ b/tests/cfr/test_cli.py @@ -1,3 +1,4 @@ +# ruff: noqa: E402 import json import os.path import re @@ -5,6 +6,10 @@ import sys import tarfile +import pytest + +pymongo = pytest.importorskip("polars", reason="Skipping tests because polars is not installed") + import tests if sys.version_info < (3, 9): @@ -17,6 +22,8 @@ from cratedb_toolkit.cfr.cli import cli +pytestmark = pytest.mark.cfr + def filenames(path: Path): return sorted([item.name for item in path.iterdir()])