From e9b6e3fab21def91326e5f73e5844529a2453c3d Mon Sep 17 00:00:00 2001 From: Matt Green Date: Mon, 23 Sep 2024 09:56:38 -0700 Subject: [PATCH] add Python udf example (#42) * include datafusion-python as sub module * copy datafusion-python code in * find/replace datafusion._internal -> denormalized._internal.datafusion * update imports to make things work * add example udf * fix cargo * rename example file * update cargo lock --- Cargo.lock | 34 +- py-denormalized/pyproject.toml | 1 + .../python/denormalized/context.py | 3 +- .../denormalized/datafusion/__init__.py | 112 + .../python/denormalized/datafusion/catalog.py | 76 + .../python/denormalized/datafusion/common.py | 62 + .../python/denormalized/datafusion/context.py | 1029 +++++++ .../denormalized/datafusion/dataframe.py | 572 ++++ .../python/denormalized/datafusion/expr.py | 718 +++++ .../denormalized/datafusion/functions.py | 2659 +++++++++++++++++ .../denormalized/datafusion/input/__init__.py | 27 + .../denormalized/datafusion/input/base.py | 48 + .../denormalized/datafusion/input/location.py | 89 + .../denormalized/datafusion/object_store.py | 35 + .../python/denormalized/datafusion/py.typed | 16 + .../denormalized/datafusion/record_batch.py | 76 + .../python/denormalized/datafusion/udf.py | 248 ++ .../python/denormalized/datastream.py | 37 +- py-denormalized/python/denormalized/utils.py | 13 + .../python/examples/stream_aggregate.py | 23 +- .../python/examples/udf_example.py | 60 + py-denormalized/src/datastream.rs | 5 + py-denormalized/src/lib.rs | 20 +- 23 files changed, 5906 insertions(+), 57 deletions(-) create mode 100644 py-denormalized/python/denormalized/datafusion/__init__.py create mode 100644 py-denormalized/python/denormalized/datafusion/catalog.py create mode 100644 py-denormalized/python/denormalized/datafusion/common.py create mode 100644 py-denormalized/python/denormalized/datafusion/context.py create mode 100644 py-denormalized/python/denormalized/datafusion/dataframe.py create mode 100644 py-denormalized/python/denormalized/datafusion/expr.py create mode 100644 py-denormalized/python/denormalized/datafusion/functions.py create mode 100644 py-denormalized/python/denormalized/datafusion/input/__init__.py create mode 100644 py-denormalized/python/denormalized/datafusion/input/base.py create mode 100644 py-denormalized/python/denormalized/datafusion/input/location.py create mode 100644 py-denormalized/python/denormalized/datafusion/object_store.py create mode 100644 py-denormalized/python/denormalized/datafusion/py.typed create mode 100644 py-denormalized/python/denormalized/datafusion/record_batch.py create mode 100644 py-denormalized/python/denormalized/datafusion/udf.py create mode 100644 py-denormalized/python/denormalized/utils.py create mode 100644 py-denormalized/python/examples/udf_example.py diff --git a/Cargo.lock b/Cargo.lock index c76efd5..c482824 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1236,7 +1236,7 @@ dependencies = [ [[package]] name = "datafusion-python" version = "41.0.0" -source = "git+https://github.com/probably-nothing-labs/datafusion-python?branch=denormalized-tweaks#b6d50fff2e0a8b4a5c01b20877c8b1d120a257e1" +source = "git+https://github.com/probably-nothing-labs/datafusion-python?branch=denormalized-tweaks#91642ad59f0c98ee1450dc80883b64b5ef1edfbb" dependencies = [ "arrow", "async-trait", @@ -2524,9 +2524,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pkg-config" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" +checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" [[package]] name = "portable-atomic" @@ -2564,9 +2564,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.13.2" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b2ecbe40f08db5c006b5764a2645f7f3f141ce756412ac9e1dd6087e6d32995" +checksum = "7b0487d90e047de87f984913713b85c601c05609aad5b0df4b4573fbf69aa13f" dependencies = [ "bytes", "prost-derive", @@ -2574,9 +2574,9 @@ dependencies = [ [[package]] name = "prost-derive" -version = "0.13.2" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acf0c195eebb4af52c752bec4f52f645da98b6e92077a04110c7f349477ae5ac" +checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5" dependencies = [ "anyhow", "itertools 0.13.0", @@ -2587,9 +2587,9 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.13.2" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60caa6738c7369b940c3d49246a8d1749323674c65cb13010134f5c9bad5b519" +checksum = "4759aa0d3a6232fb8dbdb97b61de2c20047c68aca932c7ed76da9d788508d670" dependencies = [ "prost", ] @@ -3190,18 +3190,18 @@ checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" [[package]] name = "snafu" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b835cb902660db3415a672d862905e791e54d306c6e8189168c7f3d9ae1c79d" +checksum = "223891c85e2a29c3fe8fb900c1fae5e69c2e42415e3177752e8718475efa5019" dependencies = [ "snafu-derive", ] [[package]] name = "snafu-derive" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38d1e02fca405f6280643174a50c942219f0bbf4dbf7d480f1dd864d6f211ae5" +checksum = "03c3c6b7927ffe7ecaa769ee0e3994da3b8cafc8f444578982c83ecb161af917" dependencies = [ "heck 0.5.0", "proc-macro2", @@ -3357,18 +3357,18 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.63" +version = "1.0.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724" +checksum = "d50af8abc119fb8bb6dbabcfa89656f46f84aa0ac7688088608076ad2b459a84" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.63" +version = "1.0.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" +checksum = "08904e7672f5eb876eaaf87e0ce17857500934f4981c4a0ab2b4aa98baac7fc3" dependencies = [ "proc-macro2", "quote", diff --git a/py-denormalized/pyproject.toml b/py-denormalized/pyproject.toml index a630d1c..4eb7bb8 100644 --- a/py-denormalized/pyproject.toml +++ b/py-denormalized/pyproject.toml @@ -30,6 +30,7 @@ dev-dependencies = ["pip>=24.2", "ipython>=8.26.0", "pytest>=8.3.2"] # Enable docstring linting using the google style guide [tool.ruff.lint] select = ["E4", "E7", "E9", "F", "D", "W"] +ignore = ["D103"] [tool.ruff.lint.pydocstyle] convention = "google" diff --git a/py-denormalized/python/denormalized/context.py b/py-denormalized/python/denormalized/context.py index ce444b5..90ca5d2 100644 --- a/py-denormalized/python/denormalized/context.py +++ b/py-denormalized/python/denormalized/context.py @@ -1,6 +1,5 @@ from denormalized._internal import PyContext - -from denormalized.datastream import DataStream as DataStream +from .datastream import DataStream class Context: """Context.""" diff --git a/py-denormalized/python/denormalized/datafusion/__init__.py b/py-denormalized/python/denormalized/datafusion/__init__.py new file mode 100644 index 0000000..7419ad7 --- /dev/null +++ b/py-denormalized/python/denormalized/datafusion/__init__.py @@ -0,0 +1,112 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""DataFusion python package. + +This is a Python library that binds to Apache Arrow in-memory query engine DataFusion. +See https://datafusion.apache.org/python for more information. +""" + +try: + import importlib.metadata as importlib_metadata +except ImportError: + import importlib_metadata + +from .context import ( + SessionContext, + SessionConfig, + RuntimeConfig, + SQLOptions, +) + +from .catalog import Catalog, Database, Table + +# The following imports are okay to remain as opaque to the user. +from denormalized._internal import Config, LogicalPlan, ExecutionPlan, runtime + +from .record_batch import RecordBatchStream, RecordBatch + +from .udf import ScalarUDF, AggregateUDF, Accumulator + +from .common import ( + DFSchema, +) + +from .dataframe import DataFrame + +from .expr import ( + Expr, + WindowFrame, +) + +from . import functions, object_store + +__all__ = [ + "Accumulator", + "Config", + "DataFrame", + "SessionContext", + "SessionConfig", + "SQLOptions", + "RuntimeConfig", + "Expr", + "ScalarUDF", + "WindowFrame", + "column", + "col", + "literal", + "lit", + "DFSchema", + "runtime", + "Catalog", + "Database", + "Table", + "AggregateUDF", + "LogicalPlan", + "ExecutionPlan", + "RecordBatch", + "RecordBatchStream", + "common", + "expr", + "functions", + "object_store", +] + + +def column(value: str): + """Create a column expression.""" + return Expr.column(value) + + +def col(value: str): + """Create a column expression.""" + return Expr.column(value) + + +def literal(value): + """Create a literal expression.""" + return Expr.literal(value) + + +def lit(value): + """Create a literal expression.""" + return Expr.literal(value) + + +udf = ScalarUDF.udf + +udaf = AggregateUDF.udaf diff --git a/py-denormalized/python/denormalized/datafusion/catalog.py b/py-denormalized/python/denormalized/datafusion/catalog.py new file mode 100644 index 0000000..d8c9092 --- /dev/null +++ b/py-denormalized/python/denormalized/datafusion/catalog.py @@ -0,0 +1,76 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Data catalog providers.""" + +from __future__ import annotations + +import denormalized._internal as df_internal + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import pyarrow + + +class Catalog: + """DataFusion data catalog.""" + + def __init__(self, catalog: df_internal.Catalog) -> None: + """This constructor is not typically called by the end user.""" + self.catalog = catalog + + def names(self) -> list[str]: + """Returns the list of databases in this catalog.""" + return self.catalog.names() + + def database(self, name: str = "public") -> Database: + """Returns the database with the given ``name`` from this catalog.""" + return Database(self.catalog.database(name)) + + +class Database: + """DataFusion Database.""" + + def __init__(self, db: df_internal.Database) -> None: + """This constructor is not typically called by the end user.""" + self.db = db + + def names(self) -> set[str]: + """Returns the list of all tables in this database.""" + return self.db.names() + + def table(self, name: str) -> Table: + """Return the table with the given ``name`` from this database.""" + return Table(self.db.table(name)) + + +class Table: + """DataFusion table.""" + + def __init__(self, table: df_internal.Table) -> None: + """This constructor is not typically called by the end user.""" + self.table = table + + def schema(self) -> pyarrow.Schema: + """Returns the schema associated with this table.""" + return self.table.schema() + + @property + def kind(self) -> str: + """Returns the kind of table.""" + return self.table.kind() diff --git a/py-denormalized/python/denormalized/datafusion/common.py b/py-denormalized/python/denormalized/datafusion/common.py new file mode 100644 index 0000000..73ed7c4 --- /dev/null +++ b/py-denormalized/python/denormalized/datafusion/common.py @@ -0,0 +1,62 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Common data types used throughout the DataFusion project.""" + +from denormalized._internal import common as common_internal +from enum import Enum + +# TODO these should all have proper wrapper classes + +DFSchema = common_internal.DFSchema +DataType = common_internal.DataType +DataTypeMap = common_internal.DataTypeMap +PythonType = common_internal.PythonType +RexType = common_internal.RexType +SqlFunction = common_internal.SqlFunction +SqlSchema = common_internal.SqlSchema +SqlStatistics = common_internal.SqlStatistics +SqlTable = common_internal.SqlTable +SqlType = common_internal.SqlType +SqlView = common_internal.SqlView + +__all__ = [ + "DFSchema", + "DataType", + "DataTypeMap", + "RexType", + "PythonType", + "SqlType", + "NullTreatment", + "SqlTable", + "SqlSchema", + "SqlView", + "SqlStatistics", + "SqlFunction", +] + + +class NullTreatment(Enum): + """Describe how null values are to be treated by functions. + + This is used primarily by aggregate and window functions. It can be set on + these functions using the builder approach described in + ref:`_window_functions` and ref:`_aggregation` in the online documentation. + + """ + + RESPECT_NULLS = common_internal.NullTreatment.RESPECT_NULLS + IGNORE_NULLS = common_internal.NullTreatment.IGNORE_NULLS diff --git a/py-denormalized/python/denormalized/datafusion/context.py b/py-denormalized/python/denormalized/datafusion/context.py new file mode 100644 index 0000000..19c0760 --- /dev/null +++ b/py-denormalized/python/denormalized/datafusion/context.py @@ -0,0 +1,1029 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Session Context and it's associated configuration.""" + +from __future__ import annotations + +from denormalized._internal import SessionConfig as SessionConfigInternal +from denormalized._internal import RuntimeConfig as RuntimeConfigInternal +from denormalized._internal import SQLOptions as SQLOptionsInternal +from denormalized._internal import SessionContext as SessionContextInternal +from denormalized._internal import LogicalPlan, ExecutionPlan + +from denormalized._internal import AggregateUDF +from denormalized.datafusion.catalog import Catalog, Table +from denormalized.datafusion.dataframe import DataFrame +from denormalized.datafusion.expr import Expr, SortExpr, sort_list_to_raw_sort_list +from denormalized.datafusion.record_batch import RecordBatchStream +from denormalized.datafusion.udf import ScalarUDF + +from typing import Any, TYPE_CHECKING +from typing_extensions import deprecated + +if TYPE_CHECKING: + import pyarrow + import pandas + import polars + import pathlib + + +class SessionConfig: + """Session configuration options.""" + + def __init__(self, config_options: dict[str, str] | None = None) -> None: + """Create a new :py:class:`SessionConfig` with the given configuration options. + + Args: + config_options: Configuration options. + """ + self.config_internal = SessionConfigInternal(config_options) + + def with_create_default_catalog_and_schema( + self, enabled: bool = True + ) -> SessionConfig: + """Control if the default catalog and schema will be automatically created. + + Args: + enabled: Whether the default catalog and schema will be + automatically created. + + Returns: + A new :py:class:`SessionConfig` object with the updated setting. + """ + self.config_internal = ( + self.config_internal.with_create_default_catalog_and_schema(enabled) + ) + return self + + def with_default_catalog_and_schema( + self, catalog: str, schema: str + ) -> SessionConfig: + """Select a name for the default catalog and schema. + + Args: + catalog: Catalog name. + schema: Schema name. + + Returns: + A new :py:class:`SessionConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_default_catalog_and_schema( + catalog, schema + ) + return self + + def with_information_schema(self, enabled: bool = True) -> SessionConfig: + """Enable or disable the inclusion of ``information_schema`` virtual tables. + + Args: + enabled: Whether to include ``information_schema`` virtual tables. + + Returns: + A new :py:class:`SessionConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_information_schema(enabled) + return self + + def with_batch_size(self, batch_size: int) -> SessionConfig: + """Customize batch size. + + Args: + batch_size: Batch size. + + Returns: + A new :py:class:`SessionConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_batch_size(batch_size) + return self + + def with_target_partitions(self, target_partitions: int) -> SessionConfig: + """Customize the number of target partitions for query execution. + + Increasing partitions can increase concurrency. + + Args: + target_partitions: Number of target partitions. + + Returns: + A new :py:class:`SessionConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_target_partitions( + target_partitions + ) + return self + + def with_repartition_aggregations(self, enabled: bool = True) -> SessionConfig: + """Enable or disable the use of repartitioning for aggregations. + + Enabling this improves parallelism. + + Args: + enabled: Whether to use repartitioning for aggregations. + + Returns: + A new :py:class:`SessionConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_repartition_aggregations( + enabled + ) + return self + + def with_repartition_joins(self, enabled: bool = True) -> SessionConfig: + """Enable or disable the use of repartitioning for joins to improve parallelism. + + Args: + enabled: Whether to use repartitioning for joins. + + Returns: + A new :py:class:`SessionConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_repartition_joins(enabled) + return self + + def with_repartition_windows(self, enabled: bool = True) -> SessionConfig: + """Enable or disable the use of repartitioning for window functions. + + This may improve parallelism. + + Args: + enabled: Whether to use repartitioning for window functions. + + Returns: + A new :py:class:`SessionConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_repartition_windows(enabled) + return self + + def with_repartition_sorts(self, enabled: bool = True) -> SessionConfig: + """Enable or disable the use of repartitioning for window functions. + + This may improve parallelism. + + Args: + enabled: Whether to use repartitioning for window functions. + + Returns: + A new :py:class:`SessionConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_repartition_sorts(enabled) + return self + + def with_repartition_file_scans(self, enabled: bool = True) -> SessionConfig: + """Enable or disable the use of repartitioning for file scans. + + Args: + enabled: Whether to use repartitioning for file scans. + + Returns: + A new :py:class:`SessionConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_repartition_file_scans(enabled) + return self + + def with_repartition_file_min_size(self, size: int) -> SessionConfig: + """Set minimum file range size for repartitioning scans. + + Args: + size: Minimum file range size. + + Returns: + A new :py:class:`SessionConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_repartition_file_min_size(size) + return self + + def with_parquet_pruning(self, enabled: bool = True) -> SessionConfig: + """Enable or disable the use of pruning predicate for parquet readers. + + Pruning predicates will enable the reader to skip row groups. + + Args: + enabled: Whether to use pruning predicate for parquet readers. + + Returns: + A new :py:class:`SessionConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_parquet_pruning(enabled) + return self + + def set(self, key: str, value: str) -> SessionConfig: + """Set a configuration option. + + Args: + key: Option key. + value: Option value. + + Returns: + A new :py:class:`SessionConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.set(key, value) + return self + + +class RuntimeConfig: + """Runtime configuration options.""" + + def __init__(self) -> None: + """Create a new :py:class:`RuntimeConfig` with default values.""" + self.config_internal = RuntimeConfigInternal() + + def with_disk_manager_disabled(self) -> RuntimeConfig: + """Disable the disk manager, attempts to create temporary files will error. + + Returns: + A new :py:class:`RuntimeConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_disk_manager_disabled() + return self + + def with_disk_manager_os(self) -> RuntimeConfig: + """Use the operating system's temporary directory for disk manager. + + Returns: + A new :py:class:`RuntimeConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_disk_manager_os() + return self + + def with_disk_manager_specified(self, *paths: str | pathlib.Path) -> RuntimeConfig: + """Use the specified paths for the disk manager's temporary files. + + Args: + paths: Paths to use for the disk manager's temporary files. + + Returns: + A new :py:class:`RuntimeConfig` object with the updated setting. + """ + paths = [str(p) for p in paths] + self.config_internal = self.config_internal.with_disk_manager_specified(paths) + return self + + def with_unbounded_memory_pool(self) -> RuntimeConfig: + """Use an unbounded memory pool. + + Returns: + A new :py:class:`RuntimeConfig` object with the updated setting. + """ + self.config_internal = self.config_internal.with_unbounded_memory_pool() + return self + + def with_fair_spill_pool(self, size: int) -> RuntimeConfig: + """Use a fair spill pool with the specified size. + + This pool works best when you know beforehand the query has multiple spillable + operators that will likely all need to spill. Sometimes it will cause spills + even when there was sufficient memory (reserved for other operators) to avoid + doing so:: + + ┌───────────────────────z──────────────────────z───────────────┐ + │ z z │ + │ z z │ + │ Spillable z Unspillable z Free │ + │ Memory z Memory z Memory │ + │ z z │ + │ z z │ + └───────────────────────z──────────────────────z───────────────┘ + + Args: + size: Size of the memory pool in bytes. + + Returns: + A new :py:class:`RuntimeConfig` object with the updated setting. + + Examples usage:: + + config = RuntimeConfig().with_fair_spill_pool(1024) + """ + self.config_internal = self.config_internal.with_fair_spill_pool(size) + return self + + def with_greedy_memory_pool(self, size: int) -> RuntimeConfig: + """Use a greedy memory pool with the specified size. + + This pool works well for queries that do not need to spill or have a single + spillable operator. See :py:func:`with_fair_spill_pool` if there are + multiple spillable operators that all will spill. + + Args: + size: Size of the memory pool in bytes. + + Returns: + A new :py:class:`RuntimeConfig` object with the updated setting. + + Example usage:: + + config = RuntimeConfig().with_greedy_memory_pool(1024) + """ + self.config_internal = self.config_internal.with_greedy_memory_pool(size) + return self + + def with_temp_file_path(self, path: str | pathlib.Path) -> RuntimeConfig: + """Use the specified path to create any needed temporary files. + + Args: + path: Path to use for temporary files. + + Returns: + A new :py:class:`RuntimeConfig` object with the updated setting. + + Example usage:: + + config = RuntimeConfig().with_temp_file_path("/tmp") + """ + self.config_internal = self.config_internal.with_temp_file_path(str(path)) + return self + + +class SQLOptions: + """Options to be used when performing SQL queries.""" + + def __init__(self) -> None: + """Create a new :py:class:`SQLOptions` with default values. + + The default values are: + - DDL commands are allowed + - DML commands are allowed + - Statements are allowed + """ + self.options_internal = SQLOptionsInternal() + + def with_allow_ddl(self, allow: bool = True) -> SQLOptions: + """Should DDL (Data Definition Language) commands be run? + + Examples of DDL commands include ``CREATE TABLE`` and ``DROP TABLE``. + + Args: + allow: Allow DDL commands to be run. + + Returns: + A new :py:class:`SQLOptions` object with the updated setting. + + Example usage:: + + options = SQLOptions().with_allow_ddl(True) + """ + self.options_internal = self.options_internal.with_allow_ddl(allow) + return self + + def with_allow_dml(self, allow: bool = True) -> SQLOptions: + """Should DML (Data Manipulation Language) commands be run? + + Examples of DML commands include ``INSERT INTO`` and ``DELETE``. + + Args: + allow: Allow DML commands to be run. + + Returns: + A new :py:class:`SQLOptions` object with the updated setting. + + Example usage:: + + options = SQLOptions().with_allow_dml(True) + """ + self.options_internal = self.options_internal.with_allow_dml(allow) + return self + + def with_allow_statements(self, allow: bool = True) -> SQLOptions: + """Should statements such as ``SET VARIABLE`` and ``BEGIN TRANSACTION`` be run? + + Args: + allow: Allow statements to be run. + + Returns: + A new :py:class:SQLOptions` object with the updated setting. + + Example usage:: + + options = SQLOptions().with_allow_statements(True) + """ + self.options_internal = self.options_internal.with_allow_statements(allow) + return self + + +class SessionContext: + """This is the main interface for executing queries and creating DataFrames. + + See :ref:`user_guide_concepts` in the online documentation for more information. + """ + + def __init__( + self, config: SessionConfig | None = None, runtime: RuntimeConfig | None = None + ) -> None: + """Main interface for executing queries with DataFusion. + + Maintains the state of the connection between a user and an instance + of the connection between a user and an instance of the DataFusion + engine. + + Args: + config: Session configuration options. + runtime: Runtime configuration options. + + Example usage: + + The following example demonstrates how to use the context to execute + a query against a CSV data source using the :py:class:`DataFrame` API:: + + from datafusion import SessionContext + + ctx = SessionContext() + df = ctx.read_csv("data.csv") + """ + config = config.config_internal if config is not None else None + runtime = runtime.config_internal if runtime is not None else None + + self.ctx = SessionContextInternal(config, runtime) + + def register_object_store(self, schema: str, store: Any, host: str | None) -> None: + """Add a new object store into the session. + + Args: + schema: The data source schema. + store: The :py:class:`~datafusion.object_store.ObjectStore` to register. + host: URL for the host. + """ + self.ctx.register_object_store(schema, store, host) + + def register_listing_table( + self, + name: str, + path: str | pathlib.Path, + table_partition_cols: list[tuple[str, str]] | None = None, + file_extension: str = ".parquet", + schema: pyarrow.Schema | None = None, + file_sort_order: list[list[Expr | SortExpr]] | None = None, + ) -> None: + """Register multiple files as a single table. + + Registers a :py:class:`~datafusion.catalog.Table` that can assemble multiple + files from locations in an :py:class:`~datafusion.object_store.ObjectStore` + instance. + + Args: + name: Name of the resultant table. + path: Path to the file to register. + table_partition_cols: Partition columns. + file_extension: File extension of the provided table. + schema: The data source schema. + file_sort_order: Sort order for the file. + """ + if table_partition_cols is None: + table_partition_cols = [] + file_sort_order_raw = ( + [sort_list_to_raw_sort_list(f) for f in file_sort_order] + if file_sort_order is not None + else None + ) + self.ctx.register_listing_table( + name, + str(path), + table_partition_cols, + file_extension, + schema, + file_sort_order_raw, + ) + + def sql(self, query: str, options: SQLOptions | None = None) -> DataFrame: + """Create a :py:class:`~datafusion.DataFrame` from SQL query text. + + Note: This API implements DDL statements such as ``CREATE TABLE`` and + ``CREATE VIEW`` and DML statements such as ``INSERT INTO`` with in-memory + default implementation.See + :py:func:`~datafusion.context.SessionContext.sql_with_options`. + + Args: + query: SQL query text. + options: If provided, the query will be validated against these options. + + Returns: + DataFrame representation of the SQL query. + """ + if options is None: + return DataFrame(self.ctx.sql(query)) + return DataFrame(self.ctx.sql_with_options(query, options.options_internal)) + + def sql_with_options(self, query: str, options: SQLOptions) -> DataFrame: + """Create a :py:class:`~datafusion.dataframe.DataFrame` from SQL query text. + + This function will first validate that the query is allowed by the + provided options. + + Args: + query: SQL query text. + options: SQL options. + + Returns: + DataFrame representation of the SQL query. + """ + return self.sql(query, options) + + def create_dataframe( + self, + partitions: list[list[pyarrow.RecordBatch]], + name: str | None = None, + schema: pyarrow.Schema | None = None, + ) -> DataFrame: + """Create and return a dataframe using the provided partitions. + + Args: + partitions: :py:class:`pyarrow.RecordBatch` partitions to register. + name: Resultant dataframe name. + schema: Schema for the partitions. + + Returns: + DataFrame representation of the SQL query. + """ + return DataFrame(self.ctx.create_dataframe(partitions, name, schema)) + + def create_dataframe_from_logical_plan(self, plan: LogicalPlan) -> DataFrame: + """Create a :py:class:`~datafusion.dataframe.DataFrame` from an existing plan. + + Args: + plan: Logical plan. + + Returns: + DataFrame representation of the logical plan. + """ + return DataFrame(self.ctx.create_dataframe_from_logical_plan(plan)) + + def from_pylist( + self, data: list[dict[str, Any]], name: str | None = None + ) -> DataFrame: + """Create a :py:class:`~datafusion.dataframe.DataFrame` from a list. + + Args: + data: List of dictionaries. + name: Name of the DataFrame. + + Returns: + DataFrame representation of the list of dictionaries. + """ + return DataFrame(self.ctx.from_pylist(data, name)) + + def from_pydict( + self, data: dict[str, list[Any]], name: str | None = None + ) -> DataFrame: + """Create a :py:class:`~datafusion.dataframe.DataFrame` from a dictionary. + + Args: + data: Dictionary of lists. + name: Name of the DataFrame. + + Returns: + DataFrame representation of the dictionary of lists. + """ + return DataFrame(self.ctx.from_pydict(data, name)) + + def from_arrow(self, data: Any, name: str | None = None) -> DataFrame: + """Create a :py:class:`~datafusion.dataframe.DataFrame` from an Arrow source. + + The Arrow data source can be any object that implements either + ``__arrow_c_stream__`` or ``__arrow_c_array__``. For the latter, it must return + a struct array. Common examples of sources from pyarrow include + + Args: + data: Arrow data source. + name: Name of the DataFrame. + + Returns: + DataFrame representation of the Arrow table. + """ + return DataFrame(self.ctx.from_arrow(data, name)) + + @deprecated("Use ``from_arrow`` instead.") + def from_arrow_table( + self, data: pyarrow.Table, name: str | None = None + ) -> DataFrame: + """Create a :py:class:`~datafusion.dataframe.DataFrame` from an Arrow table. + + This is an alias for :py:func:`from_arrow`. + """ + return self.from_arrow(data, name) + + def from_pandas(self, data: pandas.DataFrame, name: str | None = None) -> DataFrame: + """Create a :py:class:`~datafusion.dataframe.DataFrame` from a Pandas DataFrame. + + Args: + data: Pandas DataFrame. + name: Name of the DataFrame. + + Returns: + DataFrame representation of the Pandas DataFrame. + """ + return DataFrame(self.ctx.from_pandas(data, name)) + + def from_polars(self, data: polars.DataFrame, name: str | None = None) -> DataFrame: + """Create a :py:class:`~datafusion.dataframe.DataFrame` from a Polars DataFrame. + + Args: + data: Polars DataFrame. + name: Name of the DataFrame. + + Returns: + DataFrame representation of the Polars DataFrame. + """ + return DataFrame(self.ctx.from_polars(data, name)) + + def register_table(self, name: str, table: Table) -> None: + """Register a :py:class: `~datafusion.catalog.Table` as a table. + + The registered table can be referenced from SQL statement executed against. + + Args: + name: Name of the resultant table. + table: DataFusion table to add to the session context. + """ + self.ctx.register_table(name, table) + + def deregister_table(self, name: str) -> None: + """Remove a table from the session.""" + self.ctx.deregister_table(name) + + def register_record_batches( + self, name: str, partitions: list[list[pyarrow.RecordBatch]] + ) -> None: + """Register record batches as a table. + + This function will convert the provided partitions into a table and + register it into the session using the given name. + + Args: + name: Name of the resultant table. + partitions: Record batches to register as a table. + """ + self.ctx.register_record_batches(name, partitions) + + def register_parquet( + self, + name: str, + path: str | pathlib.Path, + table_partition_cols: list[tuple[str, str]] | None = None, + parquet_pruning: bool = True, + file_extension: str = ".parquet", + skip_metadata: bool = True, + schema: pyarrow.Schema | None = None, + file_sort_order: list[list[Expr]] | None = None, + ) -> None: + """Register a Parquet file as a table. + + The registered table can be referenced from SQL statement executed + against this context. + + Args: + name: Name of the table to register. + path: Path to the Parquet file. + table_partition_cols: Partition columns. + parquet_pruning: Whether the parquet reader should use the + predicate to prune row groups. + file_extension: File extension; only files with this extension are + selected for data input. + skip_metadata: Whether the parquet reader should skip any metadata + that may be in the file schema. This can help avoid schema + conflicts due to metadata. + schema: The data source schema. + file_sort_order: Sort order for the file. + """ + if table_partition_cols is None: + table_partition_cols = [] + self.ctx.register_parquet( + name, + str(path), + table_partition_cols, + parquet_pruning, + file_extension, + skip_metadata, + schema, + file_sort_order, + ) + + def register_csv( + self, + name: str, + path: str | pathlib.Path, + schema: pyarrow.Schema | None = None, + has_header: bool = True, + delimiter: str = ",", + schema_infer_max_records: int = 1000, + file_extension: str = ".csv", + file_compression_type: str | None = None, + ) -> None: + """Register a CSV file as a table. + + The registered table can be referenced from SQL statement executed against. + + Args: + name: Name of the table to register. + path: Path to the CSV file. + schema: An optional schema representing the CSV file. If None, the + CSV reader will try to infer it based on data in file. + has_header: Whether the CSV file have a header. If schema inference + is run on a file with no headers, default column names are + created. + delimiter: An optional column delimiter. + schema_infer_max_records: Maximum number of rows to read from CSV + files for schema inference if needed. + file_extension: File extension; only files with this extension are + selected for data input. + file_compression_type: File compression type. + """ + self.ctx.register_csv( + name, + str(path), + schema, + has_header, + delimiter, + schema_infer_max_records, + file_extension, + file_compression_type, + ) + + def register_json( + self, + name: str, + path: str | pathlib.Path, + schema: pyarrow.Schema | None = None, + schema_infer_max_records: int = 1000, + file_extension: str = ".json", + table_partition_cols: list[tuple[str, str]] | None = None, + file_compression_type: str | None = None, + ) -> None: + """Register a JSON file as a table. + + The registered table can be referenced from SQL statement executed + against this context. + + Args: + name: Name of the table to register. + path: Path to the JSON file. + schema: The data source schema. + schema_infer_max_records: Maximum number of rows to read from JSON + files for schema inference if needed. + file_extension: File extension; only files with this extension are + selected for data input. + table_partition_cols: Partition columns. + file_compression_type: File compression type. + """ + if table_partition_cols is None: + table_partition_cols = [] + self.ctx.register_json( + name, + str(path), + schema, + schema_infer_max_records, + file_extension, + table_partition_cols, + file_compression_type, + ) + + def register_avro( + self, + name: str, + path: str | pathlib.Path, + schema: pyarrow.Schema | None = None, + file_extension: str = ".avro", + table_partition_cols: list[tuple[str, str]] | None = None, + ) -> None: + """Register an Avro file as a table. + + The registered table can be referenced from SQL statement executed against + this context. + + Args: + name: Name of the table to register. + path: Path to the Avro file. + schema: The data source schema. + file_extension: File extension to select. + table_partition_cols: Partition columns. + """ + if table_partition_cols is None: + table_partition_cols = [] + self.ctx.register_avro( + name, str(path), schema, file_extension, table_partition_cols + ) + + def register_dataset(self, name: str, dataset: pyarrow.dataset.Dataset) -> None: + """Register a :py:class:`pyarrow.dataset.Dataset` as a table. + + Args: + name: Name of the table to register. + dataset: PyArrow dataset. + """ + self.ctx.register_dataset(name, dataset) + + def register_udf(self, udf: ScalarUDF) -> None: + """Register a user-defined function (UDF) with the context.""" + self.ctx.register_udf(udf._udf) + + def register_udaf(self, udaf: AggregateUDF) -> None: + """Register a user-defined aggregation function (UDAF) with the context.""" + self.ctx.register_udaf(udaf._udaf) + + def catalog(self, name: str = "datafusion") -> Catalog: + """Retrieve a catalog by name.""" + return self.ctx.catalog(name) + + @deprecated( + "Use the catalog provider interface ``SessionContext.Catalog`` to " + "examine available catalogs, schemas and tables" + ) + def tables(self) -> set[str]: + """Deprecated.""" + return self.ctx.tables() + + def table(self, name: str) -> DataFrame: + """Retrieve a previously registered table by name.""" + return DataFrame(self.ctx.table(name)) + + def table_exist(self, name: str) -> bool: + """Return whether a table with the given name exists.""" + return self.ctx.table_exist(name) + + def empty_table(self) -> DataFrame: + """Create an empty :py:class:`~datafusion.dataframe.DataFrame`.""" + return DataFrame(self.ctx.empty_table()) + + def session_id(self) -> str: + """Return an id that uniquely identifies this :py:class:`SessionContext`.""" + return self.ctx.session_id() + + def read_json( + self, + path: str | pathlib.Path, + schema: pyarrow.Schema | None = None, + schema_infer_max_records: int = 1000, + file_extension: str = ".json", + table_partition_cols: list[tuple[str, str]] | None = None, + file_compression_type: str | None = None, + ) -> DataFrame: + """Read a line-delimited JSON data source. + + Args: + path: Path to the JSON file. + schema: The data source schema. + schema_infer_max_records: Maximum number of rows to read from JSON + files for schema inference if needed. + file_extension: File extension; only files with this extension are + selected for data input. + table_partition_cols: Partition columns. + file_compression_type: File compression type. + + Returns: + DataFrame representation of the read JSON files. + """ + if table_partition_cols is None: + table_partition_cols = [] + return DataFrame( + self.ctx.read_json( + str(path), + schema, + schema_infer_max_records, + file_extension, + table_partition_cols, + file_compression_type, + ) + ) + + def read_csv( + self, + path: str | pathlib.Path | list[str] | list[pathlib.Path], + schema: pyarrow.Schema | None = None, + has_header: bool = True, + delimiter: str = ",", + schema_infer_max_records: int = 1000, + file_extension: str = ".csv", + table_partition_cols: list[tuple[str, str]] | None = None, + file_compression_type: str | None = None, + ) -> DataFrame: + """Read a CSV data source. + + Args: + path: Path to the CSV file + schema: An optional schema representing the CSV files. If None, the + CSV reader will try to infer it based on data in file. + has_header: Whether the CSV file have a header. If schema inference + is run on a file with no headers, default column names are + created. + delimiter: An optional column delimiter. + schema_infer_max_records: Maximum number of rows to read from CSV + files for schema inference if needed. + file_extension: File extension; only files with this extension are + selected for data input. + table_partition_cols: Partition columns. + file_compression_type: File compression type. + + Returns: + DataFrame representation of the read CSV files + """ + if table_partition_cols is None: + table_partition_cols = [] + + path = [str(p) for p in path] if isinstance(path, list) else str(path) + + return DataFrame( + self.ctx.read_csv( + path, + schema, + has_header, + delimiter, + schema_infer_max_records, + file_extension, + table_partition_cols, + file_compression_type, + ) + ) + + def read_parquet( + self, + path: str | pathlib.Path, + table_partition_cols: list[tuple[str, str]] | None = None, + parquet_pruning: bool = True, + file_extension: str = ".parquet", + skip_metadata: bool = True, + schema: pyarrow.Schema | None = None, + file_sort_order: list[list[Expr]] | None = None, + ) -> DataFrame: + """Read a Parquet source into a :py:class:`~datafusion.dataframe.Dataframe`. + + Args: + path: Path to the Parquet file. + table_partition_cols: Partition columns. + parquet_pruning: Whether the parquet reader should use the predicate + to prune row groups. + file_extension: File extension; only files with this extension are + selected for data input. + skip_metadata: Whether the parquet reader should skip any metadata + that may be in the file schema. This can help avoid schema + conflicts due to metadata. + schema: An optional schema representing the parquet files. If None, + the parquet reader will try to infer it based on data in the + file. + file_sort_order: Sort order for the file. + + Returns: + DataFrame representation of the read Parquet files + """ + if table_partition_cols is None: + table_partition_cols = [] + return DataFrame( + self.ctx.read_parquet( + str(path), + table_partition_cols, + parquet_pruning, + file_extension, + skip_metadata, + schema, + file_sort_order, + ) + ) + + def read_avro( + self, + path: str | pathlib.Path, + schema: pyarrow.Schema | None = None, + file_partition_cols: list[tuple[str, str]] | None = None, + file_extension: str = ".avro", + ) -> DataFrame: + """Create a :py:class:`DataFrame` for reading Avro data source. + + Args: + path: Path to the Avro file. + schema: The data source schema. + file_partition_cols: Partition columns. + file_extension: File extension to select. + + Returns: + DataFrame representation of the read Avro file + """ + if file_partition_cols is None: + file_partition_cols = [] + return DataFrame( + self.ctx.read_avro(str(path), schema, file_partition_cols, file_extension) + ) + + def read_table(self, table: Table) -> DataFrame: + """Creates a :py:class:`~datafusion.dataframe.DataFrame` from a table. + + For a :py:class:`~datafusion.catalog.Table` such as a + :py:class:`~datafusion.catalog.ListingTable`, create a + :py:class:`~datafusion.dataframe.DataFrame`. + """ + return DataFrame(self.ctx.read_table(table)) + + def execute(self, plan: ExecutionPlan, partitions: int) -> RecordBatchStream: + """Execute the ``plan`` and return the results.""" + return RecordBatchStream(self.ctx.execute(plan, partitions)) diff --git a/py-denormalized/python/denormalized/datafusion/dataframe.py b/py-denormalized/python/denormalized/datafusion/dataframe.py new file mode 100644 index 0000000..4a50545 --- /dev/null +++ b/py-denormalized/python/denormalized/datafusion/dataframe.py @@ -0,0 +1,572 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""":py:class:`DataFrame` is one of the core concepts in DataFusion. + +See :ref:`user_guide_concepts` in the online documentation for more information. +""" + +from __future__ import annotations + +from typing import Any, List, TYPE_CHECKING +from datafusion.record_batch import RecordBatchStream +from typing_extensions import deprecated + +if TYPE_CHECKING: + import pyarrow as pa + import pandas as pd + import polars as pl + import pathlib + from typing import Callable + +from denormalized._internal import DataFrame as DataFrameInternal +from denormalized.datafusion.expr import Expr, SortExpr, sort_or_default +from denormalized._internal import ( + LogicalPlan, + ExecutionPlan, +) + + +class DataFrame: + """Two dimensional table representation of data. + + See :ref:`user_guide_concepts` in the online documentation for more information. + """ + + def __init__(self, df: DataFrameInternal) -> None: + """This constructor is not to be used by the end user. + + See :py:class:`~datafusion.context.SessionContext` for methods to + create a :py:class:`DataFrame`. + """ + self.df = df + + def __getitem__(self, key: str | List[str]) -> DataFrame: + """Return a new :py:class`DataFrame` with the specified column or columns. + + Args: + key: Column name or list of column names to select. + + Returns: + DataFrame with the specified column or columns. + """ + return DataFrame(self.df.__getitem__(key)) + + def __repr__(self) -> str: + """Return a string representation of the DataFrame. + + Returns: + String representation of the DataFrame. + """ + return self.df.__repr__() + + def _repr_html_(self) -> str: + return self.df._repr_html_() + + def describe(self) -> DataFrame: + """Return the statistics for this DataFrame. + + Only summarized numeric datatypes at the moments and returns nulls + for non-numeric datatypes. + + The output format is modeled after pandas. + + Returns: + A summary DataFrame containing statistics. + """ + return DataFrame(self.df.describe()) + + def schema(self) -> pa.Schema: + """Return the :py:class:`pyarrow.Schema` of this DataFrame. + + The output schema contains information on the name, data type, and + nullability for each column. + + Returns: + Describing schema of the DataFrame + """ + return self.df.schema() + + def select_columns(self, *args: str) -> DataFrame: + """Filter the DataFrame by columns. + + Returns: + DataFrame only containing the specified columns. + """ + return self.select(*args) + + def select(self, *exprs: Expr | str) -> DataFrame: + """Project arbitrary expressions into a new :py:class:`DataFrame`. + + Args: + exprs: Either column names or :py:class:`~datafusion.expr.Expr` to select. + + Returns: + DataFrame after projection. It has one column for each expression. + + Example usage: + + The following example will return 3 columns from the original dataframe. + The first two columns will be the original column ``a`` and ``b`` since the + string "a" is assumed to refer to column selection. Also a duplicate of + column ``a`` will be returned with the column name ``alternate_a``:: + + df = df.select("a", col("b"), col("a").alias("alternate_a")) + + """ + exprs_internal = [ + Expr.column(arg).expr if isinstance(arg, str) else arg.expr for arg in exprs + ] + return DataFrame(self.df.select(*exprs_internal)) + + def filter(self, *predicates: Expr) -> DataFrame: + """Return a DataFrame for which ``predicate`` evaluates to ``True``. + + Rows for which ``predicate`` evaluates to ``False`` or ``None`` are filtered + out. If more than one predicate is provided, these predicates will be + combined as a logical AND. If more complex logic is required, see the + logical operations in :py:mod:`~datafusion.functions`. + + Args: + predicates: Predicate expression(s) to filter the DataFrame. + + Returns: + DataFrame after filtering. + """ + df = self.df + for p in predicates: + df = df.filter(p.expr) + return DataFrame(df) + + def with_column(self, name: str, expr: Expr) -> DataFrame: + """Add an additional column to the DataFrame. + + Args: + name: Name of the column to add. + expr: Expression to compute the column. + + Returns: + DataFrame with the new column. + """ + return DataFrame(self.df.with_column(name, expr.expr)) + + def with_column_renamed(self, old_name: str, new_name: str) -> DataFrame: + r"""Rename one column by applying a new projection. + + This is a no-op if the column to be renamed does not exist. + + The method supports case sensitive rename with wrapping column name + into one the following symbols (" or ' or \`). + + Args: + old_name: Old column name. + new_name: New column name. + + Returns: + DataFrame with the column renamed. + """ + return DataFrame(self.df.with_column_renamed(old_name, new_name)) + + def aggregate( + self, group_by: list[Expr] | Expr, aggs: list[Expr] | Expr + ) -> DataFrame: + """Aggregates the rows of the current DataFrame. + + Args: + group_by: List of expressions to group by. + aggs: List of expressions to aggregate. + + Returns: + DataFrame after aggregation. + """ + group_by = group_by if isinstance(group_by, list) else [group_by] + aggs = aggs if isinstance(aggs, list) else [aggs] + + group_by = [e.expr for e in group_by] + aggs = [e.expr for e in aggs] + return DataFrame(self.df.aggregate(group_by, aggs)) + + def sort(self, *exprs: Expr | SortExpr) -> DataFrame: + """Sort the DataFrame by the specified sorting expressions. + + Note that any expression can be turned into a sort expression by + calling its` ``sort`` method. + + Args: + exprs: Sort expressions, applied in order. + + Returns: + DataFrame after sorting. + """ + exprs_raw = [sort_or_default(expr) for expr in exprs] + return DataFrame(self.df.sort(*exprs_raw)) + + def limit(self, count: int, offset: int = 0) -> DataFrame: + """Return a new :py:class:`DataFrame` with a limited number of rows. + + Args: + count: Number of rows to limit the DataFrame to. + offset: Number of rows to skip. + + Returns: + DataFrame after limiting. + """ + return DataFrame(self.df.limit(count, offset)) + + def collect(self) -> list[pa.RecordBatch]: + """Execute this :py:class:`DataFrame` and collect results into memory. + + Prior to calling ``collect``, modifying a DataFrme simply updates a plan + (no actual computation is performed). Calling ``collect`` triggers the + computation. + + Returns: + List of :py:class:`pyarrow.RecordBatch` collected from the DataFrame. + """ + return self.df.collect() + + def cache(self) -> DataFrame: + """Cache the DataFrame as a memory table. + + Returns: + Cached DataFrame. + """ + return DataFrame(self.df.cache()) + + def collect_partitioned(self) -> list[list[pa.RecordBatch]]: + """Execute this DataFrame and collect all partitioned results. + + This operation returns :py:class:`pyarrow.RecordBatch` maintaining the input + partitioning. + + Returns: + List of list of :py:class:`RecordBatch` collected from the + DataFrame. + """ + return self.df.collect_partitioned() + + def show(self, num: int = 20) -> None: + """Execute the DataFrame and print the result to the console. + + Args: + num: Number of lines to show. + """ + self.df.show(num) + + def distinct(self) -> DataFrame: + """Return a new :py:class:`DataFrame` with all duplicated rows removed. + + Returns: + DataFrame after removing duplicates. + """ + return DataFrame(self.df.distinct()) + + def join( + self, + right: DataFrame, + join_keys: tuple[list[str], list[str]], + how: str, + ) -> DataFrame: + """Join this :py:class:`DataFrame` with another :py:class:`DataFrame`. + + Join keys are a pair of lists of column names in the left and right + dataframes, respectively. These lists must have the same length. + + Args: + right: Other DataFrame to join with. + join_keys: Tuple of two lists of column names to join on. + how: Type of join to perform. Supported types are "inner", "left", + "right", "full", "semi", "anti". + + Returns: + DataFrame after join. + """ + return DataFrame(self.df.join(right.df, join_keys, how)) + + def explain(self, verbose: bool = False, analyze: bool = False) -> DataFrame: + """Return a DataFrame with the explanation of its plan so far. + + If ``analyze`` is specified, runs the plan and reports metrics. + + Args: + verbose: If ``True``, more details will be included. + analyze: If ``Tru`e``, the plan will run and metrics reported. + + Returns: + DataFrame with the explanation of its plan. + """ + return DataFrame(self.df.explain(verbose, analyze)) + + def logical_plan(self) -> LogicalPlan: + """Return the unoptimized ``LogicalPlan``. + + Returns: + Unoptimized logical plan. + """ + return self.df.logical_plan() + + def optimized_logical_plan(self) -> LogicalPlan: + """Return the optimized ``LogicalPlan``. + + Returns: + Optimized logical plan. + """ + return self.df.optimized_logical_plan() + + def execution_plan(self) -> ExecutionPlan: + """Return the execution/physical plan. + + Returns: + Execution plan. + """ + return self.df.execution_plan() + + def repartition(self, num: int) -> DataFrame: + """Repartition a DataFrame into ``num`` partitions. + + The batches allocation uses a round-robin algorithm. + + Args: + num: Number of partitions to repartition the DataFrame into. + + Returns: + Repartitioned DataFrame. + """ + return DataFrame(self.df.repartition(num)) + + def repartition_by_hash(self, *exprs: Expr, num: int) -> DataFrame: + """Repartition a DataFrame using a hash partitioning scheme. + + Args: + exprs: Expressions to evaluate and perform hashing on. + num: Number of partitions to repartition the DataFrame into. + + Returns: + Repartitioned DataFrame. + """ + exprs = [expr.expr for expr in exprs] + return DataFrame(self.df.repartition_by_hash(*exprs, num=num)) + + def union(self, other: DataFrame, distinct: bool = False) -> DataFrame: + """Calculate the union of two :py:class:`DataFrame`. + + The two :py:class:`DataFrame` must have exactly the same schema. + + Args: + other: DataFrame to union with. + distinct: If ``True``, duplicate rows will be removed. + + Returns: + DataFrame after union. + """ + return DataFrame(self.df.union(other.df, distinct)) + + def union_distinct(self, other: DataFrame) -> DataFrame: + """Calculate the distinct union of two :py:class:`DataFrame`. + + The two :py:class:`DataFrame` must have exactly the same schema. + Any duplicate rows are discarded. + + Args: + other: DataFrame to union with. + + Returns: + DataFrame after union. + """ + return DataFrame(self.df.union_distinct(other.df)) + + def intersect(self, other: DataFrame) -> DataFrame: + """Calculate the intersection of two :py:class:`DataFrame`. + + The two :py:class:`DataFrame` must have exactly the same schema. + + Args: + other: DataFrame to intersect with. + + Returns: + DataFrame after intersection. + """ + return DataFrame(self.df.intersect(other.df)) + + def except_all(self, other: DataFrame) -> DataFrame: + """Calculate the exception of two :py:class:`DataFrame`. + + The two :py:class:`DataFrame` must have exactly the same schema. + + Args: + other: DataFrame to calculate exception with. + + Returns: + DataFrame after exception. + """ + return DataFrame(self.df.except_all(other.df)) + + def write_csv(self, path: str | pathlib.Path, with_header: bool = False) -> None: + """Execute the :py:class:`DataFrame` and write the results to a CSV file. + + Args: + path: Path of the CSV file to write. + with_header: If true, output the CSV header row. + """ + self.df.write_csv(str(path), with_header) + + def write_parquet( + self, + path: str | pathlib.Path, + compression: str = "uncompressed", + compression_level: int | None = None, + ) -> None: + """Execute the :py:class:`DataFrame` and write the results to a Parquet file. + + Args: + path: Path of the Parquet file to write. + compression: Compression type to use. + compression_level: Compression level to use. + """ + self.df.write_parquet(str(path), compression, compression_level) + + def write_json(self, path: str | pathlib.Path) -> None: + """Execute the :py:class:`DataFrame` and write the results to a JSON file. + + Args: + path: Path of the JSON file to write. + """ + self.df.write_json(str(path)) + + def to_arrow_table(self) -> pa.Table: + """Execute the :py:class:`DataFrame` and convert it into an Arrow Table. + + Returns: + Arrow Table. + """ + return self.df.to_arrow_table() + + def execute_stream(self) -> RecordBatchStream: + """Executes this DataFrame and returns a stream over a single partition. + + Returns: + Record Batch Stream over a single partition. + """ + return RecordBatchStream(self.df.execute_stream()) + + def execute_stream_partitioned(self) -> list[RecordBatchStream]: + """Executes this DataFrame and returns a stream for each partition. + + Returns: + One record batch stream per partition. + """ + streams = self.df.execute_stream_partitioned() + return [RecordBatchStream(rbs) for rbs in streams] + + def to_pandas(self) -> pd.DataFrame: + """Execute the :py:class:`DataFrame` and convert it into a Pandas DataFrame. + + Returns: + Pandas DataFrame. + """ + return self.df.to_pandas() + + def to_pylist(self) -> list[dict[str, Any]]: + """Execute the :py:class:`DataFrame` and convert it into a list of dictionaries. + + Returns: + List of dictionaries. + """ + return self.df.to_pylist() + + def to_pydict(self) -> dict[str, list[Any]]: + """Execute the :py:class:`DataFrame` and convert it into a dictionary of lists. + + Returns: + Dictionary of lists. + """ + return self.df.to_pydict() + + def to_polars(self) -> pl.DataFrame: + """Execute the :py:class:`DataFrame` and convert it into a Polars DataFrame. + + Returns: + Polars DataFrame. + """ + return self.df.to_polars() + + def count(self) -> int: + """Return the total number of rows in this :py:class:`DataFrame`. + + Note that this method will actually run a plan to calculate the + count, which may be slow for large or complicated DataFrames. + + Returns: + Number of rows in the DataFrame. + """ + return self.df.count() + + @deprecated("Use :py:func:`unnest_columns` instead.") + def unnest_column(self, column: str, preserve_nulls: bool = True) -> DataFrame: + """See :py:func:`unnest_columns`.""" + return DataFrame(self.df.unnest_column(column, preserve_nulls=preserve_nulls)) + + def unnest_columns(self, *columns: str, preserve_nulls: bool = True) -> DataFrame: + """Expand columns of arrays into a single row per array element. + + Args: + columns: Column names to perform unnest operation on. + preserve_nulls: If False, rows with null entries will not be + returned. + + Returns: + A DataFrame with the columns expanded. + """ + columns = [c for c in columns] + return DataFrame(self.df.unnest_columns(columns, preserve_nulls=preserve_nulls)) + + def __arrow_c_stream__(self, requested_schema: pa.Schema) -> Any: + """Export an Arrow PyCapsule Stream. + + This will execute and collect the DataFrame. We will attempt to respect the + requested schema, but only trivial transformations will be applied such as only + returning the fields listed in the requested schema if their data types match + those in the DataFrame. + + Args: + requested_schema: Attempt to provide the DataFrame using this schema. + + Returns: + Arrow PyCapsule object. + """ + return self.df.__arrow_c_stream__(requested_schema) + + def transform(self, func: Callable[..., DataFrame], *args: Any) -> DataFrame: + """Apply a function to the current DataFrame which returns another DataFrame. + + This is useful for chaining together multiple functions. For example:: + + def add_3(df: DataFrame) -> DataFrame: + return df.with_column("modified", lit(3)) + + def within_limit(df: DataFrame, limit: int) -> DataFrame: + return df.filter(col("a") < lit(limit)).distinct() + + df = df.transform(modify_df).transform(within_limit, 4) + + Args: + func: A callable function that takes a DataFrame as it's first argument + args: Zero or more arguments to pass to `func` + + Returns: + DataFrame: After applying func to the original dataframe. + """ + return func(self, *args) diff --git a/py-denormalized/python/denormalized/datafusion/expr.py b/py-denormalized/python/denormalized/datafusion/expr.py new file mode 100644 index 0000000..a858a66 --- /dev/null +++ b/py-denormalized/python/denormalized/datafusion/expr.py @@ -0,0 +1,718 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""This module supports expressions, one of the core concepts in DataFusion. + +See :ref:`Expressions` in the online documentation for more details. +""" + +from __future__ import annotations + +from typing import Any, Optional, Type + +import pyarrow as pa +from denormalized.datafusion.common import DataTypeMap, NullTreatment, RexType +from typing_extensions import deprecated + +from denormalized._internal import LogicalPlan +from denormalized._internal import expr as expr_internal +from denormalized._internal import functions as functions_internal + +# The following are imported from the internal representation. We may choose to +# give these all proper wrappers, or to simply leave as is. These were added +# in order to support passing the `test_imports` unit test. +# Tim Saucer note: It is not clear to me what the use case is for exposing +# these definitions to the end user. + +Alias = expr_internal.Alias +Analyze = expr_internal.Analyze +Aggregate = expr_internal.Aggregate +AggregateFunction = expr_internal.AggregateFunction +Between = expr_internal.Between +BinaryExpr = expr_internal.BinaryExpr +Case = expr_internal.Case +Cast = expr_internal.Cast +Column = expr_internal.Column +CreateMemoryTable = expr_internal.CreateMemoryTable +CreateView = expr_internal.CreateView +CrossJoin = expr_internal.CrossJoin +Distinct = expr_internal.Distinct +DropTable = expr_internal.DropTable +EmptyRelation = expr_internal.EmptyRelation +Exists = expr_internal.Exists +Explain = expr_internal.Explain +Extension = expr_internal.Extension +Filter = expr_internal.Filter +GroupingSet = expr_internal.GroupingSet +Join = expr_internal.Join +ILike = expr_internal.ILike +InList = expr_internal.InList +InSubquery = expr_internal.InSubquery +IsFalse = expr_internal.IsFalse +IsNotTrue = expr_internal.IsNotTrue +IsNull = expr_internal.IsNull +IsTrue = expr_internal.IsTrue +IsUnknown = expr_internal.IsUnknown +IsNotFalse = expr_internal.IsNotFalse +IsNotNull = expr_internal.IsNotNull +IsNotUnknown = expr_internal.IsNotUnknown +JoinConstraint = expr_internal.JoinConstraint +JoinType = expr_internal.JoinType +Like = expr_internal.Like +Limit = expr_internal.Limit +Literal = expr_internal.Literal +Negative = expr_internal.Negative +Not = expr_internal.Not +Partitioning = expr_internal.Partitioning +Placeholder = expr_internal.Placeholder +Projection = expr_internal.Projection +Repartition = expr_internal.Repartition +ScalarSubquery = expr_internal.ScalarSubquery +ScalarVariable = expr_internal.ScalarVariable +SimilarTo = expr_internal.SimilarTo +Sort = expr_internal.Sort +Subquery = expr_internal.Subquery +SubqueryAlias = expr_internal.SubqueryAlias +TableScan = expr_internal.TableScan +TryCast = expr_internal.TryCast +Union = expr_internal.Union +Unnest = expr_internal.Unnest +UnnestExpr = expr_internal.UnnestExpr +Window = expr_internal.Window + +__all__ = [ + "Expr", + "Column", + "Literal", + "BinaryExpr", + "Literal", + "AggregateFunction", + "Not", + "IsNotNull", + "IsNull", + "IsTrue", + "IsFalse", + "IsUnknown", + "IsNotTrue", + "IsNotFalse", + "IsNotUnknown", + "Negative", + "Like", + "ILike", + "SimilarTo", + "ScalarVariable", + "Alias", + "InList", + "Exists", + "Subquery", + "InSubquery", + "ScalarSubquery", + "Placeholder", + "GroupingSet", + "Case", + "CaseBuilder", + "Cast", + "TryCast", + "Between", + "Explain", + "Limit", + "Aggregate", + "Sort", + "SortExpr", + "Analyze", + "EmptyRelation", + "Join", + "JoinType", + "JoinConstraint", + "CrossJoin", + "Union", + "Unnest", + "UnnestExpr", + "Extension", + "Filter", + "Projection", + "TableScan", + "CreateMemoryTable", + "CreateView", + "Distinct", + "SubqueryAlias", + "DropTable", + "Partitioning", + "Repartition", + "Window", + "WindowFrame", + "WindowFrameBound", +] + + +def expr_list_to_raw_expr_list( + expr_list: Optional[list[Expr]], +) -> Optional[list[expr_internal.Expr]]: + """Helper function to convert an optional list to raw expressions.""" + return [e.expr for e in expr_list] if expr_list is not None else None + + +def sort_or_default(e: Expr | SortExpr) -> expr_internal.SortExpr: + """Helper function to return a default Sort if an Expr is provided.""" + if isinstance(e, SortExpr): + return e.raw_sort + return SortExpr(e.expr, True, True).raw_sort + + +def sort_list_to_raw_sort_list( + sort_list: Optional[list[Expr | SortExpr]], +) -> Optional[list[expr_internal.SortExpr]]: + """Helper function to return an optional sort list to raw variant.""" + return [sort_or_default(e) for e in sort_list] if sort_list is not None else None + + +class Expr: + """Expression object. + + Expressions are one of the core concepts in DataFusion. See + :ref:`Expressions` in the online documentation for more information. + """ + + def __init__(self, expr: expr_internal.Expr) -> None: + """This constructor should not be called by the end user.""" + self.expr = expr + + def to_variant(self) -> Any: + """Convert this expression into a python object if possible.""" + return self.expr.to_variant() + + @deprecated( + "display_name() is deprecated. Use :py:meth:`~Expr.schema_name` instead" + ) + def display_name(self) -> str: + """Returns the name of this expression as it should appear in a schema. + + This name will not include any CAST expressions. + """ + return self.schema_name() + + def schema_name(self) -> str: + """Returns the name of this expression as it should appear in a schema. + + This name will not include any CAST expressions. + """ + return self.expr.schema_name() + + def canonical_name(self) -> str: + """Returns a complete string representation of this expression.""" + return self.expr.canonical_name() + + def variant_name(self) -> str: + """Returns the name of the Expr variant. + + Ex: ``IsNotNull``, ``Literal``, ``BinaryExpr``, etc + """ + return self.expr.variant_name() + + def __richcmp__(self, other: Expr, op: int) -> Expr: + """Comparison operator.""" + return Expr(self.expr.__richcmp__(other, op)) + + def __repr__(self) -> str: + """Generate a string representation of this expression.""" + return self.expr.__repr__() + + def __add__(self, rhs: Any) -> Expr: + """Addition operator. + + Accepts either an expression or any valid PyArrow scalar literal value. + """ + if not isinstance(rhs, Expr): + rhs = Expr.literal(rhs) + return Expr(self.expr.__add__(rhs.expr)) + + def __sub__(self, rhs: Any) -> Expr: + """Subtraction operator. + + Accepts either an expression or any valid PyArrow scalar literal value. + """ + if not isinstance(rhs, Expr): + rhs = Expr.literal(rhs) + return Expr(self.expr.__sub__(rhs.expr)) + + def __truediv__(self, rhs: Any) -> Expr: + """Division operator. + + Accepts either an expression or any valid PyArrow scalar literal value. + """ + if not isinstance(rhs, Expr): + rhs = Expr.literal(rhs) + return Expr(self.expr.__truediv__(rhs.expr)) + + def __mul__(self, rhs: Any) -> Expr: + """Multiplication operator. + + Accepts either an expression or any valid PyArrow scalar literal value. + """ + if not isinstance(rhs, Expr): + rhs = Expr.literal(rhs) + return Expr(self.expr.__mul__(rhs.expr)) + + def __mod__(self, rhs: Any) -> Expr: + """Modulo operator (%). + + Accepts either an expression or any valid PyArrow scalar literal value. + """ + if not isinstance(rhs, Expr): + rhs = Expr.literal(rhs) + return Expr(self.expr.__mod__(rhs.expr)) + + def __and__(self, rhs: Expr) -> Expr: + """Logical AND.""" + if not isinstance(rhs, Expr): + rhs = Expr.literal(rhs) + return Expr(self.expr.__and__(rhs.expr)) + + def __or__(self, rhs: Expr) -> Expr: + """Logical OR.""" + if not isinstance(rhs, Expr): + rhs = Expr.literal(rhs) + return Expr(self.expr.__or__(rhs.expr)) + + def __invert__(self) -> Expr: + """Binary not (~).""" + return Expr(self.expr.__invert__()) + + def __getitem__(self, key: str | int) -> Expr: + """Retrieve sub-object. + + If ``key`` is a string, returns the subfield of the struct. + If ``key`` is an integer, retrieves the element in the array. Note that the + element index begins at ``0``, unlike `array_element` which begins at ``1``. + """ + if isinstance(key, int): + return Expr( + functions_internal.array_element(self.expr, Expr.literal(key + 1).expr) + ) + return Expr(self.expr.__getitem__(key)) + + def __eq__(self, rhs: Any) -> Expr: + """Equal to. + + Accepts either an expression or any valid PyArrow scalar literal value. + """ + if not isinstance(rhs, Expr): + rhs = Expr.literal(rhs) + return Expr(self.expr.__eq__(rhs.expr)) + + def __ne__(self, rhs: Any) -> Expr: + """Not equal to. + + Accepts either an expression or any valid PyArrow scalar literal value. + """ + if not isinstance(rhs, Expr): + rhs = Expr.literal(rhs) + return Expr(self.expr.__ne__(rhs.expr)) + + def __ge__(self, rhs: Any) -> Expr: + """Greater than or equal to. + + Accepts either an expression or any valid PyArrow scalar literal value. + """ + if not isinstance(rhs, Expr): + rhs = Expr.literal(rhs) + return Expr(self.expr.__ge__(rhs.expr)) + + def __gt__(self, rhs: Any) -> Expr: + """Greater than. + + Accepts either an expression or any valid PyArrow scalar literal value. + """ + if not isinstance(rhs, Expr): + rhs = Expr.literal(rhs) + return Expr(self.expr.__gt__(rhs.expr)) + + def __le__(self, rhs: Any) -> Expr: + """Less than or equal to. + + Accepts either an expression or any valid PyArrow scalar literal value. + """ + if not isinstance(rhs, Expr): + rhs = Expr.literal(rhs) + return Expr(self.expr.__le__(rhs.expr)) + + def __lt__(self, rhs: Any) -> Expr: + """Less than. + + Accepts either an expression or any valid PyArrow scalar literal value. + """ + if not isinstance(rhs, Expr): + rhs = Expr.literal(rhs) + return Expr(self.expr.__lt__(rhs.expr)) + + __radd__ = __add__ + __rand__ = __and__ + __rmod__ = __mod__ + __rmul__ = __mul__ + __ror__ = __or__ + __rsub__ = __sub__ + __rtruediv__ = __truediv__ + + @staticmethod + def literal(value: Any) -> Expr: + """Creates a new expression representing a scalar value. + + ``value`` must be a valid PyArrow scalar value or easily castable to one. + """ + if not isinstance(value, pa.Scalar): + value = pa.scalar(value) + return Expr(expr_internal.Expr.literal(value)) + + @staticmethod + def column(value: str) -> Expr: + """Creates a new expression representing a column.""" + return Expr(expr_internal.Expr.column(value)) + + def alias(self, name: str) -> Expr: + """Assign a name to the expression.""" + return Expr(self.expr.alias(name)) + + def sort(self, ascending: bool = True, nulls_first: bool = True) -> SortExpr: + """Creates a sort :py:class:`Expr` from an existing :py:class:`Expr`. + + Args: + ascending: If true, sort in ascending order. + nulls_first: Return null values first. + """ + return SortExpr(self.expr, ascending=ascending, nulls_first=nulls_first) + + def is_null(self) -> Expr: + """Returns ``True`` if this expression is null.""" + return Expr(self.expr.is_null()) + + def is_not_null(self) -> Expr: + """Returns ``True`` if this expression is not null.""" + return Expr(self.expr.is_not_null()) + + _to_pyarrow_types = { + float: pa.float64(), + int: pa.int64(), + str: pa.string(), + bool: pa.bool_(), + } + + def cast( + self, to: pa.DataType[Any] | Type[float] | Type[int] | Type[str] | Type[bool] + ) -> Expr: + """Cast to a new data type.""" + if not isinstance(to, pa.DataType): + try: + to = self._to_pyarrow_types[to] + except KeyError: + raise TypeError( + "Expected instance of pyarrow.DataType or builtins.type" + ) + + return Expr(self.expr.cast(to)) + + def between(self, low: Any, high: Any, negated: bool = False) -> Expr: + """Returns ``True`` if this expression is between a given range. + + Args: + low: lower bound of the range (inclusive). + high: higher bound of the range (inclusive). + negated: negates whether the expression is between a given range + """ + if not isinstance(low, Expr): + low = Expr.literal(low) + + if not isinstance(high, Expr): + high = Expr.literal(high) + + return Expr(self.expr.between(low.expr, high.expr, negated=negated)) + + def rex_type(self) -> RexType: + """Return the Rex Type of this expression. + + A Rex (Row Expression) specifies a single row of data.That specification + could include user defined functions or types. RexType identifies the + row as one of the possible valid ``RexType``. + """ + return self.expr.rex_type() + + def types(self) -> DataTypeMap: + """Return the ``DataTypeMap``. + + Returns: + DataTypeMap which represents the PythonType, Arrow DataType, and + SqlType Enum which this expression represents. + """ + return self.expr.types() + + def python_value(self) -> Any: + """Extracts the Expr value into a PyObject. + + This is only valid for literal expressions. + + Returns: + Python object representing literal value of the expression. + """ + return self.expr.python_value() + + def rex_call_operands(self) -> list[Expr]: + """Return the operands of the expression based on it's variant type. + + Row expressions, Rex(s), operate on the concept of operands. Different + variants of Expressions, Expr(s), store those operands in different + datastructures. This function examines the Expr variant and returns + the operands to the calling logic. + """ + return [Expr(e) for e in self.expr.rex_call_operands()] + + def rex_call_operator(self) -> str: + """Extracts the operator associated with a row expression type call.""" + return self.expr.rex_call_operator() + + def column_name(self, plan: LogicalPlan) -> str: + """Compute the output column name based on the provided logical plan.""" + return self.expr.column_name(plan) + + def order_by(self, *exprs: Expr | SortExpr) -> ExprFuncBuilder: + """Set the ordering for a window or aggregate function. + + This function will create an :py:class:`ExprFuncBuilder` that can be used to + set parameters for either window or aggregate functions. If used on any other + type of expression, an error will be generated when ``build()`` is called. + """ + return ExprFuncBuilder(self.expr.order_by([sort_or_default(e) for e in exprs])) + + def filter(self, filter: Expr) -> ExprFuncBuilder: + """Filter an aggregate function. + + This function will create an :py:class:`ExprFuncBuilder` that can be used to + set parameters for either window or aggregate functions. If used on any other + type of expression, an error will be generated when ``build()`` is called. + """ + return ExprFuncBuilder(self.expr.filter(filter.expr)) + + def distinct(self) -> ExprFuncBuilder: + """Only evaluate distinct values for an aggregate function. + + This function will create an :py:class:`ExprFuncBuilder` that can be used to + set parameters for either window or aggregate functions. If used on any other + type of expression, an error will be generated when ``build()`` is called. + """ + return ExprFuncBuilder(self.expr.distinct()) + + def null_treatment(self, null_treatment: NullTreatment) -> ExprFuncBuilder: + """Set the treatment for ``null`` values for a window or aggregate function. + + This function will create an :py:class:`ExprFuncBuilder` that can be used to + set parameters for either window or aggregate functions. If used on any other + type of expression, an error will be generated when ``build()`` is called. + """ + return ExprFuncBuilder(self.expr.null_treatment(null_treatment.value)) + + def partition_by(self, *partition_by: Expr) -> ExprFuncBuilder: + """Set the partitioning for a window function. + + This function will create an :py:class:`ExprFuncBuilder` that can be used to + set parameters for either window or aggregate functions. If used on any other + type of expression, an error will be generated when ``build()`` is called. + """ + return ExprFuncBuilder( + self.expr.partition_by(list(e.expr for e in partition_by)) + ) + + def window_frame(self, window_frame: WindowFrame) -> ExprFuncBuilder: + """Set the frame fora window function. + + This function will create an :py:class:`ExprFuncBuilder` that can be used to + set parameters for either window or aggregate functions. If used on any other + type of expression, an error will be generated when ``build()`` is called. + """ + return ExprFuncBuilder(self.expr.window_frame(window_frame.window_frame)) + + +class ExprFuncBuilder: + def __init__(self, builder: expr_internal.ExprFuncBuilder): + self.builder = builder + + def order_by(self, *exprs: Expr) -> ExprFuncBuilder: + """Set the ordering for a window or aggregate function. + + Values given in ``exprs`` must be sort expressions. You can convert any other + expression to a sort expression using `.sort()`. + """ + return ExprFuncBuilder( + self.builder.order_by([sort_or_default(e) for e in exprs]) + ) + + def filter(self, filter: Expr) -> ExprFuncBuilder: + """Filter values during aggregation.""" + return ExprFuncBuilder(self.builder.filter(filter.expr)) + + def distinct(self) -> ExprFuncBuilder: + """Only evaluate distinct values during aggregation.""" + return ExprFuncBuilder(self.builder.distinct()) + + def null_treatment(self, null_treatment: NullTreatment) -> ExprFuncBuilder: + """Set how nulls are treated for either window or aggregate functions.""" + return ExprFuncBuilder(self.builder.null_treatment(null_treatment.value)) + + def partition_by(self, *partition_by: Expr) -> ExprFuncBuilder: + """Set partitioning for window functions.""" + return ExprFuncBuilder( + self.builder.partition_by(list(e.expr for e in partition_by)) + ) + + def window_frame(self, window_frame: WindowFrame) -> ExprFuncBuilder: + """Set window frame for window functions.""" + return ExprFuncBuilder(self.builder.window_frame(window_frame.window_frame)) + + def build(self) -> Expr: + """Create an expression from a Function Builder.""" + return Expr(self.builder.build()) + + +class WindowFrame: + """Defines a window frame for performing window operations.""" + + def __init__( + self, units: str, start_bound: Optional[Any], end_bound: Optional[Any] + ) -> None: + """Construct a window frame using the given parameters. + + Args: + units: Should be one of ``rows``, ``range``, or ``groups``. + start_bound: Sets the preceding bound. Must be >= 0. If none, this + will be set to unbounded. If unit type is ``groups``, this + parameter must be set. + end_bound: Sets the following bound. Must be >= 0. If none, this + will be set to unbounded. If unit type is ``groups``, this + parameter must be set. + """ + if not isinstance(start_bound, pa.Scalar) and start_bound is not None: + start_bound = pa.scalar(start_bound) + if units == "rows" or units == "groups": + start_bound = start_bound.cast(pa.uint64()) + if not isinstance(end_bound, pa.Scalar) and end_bound is not None: + end_bound = pa.scalar(end_bound) + if units == "rows" or units == "groups": + end_bound = end_bound.cast(pa.uint64()) + self.window_frame = expr_internal.WindowFrame(units, start_bound, end_bound) + + def get_frame_units(self) -> str: + """Returns the window frame units for the bounds.""" + return self.window_frame.get_frame_units() + + def get_lower_bound(self) -> WindowFrameBound: + """Returns starting bound.""" + return WindowFrameBound(self.window_frame.get_lower_bound()) + + def get_upper_bound(self): + """Returns end bound.""" + return WindowFrameBound(self.window_frame.get_upper_bound()) + + +class WindowFrameBound: + """Defines a single window frame bound. + + :py:class:`WindowFrame` typically requires a start and end bound. + """ + + def __init__(self, frame_bound: expr_internal.WindowFrameBound) -> None: + """Constructs a window frame bound.""" + self.frame_bound = frame_bound + + def get_offset(self) -> int | None: + """Returns the offset of the window frame.""" + return self.frame_bound.get_offset() + + def is_current_row(self) -> bool: + """Returns if the frame bound is current row.""" + return self.frame_bound.is_current_row() + + def is_following(self) -> bool: + """Returns if the frame bound is following.""" + return self.frame_bound.is_following() + + def is_preceding(self) -> bool: + """Returns if the frame bound is preceding.""" + return self.frame_bound.is_preceding() + + def is_unbounded(self) -> bool: + """Returns if the frame bound is unbounded.""" + return self.frame_bound.is_unbounded() + + +class CaseBuilder: + """Builder class for constructing case statements. + + An example usage would be as follows:: + + import datafusion.functions as f + from datafusion import lit, col + df.select( + f.case(col("column_a") + .when(lit(1), lit("One")) + .when(lit(2), lit("Two")) + .otherwise(lit("Unknown")) + ) + """ + + def __init__(self, case_builder: expr_internal.CaseBuilder) -> None: + """Constructs a case builder. + + This is not typically called by the end user directly. See + :py:func:`datafusion.functions.case` instead. + """ + self.case_builder = case_builder + + def when(self, when_expr: Expr, then_expr: Expr) -> CaseBuilder: + """Add a case to match against.""" + return CaseBuilder(self.case_builder.when(when_expr.expr, then_expr.expr)) + + def otherwise(self, else_expr: Expr) -> Expr: + """Set a default value for the case statement.""" + return Expr(self.case_builder.otherwise(else_expr.expr)) + + def end(self) -> Expr: + """Finish building a case statement. + + Any non-matching cases will end in a `null` value. + """ + return Expr(self.case_builder.end()) + + +class SortExpr: + """Used to specify sorting on either a DataFrame or function.""" + + def __init__(self, expr: Expr, ascending: bool, nulls_first: bool) -> None: + """This constructor should not be called by the end user.""" + self.raw_sort = expr_internal.SortExpr(expr, ascending, nulls_first) + + def expr(self) -> Expr: + """Return the raw expr backing the SortExpr.""" + return Expr(self.raw_sort.expr()) + + def ascending(self) -> bool: + """Return ascending property.""" + return self.raw_sort.ascending() + + def nulls_first(self) -> bool: + """Return nulls_first property.""" + return self.raw_sort.nulls_first() + + def __repr__(self) -> str: + """Generate a string representation of this expression.""" + return self.raw_sort.__repr__() diff --git a/py-denormalized/python/denormalized/datafusion/functions.py b/py-denormalized/python/denormalized/datafusion/functions.py new file mode 100644 index 0000000..291c578 --- /dev/null +++ b/py-denormalized/python/denormalized/datafusion/functions.py @@ -0,0 +1,2659 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""User functions for operating on :py:class:`~datafusion.expr.Expr`.""" + +from __future__ import annotations + +from denormalized._internal import functions as f +from denormalized.datafusion.expr import ( + CaseBuilder, + Expr, + WindowFrame, + SortExpr, + sort_list_to_raw_sort_list, + expr_list_to_raw_expr_list, +) +from datafusion.context import SessionContext +from datafusion.common import NullTreatment + +from typing import Any, Optional + +import pyarrow as pa + +__all__ = [ + "abs", + "acos", + "acosh", + "alias", + "approx_distinct", + "approx_median", + "approx_percentile_cont", + "approx_percentile_cont_with_weight", + "array", + "array_agg", + "array_append", + "array_cat", + "array_concat", + "array_dims", + "array_distinct", + "array_element", + "array_except", + "array_extract", + "array_has", + "array_has_all", + "array_has_any", + "array_indexof", + "array_intersect", + "array_join", + "array_length", + "array_ndims", + "array_pop_back", + "array_pop_front", + "array_position", + "array_positions", + "array_prepend", + "array_push_back", + "array_push_front", + "array_remove", + "array_remove_all", + "array_remove_n", + "array_repeat", + "array_replace", + "array_replace_all", + "array_replace_n", + "array_resize", + "array_slice", + "array_sort", + "array_to_string", + "array_union", + "arrow_typeof", + "ascii", + "asin", + "asinh", + "atan", + "atan2", + "atanh", + "avg", + "bit_and", + "bit_length", + "bit_or", + "bit_xor", + "bool_and", + "bool_or", + "btrim", + "case", + "cbrt", + "ceil", + "char_length", + "character_length", + "chr", + "coalesce", + "col", + "concat", + "concat_ws", + "corr", + "cos", + "cosh", + "cot", + "count", + "count_star", + "covar", + "covar_pop", + "covar_samp", + "current_date", + "current_time", + "date_bin", + "date_part", + "date_trunc", + "datepart", + "datetrunc", + "decode", + "degrees", + "digest", + "encode", + "ends_with", + "exp", + "factorial", + "find_in_set", + "first_value", + "flatten", + "floor", + "from_unixtime", + "gcd", + "in_list", + "initcap", + "isnan", + "iszero", + "last_value", + "lcm", + "left", + "length", + "levenshtein", + "list_append", + "list_dims", + "list_distinct", + "list_element", + "list_except", + "list_extract", + "list_indexof", + "list_intersect", + "list_join", + "list_length", + "list_ndims", + "list_position", + "list_positions", + "list_prepend", + "list_push_back", + "list_push_front", + "list_remove", + "list_remove_all", + "list_remove_n", + "list_replace", + "list_replace_all", + "list_replace_n", + "list_resize", + "list_slice", + "list_sort", + "list_to_string", + "list_union", + "ln", + "log", + "log10", + "log2", + "lower", + "lpad", + "ltrim", + "make_array", + "make_date", + "max", + "md5", + "mean", + "median", + "min", + "named_struct", + "nanvl", + "now", + "nth_value", + "nullif", + "octet_length", + "order_by", + "overlay", + "pi", + "pow", + "power", + "radians", + "random", + "range", + "regexp_like", + "regexp_match", + "regexp_replace", + "regr_avgx", + "regr_avgy", + "regr_count", + "regr_intercept", + "regr_r2", + "regr_slope", + "regr_sxx", + "regr_sxy", + "regr_syy", + "repeat", + "replace", + "reverse", + "right", + "round", + "rpad", + "rtrim", + "sha224", + "sha256", + "sha384", + "sha512", + "signum", + "sin", + "sinh", + "split_part", + "sqrt", + "starts_with", + "stddev", + "stddev_pop", + "stddev_samp", + "string_agg", + "strpos", + "struct", + "substr", + "substr_index", + "substring", + "sum", + "tan", + "tanh", + "to_hex", + "to_timestamp", + "to_timestamp_micros", + "to_timestamp_millis", + "to_timestamp_seconds", + "to_unixtime", + "translate", + "trim", + "trunc", + "upper", + "uuid", + "var", + "var_pop", + "var_samp", + "var_sample", + "when", + # Window Functions + "window", + "lead", + "lag", + "row_number", + "rank", + "dense_rank", + "percent_rank", + "cume_dist", + "ntile", +] + + +def isnan(expr: Expr) -> Expr: + """Returns true if a given number is +NaN or -NaN otherwise returns false.""" + return Expr(f.isnan(expr.expr)) + + +def nullif(expr1: Expr, expr2: Expr) -> Expr: + """Returns NULL if expr1 equals expr2; otherwise it returns expr1. + + This can be used to perform the inverse operation of the COALESCE expression. + """ + return Expr(f.nullif(expr1.expr, expr2.expr)) + + +def encode(input: Expr, encoding: Expr) -> Expr: + """Encode the ``input``, using the ``encoding``. encoding can be base64 or hex.""" + return Expr(f.encode(input.expr, encoding.expr)) + + +def decode(input: Expr, encoding: Expr) -> Expr: + """Decode the ``input``, using the ``encoding``. encoding can be base64 or hex.""" + return Expr(f.decode(input.expr, encoding.expr)) + + +def array_to_string(expr: Expr, delimiter: Expr) -> Expr: + """Converts each element to its text representation.""" + return Expr(f.array_to_string(expr.expr, delimiter.expr)) + + +def array_join(expr: Expr, delimiter: Expr) -> Expr: + """Converts each element to its text representation. + + This is an alias for :py:func:`array_to_string`. + """ + return array_to_string(expr, delimiter) + + +def list_to_string(expr: Expr, delimiter: Expr) -> Expr: + """Converts each element to its text representation. + + This is an alias for :py:func:`array_to_string`. + """ + return array_to_string(expr, delimiter) + + +def list_join(expr: Expr, delimiter: Expr) -> Expr: + """Converts each element to its text representation. + + This is an alias for :py:func:`array_to_string`. + """ + return array_to_string(expr, delimiter) + + +def in_list(arg: Expr, values: list[Expr], negated: bool = False) -> Expr: + """Returns whether the argument is contained within the list ``values``.""" + values = [v.expr for v in values] + return Expr(f.in_list(arg.expr, values, negated)) + + +def digest(value: Expr, method: Expr) -> Expr: + """Computes the binary hash of an expression using the specified algorithm. + + Standard algorithms are md5, sha224, sha256, sha384, sha512, blake2s, + blake2b, and blake3. + """ + return Expr(f.digest(value.expr, method.expr)) + + +def concat(*args: Expr) -> Expr: + """Concatenates the text representations of all the arguments. + + NULL arguments are ignored. + """ + args = [arg.expr for arg in args] + return Expr(f.concat(args)) + + +def concat_ws(separator: str, *args: Expr) -> Expr: + """Concatenates the list ``args`` with the separator. + + ``NULL`` arguments are ignored. ``separator`` should not be ``NULL``. + """ + args = [arg.expr for arg in args] + return Expr(f.concat_ws(separator, args)) + + +def order_by(expr: Expr, ascending: bool = True, nulls_first: bool = True) -> SortExpr: + """Creates a new sort expression.""" + return SortExpr(expr.expr, ascending=ascending, nulls_first=nulls_first) + + +def alias(expr: Expr, name: str) -> Expr: + """Creates an alias expression.""" + return Expr(f.alias(expr.expr, name)) + + +def col(name: str) -> Expr: + """Creates a column reference expression.""" + return Expr(f.col(name)) + + +def count_star(filter: Optional[Expr] = None) -> Expr: + """Create a COUNT(1) aggregate expression. + + This aggregate function will count all of the rows in the partition. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``distinct``, and ``null_treatment``. + + Args: + filter: If provided, only count rows for which the filter is True + """ + return count(Expr.literal(1), filter=filter) + + +def case(expr: Expr) -> CaseBuilder: + """Create a case expression. + + Create a :py:class:`~datafusion.expr.CaseBuilder` to match cases for the + expression ``expr``. See :py:class:`~datafusion.expr.CaseBuilder` for + detailed usage. + """ + return CaseBuilder(f.case(expr.expr)) + + +def when(when: Expr, then: Expr) -> CaseBuilder: + """Create a case expression that has no base expression. + + Create a :py:class:`~datafusion.expr.CaseBuilder` to match cases for the + expression ``expr``. See :py:class:`~datafusion.expr.CaseBuilder` for + detailed usage. + """ + return CaseBuilder(f.when(when.expr, then.expr)) + + +def window( + name: str, + args: list[Expr], + partition_by: list[Expr] | None = None, + order_by: list[Expr | SortExpr] | None = None, + window_frame: WindowFrame | None = None, + ctx: SessionContext | None = None, +) -> Expr: + """Creates a new Window function expression. + + This interface will soon be deprecated. Instead of using this interface, + users should call the window functions directly. For example, to perform a + lag use:: + + df.select(functions.lag(col("a")).partition_by(col("b")).build()) + """ + args = [a.expr for a in args] + partition_by = expr_list_to_raw_expr_list(partition_by) + order_by_raw = sort_list_to_raw_sort_list(order_by) + window_frame = window_frame.window_frame if window_frame is not None else None + return Expr(f.window(name, args, partition_by, order_by_raw, window_frame, ctx)) + + +# scalar functions +def abs(arg: Expr) -> Expr: + """Return the absolute value of a given number. + + Returns: + -------- + Expr + A new expression representing the absolute value of the input expression. + """ + return Expr(f.abs(arg.expr)) + + +def acos(arg: Expr) -> Expr: + """Returns the arc cosine or inverse cosine of a number. + + Returns: + -------- + Expr + A new expression representing the arc cosine of the input expression. + """ + return Expr(f.acos(arg.expr)) + + +def acosh(arg: Expr) -> Expr: + """Returns inverse hyperbolic cosine.""" + return Expr(f.acosh(arg.expr)) + + +def ascii(arg: Expr) -> Expr: + """Returns the numeric code of the first character of the argument.""" + return Expr(f.ascii(arg.expr)) + + +def asin(arg: Expr) -> Expr: + """Returns the arc sine or inverse sine of a number.""" + return Expr(f.asin(arg.expr)) + + +def asinh(arg: Expr) -> Expr: + """Returns inverse hyperbolic sine.""" + return Expr(f.asinh(arg.expr)) + + +def atan(arg: Expr) -> Expr: + """Returns inverse tangent of a number.""" + return Expr(f.atan(arg.expr)) + + +def atanh(arg: Expr) -> Expr: + """Returns inverse hyperbolic tangent.""" + return Expr(f.atanh(arg.expr)) + + +def atan2(y: Expr, x: Expr) -> Expr: + """Returns inverse tangent of a division given in the argument.""" + return Expr(f.atan2(y.expr, x.expr)) + + +def bit_length(arg: Expr) -> Expr: + """Returns the number of bits in the string argument.""" + return Expr(f.bit_length(arg.expr)) + + +def btrim(arg: Expr) -> Expr: + """Removes all characters, spaces by default, from both sides of a string.""" + return Expr(f.btrim(arg.expr)) + + +def cbrt(arg: Expr) -> Expr: + """Returns the cube root of a number.""" + return Expr(f.cbrt(arg.expr)) + + +def ceil(arg: Expr) -> Expr: + """Returns the nearest integer greater than or equal to argument.""" + return Expr(f.ceil(arg.expr)) + + +def character_length(arg: Expr) -> Expr: + """Returns the number of characters in the argument.""" + return Expr(f.character_length(arg.expr)) + + +def length(string: Expr) -> Expr: + """The number of characters in the ``string``.""" + return Expr(f.length(string.expr)) + + +def char_length(string: Expr) -> Expr: + """The number of characters in the ``string``.""" + return Expr(f.char_length(string.expr)) + + +def chr(arg: Expr) -> Expr: + """Converts the Unicode code point to a UTF8 character.""" + return Expr(f.chr(arg.expr)) + + +def coalesce(*args: Expr) -> Expr: + """Returns the value of the first expr in ``args`` which is not NULL.""" + args = [arg.expr for arg in args] + return Expr(f.coalesce(*args)) + + +def cos(arg: Expr) -> Expr: + """Returns the cosine of the argument.""" + return Expr(f.cos(arg.expr)) + + +def cosh(arg: Expr) -> Expr: + """Returns the hyperbolic cosine of the argument.""" + return Expr(f.cosh(arg.expr)) + + +def cot(arg: Expr) -> Expr: + """Returns the cotangent of the argument.""" + return Expr(f.cot(arg.expr)) + + +def degrees(arg: Expr) -> Expr: + """Converts the argument from radians to degrees.""" + return Expr(f.degrees(arg.expr)) + + +def ends_with(arg: Expr, suffix: Expr) -> Expr: + """Returns true if the ``string`` ends with the ``suffix``, false otherwise.""" + return Expr(f.ends_with(arg.expr, suffix.expr)) + + +def exp(arg: Expr) -> Expr: + """Returns the exponential of the argument.""" + return Expr(f.exp(arg.expr)) + + +def factorial(arg: Expr) -> Expr: + """Returns the factorial of the argument.""" + return Expr(f.factorial(arg.expr)) + + +def find_in_set(string: Expr, string_list: Expr) -> Expr: + """Find a string in a list of strings. + + Returns a value in the range of 1 to N if the string is in the string list + ``string_list`` consisting of N substrings. + + The string list is a string composed of substrings separated by ``,`` characters. + """ + return Expr(f.find_in_set(string.expr, string_list.expr)) + + +def floor(arg: Expr) -> Expr: + """Returns the nearest integer less than or equal to the argument.""" + return Expr(f.floor(arg.expr)) + + +def gcd(x: Expr, y: Expr) -> Expr: + """Returns the greatest common divisor.""" + return Expr(f.gcd(x.expr, y.expr)) + + +def initcap(string: Expr) -> Expr: + """Set the initial letter of each word to capital. + + Converts the first letter of each word in ``string`` to uppercase and the remaining + characters to lowercase. + """ + return Expr(f.initcap(string.expr)) + + +def instr(string: Expr, substring: Expr) -> Expr: + """Finds the position from where the ``substring`` matches the ``string``. + + This is an alias for :py:func:`strpos`. + """ + return strpos(string, substring) + + +def iszero(arg: Expr) -> Expr: + """Returns true if a given number is +0.0 or -0.0 otherwise returns false.""" + return Expr(f.iszero(arg.expr)) + + +def lcm(x: Expr, y: Expr) -> Expr: + """Returns the least common multiple.""" + return Expr(f.lcm(x.expr, y.expr)) + + +def left(string: Expr, n: Expr) -> Expr: + """Returns the first ``n`` characters in the ``string``.""" + return Expr(f.left(string.expr, n.expr)) + + +def levenshtein(string1: Expr, string2: Expr) -> Expr: + """Returns the Levenshtein distance between the two given strings.""" + return Expr(f.levenshtein(string1.expr, string2.expr)) + + +def ln(arg: Expr) -> Expr: + """Returns the natural logarithm (base e) of the argument.""" + return Expr(f.ln(arg.expr)) + + +def log(base: Expr, num: Expr) -> Expr: + """Returns the logarithm of a number for a particular ``base``.""" + return Expr(f.log(base.expr, num.expr)) + + +def log10(arg: Expr) -> Expr: + """Base 10 logarithm of the argument.""" + return Expr(f.log10(arg.expr)) + + +def log2(arg: Expr) -> Expr: + """Base 2 logarithm of the argument.""" + return Expr(f.log2(arg.expr)) + + +def lower(arg: Expr) -> Expr: + """Converts a string to lowercase.""" + return Expr(f.lower(arg.expr)) + + +def lpad(string: Expr, count: Expr, characters: Expr | None = None) -> Expr: + """Add left padding to a string. + + Extends the string to length length by prepending the characters fill (a + space by default). If the string is already longer than length then it is + truncated (on the right). + """ + characters = characters if characters is not None else Expr.literal(" ") + return Expr(f.lpad(string.expr, count.expr, characters.expr)) + + +def ltrim(arg: Expr) -> Expr: + """Removes all characters, spaces by default, from the beginning of a string.""" + return Expr(f.ltrim(arg.expr)) + + +def md5(arg: Expr) -> Expr: + """Computes an MD5 128-bit checksum for a string expression.""" + return Expr(f.md5(arg.expr)) + + +def nanvl(x: Expr, y: Expr) -> Expr: + """Returns ``x`` if ``x`` is not ``NaN``. Otherwise returns ``y``.""" + return Expr(f.nanvl(x.expr, y.expr)) + + +def octet_length(arg: Expr) -> Expr: + """Returns the number of bytes of a string.""" + return Expr(f.octet_length(arg.expr)) + + +def overlay( + string: Expr, substring: Expr, start: Expr, length: Expr | None = None +) -> Expr: + """Replace a substring with a new substring. + + Replace the substring of string that starts at the ``start``'th character and + extends for ``length`` characters with new substring. + """ + if length is None: + return Expr(f.overlay(string.expr, substring.expr, start.expr)) + return Expr(f.overlay(string.expr, substring.expr, start.expr, length.expr)) + + +def pi() -> Expr: + """Returns an approximate value of π.""" + return Expr(f.pi()) + + +def position(string: Expr, substring: Expr) -> Expr: + """Finds the position from where the ``substring`` matches the ``string``. + + This is an alias for :py:func:`strpos`. + """ + return strpos(string, substring) + + +def power(base: Expr, exponent: Expr) -> Expr: + """Returns ``base`` raised to the power of ``exponent``.""" + return Expr(f.power(base.expr, exponent.expr)) + + +def pow(base: Expr, exponent: Expr) -> Expr: + """Returns ``base`` raised to the power of ``exponent``. + + This is an alias of :py:func:`power`. + """ + return power(base, exponent) + + +def radians(arg: Expr) -> Expr: + """Converts the argument from degrees to radians.""" + return Expr(f.radians(arg.expr)) + + +def regexp_like(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr: + """Find if any regular expression (regex) matches exist. + + Tests a string using a regular expression returning true if at least one match, + false otherwise. + """ + if flags is not None: + flags = flags.expr + return Expr(f.regexp_like(string.expr, regex.expr, flags)) + + +def regexp_match(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr: + """Perform regular expression (regex) matching. + + Returns an array with each element containing the leftmost-first match of the + corresponding index in ``regex`` to string in ``string``. + """ + if flags is not None: + flags = flags.expr + return Expr(f.regexp_match(string.expr, regex.expr, flags)) + + +def regexp_replace( + string: Expr, pattern: Expr, replacement: Expr, flags: Expr | None = None +) -> Expr: + """Replaces substring(s) matching a PCRE-like regular expression. + + The full list of supported features and syntax can be found at + + + Supported flags with the addition of 'g' can be found at + + """ + if flags is not None: + flags = flags.expr + return Expr(f.regexp_replace(string.expr, pattern.expr, replacement.expr, flags)) + + +def repeat(string: Expr, n: Expr) -> Expr: + """Repeats the ``string`` to ``n`` times.""" + return Expr(f.repeat(string.expr, n.expr)) + + +def replace(string: Expr, from_val: Expr, to_val: Expr) -> Expr: + """Replaces all occurrences of ``from_val`` with ``to_val`` in the ``string``.""" + return Expr(f.replace(string.expr, from_val.expr, to_val.expr)) + + +def reverse(arg: Expr) -> Expr: + """Reverse the string argument.""" + return Expr(f.reverse(arg.expr)) + + +def right(string: Expr, n: Expr) -> Expr: + """Returns the last ``n`` characters in the ``string``.""" + return Expr(f.right(string.expr, n.expr)) + + +def round(value: Expr, decimal_places: Expr = Expr.literal(0)) -> Expr: + """Round the argument to the nearest integer. + + If the optional ``decimal_places`` is specified, round to the nearest number of + decimal places. You can specify a negative number of decimal places. For example + ``round(lit(125.2345), lit(-2))`` would yield a value of ``100.0``. + """ + return Expr(f.round(value.expr, decimal_places.expr)) + + +def rpad(string: Expr, count: Expr, characters: Expr | None = None) -> Expr: + """Add right padding to a string. + + Extends the string to length length by appending the characters fill (a space + by default). If the string is already longer than length then it is truncated. + """ + characters = characters if characters is not None else Expr.literal(" ") + return Expr(f.rpad(string.expr, count.expr, characters.expr)) + + +def rtrim(arg: Expr) -> Expr: + """Removes all characters, spaces by default, from the end of a string.""" + return Expr(f.rtrim(arg.expr)) + + +def sha224(arg: Expr) -> Expr: + """Computes the SHA-224 hash of a binary string.""" + return Expr(f.sha224(arg.expr)) + + +def sha256(arg: Expr) -> Expr: + """Computes the SHA-256 hash of a binary string.""" + return Expr(f.sha256(arg.expr)) + + +def sha384(arg: Expr) -> Expr: + """Computes the SHA-384 hash of a binary string.""" + return Expr(f.sha384(arg.expr)) + + +def sha512(arg: Expr) -> Expr: + """Computes the SHA-512 hash of a binary string.""" + return Expr(f.sha512(arg.expr)) + + +def signum(arg: Expr) -> Expr: + """Returns the sign of the argument (-1, 0, +1).""" + return Expr(f.signum(arg.expr)) + + +def sin(arg: Expr) -> Expr: + """Returns the sine of the argument.""" + return Expr(f.sin(arg.expr)) + + +def sinh(arg: Expr) -> Expr: + """Returns the hyperbolic sine of the argument.""" + return Expr(f.sinh(arg.expr)) + + +def split_part(string: Expr, delimiter: Expr, index: Expr) -> Expr: + """Split a string and return one part. + + Splits a string based on a delimiter and picks out the desired field based + on the index. + """ + return Expr(f.split_part(string.expr, delimiter.expr, index.expr)) + + +def sqrt(arg: Expr) -> Expr: + """Returns the square root of the argument.""" + return Expr(f.sqrt(arg.expr)) + + +def starts_with(string: Expr, prefix: Expr) -> Expr: + """Returns true if string starts with prefix.""" + return Expr(f.starts_with(string.expr, prefix.expr)) + + +def strpos(string: Expr, substring: Expr) -> Expr: + """Finds the position from where the ``substring`` matches the ``string``.""" + return Expr(f.strpos(string.expr, substring.expr)) + + +def substr(string: Expr, position: Expr) -> Expr: + """Substring from the ``position`` to the end.""" + return Expr(f.substr(string.expr, position.expr)) + + +def substr_index(string: Expr, delimiter: Expr, count: Expr) -> Expr: + """Returns an indexed substring. + + The return will be the ``string`` from before ``count`` occurrences of + ``delimiter``. + """ + return Expr(f.substr_index(string.expr, delimiter.expr, count.expr)) + + +def substring(string: Expr, position: Expr, length: Expr) -> Expr: + """Substring from the ``position`` with ``length`` characters.""" + return Expr(f.substring(string.expr, position.expr, length.expr)) + + +def tan(arg: Expr) -> Expr: + """Returns the tangent of the argument.""" + return Expr(f.tan(arg.expr)) + + +def tanh(arg: Expr) -> Expr: + """Returns the hyperbolic tangent of the argument.""" + return Expr(f.tanh(arg.expr)) + + +def to_hex(arg: Expr) -> Expr: + """Converts an integer to a hexadecimal string.""" + return Expr(f.to_hex(arg.expr)) + + +def now() -> Expr: + """Returns the current timestamp in nanoseconds. + + This will use the same value for all instances of now() in same statement. + """ + return Expr(f.now()) + + +def to_timestamp(arg: Expr, *formatters: Expr) -> Expr: + """Converts a string and optional formats to a ``Timestamp`` in nanoseconds. + + For usage of ``formatters`` see the rust chrono package ``strftime`` package. + + [Documentation here.](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) + """ + if formatters is None: + return f.to_timestamp(arg.expr) + + formatters = [f.expr for f in formatters] + return Expr(f.to_timestamp(arg.expr, *formatters)) + + +def to_timestamp_millis(arg: Expr, *formatters: Expr) -> Expr: + """Converts a string and optional formats to a ``Timestamp`` in milliseconds. + + See :py:func:`to_timestamp` for a description on how to use formatters. + """ + return Expr(f.to_timestamp_millis(arg.expr, *formatters)) + + +def to_timestamp_micros(arg: Expr, *formatters: Expr) -> Expr: + """Converts a string and optional formats to a ``Timestamp`` in microseconds. + + See :py:func:`to_timestamp` for a description on how to use formatters. + """ + return Expr(f.to_timestamp_micros(arg.expr, *formatters)) + + +def to_timestamp_nanos(arg: Expr, *formatters: Expr) -> Expr: + """Converts a string and optional formats to a ``Timestamp`` in nanoseconds. + + See :py:func:`to_timestamp` for a description on how to use formatters. + """ + return Expr(f.to_timestamp_nanos(arg.expr, *formatters)) + + +def to_timestamp_seconds(arg: Expr, *formatters: Expr) -> Expr: + """Converts a string and optional formats to a ``Timestamp`` in seconds. + + See :py:func:`to_timestamp` for a description on how to use formatters. + """ + return Expr(f.to_timestamp_seconds(arg.expr, *formatters)) + + +def to_unixtime(string: Expr, *format_arguments: Expr) -> Expr: + """Converts a string and optional formats to a Unixtime.""" + args = [f.expr for f in format_arguments] + return Expr(f.to_unixtime(string.expr, *args)) + + +def current_date() -> Expr: + """Returns current UTC date as a Date32 value.""" + return Expr(f.current_date()) + + +def current_time() -> Expr: + """Returns current UTC time as a Time64 value.""" + return Expr(f.current_time()) + + +def datepart(part: Expr, date: Expr) -> Expr: + """Return a specified part of a date. + + This is an alias for :py:func:`date_part`. + """ + return date_part(part, date) + + +def date_part(part: Expr, date: Expr) -> Expr: + """Extracts a subfield from the date.""" + return Expr(f.date_part(part.expr, date.expr)) + + +def date_trunc(part: Expr, date: Expr) -> Expr: + """Truncates the date to a specified level of precision.""" + return Expr(f.date_trunc(part.expr, date.expr)) + + +def datetrunc(part: Expr, date: Expr) -> Expr: + """Truncates the date to a specified level of precision. + + This is an alias for :py:func:`date_trunc`. + """ + return date_trunc(part, date) + + +def date_bin(stride: Expr, source: Expr, origin: Expr) -> Expr: + """Coerces an arbitrary timestamp to the start of the nearest specified interval.""" + return Expr(f.date_bin(stride.expr, source.expr, origin.expr)) + + +def make_date(year: Expr, month: Expr, day: Expr) -> Expr: + """Make a date from year, month and day component parts.""" + return Expr(f.make_date(year.expr, month.expr, day.expr)) + + +def translate(string: Expr, from_val: Expr, to_val: Expr) -> Expr: + """Replaces the characters in ``from_val`` with the counterpart in ``to_val``.""" + return Expr(f.translate(string.expr, from_val.expr, to_val.expr)) + + +def trim(arg: Expr) -> Expr: + """Removes all characters, spaces by default, from both sides of a string.""" + return Expr(f.trim(arg.expr)) + + +def trunc(num: Expr, precision: Expr | None = None) -> Expr: + """Truncate the number toward zero with optional precision.""" + if precision is not None: + return Expr(f.trunc(num.expr, precision.expr)) + return Expr(f.trunc(num.expr)) + + +def upper(arg: Expr) -> Expr: + """Converts a string to uppercase.""" + return Expr(f.upper(arg.expr)) + + +def make_array(*args: Expr) -> Expr: + """Returns an array using the specified input expressions.""" + args = [arg.expr for arg in args] + return Expr(f.make_array(args)) + + +def array(*args: Expr) -> Expr: + """Returns an array using the specified input expressions. + + This is an alias for :py:func:`make_array`. + """ + return make_array(*args) + + +def range(start: Expr, stop: Expr, step: Expr) -> Expr: + """Create a list of values in the range between start and stop.""" + return Expr(f.range(start.expr, stop.expr, step.expr)) + + +def uuid(arg: Expr) -> Expr: + """Returns uuid v4 as a string value.""" + return Expr(f.uuid(arg.expr)) + + +def struct(*args: Expr) -> Expr: + """Returns a struct with the given arguments.""" + args = [arg.expr for arg in args] + return Expr(f.struct(*args)) + + +def named_struct(name_pairs: list[tuple[str, Expr]]) -> Expr: + """Returns a struct with the given names and arguments pairs.""" + name_pair_exprs = [[Expr.literal(pair[0]), pair[1]] for pair in name_pairs] + + # flatten + name_pairs = [x.expr for xs in name_pair_exprs for x in xs] + return Expr(f.named_struct(*name_pairs)) + + +def from_unixtime(arg: Expr) -> Expr: + """Converts an integer to RFC3339 timestamp format string.""" + return Expr(f.from_unixtime(arg.expr)) + + +def arrow_typeof(arg: Expr) -> Expr: + """Returns the Arrow type of the expression.""" + return Expr(f.arrow_typeof(arg.expr)) + + +def random() -> Expr: + """Returns a random value in the range ``0.0 <= x < 1.0``.""" + return Expr(f.random()) + + +def array_append(array: Expr, element: Expr) -> Expr: + """Appends an element to the end of an array.""" + return Expr(f.array_append(array.expr, element.expr)) + + +def array_push_back(array: Expr, element: Expr) -> Expr: + """Appends an element to the end of an array. + + This is an alias for :py:func:`array_append`. + """ + return array_append(array, element) + + +def list_append(array: Expr, element: Expr) -> Expr: + """Appends an element to the end of an array. + + This is an alias for :py:func:`array_append`. + """ + return array_append(array, element) + + +def list_push_back(array: Expr, element: Expr) -> Expr: + """Appends an element to the end of an array. + + This is an alias for :py:func:`array_append`. + """ + return array_append(array, element) + + +def array_concat(*args: Expr) -> Expr: + """Concatenates the input arrays.""" + args = [arg.expr for arg in args] + return Expr(f.array_concat(args)) + + +def array_cat(*args: Expr) -> Expr: + """Concatenates the input arrays. + + This is an alias for :py:func:`array_concat`. + """ + return array_concat(*args) + + +def array_dims(array: Expr) -> Expr: + """Returns an array of the array's dimensions.""" + return Expr(f.array_dims(array.expr)) + + +def array_distinct(array: Expr) -> Expr: + """Returns distinct values from the array after removing duplicates.""" + return Expr(f.array_distinct(array.expr)) + + +def list_distinct(array: Expr) -> Expr: + """Returns distinct values from the array after removing duplicates. + + This is an alias for :py:func:`array_distinct`. + """ + return array_distinct(array) + + +def list_dims(array: Expr) -> Expr: + """Returns an array of the array's dimensions. + + This is an alias for :py:func:`array_dims`. + """ + return array_dims(array) + + +def array_element(array: Expr, n: Expr) -> Expr: + """Extracts the element with the index n from the array.""" + return Expr(f.array_element(array.expr, n.expr)) + + +def array_extract(array: Expr, n: Expr) -> Expr: + """Extracts the element with the index n from the array. + + This is an alias for :py:func:`array_element`. + """ + return array_element(array, n) + + +def list_element(array: Expr, n: Expr) -> Expr: + """Extracts the element with the index n from the array. + + This is an alias for :py:func:`array_element`. + """ + return array_element(array, n) + + +def list_extract(array: Expr, n: Expr) -> Expr: + """Extracts the element with the index n from the array. + + This is an alias for :py:func:`array_element`. + """ + return array_element(array, n) + + +def array_length(array: Expr) -> Expr: + """Returns the length of the array.""" + return Expr(f.array_length(array.expr)) + + +def list_length(array: Expr) -> Expr: + """Returns the length of the array. + + This is an alias for :py:func:`array_length`. + """ + return array_length(array) + + +def array_has(first_array: Expr, second_array: Expr) -> Expr: + """Returns true if the element appears in the first array, otherwise false.""" + return Expr(f.array_has(first_array.expr, second_array.expr)) + + +def array_has_all(first_array: Expr, second_array: Expr) -> Expr: + """Determines if there is complete overlap ``second_array`` in ``first_array``. + + Returns true if each element of the second array appears in the first array. + Otherwise, it returns false. + """ + return Expr(f.array_has_all(first_array.expr, second_array.expr)) + + +def array_has_any(first_array: Expr, second_array: Expr) -> Expr: + """Determine if there is an overlap between ``first_array`` and ``second_array``. + + Returns true if at least one element of the second array appears in the first + array. Otherwise, it returns false. + """ + return Expr(f.array_has_any(first_array.expr, second_array.expr)) + + +def array_position(array: Expr, element: Expr, index: int | None = 1) -> Expr: + """Return the position of the first occurrence of ``element`` in ``array``.""" + return Expr(f.array_position(array.expr, element.expr, index)) + + +def array_indexof(array: Expr, element: Expr, index: int | None = 1) -> Expr: + """Return the position of the first occurrence of ``element`` in ``array``. + + This is an alias for :py:func:`array_position`. + """ + return array_position(array, element, index) + + +def list_position(array: Expr, element: Expr, index: int | None = 1) -> Expr: + """Return the position of the first occurrence of ``element`` in ``array``. + + This is an alias for :py:func:`array_position`. + """ + return array_position(array, element, index) + + +def list_indexof(array: Expr, element: Expr, index: int | None = 1) -> Expr: + """Return the position of the first occurrence of ``element`` in ``array``. + + This is an alias for :py:func:`array_position`. + """ + return array_position(array, element, index) + + +def array_positions(array: Expr, element: Expr) -> Expr: + """Searches for an element in the array and returns all occurrences.""" + return Expr(f.array_positions(array.expr, element.expr)) + + +def list_positions(array: Expr, element: Expr) -> Expr: + """Searches for an element in the array and returns all occurrences. + + This is an alias for :py:func:`array_positions`. + """ + return array_positions(array, element) + + +def array_ndims(array: Expr) -> Expr: + """Returns the number of dimensions of the array.""" + return Expr(f.array_ndims(array.expr)) + + +def list_ndims(array: Expr) -> Expr: + """Returns the number of dimensions of the array. + + This is an alias for :py:func:`array_ndims`. + """ + return array_ndims(array) + + +def array_prepend(element: Expr, array: Expr) -> Expr: + """Prepends an element to the beginning of an array.""" + return Expr(f.array_prepend(element.expr, array.expr)) + + +def array_push_front(element: Expr, array: Expr) -> Expr: + """Prepends an element to the beginning of an array. + + This is an alias for :py:func:`array_prepend`. + """ + return array_prepend(element, array) + + +def list_prepend(element: Expr, array: Expr) -> Expr: + """Prepends an element to the beginning of an array. + + This is an alias for :py:func:`array_prepend`. + """ + return array_prepend(element, array) + + +def list_push_front(element: Expr, array: Expr) -> Expr: + """Prepends an element to the beginning of an array. + + This is an alias for :py:func:`array_prepend`. + """ + return array_prepend(element, array) + + +def array_pop_back(array: Expr) -> Expr: + """Returns the array without the last element.""" + return Expr(f.array_pop_back(array.expr)) + + +def array_pop_front(array: Expr) -> Expr: + """Returns the array without the first element.""" + return Expr(f.array_pop_front(array.expr)) + + +def array_remove(array: Expr, element: Expr) -> Expr: + """Removes the first element from the array equal to the given value.""" + return Expr(f.array_remove(array.expr, element.expr)) + + +def list_remove(array: Expr, element: Expr) -> Expr: + """Removes the first element from the array equal to the given value. + + This is an alias for :py:func:`array_remove`. + """ + return array_remove(array, element) + + +def array_remove_n(array: Expr, element: Expr, max: Expr) -> Expr: + """Removes the first ``max`` elements from the array equal to the given value.""" + return Expr(f.array_remove_n(array.expr, element.expr, max.expr)) + + +def list_remove_n(array: Expr, element: Expr, max: Expr) -> Expr: + """Removes the first ``max`` elements from the array equal to the given value. + + This is an alias for :py:func:`array_remove_n`. + """ + return array_remove_n(array, element, max) + + +def array_remove_all(array: Expr, element: Expr) -> Expr: + """Removes all elements from the array equal to the given value.""" + return Expr(f.array_remove_all(array.expr, element.expr)) + + +def list_remove_all(array: Expr, element: Expr) -> Expr: + """Removes all elements from the array equal to the given value. + + This is an alias for :py:func:`array_remove_all`. + """ + return array_remove_all(array, element) + + +def array_repeat(element: Expr, count: Expr) -> Expr: + """Returns an array containing ``element`` ``count`` times.""" + return Expr(f.array_repeat(element.expr, count.expr)) + + +def array_replace(array: Expr, from_val: Expr, to_val: Expr) -> Expr: + """Replaces the first occurrence of ``from_val`` with ``to_val``.""" + return Expr(f.array_replace(array.expr, from_val.expr, to_val.expr)) + + +def list_replace(array: Expr, from_val: Expr, to_val: Expr) -> Expr: + """Replaces the first occurrence of ``from_val`` with ``to_val``. + + This is an alias for :py:func:`array_replace`. + """ + return array_replace(array, from_val, to_val) + + +def array_replace_n(array: Expr, from_val: Expr, to_val: Expr, max: Expr) -> Expr: + """Replace ``n`` occurrences of ``from_val`` with ``to_val``. + + Replaces the first ``max`` occurrences of the specified element with another + specified element. + """ + return Expr(f.array_replace_n(array.expr, from_val.expr, to_val.expr, max.expr)) + + +def list_replace_n(array: Expr, from_val: Expr, to_val: Expr, max: Expr) -> Expr: + """Replace ``n`` occurrences of ``from_val`` with ``to_val``. + + Replaces the first ``max`` occurrences of the specified element with another + specified element. + + This is an alias for :py:func:`array_replace_n`. + """ + return array_replace_n(array, from_val, to_val, max) + + +def array_replace_all(array: Expr, from_val: Expr, to_val: Expr) -> Expr: + """Replaces all occurrences of ``from_val`` with ``to_val``.""" + return Expr(f.array_replace_all(array.expr, from_val.expr, to_val.expr)) + + +def list_replace_all(array: Expr, from_val: Expr, to_val: Expr) -> Expr: + """Replaces all occurrences of ``from_val`` with ``to_val``. + + This is an alias for :py:func:`array_replace_all`. + """ + return array_replace_all(array, from_val, to_val) + + +def array_sort(array: Expr, descending: bool = False, null_first: bool = False) -> Expr: + """Sort an array. + + Args: + array: The input array to sort. + descending: If True, sorts in descending order. + null_first: If True, nulls will be returned at the beginning of the array. + """ + desc = "DESC" if descending else "ASC" + nulls_first = "NULLS FIRST" if null_first else "NULLS LAST" + return Expr( + f.array_sort( + array.expr, Expr.literal(desc).expr, Expr.literal(nulls_first).expr + ) + ) + + +def list_sort(array: Expr, descending: bool = False, null_first: bool = False) -> Expr: + """This is an alias for :py:func:`array_sort`.""" + return array_sort(array, descending=descending, null_first=null_first) + + +def array_slice( + array: Expr, begin: Expr, end: Expr, stride: Expr | None = None +) -> Expr: + """Returns a slice of the array.""" + if stride is not None: + stride = stride.expr + return Expr(f.array_slice(array.expr, begin.expr, end.expr, stride)) + + +def list_slice(array: Expr, begin: Expr, end: Expr, stride: Expr | None = None) -> Expr: + """Returns a slice of the array. + + This is an alias for :py:func:`array_slice`. + """ + return array_slice(array, begin, end, stride) + + +def array_intersect(array1: Expr, array2: Expr) -> Expr: + """Returns the intersection of ``array1`` and ``array2``.""" + return Expr(f.array_intersect(array1.expr, array2.expr)) + + +def list_intersect(array1: Expr, array2: Expr) -> Expr: + """Returns an the intersection of ``array1`` and ``array2``. + + This is an alias for :py:func:`array_intersect`. + """ + return array_intersect(array1, array2) + + +def array_union(array1: Expr, array2: Expr) -> Expr: + """Returns an array of the elements in the union of array1 and array2. + + Duplicate rows will not be returned. + """ + return Expr(f.array_union(array1.expr, array2.expr)) + + +def list_union(array1: Expr, array2: Expr) -> Expr: + """Returns an array of the elements in the union of array1 and array2. + + Duplicate rows will not be returned. + + This is an alias for :py:func:`array_union`. + """ + return array_union(array1, array2) + + +def array_except(array1: Expr, array2: Expr) -> Expr: + """Returns the elements that appear in ``array1`` but not in ``array2``.""" + return Expr(f.array_except(array1.expr, array2.expr)) + + +def list_except(array1: Expr, array2: Expr) -> Expr: + """Returns the elements that appear in ``array1`` but not in the ``array2``. + + This is an alias for :py:func:`array_except`. + """ + return array_except(array1, array2) + + +def array_resize(array: Expr, size: Expr, value: Expr) -> Expr: + """Returns an array with the specified size filled. + + If ``size`` is greater than the ``array`` length, the additional entries will + be filled with the given ``value``. + """ + return Expr(f.array_resize(array.expr, size.expr, value.expr)) + + +def list_resize(array: Expr, size: Expr, value: Expr) -> Expr: + """Returns an array with the specified size filled. + + If ``size`` is greater than the ``array`` length, the additional entries will be + filled with the given ``value``. This is an alias for :py:func:`array_resize`. + """ + return array_resize(array, size, value) + + +def flatten(array: Expr) -> Expr: + """Flattens an array of arrays into a single array.""" + return Expr(f.flatten(array.expr)) + + +# aggregate functions +def approx_distinct( + expression: Expr, + filter: Optional[Expr] = None, +) -> Expr: + """Returns the approximate number of distinct values. + + This aggregate function is similar to :py:func:`count` with distinct set, but it + will approximate the number of distinct entries. It may return significantly faster + than :py:func:`count` for some DataFrames. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + expression: Values to check for distinct entries + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + + return Expr(f.approx_distinct(expression.expr, filter=filter_raw)) + + +def approx_median(expression: Expr, filter: Optional[Expr] = None) -> Expr: + """Returns the approximate median value. + + This aggregate function is similar to :py:func:`median`, but it will only + approximate the median. It may return significantly faster for some DataFrames. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by`` and ``null_treatment``, and ``distinct``. + + Args: + expression: Values to find the median for + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.approx_median(expression.expr, filter=filter_raw)) + + +def approx_percentile_cont( + expression: Expr, + percentile: float, + num_centroids: Optional[int] = None, + filter: Optional[Expr] = None, +) -> Expr: + """Returns the value that is approximately at a given percentile of ``expr``. + + This aggregate function assumes the input values form a continuous distribution. + Suppose you have a DataFrame which consists of 100 different test scores. If you + called this function with a percentile of 0.9, it would return the value of the + test score that is above 90% of the other test scores. The returned value may be + between two of the values. + + This function uses the [t-digest](https://arxiv.org/abs/1902.04023) algorithm to + compute the percentil. You can limit the number of bins used in this algorithm by + setting the ``num_centroids`` parameter. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + expression: Values for which to find the approximate percentile + percentile: This must be between 0.0 and 1.0, inclusive + num_centroids: Max bin size for the t-digest algorithm + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr( + f.approx_percentile_cont( + expression.expr, percentile, num_centroids=num_centroids, filter=filter_raw + ) + ) + + +def approx_percentile_cont_with_weight( + expression: Expr, weight: Expr, percentile: float, filter: Optional[Expr] = None +) -> Expr: + """Returns the value of the weighted approximate percentile. + + This aggregate function is similar to :py:func:`approx_percentile_cont` except that + it uses the associated associated weights. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + expression: Values for which to find the approximate percentile + weight: Relative weight for each of the values in ``expression`` + percentile: This must be between 0.0 and 1.0, inclusive + filter: If provided, only compute against rows for which the filter is True + + """ + filter_raw = filter.expr if filter is not None else None + return Expr( + f.approx_percentile_cont_with_weight( + expression.expr, weight.expr, percentile, filter=filter_raw + ) + ) + + +def array_agg( + expression: Expr, + distinct: bool = False, + filter: Optional[Expr] = None, + order_by: Optional[list[Expr | SortExpr]] = None, +) -> Expr: + """Aggregate values into an array. + + Currently ``distinct`` and ``order_by`` cannot be used together. As a work around, + consider :py:func:`array_sort` after aggregation. + [Issue Tracker](https://github.com/apache/datafusion/issues/12371) + + If using the builder functions described in ref:`_aggregation` this function ignores + the option ``null_treatment``. + + Args: + expression: Values to combine into an array + distinct: If True, a single entry for each distinct value will be in the result + filter: If provided, only compute against rows for which the filter is True + order_by: Order the resultant array values + """ + order_by_raw = sort_list_to_raw_sort_list(order_by) + filter_raw = filter.expr if filter is not None else None + + return Expr( + f.array_agg( + expression.expr, distinct=distinct, filter=filter_raw, order_by=order_by_raw + ) + ) + + +def avg( + expression: Expr, + filter: Optional[Expr] = None, +) -> Expr: + """Returns the average value. + + This aggregate function expects a numeric expression and will return a float. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + expression: Values to combine into an array + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.avg(expression.expr, filter=filter_raw)) + + +def corr(value_y: Expr, value_x: Expr, filter: Optional[Expr] = None) -> Expr: + """Returns the correlation coefficient between ``value1`` and ``value2``. + + This aggregate function expects both values to be numeric and will return a float. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + value_y: The dependent variable for correlation + value_x: The independent variable for correlation + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.corr(value_y.expr, value_x.expr, filter=filter_raw)) + + +def count( + expressions: Expr | list[Expr] | None = None, + distinct: bool = False, + filter: Optional[Expr] = None, +) -> Expr: + """Returns the number of rows that match the given arguments. + + This aggregate function will count the non-null rows provided in the expression. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by`` and ``null_treatment``. + + Args: + expressions: Argument to perform bitwise calculation on + distinct: If True, a single entry for each distinct value will be in the result + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + + if expressions is None: + args = [Expr.literal(1).expr] + elif isinstance(expressions, list): + args = [arg.expr for arg in expressions] + else: + args = [expressions.expr] + + return Expr(f.count(*args, distinct=distinct, filter=filter_raw)) + + +def covar_pop(value_y: Expr, value_x: Expr, filter: Optional[Expr] = None) -> Expr: + """Computes the population covariance. + + This aggregate function expects both values to be numeric and will return a float. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + value_y: The dependent variable for covariance + value_x: The independent variable for covariance + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.covar_pop(value_y.expr, value_x.expr, filter=filter_raw)) + + +def covar_samp(value_y: Expr, value_x: Expr, filter: Optional[Expr] = None) -> Expr: + """Computes the sample covariance. + + This aggregate function expects both values to be numeric and will return a float. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + value_y: The dependent variable for covariance + value_x: The independent variable for covariance + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.covar_samp(value_y.expr, value_x.expr, filter=filter_raw)) + + +def covar(value_y: Expr, value_x: Expr, filter: Optional[Expr] = None) -> Expr: + """Computes the sample covariance. + + This is an alias for :py:func:`covar_samp`. + """ + return covar_samp(value_y, value_x, filter) + + +def max(expression: Expr, filter: Optional[Expr] = None) -> Expr: + """Aggregate function that returns the maximum value of the argument. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + expression: The value to find the maximum of + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.max(expression.expr, filter=filter_raw)) + + +def mean(expression: Expr, filter: Optional[Expr] = None) -> Expr: + """Returns the average (mean) value of the argument. + + This is an alias for :py:func:`avg`. + """ + return avg(expression, filter) + + +def median( + expression: Expr, distinct: bool = False, filter: Optional[Expr] = None +) -> Expr: + """Computes the median of a set of numbers. + + This aggregate function returns the median value of the expression for the given + aggregate function. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by`` and ``null_treatment``. + + Args: + expression: The value to compute the median of + distinct: If True, a single entry for each distinct value will be in the result + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.median(expression.expr, distinct=distinct, filter=filter_raw)) + + +def min(expression: Expr, filter: Optional[Expr] = None) -> Expr: + """Returns the minimum value of the argument. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + expression: The value to find the minimum of + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.min(expression.expr, filter=filter_raw)) + + +def sum( + expression: Expr, + filter: Optional[Expr] = None, +) -> Expr: + """Computes the sum of a set of numbers. + + This aggregate function expects a numeric expression. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + expression: Values to combine into an array + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.sum(expression.expr, filter=filter_raw)) + + +def stddev(expression: Expr, filter: Optional[Expr] = None) -> Expr: + """Computes the standard deviation of the argument. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + expression: The value to find the minimum of + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.stddev(expression.expr, filter=filter_raw)) + + +def stddev_pop(expression: Expr, filter: Optional[Expr] = None) -> Expr: + """Computes the population standard deviation of the argument. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + expression: The value to find the minimum of + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.stddev_pop(expression.expr, filter=filter_raw)) + + +def stddev_samp(arg: Expr, filter: Optional[Expr] = None) -> Expr: + """Computes the sample standard deviation of the argument. + + This is an alias for :py:func:`stddev`. + """ + return stddev(arg, filter=filter) + + +def var(expression: Expr, filter: Optional[Expr] = None) -> Expr: + """Computes the sample variance of the argument. + + This is an alias for :py:func:`var_samp`. + """ + return var_samp(expression, filter) + + +def var_pop(expression: Expr, filter: Optional[Expr] = None) -> Expr: + """Computes the population variance of the argument. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + expression: The variable to compute the variance for + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.var_pop(expression.expr, filter=filter_raw)) + + +def var_samp(expression: Expr, filter: Optional[Expr] = None) -> Expr: + """Computes the sample variance of the argument. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + expression: The variable to compute the variance for + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.var_sample(expression.expr, filter=filter_raw)) + + +def var_sample(expression: Expr, filter: Optional[Expr] = None) -> Expr: + """Computes the sample variance of the argument. + + This is an alias for :py:func:`var_samp`. + """ + return var_samp(expression, filter) + + +def regr_avgx( + y: Expr, + x: Expr, + filter: Optional[Expr] = None, +) -> Expr: + """Computes the average of the independent variable ``x``. + + This is a linear regression aggregate function. Only non-null pairs of the inputs + are evaluated. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + y: The linear regression dependent variable + x: The linear regression independent variable + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + + return Expr(f.regr_avgx(y.expr, x.expr, filter=filter_raw)) + + +def regr_avgy( + y: Expr, + x: Expr, + filter: Optional[Expr] = None, +) -> Expr: + """Computes the average of the dependent variable ``y``. + + This is a linear regression aggregate function. Only non-null pairs of the inputs + are evaluated. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + y: The linear regression dependent variable + x: The linear regression independent variable + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + + return Expr(f.regr_avgy(y.expr, x.expr, filter=filter_raw)) + + +def regr_count( + y: Expr, + x: Expr, + filter: Optional[Expr] = None, +) -> Expr: + """Counts the number of rows in which both expressions are not null. + + This is a linear regression aggregate function. Only non-null pairs of the inputs + are evaluated. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + y: The linear regression dependent variable + x: The linear regression independent variable + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + + return Expr(f.regr_count(y.expr, x.expr, filter=filter_raw)) + + +def regr_intercept( + y: Expr, + x: Expr, + filter: Optional[Expr] = None, +) -> Expr: + """Computes the intercept from the linear regression. + + This is a linear regression aggregate function. Only non-null pairs of the inputs + are evaluated. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + y: The linear regression dependent variable + x: The linear regression independent variable + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + + return Expr(f.regr_intercept(y.expr, x.expr, filter=filter_raw)) + + +def regr_r2( + y: Expr, + x: Expr, + filter: Optional[Expr] = None, +) -> Expr: + """Computes the R-squared value from linear regression. + + This is a linear regression aggregate function. Only non-null pairs of the inputs + are evaluated. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + y: The linear regression dependent variable + x: The linear regression independent variable + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + + return Expr(f.regr_r2(y.expr, x.expr, filter=filter_raw)) + + +def regr_slope( + y: Expr, + x: Expr, + filter: Optional[Expr] = None, +) -> Expr: + """Computes the slope from linear regression. + + This is a linear regression aggregate function. Only non-null pairs of the inputs + are evaluated. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + y: The linear regression dependent variable + x: The linear regression independent variable + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + + return Expr(f.regr_slope(y.expr, x.expr, filter=filter_raw)) + + +def regr_sxx( + y: Expr, + x: Expr, + filter: Optional[Expr] = None, +) -> Expr: + """Computes the sum of squares of the independent variable ``x``. + + This is a linear regression aggregate function. Only non-null pairs of the inputs + are evaluated. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + y: The linear regression dependent variable + x: The linear regression independent variable + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + + return Expr(f.regr_sxx(y.expr, x.expr, filter=filter_raw)) + + +def regr_sxy( + y: Expr, + x: Expr, + filter: Optional[Expr] = None, +) -> Expr: + """Computes the sum of products of pairs of numbers. + + This is a linear regression aggregate function. Only non-null pairs of the inputs + are evaluated. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + y: The linear regression dependent variable + x: The linear regression independent variable + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + + return Expr(f.regr_sxy(y.expr, x.expr, filter=filter_raw)) + + +def regr_syy( + y: Expr, + x: Expr, + filter: Optional[Expr] = None, +) -> Expr: + """Computes the sum of squares of the dependent variable ``y``. + + This is a linear regression aggregate function. Only non-null pairs of the inputs + are evaluated. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + y: The linear regression dependent variable + x: The linear regression independent variable + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + + return Expr(f.regr_syy(y.expr, x.expr, filter=filter_raw)) + + +def first_value( + expression: Expr, + filter: Optional[Expr] = None, + order_by: Optional[list[Expr | SortExpr]] = None, + null_treatment: NullTreatment = NullTreatment.RESPECT_NULLS, +) -> Expr: + """Returns the first value in a group of values. + + This aggregate function will return the first value in the partition. + + If using the builder functions described in ref:`_aggregation` this function ignores + the option ``distinct``. + + Args: + expression: Argument to perform bitwise calculation on + filter: If provided, only compute against rows for which the filter is True + order_by: Set the ordering of the expression to evaluate + null_treatment: Assign whether to respect or ignull null values. + """ + order_by_raw = sort_list_to_raw_sort_list(order_by) + filter_raw = filter.expr if filter is not None else None + + return Expr( + f.first_value( + expression.expr, + filter=filter_raw, + order_by=order_by_raw, + null_treatment=null_treatment.value, + ) + ) + + +def last_value( + expression: Expr, + filter: Optional[Expr] = None, + order_by: Optional[list[Expr | SortExpr]] = None, + null_treatment: NullTreatment = NullTreatment.RESPECT_NULLS, +) -> Expr: + """Returns the last value in a group of values. + + This aggregate function will return the last value in the partition. + + If using the builder functions described in ref:`_aggregation` this function ignores + the option ``distinct``. + + Args: + expression: Argument to perform bitwise calculation on + filter: If provided, only compute against rows for which the filter is True + order_by: Set the ordering of the expression to evaluate + null_treatment: Assign whether to respect or ignull null values. + """ + order_by_raw = sort_list_to_raw_sort_list(order_by) + filter_raw = filter.expr if filter is not None else None + + return Expr( + f.last_value( + expression.expr, + filter=filter_raw, + order_by=order_by_raw, + null_treatment=null_treatment.value, + ) + ) + + +def nth_value( + expression: Expr, + n: int, + filter: Optional[Expr] = None, + order_by: Optional[list[Expr | SortExpr]] = None, + null_treatment: NullTreatment = NullTreatment.RESPECT_NULLS, +) -> Expr: + """Returns the n-th value in a group of values. + + This aggregate function will return the n-th value in the partition. + + If using the builder functions described in ref:`_aggregation` this function ignores + the option ``distinct``. + + Args: + expression: Argument to perform bitwise calculation on + n: Index of value to return. Starts at 1. + filter: If provided, only compute against rows for which the filter is True + order_by: Set the ordering of the expression to evaluate + null_treatment: Assign whether to respect or ignull null values. + """ + order_by_raw = sort_list_to_raw_sort_list(order_by) + filter_raw = filter.expr if filter is not None else None + + return Expr( + f.nth_value( + expression.expr, + n, + filter=filter_raw, + order_by=order_by_raw, + null_treatment=null_treatment.value, + ) + ) + + +def bit_and(expression: Expr, filter: Optional[Expr] = None) -> Expr: + """Computes the bitwise AND of the argument. + + This aggregate function will bitwise compare every value in the input partition. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + expression: Argument to perform bitwise calculation on + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.bit_and(expression.expr, filter=filter_raw)) + + +def bit_or(expression: Expr, filter: Optional[Expr] = None) -> Expr: + """Computes the bitwise OR of the argument. + + This aggregate function will bitwise compare every value in the input partition. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + expression: Argument to perform bitwise calculation on + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.bit_or(expression.expr, filter=filter_raw)) + + +def bit_xor( + expression: Expr, distinct: bool = False, filter: Optional[Expr] = None +) -> Expr: + """Computes the bitwise XOR of the argument. + + This aggregate function will bitwise compare every value in the input partition. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by`` and ``null_treatment``. + + Args: + expression: Argument to perform bitwise calculation on + distinct: If True, evaluate each unique value of expression only once + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.bit_xor(expression.expr, distinct=distinct, filter=filter_raw)) + + +def bool_and(expression: Expr, filter: Optional[Expr] = None) -> Expr: + """Computes the boolean AND of the argument. + + This aggregate function will compare every value in the input partition. These are + expected to be boolean values. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + expression: Argument to perform calculation on + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.bool_and(expression.expr, filter=filter_raw)) + + +def bool_or(expression: Expr, filter: Optional[Expr] = None) -> Expr: + """Computes the boolean OR of the argument. + + This aggregate function will compare every value in the input partition. These are + expected to be boolean values. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``order_by``, ``null_treatment``, and ``distinct``. + + Args: + expression: Argument to perform calculation on + filter: If provided, only compute against rows for which the filter is True + """ + filter_raw = filter.expr if filter is not None else None + return Expr(f.bool_or(expression.expr, filter=filter_raw)) + + +def lead( + arg: Expr, + shift_offset: int = 1, + default_value: Optional[Any] = None, + partition_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr | SortExpr]] = None, +) -> Expr: + """Create a lead window function. + + Lead operation will return the argument that is in the next shift_offset-th row in + the partition. For example ``lead(col("b"), shift_offset=3, default_value=5)`` will + return the 3rd following value in column ``b``. At the end of the partition, where + no futher values can be returned it will return the default value of 5. + + Here is an example of both the ``lead`` and :py:func:`datafusion.functions.lag` + functions on a simple DataFrame:: + + +--------+------+-----+ + | points | lead | lag | + +--------+------+-----+ + | 100 | 100 | | + | 100 | 50 | 100 | + | 50 | 25 | 100 | + | 25 | | 50 | + +--------+------+-----+ + + To set window function parameters use the window builder approach described in the + ref:`_window_functions` online documentation. + + Args: + arg: Value to return + shift_offset: Number of rows following the current row. + default_value: Value to return if shift_offet row does not exist. + partition_by: Expressions to partition the window frame on. + order_by: Set ordering within the window frame. + """ + if not isinstance(default_value, pa.Scalar) and default_value is not None: + default_value = pa.scalar(default_value) + + partition_cols = ( + [col.expr for col in partition_by] if partition_by is not None else None + ) + order_by_raw = sort_list_to_raw_sort_list(order_by) + + return Expr( + f.lead( + arg.expr, + shift_offset, + default_value, + partition_by=partition_cols, + order_by=order_by_raw, + ) + ) + + +def lag( + arg: Expr, + shift_offset: int = 1, + default_value: Optional[Any] = None, + partition_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr | SortExpr]] = None, +) -> Expr: + """Create a lag window function. + + Lag operation will return the argument that is in the previous shift_offset-th row + in the partition. For example ``lag(col("b"), shift_offset=3, default_value=5)`` + will return the 3rd previous value in column ``b``. At the beginnig of the + partition, where no values can be returned it will return the default value of 5. + + Here is an example of both the ``lag`` and :py:func:`datafusion.functions.lead` + functions on a simple DataFrame:: + + +--------+------+-----+ + | points | lead | lag | + +--------+------+-----+ + | 100 | 100 | | + | 100 | 50 | 100 | + | 50 | 25 | 100 | + | 25 | | 50 | + +--------+------+-----+ + + Args: + arg: Value to return + shift_offset: Number of rows before the current row. + default_value: Value to return if shift_offet row does not exist. + partition_by: Expressions to partition the window frame on. + order_by: Set ordering within the window frame. + """ + if not isinstance(default_value, pa.Scalar): + default_value = pa.scalar(default_value) + + partition_cols = ( + [col.expr for col in partition_by] if partition_by is not None else None + ) + order_by_raw = sort_list_to_raw_sort_list(order_by) + + return Expr( + f.lag( + arg.expr, + shift_offset, + default_value, + partition_by=partition_cols, + order_by=order_by_raw, + ) + ) + + +def row_number( + partition_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr | SortExpr]] = None, +) -> Expr: + """Create a row number window function. + + Returns the row number of the window function. + + Here is an example of the ``row_number`` on a simple DataFrame:: + + +--------+------------+ + | points | row number | + +--------+------------+ + | 100 | 1 | + | 100 | 2 | + | 50 | 3 | + | 25 | 4 | + +--------+------------+ + + Args: + partition_by: Expressions to partition the window frame on. + order_by: Set ordering within the window frame. + """ + partition_cols = ( + [col.expr for col in partition_by] if partition_by is not None else None + ) + order_by_raw = sort_list_to_raw_sort_list(order_by) + + return Expr( + f.row_number( + partition_by=partition_cols, + order_by=order_by_raw, + ) + ) + + +def rank( + partition_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr | SortExpr]] = None, +) -> Expr: + """Create a rank window function. + + Returns the rank based upon the window order. Consecutive equal values will receive + the same rank, but the next different value will not be consecutive but rather the + number of rows that preceed it plus one. This is similar to Olympic medals. If two + people tie for gold, the next place is bronze. There would be no silver medal. Here + is an example of a dataframe with a window ordered by descending ``points`` and the + associated rank. + + You should set ``order_by`` to produce meaningful results:: + + +--------+------+ + | points | rank | + +--------+------+ + | 100 | 1 | + | 100 | 1 | + | 50 | 3 | + | 25 | 4 | + +--------+------+ + + Args: + partition_by: Expressions to partition the window frame on. + order_by: Set ordering within the window frame. + """ + partition_cols = ( + [col.expr for col in partition_by] if partition_by is not None else None + ) + order_by_raw = sort_list_to_raw_sort_list(order_by) + + return Expr( + f.rank( + partition_by=partition_cols, + order_by=order_by_raw, + ) + ) + + +def dense_rank( + partition_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr | SortExpr]] = None, +) -> Expr: + """Create a dense_rank window function. + + This window function is similar to :py:func:`rank` except that the returned values + will be consecutive. Here is an example of a dataframe with a window ordered by + descending ``points`` and the associated dense rank:: + + +--------+------------+ + | points | dense_rank | + +--------+------------+ + | 100 | 1 | + | 100 | 1 | + | 50 | 2 | + | 25 | 3 | + +--------+------------+ + + Args: + partition_by: Expressions to partition the window frame on. + order_by: Set ordering within the window frame. + """ + partition_cols = ( + [col.expr for col in partition_by] if partition_by is not None else None + ) + order_by_raw = sort_list_to_raw_sort_list(order_by) + + return Expr( + f.dense_rank( + partition_by=partition_cols, + order_by=order_by_raw, + ) + ) + + +def percent_rank( + partition_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr | SortExpr]] = None, +) -> Expr: + """Create a percent_rank window function. + + This window function is similar to :py:func:`rank` except that the returned values + are the percentage from 0.0 to 1.0 from first to last. Here is an example of a + dataframe with a window ordered by descending ``points`` and the associated percent + rank:: + + +--------+--------------+ + | points | percent_rank | + +--------+--------------+ + | 100 | 0.0 | + | 100 | 0.0 | + | 50 | 0.666667 | + | 25 | 1.0 | + +--------+--------------+ + + Args: + partition_by: Expressions to partition the window frame on. + order_by: Set ordering within the window frame. + """ + partition_cols = ( + [col.expr for col in partition_by] if partition_by is not None else None + ) + order_by_raw = sort_list_to_raw_sort_list(order_by) + + return Expr( + f.percent_rank( + partition_by=partition_cols, + order_by=order_by_raw, + ) + ) + + +def cume_dist( + partition_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr | SortExpr]] = None, +) -> Expr: + """Create a cumulative distribution window function. + + This window function is similar to :py:func:`rank` except that the returned values + are the ratio of the row number to the total numebr of rows. Here is an example of a + dataframe with a window ordered by descending ``points`` and the associated + cumulative distribution:: + + +--------+-----------+ + | points | cume_dist | + +--------+-----------+ + | 100 | 0.5 | + | 100 | 0.5 | + | 50 | 0.75 | + | 25 | 1.0 | + +--------+-----------+ + + Args: + partition_by: Expressions to partition the window frame on. + order_by: Set ordering within the window frame. + """ + partition_cols = ( + [col.expr for col in partition_by] if partition_by is not None else None + ) + order_by_raw = sort_list_to_raw_sort_list(order_by) + + return Expr( + f.cume_dist( + partition_by=partition_cols, + order_by=order_by_raw, + ) + ) + + +def ntile( + groups: int, + partition_by: Optional[list[Expr]] = None, + order_by: Optional[list[Expr | SortExpr]] = None, +) -> Expr: + """Create a n-tile window function. + + This window function orders the window frame into a give number of groups based on + the ordering criteria. It then returns which group the current row is assigned to. + Here is an example of a dataframe with a window ordered by descending ``points`` + and the associated n-tile function:: + + +--------+-------+ + | points | ntile | + +--------+-------+ + | 120 | 1 | + | 100 | 1 | + | 80 | 2 | + | 60 | 2 | + | 40 | 3 | + | 20 | 3 | + +--------+-------+ + + Args: + groups: Number of groups for the n-tile to be divided into. + partition_by: Expressions to partition the window frame on. + order_by: Set ordering within the window frame. + """ + partition_cols = ( + [col.expr for col in partition_by] if partition_by is not None else None + ) + order_by_raw = sort_list_to_raw_sort_list(order_by) + + return Expr( + f.ntile( + Expr.literal(groups).expr, + partition_by=partition_cols, + order_by=order_by_raw, + ) + ) + + +def string_agg( + expression: Expr, + delimiter: str, + filter: Optional[Expr] = None, + order_by: Optional[list[Expr | SortExpr]] = None, +) -> Expr: + """Concatenates the input strings. + + This aggregate function will concatenate input strings, ignoring null values, and + seperating them with the specified delimiter. Non-string values will be converted to + their string equivalents. + + If using the builder functions described in ref:`_aggregation` this function ignores + the options ``distinct`` and ``null_treatment``. + + Args: + expression: Argument to perform bitwise calculation on + delimiter: Text to place between each value of expression + filter: If provided, only compute against rows for which the filter is True + order_by: Set the ordering of the expression to evaluate + """ + order_by_raw = sort_list_to_raw_sort_list(order_by) + filter_raw = filter.expr if filter is not None else None + + return Expr( + f.string_agg( + expression.expr, + delimiter, + filter=filter_raw, + order_by=order_by_raw, + ) + ) diff --git a/py-denormalized/python/denormalized/datafusion/input/__init__.py b/py-denormalized/python/denormalized/datafusion/input/__init__.py new file mode 100644 index 0000000..f85ce21 --- /dev/null +++ b/py-denormalized/python/denormalized/datafusion/input/__init__.py @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""This package provides for input sources. + +The primary class used within DataFusion is ``LocationInputPlugin``. +""" + +from .location import LocationInputPlugin + +__all__ = [ + LocationInputPlugin, +] diff --git a/py-denormalized/python/denormalized/datafusion/input/base.py b/py-denormalized/python/denormalized/datafusion/input/base.py new file mode 100644 index 0000000..4eba197 --- /dev/null +++ b/py-denormalized/python/denormalized/datafusion/input/base.py @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""This module provides ``BaseInputSource``. + +A user can extend this to provide a custom input source. +""" + +from abc import ABC, abstractmethod +from typing import Any + +from datafusion.common import SqlTable + + +class BaseInputSource(ABC): + """Base Input Source class. + + If a consuming library would like to provider their own InputSource this is + the class they should extend to write their own. + + Once completed the Plugin InputSource can be registered with the + SessionContext to ensure that it will be used in order + to obtain the SqlTable information from the custom datasource. + """ + + @abstractmethod + def is_correct_input(self, input_item: Any, table_name: str, **kwargs) -> bool: + """Returns `True` if the input is valid.""" + pass + + @abstractmethod + def build_table(self, input_item: Any, table_name: str, **kwarg) -> SqlTable: + """Create a table from the input source.""" + pass diff --git a/py-denormalized/python/denormalized/datafusion/input/location.py b/py-denormalized/python/denormalized/datafusion/input/location.py new file mode 100644 index 0000000..b274539 --- /dev/null +++ b/py-denormalized/python/denormalized/datafusion/input/location.py @@ -0,0 +1,89 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""The default input source for DataFusion.""" + +import os +import glob +from typing import Any + +from datafusion.common import DataTypeMap, SqlTable +from datafusion.input.base import BaseInputSource + + +class LocationInputPlugin(BaseInputSource): + """Input Plugin for everything. + + This can be read in from a file (on disk, remote etc.). + """ + + def is_correct_input(self, input_item: Any, table_name: str, **kwargs): + """Returns `True` if the input is valid.""" + return isinstance(input_item, str) + + def build_table( + self, + input_file: str, + table_name: str, + **kwargs, + ) -> SqlTable: + """Create a table from the input source.""" + _, extension = os.path.splitext(input_file) + format = extension.lstrip(".").lower() + num_rows = 0 # Total number of rows in the file. Used for statistics + columns = [] + if format == "parquet": + import pyarrow.parquet as pq + + # Read the Parquet metadata + metadata = pq.read_metadata(input_file) + num_rows = metadata.num_rows + # Iterate through the schema and build the SqlTable + for col in metadata.schema: + columns.append( + ( + col.name, + DataTypeMap.from_parquet_type_str(col.physical_type), + ) + ) + elif format == "csv": + import csv + + # Consume header row and count number of rows for statistics. + # TODO: Possibly makes sense to have the eager number of rows + # calculated as a configuration since you must read the entire file + # to get that information. However, this should only be occurring + # at table creation time and therefore shouldn't + # slow down query performance. + with open(input_file, "r") as file: + reader = csv.reader(file) + header_row = next(reader) + print(header_row) + for _ in reader: + num_rows += 1 + # TODO: Need to actually consume this row into reasonable columns + raise RuntimeError("TODO: Currently unable to support CSV input files.") + else: + raise RuntimeError( + f"Input of format: `{format}` is currently not supported.\ + Only Parquet and CSV." + ) + + # Input could possibly be multiple files. Create a list if so + input_files = glob.glob(input_file) + + return SqlTable(table_name, columns, num_rows, input_files) diff --git a/py-denormalized/python/denormalized/datafusion/object_store.py b/py-denormalized/python/denormalized/datafusion/object_store.py new file mode 100644 index 0000000..3a3371e --- /dev/null +++ b/py-denormalized/python/denormalized/datafusion/object_store.py @@ -0,0 +1,35 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Object store functionality.""" + +from denormalized._internal import object_store + +AmazonS3 = object_store.AmazonS3 +GoogleCloud = object_store.GoogleCloud +LocalFileSystem = object_store.LocalFileSystem +MicrosoftAzure = object_store.MicrosoftAzure + +__all__ = [ + "AmazonS3", + "GoogleCloud", + "LocalFileSystem", + "MicrosoftAzure", +] + + +def __getattr__(name): + return getattr(object_store, name) diff --git a/py-denormalized/python/denormalized/datafusion/py.typed b/py-denormalized/python/denormalized/datafusion/py.typed new file mode 100644 index 0000000..d216be4 --- /dev/null +++ b/py-denormalized/python/denormalized/datafusion/py.typed @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. \ No newline at end of file diff --git a/py-denormalized/python/denormalized/datafusion/record_batch.py b/py-denormalized/python/denormalized/datafusion/record_batch.py new file mode 100644 index 0000000..e0e436e --- /dev/null +++ b/py-denormalized/python/denormalized/datafusion/record_batch.py @@ -0,0 +1,76 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""This module provides the classes for handling record batches. + +These are typically the result of dataframe +:py:func:`datafusion.dataframe.execute_stream` operations. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import pyarrow + import denormalized._internal as df_internal + import typing_extensions + + +class RecordBatch: + """This class is essentially a wrapper for :py:class:`pyarrow.RecordBatch`.""" + + def __init__(self, record_batch: df_internal.RecordBatch) -> None: + """This constructor is generally not called by the end user. + + See the :py:class:`RecordBatchStream` iterator for generating this class. + """ + self.record_batch = record_batch + + def to_pyarrow(self) -> pyarrow.RecordBatch: + """Convert to :py:class:`pyarrow.RecordBatch`.""" + return self.record_batch.to_pyarrow() + + +class RecordBatchStream: + """This class represents a stream of record batches. + + These are typically the result of a + :py:func:`~datafusion.dataframe.DataFrame.execute_stream` operation. + """ + + def __init__(self, record_batch_stream: df_internal.RecordBatchStream) -> None: + """This constructor is typically not called by the end user.""" + self.rbs = record_batch_stream + + def next(self) -> RecordBatch | None: + """See :py:func:`__next__` for the iterator function.""" + try: + next_batch = next(self) + except StopIteration: + return None + + return next_batch + + def __next__(self) -> RecordBatch: + """Iterator function.""" + next_batch = next(self.rbs) + return RecordBatch(next_batch) + + def __iter__(self) -> typing_extensions.Self: + """Iterator function.""" + return self diff --git a/py-denormalized/python/denormalized/datafusion/udf.py b/py-denormalized/python/denormalized/datafusion/udf.py new file mode 100644 index 0000000..c1d45f9 --- /dev/null +++ b/py-denormalized/python/denormalized/datafusion/udf.py @@ -0,0 +1,248 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Provides the user defined functions for evaluation of dataframes.""" + +from __future__ import annotations + +import denormalized._internal as df_internal +from datafusion.expr import Expr +from typing import Callable, TYPE_CHECKING, TypeVar +from abc import ABCMeta, abstractmethod +from typing import List +from enum import Enum +import pyarrow + +if TYPE_CHECKING: + _R = TypeVar("_R", bound=pyarrow.DataType) + + +class Volatility(Enum): + """Defines how stable or volatile a function is. + + When setting the volatility of a function, you can either pass this + enumeration or a ``str``. The ``str`` equivalent is the lower case value of the + name (`"immutable"`, `"stable"`, or `"volatile"`). + """ + + Immutable = 1 + """An immutable function will always return the same output when given the + same input. + + DataFusion will attempt to inline immutable functions during planning. + """ + + Stable = 2 + """ + Returns the same value for a given input within a single queries. + + A stable function may return different values given the same input across + different queries but must return the same value for a given input within a + query. An example of this is the ``Now`` function. DataFusion will attempt to + inline ``Stable`` functions during planning, when possible. For query + ``select col1, now() from t1``, it might take a while to execute but ``now()`` + column will be the same for each output row, which is evaluated during + planning. + """ + + Volatile = 3 + """A volatile function may change the return value from evaluation to + evaluation. + + Multiple invocations of a volatile function may return different results + when used in the same query. An example of this is the random() function. + DataFusion can not evaluate such functions during planning. In the query + ``select col1, random() from t1``, ``random()`` function will be evaluated + for each output row, resulting in a unique random value for each row. + """ + + def __str__(self): + """Returns the string equivalent.""" + return self.name.lower() + + +class ScalarUDF: + """Class for performing scalar user defined functions (UDF). + + Scalar UDFs operate on a row by row basis. See also :py:class:`AggregateUDF` for + operating on a group of rows. + """ + + def __init__( + self, + name: str | None, + func: Callable[..., _R], + input_types: list[pyarrow.DataType], + return_type: _R, + volatility: Volatility | str, + ) -> None: + """Instantiate a scalar user defined function (UDF). + + See helper method :py:func:`udf` for argument details. + """ + self._udf = df_internal.ScalarUDF( + name, func, input_types, return_type, str(volatility) + ) + + def __call__(self, *args: Expr) -> Expr: + """Execute the UDF. + + This function is not typically called by an end user. These calls will + occur during the evaluation of the dataframe. + """ + args = [arg.expr for arg in args] + return Expr(self._udf.__call__(*args)) + + @staticmethod + def udf( + func: Callable[..., _R], + input_types: list[pyarrow.DataType], + return_type: _R, + volatility: Volatility | str, + name: str | None = None, + ) -> ScalarUDF: + """Create a new User Defined Function. + + Args: + func: A callable python function. + input_types: The data types of the arguments to ``func``. This list + must be of the same length as the number of arguments. + return_type: The data type of the return value from the python + function. + volatility: See ``Volatility`` for allowed values. + name: A descriptive name for the function. + + Returns: + A user defined aggregate function, which can be used in either data + aggregation or window function calls. + """ + if not callable(func): + raise TypeError("`func` argument must be callable") + if name is None: + name = func.__qualname__.lower() + return ScalarUDF( + name=name, + func=func, + input_types=input_types, + return_type=return_type, + volatility=volatility, + ) + + +class Accumulator(metaclass=ABCMeta): + """Defines how an :py:class:`AggregateUDF` accumulates values.""" + + @abstractmethod + def state(self) -> List[pyarrow.Scalar]: + """Return the current state.""" + pass + + @abstractmethod + def update(self, values: pyarrow.Array) -> None: + """Evaluate an array of values and update state.""" + pass + + @abstractmethod + def merge(self, states: List[pyarrow.Array]) -> None: + """Merge a set of states.""" + pass + + @abstractmethod + def evaluate(self) -> pyarrow.Scalar: + """Return the resultant value.""" + pass + + +if TYPE_CHECKING: + _A = TypeVar("_A", bound=(Callable[..., _R], Accumulator)) + + +class AggregateUDF: + """Class for performing scalar user defined functions (UDF). + + Aggregate UDFs operate on a group of rows and return a single value. See + also :py:class:`ScalarUDF` for operating on a row by row basis. + """ + + def __init__( + self, + name: str | None, + accumulator: _A, + input_types: list[pyarrow.DataType], + return_type: _R, + state_type: list[pyarrow.DataType], + volatility: Volatility | str, + ) -> None: + """Instantiate a user defined aggregate function (UDAF). + + See :py:func:`udaf` for a convenience function and argument + descriptions. + """ + self._udf = df_internal.AggregateUDF( + name, accumulator, input_types, return_type, state_type, str(volatility) + ) + + def __call__(self, *args: Expr) -> Expr: + """Execute the UDAF. + + This function is not typically called by an end user. These calls will + occur during the evaluation of the dataframe. + """ + args = [arg.expr for arg in args] + return Expr(self._udf.__call__(*args)) + + @staticmethod + def udaf( + accum: _A, + input_types: list[pyarrow.DataType], + return_type: _R, + state_type: list[pyarrow.DataType], + volatility: Volatility | str, + name: str | None = None, + ) -> AggregateUDF: + """Create a new User Defined Aggregate Function. + + The accumulator function must be callable and implement :py:class:`Accumulator`. + + Args: + accum: The accumulator python function. + input_types: The data types of the arguments to ``accum``. + return_type: The data type of the return value. + state_type: The data types of the intermediate accumulation. + volatility: See :py:class:`Volatility` for allowed values. + name: A descriptive name for the function. + + Returns: + A user defined aggregate function, which can be used in either data + aggregation or window function calls. + """ + if not issubclass(accum, Accumulator): + raise TypeError( + "`accum` must implement the abstract base class Accumulator" + ) + if name is None: + name = accum.__qualname__.lower() + if isinstance(input_types, pyarrow.lib.DataType): + input_types = [input_types] + return AggregateUDF( + name=name, + accumulator=accum, + input_types=input_types, + return_type=return_type, + state_type=state_type, + volatility=volatility, + ) diff --git a/py-denormalized/python/denormalized/datastream.py b/py-denormalized/python/denormalized/datastream.py index 0919902..d961e75 100644 --- a/py-denormalized/python/denormalized/datastream.py +++ b/py-denormalized/python/denormalized/datastream.py @@ -1,7 +1,7 @@ import pyarrow as pa -from datafusion import Expr from denormalized._internal import PyDataStream -from denormalized._internal import expr as internal_expr +from denormalized.datafusion import Expr +from denormalized.utils import to_internal_expr, to_internal_exprs class DataStream: @@ -48,7 +48,7 @@ def select(self, expr_list: list[Expr]) -> "DataStream": Returns: DataStream: A new DataStream with the selected columns/expressions. """ - return DataStream(self.ds.select(expr_list)) + return DataStream(self.ds.select(to_internal_exprs(expr_list))) def filter(self, predicate: Expr) -> "DataStream": """Filter the DataStream based on a predicate. @@ -59,7 +59,19 @@ def filter(self, predicate: Expr) -> "DataStream": Returns: DataStream: A new DataStream with the filter applied. """ - return DataStream(self.ds.filter(predicate)) + return DataStream(self.ds.filter(to_internal_expr(predicate))) + + def with_column(self, name: str, predicate: Expr) -> "DataStream": + """Add a new column to the DataStream. + + Args: + name (str): The name of the new column. + predicate (Expr): The expression that defines the column's values. + + Returns: + DataStream: A new DataStream with the additional column. + """ + return DataStream(self.ds.with_column(name, to_internal_expr(predicate))) def join_on( self, right: "DataStream", join_type: str, on_exprs: list[Expr] @@ -82,7 +94,7 @@ def join( join_type: str, left_cols: list[str], right_cols: list[str], - filter: Expr = None, + filter: Expr | None = None, ) -> "DataStream": """Join this DataStream with another one based on column names. @@ -102,16 +114,16 @@ def join( def window( self, - group_expr: list[Expr], - aggr_expr: list[Expr], + group_exprs: list[Expr], + aggr_exprs: list[Expr], window_length_millis: int, slide_millis: int | None = None, ) -> "DataStream": """Apply a windowing operation to the DataStream. Args: - group_expr (list[Expr]): The expressions to group by. - aggr_expr (list[Expr]): The aggregation expressions to apply. + group_exprs (list[Expr]): The expressions to group by. + aggr_exprs (list[Expr]): The aggregation expressions to apply. window_length_millis (int): The length of the window in milliseconds. slide_millis (int, optional): The slide interval of the window in milliseconds. @@ -120,7 +132,12 @@ def window( DataStream: A new DataStream with the windowing operation applied. """ return DataStream( - self.ds.window(group_expr, aggr_expr, window_length_millis, slide_millis) + self.ds.window( + to_internal_exprs(group_exprs), + to_internal_exprs(aggr_exprs), + window_length_millis, + slide_millis, + ) ) def print_stream(self) -> None: diff --git a/py-denormalized/python/denormalized/utils.py b/py-denormalized/python/denormalized/utils.py new file mode 100644 index 0000000..13a5dbf --- /dev/null +++ b/py-denormalized/python/denormalized/utils.py @@ -0,0 +1,13 @@ +from denormalized._internal import expr as internal_exprs +from denormalized.datafusion import Expr + + +def to_internal_expr(expr: Expr | str) -> internal_exprs: + """Convert a single Expr or string to internal exprs.""" + return Expr.column(expr).expr if isinstance(expr, str) else expr.expr + +def to_internal_exprs(exprs: list[Expr] | list[str]) -> list[internal_exprs]: + """Convert a list of Expr or string to a list of internal exprs.""" + return [ + to_internal_expr(arg) for arg in exprs + ] diff --git a/py-denormalized/python/examples/stream_aggregate.py b/py-denormalized/python/examples/stream_aggregate.py index ccd8026..dbf96b5 100644 --- a/py-denormalized/python/examples/stream_aggregate.py +++ b/py-denormalized/python/examples/stream_aggregate.py @@ -3,8 +3,8 @@ import pyarrow as pa from denormalized import Context -from denormalized._internal import expr -from denormalized._internal import functions as f +from denormalized.datafusion import lit, col +from denormalized.datafusion import functions as f import signal import sys @@ -23,26 +23,25 @@ def signal_handler(sig, frame): "reading": 0.0, } -def sample_func(rb): - print("hello world2!") - print(len(rb)) +def sample_sink_func(rb): + print(rb) ctx = Context() ds = ctx.from_topic("temperature", json.dumps(sample_event), bootstrap_server) ds.window( - [expr.Expr.column("sensor_name")], + [col("sensor_name")], [ - f.count(expr.Expr.column("reading"), distinct=False, filter=None).alias( + f.count(col("reading"), distinct=False, filter=None).alias( "count" ), - f.min(expr.Expr.column("reading")).alias("min"), - f.max(expr.Expr.column("reading")).alias("max"), - f.avg(expr.Expr.column("reading")).alias("average"), + f.min(col("reading")).alias("min"), + f.max(col("reading")).alias("max"), + f.avg(col("reading")).alias("average"), ], 1000, None, ).filter( - expr.Expr.column("max") > (expr.Expr.literal(pa.scalar(113))) -).sink_python(sample_func) + col("max") > (lit(113)) +).sink_python(sample_sink_func) diff --git a/py-denormalized/python/examples/udf_example.py b/py-denormalized/python/examples/udf_example.py new file mode 100644 index 0000000..563bd99 --- /dev/null +++ b/py-denormalized/python/examples/udf_example.py @@ -0,0 +1,60 @@ +"""stream_aggregate example.""" + +import json +import signal +import sys + +import pyarrow as pa +import pyarrow.compute as pc +from denormalized import Context +from denormalized.datafusion import col +from denormalized.datafusion import functions as f +from denormalized.datafusion import lit, udf + + +def signal_handler(sig, frame): + sys.exit(0) + +signal.signal(signal.SIGINT, signal_handler) + +bootstrap_server = "localhost:9092" + +sample_event = { + "occurred_at_ms": 100, + "sensor_name": "foo", + "reading": 0.0, +} + +def gt(lhs: pa.Array, rhs: pa.Scalar) -> pa.Array: + return pc.greater(lhs, rhs) + +greater_than_udf = udf(gt, [pa.float64(), pa.float64()], pa.bool_(), "stable") + +def sample_sink_func(rb: pa.RecordBatch): + if not len(rb): + return + print(rb) + + +ctx = Context() +ds = ctx.from_topic("temperature", json.dumps(sample_event), bootstrap_server) + +ds.window( + [col("sensor_name")], + [ + f.count(col("reading"), distinct=False, filter=None).alias("count"), + f.min(col("reading")).alias("min"), + f.max(col("reading")).alias("max"), + f.avg(col("reading")).alias("average"), + ], + 1000, + None, +).with_column( + "greater_than", + greater_than_udf( + col("count"), + lit(1400.0), + ), +).sink_python( + sample_sink_func +) diff --git a/py-denormalized/src/datastream.rs b/py-denormalized/src/datastream.rs index fa5f604..da9de9d 100644 --- a/py-denormalized/src/datastream.rs +++ b/py-denormalized/src/datastream.rs @@ -81,6 +81,11 @@ impl PyDataStream { Ok(Self::new(ds)) } + pub fn with_column(&self, name: &str, expr: PyExpr) -> Result { + let ds = self.ds.as_ref().clone().with_column(name, expr.into())?; + Ok(Self::new(ds)) + } + pub fn join_on( &self, _right: PyDataStream, diff --git a/py-denormalized/src/lib.rs b/py-denormalized/src/lib.rs index 4e1c285..fd92024 100644 --- a/py-denormalized/src/lib.rs +++ b/py-denormalized/src/lib.rs @@ -1,7 +1,5 @@ use pyo3::prelude::*; -use datafusion_python::{expr, functions}; - pub mod context; pub mod datastream; @@ -15,24 +13,14 @@ pub(crate) struct TokioRuntime(tokio::runtime::Runtime); /// A Python module implemented in Rust. #[pymodule] fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { - // Register the Tokio Runtime as a module attribute so we can reuse it - m.add( - "runtime", - TokioRuntime(tokio::runtime::Runtime::new().unwrap()), - )?; - m.add_class::()?; m.add_class::()?; - // Register `expr` as a submodule. Matching `datafusion-expr` https://docs.rs/datafusion-expr/latest/datafusion_expr/ - let expr = PyModule::new_bound(py, "expr")?; - expr::init_module(&expr)?; - m.add_submodule(&expr)?; - + datafusion_python::_internal(py, &m)?; // Register the functions as a submodule - let funcs = PyModule::new_bound(py, "functions")?; - functions::init_module(&funcs)?; - m.add_submodule(&funcs)?; + // let datafusion = &PyModule::new_bound(py, "datafusion")?; + // datafusion_python::_internal(py, datafusion)?; + // m.add_submodule(datafusion)?; Ok(()) }