From f6199d7d2f80cf3dcfe599b9debe2fb53902974f Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sun, 18 Dec 2022 06:36:00 +0000 Subject: [PATCH 01/30] Add fugue interfaceless util functions --- fugue/__init__.py | 2 +- fugue/dataframe/__init__.py | 4 +- fugue/dataframe/array_dataframe.py | 2 +- fugue/dataframe/arrow_dataframe.py | 71 ++++- fugue/dataframe/dataframe.py | 266 +++++++++++++++++- .../dataframe/dataframe_iterable_dataframe.py | 2 +- fugue/dataframe/iterable_dataframe.py | 2 +- fugue/dataframe/pandas_dataframe.py | 79 +++++- fugue/dataframe/utils.py | 151 ++-------- fugue/dataset.py | 51 +++- fugue/interfaceless/__init__.py | 29 ++ .../transformation.py} | 0 fugue/plugins.py | 35 +++ fugue/workflow/workflow.py | 2 +- fugue_dask/dataframe.py | 12 +- fugue_duckdb/dataframe.py | 2 +- fugue_ibis/dataframe.py | 2 +- fugue_ray/dataframe.py | 12 +- fugue_spark/dataframe.py | 10 +- tests/fugue/dataframe/test_utils.py | 46 +-- tests/fugue_dask/test_dataframe.py | 18 +- tests/fugue_ray/test_dataframe.py | 20 +- tests/fugue_spark/test_dataframe.py | 18 +- 23 files changed, 612 insertions(+), 224 deletions(-) create mode 100644 fugue/interfaceless/__init__.py rename fugue/{interfaceless.py => interfaceless/transformation.py} (100%) create mode 100644 fugue/plugins.py diff --git a/fugue/__init__.py b/fugue/__init__.py index 57abba71..9f3ef2b1 100644 --- a/fugue/__init__.py +++ b/fugue/__init__.py @@ -20,7 +20,7 @@ from fugue.dataframe.iterable_dataframe import IterableDataFrame from fugue.dataframe.pandas_dataframe import PandasDataFrame from fugue.dataframe.utils import to_local_bounded_df, to_local_df -from fugue.dataset import Dataset, DatasetDisplay, get_dataset_display +from fugue.dataset import Dataset, DatasetDisplay, as_fugue_dataset, get_dataset_display from fugue.execution.execution_engine import ExecutionEngine, MapEngine, SQLEngine from fugue.execution.factory import ( infer_execution_engine, diff --git a/fugue/dataframe/__init__.py b/fugue/dataframe/__init__.py index d8eadb34..b18a9631 100644 --- a/fugue/dataframe/__init__.py +++ b/fugue/dataframe/__init__.py @@ -12,9 +12,9 @@ from .iterable_dataframe import IterableDataFrame from .pandas_dataframe import PandasDataFrame from .utils import ( - get_dataframe_column_names, + get_column_names, normalize_dataframe_column_names, - rename_dataframe_column_names, + rename, to_local_bounded_df, to_local_df, ) diff --git a/fugue/dataframe/array_dataframe.py b/fugue/dataframe/array_dataframe.py index 5e89b63a..48ca9ee5 100644 --- a/fugue/dataframe/array_dataframe.py +++ b/fugue/dataframe/array_dataframe.py @@ -51,7 +51,7 @@ def native(self) -> List[Any]: def empty(self) -> bool: return self.count() == 0 - def peek_array(self) -> Any: + def peek_array(self) -> List[Any]: self.assert_not_empty() return list(self.native[0]) diff --git a/fugue/dataframe/arrow_dataframe.py b/fugue/dataframe/arrow_dataframe.py index 8ef1219b..0ba53ed8 100644 --- a/fugue/dataframe/arrow_dataframe.py +++ b/fugue/dataframe/arrow_dataframe.py @@ -2,12 +2,24 @@ import pandas as pd import pyarrow as pa -from fugue.dataframe.dataframe import DataFrame, LocalBoundedDataFrame, _input_schema -from fugue.exceptions import FugueDataFrameOperationError from triad.collections.schema import Schema from triad.exceptions import InvalidOperationError from triad.utils.assertion import assert_or_throw +from fugue.dataset import as_fugue_dataset, count, is_bounded, is_empty, is_local +from fugue.exceptions import FugueDataFrameOperationError + +from .dataframe import ( + DataFrame, + LocalBoundedDataFrame, + _input_schema, + drop_columns, + get_column_names, + get_schema, + rename, + select_columns, +) + class ArrowDataFrame(LocalBoundedDataFrame): """DataFrame that wraps :func:`pyarrow.Table `. Please also read @@ -105,7 +117,7 @@ def native(self) -> pa.Table: def empty(self) -> bool: return self.count() == 0 - def peek_array(self) -> Any: + def peek_array(self) -> List[Any]: self.assert_not_empty() data = self.native.take([0]).to_pydict() return [v[0] for v in data.values()] @@ -218,6 +230,59 @@ def as_array_iterable( yield list(arr) +@as_fugue_dataset.candidate(lambda df: isinstance(df, pa.Table)) +def _pa_table_as_fugue_df(df: pa.Table) -> "ArrowDataFrame": + return ArrowDataFrame(df) + + +@count.candidate(lambda df: isinstance(df, pa.Table)) +def _pa_table_count(df: pa.Table) -> int: + return df.shape[0] + + +@is_bounded.candidate(lambda df: isinstance(df, pa.Table)) +def _pa_table_is_bounded(df: pa.Table) -> bool: + return True + + +@is_empty.candidate(lambda df: isinstance(df, pa.Table)) +def _pa_table_is_empty(df: pa.Table) -> bool: + return df.shape[0] == 0 + + +@is_local.candidate(lambda df: isinstance(df, pa.Table)) +def _pa_table_is_local(df: pa.Table) -> bool: + return True + + +@get_column_names.candidate(lambda df: isinstance(df, pa.Table)) +def _get_pyarrow_table_columns(df: pa.Table) -> List[Any]: + return [f.name for f in df.schema] + + +@get_schema.candidate(lambda df: isinstance(df, pa.Table)) +def _get_pyarrow_table_schema(df: pa.Table) -> Schema: + return Schema(df.schema) + + +@rename.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table)) +def _rename_pyarrow_dataframe(df: pa.Table, names: Dict[str, Any]) -> pa.Table: + if len(names) == 0: + return df + return df.rename_columns([names.get(f.name, f.name) for f in df.schema]) + + +@drop_columns.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table)) +def _drop_pa_columns(df: pa.Table, columns: List[str]) -> pa.Table: + cols = [x for x in df.schema.names if x not in columns] + return df.select(cols) + + +@select_columns.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table)) +def _select_pa_columns(df: pa.Table, columns: List[Any]) -> pa.Table: + return df.select(columns) + + def _build_empty_arrow(schema: Schema) -> pa.Table: # pragma: no cover if pa.__version__ < "7": arr = [pa.array([])] * len(schema) diff --git a/fugue/dataframe/dataframe.py b/fugue/dataframe/dataframe.py index 2b7b0063..0aea3894 100644 --- a/fugue/dataframe/dataframe.py +++ b/fugue/dataframe/dataframe.py @@ -9,8 +9,12 @@ from triad.exceptions import InvalidOperationError from triad.utils.assertion import assert_or_throw from triad.utils.pandas_like import PD_UTILS +from triad.utils.rename import normalize_names + +from fugue.dataset import as_fugue_dataset from .._utils.display import PrettyTable +from .._utils.registry import fugue_plugin from ..collections.yielded import Yielded from ..dataset import Dataset, DatasetDisplay, get_dataset_display from ..exceptions import FugueDataFrameOperationError @@ -62,7 +66,7 @@ def as_local(self) -> "LocalDataFrame": # pragma: no cover raise NotImplementedError @abstractmethod - def peek_array(self) -> Any: # pragma: no cover + def peek_array(self) -> List[Any]: # pragma: no cover """Peek the first row of the dataframe as array :raises FugueDatasetEmptyError: if it is empty @@ -410,6 +414,260 @@ def show( print("") +def as_fugue_df(df: Any) -> DataFrame: + """Wrap the object as a Fugue DataFrame. This is a wrapper + of :func:`~fugue.dataset.as_fugue_dataset` + + :param df: the object to wrap + """ + res = as_fugue_dataset(df) + assert_or_throw( + isinstance(res, DataFrame), + TypeError(f"{type(df)} can't be converted to a Fugue DataFrame"), + ) + return res # type: ignore + + +@fugue_plugin +def get_schema(df: Any) -> Schema: + """Get the schema of the ``df`` + + :param df: the object that can be recognized as a dataframe by Fugue + :return: the Schema object + """ + return as_fugue_df(df).schema + + +@fugue_plugin +def as_pandas(df: Any) -> pd.DataFrame: + """Convert ``df`` to a Pandas DataFrame + + :param df: the object that can be recognized as a dataframe by Fugue + :return: the Pandas DataFrame + """ + return as_fugue_df(df).as_pandas() + + +@fugue_plugin +def as_arrow(df: Any) -> pa.Table: + """Convert ``df`` to a PyArrow Table + + :param df: the object that can be recognized as a dataframe by Fugue + :return: the PyArrow Table + """ + return as_fugue_df(df).as_arrow() + + +@fugue_plugin +def as_array( + df: Any, columns: Optional[List[str]] = None, type_safe: bool = False +) -> List[Any]: # pragma: no cover + """Convert df to 2-dimensional native python array + + :param df: the object that can be recognized as a dataframe by Fugue + :param columns: columns to extract, defaults to None + :param type_safe: whether to ensure output conforms with its schema, + defaults to False + :return: 2-dimensional native python array + + .. note:: + + If ``type_safe`` is False, then the returned values are 'raw' values. + """ + return as_fugue_df(df).as_array(columns=columns, type_safe=type_safe) + + +@fugue_plugin +def as_array_iterable( + df: Any, columns: Optional[List[str]] = None, type_safe: bool = False +) -> Iterable[Any]: # pragma: no cover + """Convert df to iterable of native python arrays + + :param df: the object that can be recognized as a dataframe by Fugue + :param columns: columns to extract, defaults to None + :param type_safe: whether to ensure output conforms with its schema, + defaults to False + :return: iterable of native python arrays + + .. note:: + + If ``type_safe`` is False, then the returned values are 'raw' values. + """ + + return as_fugue_df(df).as_array_iterable(columns=columns, type_safe=type_safe) + + +@fugue_plugin +def as_dict_iterable( + df: Any, columns: Optional[List[str]] = None +) -> Iterable[Dict[str, Any]]: + """Convert df to iterable of native python dicts + + :param df: the object that can be recognized as a dataframe by Fugue + :param columns: columns to extract, defaults to None + :return: iterable of native python dicts + + .. note:: + + The default implementation enforces ``type_safe`` True + """ + return as_fugue_df(df).as_array_iterable(columns=columns) + + +@fugue_plugin +def peek_array(df: Any) -> List[Any]: + """Peek the first row of the dataframe as an array + + :param df: the object that can be recognized as a dataframe by Fugue + :return: the first row as an array + """ + return as_fugue_df(df).peek_array() + + +@fugue_plugin +def peek_dict(df: Any) -> Dict[str, Any]: + """Peek the first row of the dataframe as a array + + :param df: the object that can be recognized as a dataframe by Fugue + :return: the first row as a dict + """ + return as_fugue_df(df).peek_dict() + + +@fugue_plugin +def head(df: Any, n: int, columns: Optional[List[str]] = None) -> Any: + """Get first n rows of the dataframe as a new local bounded dataframe + + :param n: number of rows + :param columns: selected columns, defaults to None (all columns) + :return: a local bounded dataframe + """ + res = as_fugue_df(df).head(n=n, columns=columns) + if isinstance(df, DataFrame): + return res + return res.as_pandas() + + +@fugue_plugin +def alter_columns(df: Any, columns: Any) -> Any: + """Change column types + + :param df: the object that can be recognized as a dataframe by Fugue + :param columns: |SchemaLikeObject|, + all columns should be contained by the dataframe schema + :return: a new dataframe with altered columns, the order of the + original schema will not change + """ + return _adjust_df(df, as_fugue_df(df).alter_columns(columns)) + + +@fugue_plugin +def drop_columns(df: Any, columns: List[str]) -> Any: + """Drop certain columns and return a new dataframe + + :param df: the object that can be recognized as a dataframe by Fugue + :param columns: columns to drop + :return: a new dataframe removing the columns + """ + return _adjust_df(df, as_fugue_df(df).drop(columns)) + + +@fugue_plugin +def select_columns(df: Any, columns: List[Any]) -> Any: + """Select certain columns and return a new dataframe + + :param df: the object that can be recognized as a dataframe by Fugue + :param columns: columns to return + :return: a new dataframe with the selected the columns + """ + return _adjust_df(df, as_fugue_df(df)[columns]) + + +@fugue_plugin +def get_column_names(df: Any) -> List[Any]: # pragma: no cover + """A generic function to get column names of any dataframe + + :param df: the dataframe object + :return: the column names + + .. note:: + + In order to support a new type of dataframe, an implementation must + be registered, for example + + .. code-block::python + + @get_column_names.candidate(lambda df: isinstance(df, pa.Table)) + def _get_pyarrow_dataframe_columns(df: pa.Table) -> List[Any]: + return [f.name for f in df.schema] + """ + return get_schema(df).names + + +@fugue_plugin +def rename(df: Any, names: Dict[str, Any]) -> Any: + """A generic function to rename column names of any dataframe + + :param df: the dataframe object + :param names: the rename operations as a dict: ``old name => new name`` + :return: the renamed dataframe + + .. note:: + + In order to support a new type of dataframe, an implementation must + be registered, for example + + .. code-block::python + + @rename.candidate( + lambda df, *args, **kwargs: isinstance(df, pd.DataFrame) + ) + def _rename_pandas_dataframe( + df: pd.DataFrame, names: Dict[str, Any] + ) -> pd.DataFrame: + if len(names) == 0: + return df + return df.rename(columns=names) + """ + if len(names) == 0: + return df + return _adjust_df(df, as_fugue_df(df).rename(names)) + + +def normalize_column_names(df: Any) -> Tuple[Any, Dict[str, Any]]: + """A generic function to normalize any dataframe's column names to follow + Fugue naming rules + + .. note:: + + This is a temporary solution before + :class:`~triad:triad.collections.schema.Schema` + can take arbitrary names + + .. admonition:: Examples + + * ``[0,1]`` => ``{"_0":0, "_1":1}`` + * ``["1a","2b"]`` => ``{"_1a":"1a", "_2b":"2b"}`` + * ``["*a","-a"]`` => ``{"_a":"*a", "_a_1":"-a"}`` + + :param df: a dataframe object + :return: the renamed dataframe and the rename operations as a dict that + can **undo** the change + + .. seealso:: + + * :func:`~.get_column_names` + * :func:`~.rename` + * :func:`~triad:triad.utils.rename.normalize_names` + """ + cols = get_column_names(df) + names = normalize_names(cols) + if len(names) == 0: + return df, {} + undo = {v: k for k, v in names.items()} + return (rename(df, names), undo) + + @get_dataset_display.candidate(lambda ds: isinstance(ds, DataFrame), priority=0.1) def _get_dataframe_display(ds: DataFrame): return DataFrameDisplay(ds) @@ -443,3 +701,9 @@ def _get_schema_change( def _input_schema(schema: Any) -> Schema: return schema if isinstance(schema, Schema) else Schema(schema) + + +def _adjust_df(input_df: Any, output_df: DataFrame) -> Any: + if isinstance(input_df, DataFrame): + return output_df + return output_df.native # type: ignore diff --git a/fugue/dataframe/dataframe_iterable_dataframe.py b/fugue/dataframe/dataframe_iterable_dataframe.py index 646d7b2d..a1bde9e8 100644 --- a/fugue/dataframe/dataframe_iterable_dataframe.py +++ b/fugue/dataframe/dataframe_iterable_dataframe.py @@ -108,7 +108,7 @@ def native(self) -> EmptyAwareIterable[LocalDataFrame]: def empty(self) -> bool: return self.native.empty or self.native.peek().empty - def peek_array(self) -> Any: + def peek_array(self) -> List[Any]: self.assert_not_empty() return self.native.peek().peek_array() diff --git a/fugue/dataframe/iterable_dataframe.py b/fugue/dataframe/iterable_dataframe.py index 653689fd..49a46288 100644 --- a/fugue/dataframe/iterable_dataframe.py +++ b/fugue/dataframe/iterable_dataframe.py @@ -61,7 +61,7 @@ def native(self) -> EmptyAwareIterable[Any]: def empty(self) -> bool: return self.native.empty - def peek_array(self) -> Any: + def peek_array(self) -> List[Any]: self.assert_not_empty() return list(self.native.peek()) diff --git a/fugue/dataframe/pandas_dataframe.py b/fugue/dataframe/pandas_dataframe.py index dc52a0b2..38dd17b2 100644 --- a/fugue/dataframe/pandas_dataframe.py +++ b/fugue/dataframe/pandas_dataframe.py @@ -2,12 +2,25 @@ import pandas as pd import pyarrow as pa -from fugue.dataframe.dataframe import DataFrame, LocalBoundedDataFrame, _input_schema -from fugue.exceptions import FugueDataFrameOperationError from triad.collections.schema import Schema from triad.utils.assertion import assert_or_throw from triad.utils.pandas_like import PD_UTILS +from fugue.dataset import as_fugue_dataset, count, is_bounded, is_empty, is_local +from fugue.exceptions import FugueDataFrameOperationError + +from .dataframe import ( + DataFrame, + LocalBoundedDataFrame, + _input_schema, + drop_columns, + get_column_names, + get_schema, + head, + rename, + select_columns, +) + class PandasDataFrame(LocalBoundedDataFrame): """DataFrame that wraps pandas DataFrame. Please also read @@ -76,7 +89,7 @@ def native(self) -> pd.DataFrame: def empty(self) -> bool: return self.native.empty - def peek_array(self) -> Any: + def peek_array(self) -> List[Any]: self.assert_not_empty() return self.native.iloc[0].values.tolist() @@ -170,3 +183,63 @@ def _apply_schema( ) pdf.columns = schema.names return PD_UTILS.enforce_type(pdf, schema.pa_schema, null_safe=True), schema + + +@as_fugue_dataset.candidate(lambda df: isinstance(df, pd.DataFrame)) +def _pd_as_fugue_df(df: pd.DataFrame) -> "PandasDataFrame": + return PandasDataFrame(df) + + +@count.candidate(lambda df: isinstance(df, pd.DataFrame)) +def _pd_count(df: pd.DataFrame) -> int: + return df.shape[0] + + +@is_bounded.candidate(lambda df: isinstance(df, pd.DataFrame)) +def _pd_is_bounded(df: pd.DataFrame) -> bool: + return True + + +@is_empty.candidate(lambda df: isinstance(df, pd.DataFrame)) +def _pd_is_empty(df: pd.DataFrame) -> bool: + return df.shape[0] == 0 + + +@is_local.candidate(lambda df: isinstance(df, pd.DataFrame)) +def _pd_is_local(df: pd.DataFrame) -> bool: + return True + + +@get_column_names.candidate(lambda df: isinstance(df, pd.DataFrame)) +def _get_pandas_dataframe_columns(df: pd.DataFrame) -> List[Any]: + return list(df.columns) + + +@get_schema.candidate(lambda df: isinstance(df, pd.DataFrame)) +def _get_pandas_dataframe_schema(df: pd.DataFrame) -> Schema: + return Schema(df) + + +@rename.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame)) +def _rename_pandas_dataframe(df: pd.DataFrame, names: Dict[str, Any]) -> pd.DataFrame: + if len(names) == 0: + return df + return df.rename(columns=names) + + +@drop_columns.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame)) +def _drop_pd_columns(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame: + cols = [x for x in df.columns if x not in columns] + return df[cols] + + +@select_columns.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame)) +def _select_pd_columns(df: pd.DataFrame, columns: List[Any]) -> pd.DataFrame: + return df[columns] + + +@head.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame)) +def _pd_head(df: Any, n: int, columns: Optional[List[str]] = None) -> pd.DataFrame: + if columns is not None: + df = df[columns] + return df.head(n) diff --git a/fugue/dataframe/utils.py b/fugue/dataframe/utils.py index 320be8c3..8a3e3e2c 100644 --- a/fugue/dataframe/utils.py +++ b/fugue/dataframe/utils.py @@ -2,152 +2,33 @@ import json import os import pickle -from typing import Any, Dict, Iterable, List, Optional, Tuple +from typing import Any, Iterable, List, Optional, Tuple import pandas as pd import pyarrow as pa from fs import open_fs -from fugue.dataframe.array_dataframe import ArrayDataFrame -from fugue.dataframe.dataframe import DataFrame, LocalBoundedDataFrame, LocalDataFrame -from fugue.dataframe.iterable_dataframe import IterableDataFrame -from fugue.dataframe.pandas_dataframe import PandasDataFrame -from triad.collections import Schema -from triad.collections.fs import FileSystem +from triad import FileSystem, Schema from triad.collections.schema import SchemaError from triad.exceptions import InvalidOperationError from triad.utils.assertion import assert_arg_not_none from triad.utils.assertion import assert_or_throw as aot -from triad.utils.rename import normalize_names -from .._utils.registry import fugue_plugin - - -@fugue_plugin -def get_dataframe_column_names(df: Any) -> List[Any]: # pragma: no cover - """A generic function to get column names of any dataframe - - :param df: the dataframe object - :return: the column names - - .. note:: - - In order to support a new type of dataframe, an implementation must - be registered, for example - - .. code-block::python - - @get_dataframe_column_names.candidate(lambda df: isinstance(df, pa.Table)) - def _get_pyarrow_dataframe_columns(df: pa.Table) -> List[Any]: - return [f.name for f in df.schema] - """ - raise NotImplementedError(f"{type(df)} is not supported") - - -@fugue_plugin -def rename_dataframe_column_names(df: Any, names: Dict[str, Any]) -> Any: - """A generic function to rename column names of any dataframe - - :param df: the dataframe object - :param names: the rename operations as a dict: ``old name => new name`` - :return: the renamed dataframe - - .. note:: - - In order to support a new type of dataframe, an implementation must - be registered, for example - - .. code-block::python - - @rename_dataframe_column_names.candidate( - lambda df, *args, **kwargs: isinstance(df, pd.DataFrame) - ) - def _rename_pandas_dataframe( - df: pd.DataFrame, names: Dict[str, Any] - ) -> pd.DataFrame: - if len(names) == 0: - return df - return df.rename(columns=names) - """ - if len(names) == 0: - return df - else: # pragma: no cover - raise NotImplementedError(f"{type(df)} is not supported") - - -def normalize_dataframe_column_names(df: Any) -> Tuple[Any, Dict[str, Any]]: - """A generic function to normalize any dataframe's column names to follow - Fugue naming rules - - .. note:: - - This is a temporary solution before - :class:`~triad:triad.collections.schema.Schema` - can take arbitrary names - - .. admonition:: Examples - - * ``[0,1]`` => ``{"_0":0, "_1":1}`` - * ``["1a","2b"]`` => ``{"_1a":"1a", "_2b":"2b"}`` - * ``["*a","-a"]`` => ``{"_a":"*a", "_a_1":"-a"}`` - - :param df: a dataframe object - :return: the renamed dataframe and the rename operations as a dict that - can **undo** the change - - .. seealso:: - - * :func:`~.get_dataframe_column_names` - * :func:`~.rename_dataframe_column_names` - * :func:`~triad:triad.utils.rename.normalize_names` - """ - cols = get_dataframe_column_names(df) - names = normalize_names(cols) - if len(names) == 0: - return df, {} - undo = {v: k for k, v in names.items()} - return (rename_dataframe_column_names(df, names), undo) - - -@get_dataframe_column_names.candidate(lambda df: isinstance(df, pd.DataFrame)) -def _get_pandas_dataframe_columns(df: pd.DataFrame) -> List[Any]: - return list(df.columns) - - -@rename_dataframe_column_names.candidate( - lambda df, *args, **kwargs: isinstance(df, pd.DataFrame) +from .array_dataframe import ArrayDataFrame +from .dataframe import ( + DataFrame, + LocalBoundedDataFrame, + LocalDataFrame, + get_column_names, + normalize_column_names, + rename, ) -def _rename_pandas_dataframe(df: pd.DataFrame, names: Dict[str, Any]) -> pd.DataFrame: - if len(names) == 0: - return df - return df.rename(columns=names) +from .iterable_dataframe import IterableDataFrame +from .pandas_dataframe import PandasDataFrame - -@get_dataframe_column_names.candidate(lambda df: isinstance(df, pa.Table)) -def _get_pyarrow_dataframe_columns(df: pa.Table) -> List[Any]: - return [f.name for f in df.schema] - - -@rename_dataframe_column_names.candidate( - lambda df, *args, **kwargs: isinstance(df, pa.Table) -) -def _rename_pyarrow_dataframe(df: pa.Table, names: Dict[str, Any]) -> pa.Table: - if len(names) == 0: - return df - return df.rename_columns([names.get(f.name, f.name) for f in df.schema]) - - -@get_dataframe_column_names.candidate(lambda df: isinstance(df, DataFrame)) -def _get_fugue_dataframe_columns(df: "DataFrame") -> List[Any]: - return df.schema.names - - -@rename_dataframe_column_names.candidate( - lambda df, *args, **kwargs: isinstance(df, DataFrame) -) -def _rename_fugue_dataframe(df: "DataFrame", names: Dict[str, Any]) -> "DataFrame": - if len(names) == 0: - return df - return df.rename(columns=names) +# For backward compatibility, TODO: remove! +get_dataframe_column_names = get_column_names +normalize_dataframe_column_names = normalize_column_names +rename_dataframe_column_names = rename def _pa_type_eq(t1: pa.DataType, t2: pa.DataType) -> bool: diff --git a/fugue/dataset.py b/fugue/dataset.py index 080337c9..2ea70bde 100644 --- a/fugue/dataset.py +++ b/fugue/dataset.py @@ -39,7 +39,7 @@ def reset_metadata(self, metadata: Any) -> None: @property @abstractmethod def is_local(self) -> bool: # pragma: no cover - """Whether this dataframe is a :class:`.LocalDataFrame`""" + """Whether this dataframe is a local Dataset""" raise NotImplementedError @property @@ -146,4 +146,51 @@ def get_dataset_display(ds: "Dataset") -> DatasetDisplay: # pragma: no cover :param ds: the Dataset to be displayed """ - raise NotImplementedError(f"No matching DatasetDisplay registered for {type(ds)}") + raise NotImplementedError(f"no matching DatasetDisplay registered for {type(ds)}") + + +@fugue_plugin +def as_fugue_dataset(data: Any) -> Dataset: + """Wrap the input as a :class:`~.Dataset` + + :param data: the data to be wrapped + """ + if isinstance(data, Dataset): + return data + raise NotImplementedError(f"no registered dataset conversion for {type(data)}") + + +@fugue_plugin +def is_local(data: Any) -> bool: + """Whether the dataset is local + + :param data: the data that can be recognized by Fugue + """ + return as_fugue_dataset(data).is_local + + +@fugue_plugin +def is_bounded(data: Any) -> bool: + """Whether the dataset is local + + :param data: the data that can be recognized by Fugue + """ + return as_fugue_dataset(data).is_bounded + + +@fugue_plugin +def is_empty(data: Any) -> bool: + """Whether the dataset is empty + + :param data: the data that can be recognized by Fugue + """ + return as_fugue_dataset(data).empty + + +@fugue_plugin +def count(data: Any) -> int: + """The number of elements in the dataset + + :param data: the data that can be recognized by Fugue + """ + return as_fugue_dataset(data).count() diff --git a/fugue/interfaceless/__init__.py b/fugue/interfaceless/__init__.py new file mode 100644 index 00000000..c8da4c88 --- /dev/null +++ b/fugue/interfaceless/__init__.py @@ -0,0 +1,29 @@ +# flake8: noqa +from fugue.dataframe.dataframe import ( + alter_columns, + as_array, + as_array_iterable, + as_arrow, + as_dict_iterable, + as_fugue_df, + as_pandas, + drop_columns, + get_column_names, + get_schema, + head, + normalize_column_names, + peek_array, + peek_dict, + rename, + select_columns, +) +from fugue.dataset import ( + as_fugue_dataset, + count, + get_dataset_display, + is_bounded, + is_empty, + is_local, +) + +from .transformation import out_transform, transform diff --git a/fugue/interfaceless.py b/fugue/interfaceless/transformation.py similarity index 100% rename from fugue/interfaceless.py rename to fugue/interfaceless/transformation.py diff --git a/fugue/plugins.py b/fugue/plugins.py new file mode 100644 index 00000000..6fec357e --- /dev/null +++ b/fugue/plugins.py @@ -0,0 +1,35 @@ +# flake8: noqa +# pylint: disable-all +from fugue.dataframe.dataframe import ( + alter_columns, + as_array, + as_array_iterable, + as_arrow, + as_dict_iterable, + as_pandas, + drop_columns, + get_column_names, + get_schema, + head, + peek_array, + peek_dict, + rename, + select_columns, +) +from fugue.dataset import ( + as_fugue_dataset, + count, + get_dataset_display, + is_bounded, + is_empty, + is_local, +) +from fugue.execution.factory import ( + infer_execution_engine, + parse_execution_engine, + parse_sql_engine, +) +from fugue.extensions.creator import parse_creator +from fugue.extensions.outputter import parse_outputter +from fugue.extensions.processor import parse_processor +from fugue.extensions.transformer import parse_output_transformer, parse_transformer diff --git a/fugue/workflow/workflow.py b/fugue/workflow/workflow.py index fca6864b..bc051b7c 100644 --- a/fugue/workflow/workflow.py +++ b/fugue/workflow/workflow.py @@ -1312,7 +1312,7 @@ def num_partitions(self) -> int: # pragma: no cover """ raise NotImplementedError("WorkflowDataFrame does not support this method") - def peek_array(self) -> Any: # pragma: no cover + def peek_array(self) -> List[Any]: # pragma: no cover """ :raises NotImplementedError: don't call this method """ diff --git a/fugue_dask/dataframe.py b/fugue_dask/dataframe.py index 85068273..b86e8299 100644 --- a/fugue_dask/dataframe.py +++ b/fugue_dask/dataframe.py @@ -12,8 +12,8 @@ ) from fugue.dataframe.dataframe import _input_schema from fugue.dataframe.utils import ( - get_dataframe_column_names, - rename_dataframe_column_names, + get_column_names, + rename, ) from fugue.exceptions import FugueDataFrameOperationError from triad.collections.schema import Schema @@ -27,14 +27,12 @@ from fugue_dask._utils import DASK_UTILS -@get_dataframe_column_names.candidate(lambda df: isinstance(df, pd.DataFrame)) +@get_column_names.candidate(lambda df: isinstance(df, pd.DataFrame)) def _get_dask_dataframe_columns(df: pd.DataFrame) -> List[Any]: return list(df.columns) -@rename_dataframe_column_names.candidate( - lambda df, *args, **kwargs: isinstance(df, pd.DataFrame) -) +@rename.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame)) def _rename_dask_dataframe(df: pd.DataFrame, names: Dict[str, Any]) -> pd.DataFrame: if len(names) == 0: return df @@ -132,7 +130,7 @@ def _select_cols(self, cols: List[Any]) -> DataFrame: schema = self.schema.extract(cols) return DaskDataFrame(self.native[schema.names], schema, type_safe=False) - def peek_array(self) -> Any: + def peek_array(self) -> List[Any]: self.assert_not_empty() return self.as_pandas().iloc[0].values.tolist() diff --git a/fugue_duckdb/dataframe.py b/fugue_duckdb/dataframe.py index 89744676..0efe7390 100644 --- a/fugue_duckdb/dataframe.py +++ b/fugue_duckdb/dataframe.py @@ -38,7 +38,7 @@ def native(self) -> DuckDBPyRelation: def empty(self) -> bool: return self._rel.fetchone() is None - def peek_array(self) -> Any: + def peek_array(self) -> List[Any]: res = self._rel.fetchone() if res is None: raise FugueDatasetEmptyError() diff --git a/fugue_ibis/dataframe.py b/fugue_ibis/dataframe.py index 80b3aad3..d21c23e7 100644 --- a/fugue_ibis/dataframe.py +++ b/fugue_ibis/dataframe.py @@ -69,7 +69,7 @@ def empty(self) -> bool: def num_partitions(self) -> int: return 1 # pragma: no cover - def peek_array(self) -> Any: + def peek_array(self) -> List[Any]: res = self._to_local_df(self._table.head(1)).as_array() if len(res) == 0: raise FugueDatasetEmptyError() diff --git a/fugue_ray/dataframe.py b/fugue_ray/dataframe.py index cc007a8b..8780845a 100644 --- a/fugue_ray/dataframe.py +++ b/fugue_ray/dataframe.py @@ -12,8 +12,8 @@ ) from fugue.dataframe.dataframe import _input_schema from fugue.dataframe.utils import ( - get_dataframe_column_names, - rename_dataframe_column_names, + get_column_names, + rename, ) from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError from triad.collections.schema import Schema @@ -21,7 +21,7 @@ from ._utils.dataframe import _build_empty_arrow, build_empty, get_dataset_format -@get_dataframe_column_names.candidate(lambda df: isinstance(df, rd.Dataset)) +@get_column_names.candidate(lambda df: isinstance(df, rd.Dataset)) def _get_ray_dataframe_columns(df: rd.Dataset) -> List[Any]: fmt = get_dataset_format(df) if fmt == "pandas": @@ -31,9 +31,7 @@ def _get_ray_dataframe_columns(df: rd.Dataset) -> List[Any]: raise NotImplementedError(f"{fmt} is not supported") # pragma: no cover -@rename_dataframe_column_names.candidate( - lambda df, *args, **kwargs: isinstance(df, rd.Dataset) -) +@rename.candidate(lambda df, *args, **kwargs: isinstance(df, rd.Dataset)) def _rename_ray_dataframe(df: rd.Dataset, names: Dict[str, Any]) -> rd.Dataset: if len(names) == 0: return df @@ -153,7 +151,7 @@ def _select_cols(self, cols: List[Any]) -> DataFrame: ) return RayDataFrame(rdf, self.schema.extract(cols), internal_schema=True) - def peek_array(self) -> Any: + def peek_array(self) -> List[Any]: data = self.native.limit(1).to_pandas().values.tolist() if len(data) == 0: raise FugueDatasetEmptyError diff --git a/fugue_spark/dataframe.py b/fugue_spark/dataframe.py index c5a37d22..6c42a8e5 100644 --- a/fugue_spark/dataframe.py +++ b/fugue_spark/dataframe.py @@ -12,8 +12,8 @@ PandasDataFrame, ) from fugue.dataframe.utils import ( - get_dataframe_column_names, - rename_dataframe_column_names, + get_column_names, + rename, ) from fugue.exceptions import FugueDataFrameOperationError from pyspark.sql.functions import col @@ -24,14 +24,12 @@ from fugue_spark._utils.convert import to_cast_expression, to_schema, to_type_safe_input -@get_dataframe_column_names.candidate(lambda df: isinstance(df, ps.DataFrame)) +@get_column_names.candidate(lambda df: isinstance(df, ps.DataFrame)) def _get_spark_dataframe_columns(df: ps.DataFrame) -> List[Any]: return [f.name for f in df.schema] -@rename_dataframe_column_names.candidate( - lambda df, *args, **kwargs: isinstance(df, ps.DataFrame) -) +@rename.candidate(lambda df, *args, **kwargs: isinstance(df, ps.DataFrame)) def _rename_spark_dataframe(df: ps.DataFrame, names: Dict[str, Any]) -> ps.DataFrame: if len(names) == 0: return df diff --git a/tests/fugue/dataframe/test_utils.py b/tests/fugue/dataframe/test_utils.py index 4bd7add3..4c33cded 100644 --- a/tests/fugue/dataframe/test_utils.py +++ b/tests/fugue/dataframe/test_utils.py @@ -11,11 +11,11 @@ from fugue.dataframe.utils import ( _schema_eq, deserialize_df, - get_dataframe_column_names, + get_column_names, get_join_schemas, normalize_dataframe_column_names, pickle_df, - rename_dataframe_column_names, + rename, serialize_df, unpickle_df, ) @@ -200,50 +200,50 @@ def assert_eq(df, df_expected=None, raw=False): raises(ValueError, lambda: deserialize_df('{"x":1}')) -def test_get_dataframe_column_names(): +def test_get_column_names(): df = pd.DataFrame([[0, 1, 2]]) - assert get_dataframe_column_names(df) == [0, 1, 2] + assert get_column_names(df) == [0, 1, 2] adf = pa.Table.from_pandas(df) - assert get_dataframe_column_names(adf) == ["0", "1", "2"] + assert get_column_names(adf) == ["0", "1", "2"] pdf = PandasDataFrame(pd.DataFrame([[0, 1]], columns=["a", "b"])) - assert get_dataframe_column_names(pdf) == ["a", "b"] + assert get_column_names(pdf) == ["a", "b"] -def test_rename_dataframe_column_names(): - assert rename_dataframe_column_names("dummy", {}) == "dummy" +def test_rename(): + assert rename("dummy", {}) == "dummy" pdf = pd.DataFrame([[0, 1, 2]], columns=["a", "b", "c"]) - df = rename_dataframe_column_names(pdf, {}) - assert get_dataframe_column_names(df) == ["a", "b", "c"] - df = rename_dataframe_column_names(pdf, {"b": "bb"}) - assert get_dataframe_column_names(df) == ["a", "bb", "c"] + df = rename(pdf, {}) + assert get_column_names(df) == ["a", "b", "c"] + df = rename(pdf, {"b": "bb"}) + assert get_column_names(df) == ["a", "bb", "c"] adf = pa.Table.from_pandas(pdf) - adf = rename_dataframe_column_names(adf, {}) - assert get_dataframe_column_names(adf) == ["a", "b", "c"] - adf = rename_dataframe_column_names(adf, {"b": "bb"}) - assert get_dataframe_column_names(adf) == ["a", "bb", "c"] + adf = rename(adf, {}) + assert get_column_names(adf) == ["a", "b", "c"] + adf = rename(adf, {"b": "bb"}) + assert get_column_names(adf) == ["a", "bb", "c"] fdf = PandasDataFrame(pdf) - fdf = rename_dataframe_column_names(fdf, {}) - assert get_dataframe_column_names(fdf) == ["a", "b", "c"] - fdf = rename_dataframe_column_names(fdf, {"b": "bb"}) - assert get_dataframe_column_names(fdf) == ["a", "bb", "c"] + fdf = rename(fdf, {}) + assert get_column_names(fdf) == ["a", "b", "c"] + fdf = rename(fdf, {"b": "bb"}) + assert get_column_names(fdf) == ["a", "bb", "c"] def test_normalize_dataframe_column_names(): df = pd.DataFrame([[0, 1, 2]], columns=["a", "b", "c"]) df, names = normalize_dataframe_column_names(df) - assert get_dataframe_column_names(df) == ["a", "b", "c"] + assert get_column_names(df) == ["a", "b", "c"] assert names == {} df = pd.DataFrame([[0, 1, 2]]) df, names = normalize_dataframe_column_names(df) - assert get_dataframe_column_names(df) == ["_0", "_1", "_2"] + assert get_column_names(df) == ["_0", "_1", "_2"] assert names == {"_0": 0, "_1": 1, "_2": 2} df = pd.DataFrame([[0, 1, 2, 3]], columns=["1", "2", "_2", "大"]) df, names = normalize_dataframe_column_names(df) - assert get_dataframe_column_names(df) == ["_1", "_2_1", "_2", "_1_1"] + assert get_column_names(df) == ["_1", "_2_1", "_2", "_1_1"] assert names == {"_1": "1", "_2_1": "2", "_1_1": "大"} diff --git a/tests/fugue_dask/test_dataframe.py b/tests/fugue_dask/test_dataframe.py index f8b41064..8e739134 100644 --- a/tests/fugue_dask/test_dataframe.py +++ b/tests/fugue_dask/test_dataframe.py @@ -14,8 +14,8 @@ from pytest import raises from triad.collections.schema import Schema from fugue.dataframe.utils import ( - get_dataframe_column_names, - rename_dataframe_column_names, + get_column_names, + rename, ) @@ -201,20 +201,20 @@ def _test_as_array_perf(): print(nts, ts) -def test_get_dataframe_column_names(): +def test_get_column_names(): df = pd.from_pandas(pandas.DataFrame([[0, 1, 2]]), npartitions=1) - assert get_dataframe_column_names(df) == [0, 1, 2] + assert get_column_names(df) == [0, 1, 2] -def test_rename_dataframe_column_names(): +def test_rename(): pdf = pd.from_pandas( pandas.DataFrame([[0, 1, 2]], columns=["a", "b", "c"]), npartitions=1 ) - df = rename_dataframe_column_names(pdf, {}) + df = rename(pdf, {}) assert isinstance(df, pd.DataFrame) - assert get_dataframe_column_names(df) == ["a", "b", "c"] + assert get_column_names(df) == ["a", "b", "c"] pdf = pd.from_pandas(pandas.DataFrame([[0, 1, 2]]), npartitions=1) - df = rename_dataframe_column_names(pdf, {0: "_0", 1: "_1", 2: "_2"}) + df = rename(pdf, {0: "_0", 1: "_1", 2: "_2"}) assert isinstance(df, pd.DataFrame) - assert get_dataframe_column_names(df) == ["_0", "_1", "_2"] + assert get_column_names(df) == ["_0", "_1", "_2"] diff --git a/tests/fugue_ray/test_dataframe.py b/tests/fugue_ray/test_dataframe.py index 411ec20e..2b09b62a 100644 --- a/tests/fugue_ray/test_dataframe.py +++ b/tests/fugue_ray/test_dataframe.py @@ -7,8 +7,8 @@ from fugue.dataframe.array_dataframe import ArrayDataFrame from fugue.dataframe.arrow_dataframe import _build_empty_arrow from fugue.dataframe.utils import ( - get_dataframe_column_names, - rename_dataframe_column_names, + get_column_names, + rename, ) from fugue_test.dataframe_suite import DataFrameTests from pytest import raises @@ -101,22 +101,22 @@ def test_ray_num_partitions(self): df = RayDataFrame(rdf.repartition(5)) assert 5 == df.num_partitions - def test_get_dataframe_column_names(self): + def test_get_column_names(self): df = rd.from_pandas(pd.DataFrame([[0, 10, 20]], columns=["0", "1", "2"])) - assert get_dataframe_column_names(df) == ["0", "1", "2"] + assert get_column_names(df) == ["0", "1", "2"] df = rd.from_arrow( pa.Table.from_pandas(pd.DataFrame([[0, 10, 20]], columns=["0", "1", "2"])) ) - assert get_dataframe_column_names(df) == ["0", "1", "2"] + assert get_column_names(df) == ["0", "1", "2"] - def test_rename_dataframe_column_names(self): + def test_rename(self): rdf = rd.from_pandas(pd.DataFrame([[0, 10, 20]], columns=["a", "b", "c"])) - df = rename_dataframe_column_names(rdf, {}) + df = rename(rdf, {}) assert isinstance(df, rd.Dataset) - assert get_dataframe_column_names(df) == ["a", "b", "c"] + assert get_column_names(df) == ["a", "b", "c"] pdf = rd.from_pandas(pd.DataFrame([[0, 10, 20]], columns=["0", "1", "2"])) - df = rename_dataframe_column_names(pdf, {"0": "_0", "1": "_1", "2": "_2"}) + df = rename(pdf, {"0": "_0", "1": "_1", "2": "_2"}) assert isinstance(df, rd.Dataset) - assert get_dataframe_column_names(df) == ["_0", "_1", "_2"] + assert get_column_names(df) == ["_0", "_1", "_2"] diff --git a/tests/fugue_spark/test_dataframe.py b/tests/fugue_spark/test_dataframe.py index 4a988019..0bbef2e8 100644 --- a/tests/fugue_spark/test_dataframe.py +++ b/tests/fugue_spark/test_dataframe.py @@ -7,8 +7,8 @@ import pytest from fugue.dataframe.pandas_dataframe import PandasDataFrame from fugue.dataframe.utils import ( - get_dataframe_column_names, - rename_dataframe_column_names, + get_column_names, + rename, ) from fugue_test.dataframe_suite import DataFrameTests from pyspark.sql import SparkSession @@ -123,24 +123,24 @@ def _df(data, schema=None): return SparkDataFrame(df, schema) -def test_get_dataframe_column_names(spark_session): +def test_get_column_names(spark_session): df = spark_session.createDataFrame( pd.DataFrame([[0, 1, 2]], columns=["0", "1", "2"]) ) - assert get_dataframe_column_names(df) == ["0", "1", "2"] + assert get_column_names(df) == ["0", "1", "2"] -def test_rename_dataframe_column_names(spark_session): +def test_rename(spark_session): pdf = spark_session.createDataFrame( pd.DataFrame([[0, 1, 2]], columns=["a", "b", "c"]) ) - df = rename_dataframe_column_names(pdf, {}) + df = rename(pdf, {}) assert isinstance(df, ps.DataFrame) - assert get_dataframe_column_names(df) == ["a", "b", "c"] + assert get_column_names(df) == ["a", "b", "c"] pdf = spark_session.createDataFrame( pd.DataFrame([[0, 1, 2]], columns=["0", "1", "2"]) ) - df = rename_dataframe_column_names(pdf, {"0": "_0", "1": "_1", "2": "_2"}) + df = rename(pdf, {"0": "_0", "1": "_1", "2": "_2"}) assert isinstance(df, ps.DataFrame) - assert get_dataframe_column_names(df) == ["_0", "_1", "_2"] + assert get_column_names(df) == ["_0", "_1", "_2"] From b1243f57712ed93bf054127545d717dd07cc243f Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sun, 18 Dec 2022 08:52:41 +0000 Subject: [PATCH 02/30] update tests --- fugue/dataframe/arrow_dataframe.py | 18 +- fugue/dataframe/dataframe.py | 53 ++-- fugue/dataframe/pandas_dataframe.py | 45 +++- fugue/dataset.py | 22 +- fugue/interfaceless/__init__.py | 1 + fugue_test/dataframe_suite.py | 252 +++++++++--------- tests/fugue/dataframe/test_arrow_dataframe.py | 8 + .../fugue/dataframe/test_pandas_dataframe.py | 21 +- 8 files changed, 264 insertions(+), 156 deletions(-) diff --git a/fugue/dataframe/arrow_dataframe.py b/fugue/dataframe/arrow_dataframe.py index 0ba53ed8..8a9dd482 100644 --- a/fugue/dataframe/arrow_dataframe.py +++ b/fugue/dataframe/arrow_dataframe.py @@ -266,20 +266,26 @@ def _get_pyarrow_table_schema(df: pa.Table) -> Schema: @rename.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table)) -def _rename_pyarrow_dataframe(df: pa.Table, names: Dict[str, Any]) -> pa.Table: - if len(names) == 0: +def _rename_pyarrow_dataframe(df: pa.Table, columns: Dict[str, Any]) -> pa.Table: + if len(columns) == 0: return df - return df.rename_columns([names.get(f.name, f.name) for f in df.schema]) + _assert_no_missing(df, columns.keys()) + return df.rename_columns([columns.get(f.name, f.name) for f in df.schema]) @drop_columns.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table)) def _drop_pa_columns(df: pa.Table, columns: List[str]) -> pa.Table: cols = [x for x in df.schema.names if x not in columns] + if len(cols) == 0: + raise FugueDataFrameOperationError("cannot drop all columns") + if len(cols) + len(columns) != len(df.columns): + _assert_no_missing(df, columns) return df.select(cols) @select_columns.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table)) def _select_pa_columns(df: pa.Table, columns: List[Any]) -> pa.Table: + _assert_no_missing(df, columns=columns) return df.select(columns) @@ -288,3 +294,9 @@ def _build_empty_arrow(schema: Schema) -> pa.Table: # pragma: no cover arr = [pa.array([])] * len(schema) return pa.Table.from_arrays(arr, schema=schema.pa_schema) return pa.Table.from_pylist([], schema=schema.pa_schema) + + +def _assert_no_missing(df: pa.Table, columns: Iterable[Any]) -> None: + missing = [x for x in columns if x not in df.schema.names] + if len(missing) > 0: + raise FugueDataFrameOperationError("cannot drop nonexistent columns: {missing}") diff --git a/fugue/dataframe/dataframe.py b/fugue/dataframe/dataframe.py index 0aea3894..d31603b0 100644 --- a/fugue/dataframe/dataframe.py +++ b/fugue/dataframe/dataframe.py @@ -511,7 +511,7 @@ def as_dict_iterable( The default implementation enforces ``type_safe`` True """ - return as_fugue_df(df).as_array_iterable(columns=columns) + return as_fugue_df(df).as_dict_iterable(columns=columns) @fugue_plugin @@ -535,52 +535,66 @@ def peek_dict(df: Any) -> Dict[str, Any]: @fugue_plugin -def head(df: Any, n: int, columns: Optional[List[str]] = None) -> Any: +def head( + df: Any, n: int, columns: Optional[List[str]] = None, as_fugue: bool = False +) -> Any: """Get first n rows of the dataframe as a new local bounded dataframe :param n: number of rows :param columns: selected columns, defaults to None (all columns) + :param as_fugue: whether return a Fugue :class:`~.DataFrame`, default to + False. If False, then if the input ``df`` is not a Fugue DataFrame + then it will return the underlying DataFrame object. :return: a local bounded dataframe """ res = as_fugue_df(df).head(n=n, columns=columns) - if isinstance(df, DataFrame): + if as_fugue or isinstance(df, DataFrame): return res return res.as_pandas() @fugue_plugin -def alter_columns(df: Any, columns: Any) -> Any: +def alter_columns(df: Any, columns: Any, as_fugue: bool = False) -> Any: """Change column types :param df: the object that can be recognized as a dataframe by Fugue :param columns: |SchemaLikeObject|, all columns should be contained by the dataframe schema + :param as_fugue: whether return a Fugue :class:`~.DataFrame`, default to + False. If False, then if the input ``df`` is not a Fugue DataFrame + then it will return the underlying DataFrame object. :return: a new dataframe with altered columns, the order of the original schema will not change """ - return _adjust_df(df, as_fugue_df(df).alter_columns(columns)) + return _adjust_df(df, as_fugue_df(df).alter_columns(columns), as_fugue=as_fugue) @fugue_plugin -def drop_columns(df: Any, columns: List[str]) -> Any: +def drop_columns(df: Any, columns: List[str], as_fugue: bool = False) -> Any: """Drop certain columns and return a new dataframe :param df: the object that can be recognized as a dataframe by Fugue :param columns: columns to drop + :param as_fugue: whether return a Fugue :class:`~.DataFrame`, default to + False. If False, then if the input ``df`` is not a Fugue DataFrame + then it will return the underlying DataFrame object. :return: a new dataframe removing the columns """ - return _adjust_df(df, as_fugue_df(df).drop(columns)) + return _adjust_df(df, as_fugue_df(df).drop(columns), as_fugue=as_fugue) @fugue_plugin -def select_columns(df: Any, columns: List[Any]) -> Any: +def select_columns(df: Any, columns: List[Any], as_fugue: bool = False) -> Any: """Select certain columns and return a new dataframe :param df: the object that can be recognized as a dataframe by Fugue :param columns: columns to return + :param as_fugue: whether return a Fugue :class:`~.DataFrame`, default to + False. If False, then if the input ``df`` is not a Fugue DataFrame + then it will return the underlying DataFrame object. :return: a new dataframe with the selected the columns """ - return _adjust_df(df, as_fugue_df(df)[columns]) + return _adjust_df(df, as_fugue_df(df)[columns], as_fugue=as_fugue) @fugue_plugin @@ -605,11 +619,14 @@ def _get_pyarrow_dataframe_columns(df: pa.Table) -> List[Any]: @fugue_plugin -def rename(df: Any, names: Dict[str, Any]) -> Any: +def rename(df: Any, columns: Dict[str, Any], as_fugue: bool = False) -> Any: """A generic function to rename column names of any dataframe :param df: the dataframe object - :param names: the rename operations as a dict: ``old name => new name`` + :param columns: the rename operations as a dict: ``old name => new name`` + :param as_fugue: whether return a Fugue :class:`~.DataFrame`, default to + False. If False, then if the input ``df`` is not a Fugue DataFrame + then it will return the underlying DataFrame object. :return: the renamed dataframe .. note:: @@ -623,15 +640,15 @@ def rename(df: Any, names: Dict[str, Any]) -> Any: lambda df, *args, **kwargs: isinstance(df, pd.DataFrame) ) def _rename_pandas_dataframe( - df: pd.DataFrame, names: Dict[str, Any] + df: pd.DataFrame, columns: Dict[str, Any] ) -> pd.DataFrame: - if len(names) == 0: + if len(columns) == 0: return df - return df.rename(columns=names) + return df.rename(columns=columns) """ - if len(names) == 0: + if len(columns) == 0: return df - return _adjust_df(df, as_fugue_df(df).rename(names)) + return _adjust_df(df, as_fugue_df(df).rename(columns), as_fugue=as_fugue) def normalize_column_names(df: Any) -> Tuple[Any, Dict[str, Any]]: @@ -703,7 +720,7 @@ def _input_schema(schema: Any) -> Schema: return schema if isinstance(schema, Schema) else Schema(schema) -def _adjust_df(input_df: Any, output_df: DataFrame) -> Any: - if isinstance(input_df, DataFrame): +def _adjust_df(input_df: Any, output_df: DataFrame, as_fugue: bool) -> Any: + if as_fugue or isinstance(input_df, DataFrame): return output_df return output_df.native # type: ignore diff --git a/fugue/dataframe/pandas_dataframe.py b/fugue/dataframe/pandas_dataframe.py index 38dd17b2..2850ecb1 100644 --- a/fugue/dataframe/pandas_dataframe.py +++ b/fugue/dataframe/pandas_dataframe.py @@ -221,25 +221,52 @@ def _get_pandas_dataframe_schema(df: pd.DataFrame) -> Schema: @rename.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame)) -def _rename_pandas_dataframe(df: pd.DataFrame, names: Dict[str, Any]) -> pd.DataFrame: - if len(names) == 0: +def _rename_pandas_dataframe( + df: pd.DataFrame, columns: Dict[str, Any], as_fugue: bool = False +) -> Any: + if len(columns) == 0: return df - return df.rename(columns=names) + _assert_no_missing(df, columns.keys()) + return _adjust_df(df.rename(columns=columns), as_fugue=as_fugue) @drop_columns.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame)) -def _drop_pd_columns(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame: +def _drop_pd_columns( + df: pd.DataFrame, columns: List[str], as_fugue: bool = False +) -> Any: cols = [x for x in df.columns if x not in columns] - return df[cols] + if len(cols) == 0: + raise FugueDataFrameOperationError("cannot drop all columns") + if len(cols) + len(columns) != len(df.columns): + _assert_no_missing(df, columns) + return _adjust_df(df[cols], as_fugue=as_fugue) @select_columns.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame)) -def _select_pd_columns(df: pd.DataFrame, columns: List[Any]) -> pd.DataFrame: - return df[columns] +def _select_pd_columns( + df: pd.DataFrame, columns: List[Any], as_fugue: bool = False +) -> Any: + _assert_no_missing(df, columns) + return _adjust_df(df[columns], as_fugue=as_fugue) @head.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame)) -def _pd_head(df: Any, n: int, columns: Optional[List[str]] = None) -> pd.DataFrame: +def _pd_head( + df: pd.DataFrame, + n: int, + columns: Optional[List[str]] = None, + as_fugue: bool = False, +) -> pd.DataFrame: if columns is not None: df = df[columns] - return df.head(n) + return _adjust_df(df.head(n), as_fugue=as_fugue) + + +def _adjust_df(res: pd.DataFrame, as_fugue: bool): + return res if not as_fugue else PandasDataFrame(res) + + +def _assert_no_missing(df: pd.DataFrame, columns: Iterable[Any]) -> None: + missing = [x for x in columns if x not in df.columns] + if len(missing) > 0: + raise FugueDataFrameOperationError("cannot drop nonexistent columns: {missing}") diff --git a/fugue/dataset.py b/fugue/dataset.py index 2ea70bde..075a540f 100644 --- a/fugue/dataset.py +++ b/fugue/dataset.py @@ -79,7 +79,7 @@ def show( ) -> None: """Display the Dataset - :param rows: number of rows to print, defaults to 10 + :param n: number of rows to print, defaults to 10 :param with_count: whether to show dataset count, defaults to False :param title: title of the dataset, defaults to None @@ -160,6 +160,26 @@ def as_fugue_dataset(data: Any) -> Dataset: raise NotImplementedError(f"no registered dataset conversion for {type(data)}") +def show( + data: Any, n: int = 10, with_count: bool = False, title: Optional[str] = None +) -> None: + """Display the Dataset + + :param data: the data that can be recognized by Fugue + :param n: number of rows to print, defaults to 10 + :param with_count: whether to show dataset count, defaults to False + :param title: title of the dataset, defaults to None + + .. note:: + + When ``with_count`` is True, it can trigger expensive calculation for + a distributed dataframe. So if you call this function directly, you may + need to :func:`fugue.execution.execution_engine.ExecutionEngine.persist` + the dataset. + """ + return as_fugue_dataset(data).show(n=n, with_count=with_count, title=title) + + @fugue_plugin def is_local(data: Any) -> bool: """Whether the dataset is local diff --git a/fugue/interfaceless/__init__.py b/fugue/interfaceless/__init__.py index c8da4c88..df0d7674 100644 --- a/fugue/interfaceless/__init__.py +++ b/fugue/interfaceless/__init__.py @@ -24,6 +24,7 @@ is_bounded, is_empty, is_local, + show, ) from .transformation import out_transform, transform diff --git a/fugue_test/dataframe_suite.py b/fugue_test/dataframe_suite.py index c956c9dc..959d4535 100644 --- a/fugue_test/dataframe_suite.py +++ b/fugue_test/dataframe_suite.py @@ -6,11 +6,12 @@ import numpy as np import pandas as pd -from fugue.dataframe import ArrowDataFrame, DataFrame +from pytest import raises + +import fugue.interfaceless as fi +from fugue.dataframe import ArrowDataFrame from fugue.dataframe.utils import _df_eq as df_eq from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError -from pytest import raises -from triad.collections.schema import Schema class DataFrameTests(object): @@ -27,100 +28,97 @@ def setUpClass(cls): def tearDownClass(cls): pass - def df( - self, data: Any = None, schema: Any = None - ) -> DataFrame: # pragma: no cover + def df(self, data: Any = None, schema: Any = None) -> Any: # pragma: no cover raise NotImplementedError - def test_init_basic(self): - raises(Exception, lambda: self.df()) - raises(Exception, lambda: self.df([])) - raises(Exception, lambda: self.df([[]], Schema())) - raises(Exception, lambda: self.df([[1]], Schema())) - # raises(SchemaError, lambda: self.df([[1]])) # schema can be inferred - - df = self.df([], "a:str,b:int") - assert df.empty - - def test_datetime(self): - df = self.df([["2020-01-01"], [None]], "a:datetime") - assert [[datetime(2020, 1, 1)], [None]] == df.as_array(type_safe=True) - def test_peek(self): df = self.df([], "x:str,y:double") - raises(FugueDatasetEmptyError, lambda: df.peek_array()) - raises(FugueDatasetEmptyError, lambda: df.peek_dict()) + raises(FugueDatasetEmptyError, lambda: fi.peek_array(df)) + raises(FugueDatasetEmptyError, lambda: fi.peek_dict(df)) df = self.df([["a", 1.0], ["b", 2.0]], "x:str,y:double") - assert not df.is_bounded or 2 == df.count() - assert not df.empty - assert ["a", 1.0] == df.peek_array() - assert dict(x="a", y=1.0) == df.peek_dict() + assert not fi.is_bounded(df) or 2 == fi.count(df) + assert not fi.is_empty(df) + assert ["a", 1.0] == fi.peek_array(df) + assert dict(x="a", y=1.0) == fi.peek_dict(df) def test_as_pandas(self): df = self.df([["a", 1.0], ["b", 2.0]], "x:str,y:double") - pdf = df.as_pandas() + pdf = fi.as_pandas(df) assert [["a", 1.0], ["b", 2.0]] == pdf.values.tolist() df = self.df([], "x:str,y:double") - pdf = df.as_pandas() + pdf = fi.as_pandas(df) assert [] == pdf.values.tolist() - def test_drop(self): - df = self.df([], "a:str,b:int").drop(["a"]) - assert df.schema == "b:int" + def test_drop_columns(self): + df = fi.drop_columns(self.df([], "a:str,b:int"), ["a"]) + assert fi.get_schema(df) == "b:int" raises( - FugueDataFrameOperationError, lambda: df.drop(["b"]) + FugueDataFrameOperationError, lambda: fi.drop_columns(df, ["b"]) ) # can't be empty raises( - FugueDataFrameOperationError, lambda: df.drop(["x"]) + FugueDataFrameOperationError, lambda: fi.drop_columns(df, ["x"]) ) # cols must exist - df = self.df([["a", 1]], "a:str,b:int").drop(["a"]) - assert df.schema == "b:int" + df = fi.drop_columns(self.df([["a", 1]], "a:str,b:int"), ["a"]) + assert fi.get_schema(df) == "b:int" raises( - FugueDataFrameOperationError, lambda: df.drop(["b"]) + FugueDataFrameOperationError, lambda: fi.drop_columns(df, ["b"]) ) # can't be empty raises( - FugueDataFrameOperationError, lambda: df.drop(["x"]) + FugueDataFrameOperationError, lambda: fi.drop_columns(df, ["x"]) ) # cols must exist - assert [[1]] == df.as_array(type_safe=True) + assert [[1]] == fi.as_array(df, type_safe=True) def test_select(self): - df = self.df([], "a:str,b:int")[["b"]] - assert df.schema == "b:int" - raises(FugueDataFrameOperationError, lambda: df[["a"]]) # not existed - raises(FugueDataFrameOperationError, lambda: df[[]]) # empty + df = fi.select_columns(self.df([], "a:str,b:int"), ["b"]) + assert fi.get_schema(df) == "b:int" + raises( + FugueDataFrameOperationError, lambda: fi.select_columns(df, ["a"]) + ) # not existed + raises( + FugueDataFrameOperationError, lambda: fi.select_columns(df, ["a"]) + ) # empty - df = self.df([["a", 1]], "a:str,b:int")[["b"]] - assert df.schema == "b:int" - raises(FugueDataFrameOperationError, lambda: df[["a"]]) # not existed - raises(FugueDataFrameOperationError, lambda: df[[]]) # empty - assert [[1]] == df.as_array(type_safe=True) + df = fi.select_columns(self.df([["a", 1]], "a:str,b:int"), ["b"]) + assert fi.get_schema(df) == "b:int" + raises( + FugueDataFrameOperationError, lambda: fi.select_columns(df, ["a"]) + ) # not existed + raises( + FugueDataFrameOperationError, lambda: fi.select_columns(df, ["a"]) + ) # empty + assert [[1]] == fi.as_array(df, type_safe=True) df = self.df([["a", 1, 2]], "a:str,b:int,c:int") - df_eq(df[["c", "a"]], [[2, "a"]], "a:str,c:int") + df_eq( + fi.as_fugue_df(fi.select_columns(df, ["c", "a"])), + [[2, "a"]], + "a:str,c:int", + ) def test_rename(self): for data in [[["a", 1]], []]: df = self.df(data, "a:str,b:int") - df2 = df.rename(columns=dict(a="aa")) - assert df.schema == "a:str,b:int" - df_eq(df2, data, "aa:str,b:int", throw=True) + df2 = fi.rename(df, columns=dict(a="aa")) + assert fi.get_schema(df) == "a:str,b:int" + df_eq(fi.as_fugue_df(df2), data, "aa:str,b:int", throw=True) def test_rename_invalid(self): df = self.df([["a", 1]], "a:str,b:int") raises( - FugueDataFrameOperationError, lambda: df.rename(columns=dict(aa="ab")) + FugueDataFrameOperationError, + lambda: fi.rename(df, columns=dict(aa="ab")), ) def test_as_array(self): for func in [ - lambda df, *args, **kwargs: df.as_array( - *args, **kwargs, type_safe=True + lambda df, *args, **kwargs: fi.as_array( + df, *args, **kwargs, type_safe=True ), lambda df, *args, **kwargs: list( - df.as_array_iterable(*args, **kwargs, type_safe=True) + fi.as_array_iterable(df, *args, **kwargs, type_safe=True) ), ]: df = self.df([], "a:str,b:int") @@ -142,11 +140,11 @@ def test_as_array(self): def test_as_array_special_values(self): for func in [ - lambda df, *args, **kwargs: df.as_array( - *args, **kwargs, type_safe=True + lambda df, *args, **kwargs: fi.as_array( + df, *args, **kwargs, type_safe=True ), lambda df, *args, **kwargs: list( - df.as_array_iterable(*args, **kwargs, type_safe=True) + fi.as_array_iterable(df, *args, **kwargs, type_safe=True) ), ]: df = self.df([[pd.Timestamp("2020-01-01"), 1]], "a:datetime,b:int") @@ -166,92 +164,92 @@ def test_as_array_special_values(self): def test_as_dict_iterable(self): df = self.df([[pd.NaT, 1]], "a:datetime,b:int") - assert [dict(a=None, b=1)] == list(df.as_dict_iterable()) + assert [dict(a=None, b=1)] == list(fi.as_dict_iterable(df)) df = self.df([[pd.Timestamp("2020-01-01"), 1]], "a:datetime,b:int") - assert [dict(a=datetime(2020, 1, 1), b=1)] == list(df.as_dict_iterable()) + assert [dict(a=datetime(2020, 1, 1), b=1)] == list(fi.as_dict_iterable(df)) def test_list_type(self): data = [[[30, 40]]] df = self.df(data, "a:[int]") - a = df.as_array(type_safe=True) + a = fi.as_array(df, type_safe=True) assert data == a def test_struct_type(self): data = [[{"a": 1}], [{"a": 2}]] df = self.df(data, "x:{a:int}") - a = df.as_array(type_safe=True) + a = fi.as_array(df, type_safe=True) assert data == a def test_map_type(self): data = [[[("a", 1), ("b", 3)]], [[("b", 2)]]] df = self.df(data, "x:") - a = df.as_array(type_safe=True) + a = fi.as_array(df, type_safe=True) assert data == a def test_deep_nested_types(self): data = [[dict(a="1", b=[3, 4], d=1.0)], [dict(b=[30, 40])]] df = self.df(data, "a:{a:str,b:[int]}") - a = df.as_array(type_safe=True) + a = fi.as_array(df, type_safe=True) assert [[dict(a="1", b=[3, 4])], [dict(a=None, b=[30, 40])]] == a data = [[[dict(b=[30, 40])]]] df = self.df(data, "a:[{a:str,b:[int]}]") - a = df.as_array(type_safe=True) + a = fi.as_array(df, type_safe=True) assert [[[dict(a=None, b=[30, 40])]]] == a def test_binary_type(self): data = [[b"\x01\x05"]] df = self.df(data, "a:bytes") - a = df.as_array(type_safe=True) + a = fi.as_array(df, type_safe=True) assert data == a def test_as_arrow(self): # empty df = self.df([], "a:int,b:int") - assert [] == list(ArrowDataFrame(df.as_arrow()).as_dict_iterable()) + assert [] == list(ArrowDataFrame(fi.as_arrow(df)).as_dict_iterable()) # pd.Nat df = self.df([[pd.NaT, 1]], "a:datetime,b:int") assert [dict(a=None, b=1)] == list( - ArrowDataFrame(df.as_arrow()).as_dict_iterable() + ArrowDataFrame(fi.as_arrow(df)).as_dict_iterable() ) # pandas timestamps df = self.df([[pd.Timestamp("2020-01-01"), 1]], "a:datetime,b:int") assert [dict(a=datetime(2020, 1, 1), b=1)] == list( - ArrowDataFrame(df.as_arrow()).as_dict_iterable() + ArrowDataFrame(fi.as_arrow(df)).as_dict_iterable() ) # float nan, list data = [[[float("nan"), 2.0]]] df = self.df(data, "a:[float]") - assert [[[None, 2.0]]] == ArrowDataFrame(df.as_arrow()).as_array() + assert [[[None, 2.0]]] == ArrowDataFrame(fi.as_arrow(df)).as_array() # dict data = [[dict(b="x")]] df = self.df(data, "a:{b:str}") - assert data == ArrowDataFrame(df.as_arrow()).as_array() + assert data == ArrowDataFrame(fi.as_arrow(df)).as_array() # list[dict] data = [[[dict(b=[30, 40])]]] df = self.df(data, "a:[{b:[int]}]") - assert data == ArrowDataFrame(df.as_arrow()).as_array() + assert data == ArrowDataFrame(fi.as_arrow(df)).as_array() def test_head(self): df = self.df([], "a:str,b:int") - assert [] == df.head(1).as_array() - assert [] == df.head(1, ["b"]).as_array() + assert [] == fi.as_array(fi.head(df, 1)) + assert [] == fi.as_array(fi.head(df, 1, ["b"])) df = self.df([["a", 1]], "a:str,b:int") - if df.is_bounded: - assert [["a", 1]] == df.head(1).as_array() - assert [[1, "a"]] == df.head(1, ["b", "a"]).as_array() - assert [] == df.head(0).as_array() + if fi.is_bounded(df): + assert [["a", 1]] == fi.as_array(fi.head(df, 1)) + assert [[1, "a"]] == fi.as_array(fi.head(df, 1, ["b", "a"])) + assert [] == fi.as_array(fi.head(df, 0)) df = self.df([[0, 1], [0, 2], [1, 1], [1, 3]], "a:int,b:int") - assert 2 == df.head(2).count() + assert 2 == fi.count(fi.head(df, 2)) df = self.df([[0, 1], [0, 2], [1, 1], [1, 3]], "a:int,b:int") - assert 4 == df.head(10).count() - h = df.head(10) - assert h.is_local and h.is_bounded + assert 4 == fi.count(fi.head(df, 10)) + h = fi.head(df, 10) + assert fi.is_local(h) and fi.is_bounded(h) def test_show(self): df = self.df([["a", 1]], "a:str,b:int") - df.show() + fi.show(df) def test_get_altered_schema(self): df = self.df([["a", 1]], "a:str,b:int") @@ -270,47 +268,55 @@ def test_get_altered_schema(self): def test_alter_columns(self): # empty df = self.df([], "a:str,b:int") - ndf = df.alter_columns("a:str,b:str") - assert [] == ndf.as_array(type_safe=True) - assert ndf.schema == "a:str,b:str" + ndf = fi.alter_columns(df, "a:str,b:str") + assert [] == fi.as_array(ndf, type_safe=True) + assert fi.get_schema(ndf) == "a:str,b:str" # no change df = self.df([["a", 1], ["c", None]], "a:str,b:int") - ndf = df.alter_columns("b:int,a:str") - assert [["a", 1], ["c", None]] == ndf.as_array(type_safe=True) - assert ndf.schema == df.schema + ndf = fi.alter_columns(df, "b:int,a:str", as_fugue=True) + assert [["a", 1], ["c", None]] == fi.as_array(ndf, type_safe=True) + assert fi.get_schema(ndf) == "a:str,b:int" # bool -> str df = self.df([["a", True], ["b", False], ["c", None]], "a:str,b:bool") - ndf = df.alter_columns("b:str") - actual = ndf.as_array(type_safe=True) + ndf = fi.alter_columns(df, "b:str", as_fugue=True) + actual = fi.as_array(ndf, type_safe=True) # Capitalization doesn't matter # and dataframes don't need to be consistent on capitalization expected1 = [["a", "True"], ["b", "False"], ["c", None]] expected2 = [["a", "true"], ["b", "false"], ["c", None]] assert expected1 == actual or expected2 == actual - assert ndf.schema == "a:str,b:str" + assert fi.get_schema(ndf) == "a:str,b:str" # int -> str df = self.df([["a", 1], ["c", None]], "a:str,b:int") - ndf = df.alter_columns("b:str") - assert [["a", "1"], ["c", None]] == ndf.as_array(type_safe=True) - assert ndf.schema == "a:str,b:str" + ndf = fi.alter_columns(df, "b:str", as_fugue=True) + arr = fi.as_array(ndf, type_safe=True) + assert [["a", "1"], ["c", None]] == arr or [ + ["a", "1.0"], + ["c", None], + ] == arr # in pandas case, it can't treat [1, None] as an int col + assert fi.get_schema(ndf) == "a:str,b:str" # int -> double df = self.df([["a", 1], ["c", None]], "a:str,b:int") - ndf = df.alter_columns("b:double") - assert [["a", 1], ["c", None]] == ndf.as_array(type_safe=True) - assert ndf.schema == "a:str,b:double" + ndf = fi.alter_columns(df, "b:double", as_fugue=True) + assert [["a", 1], ["c", None]] == fi.as_array(ndf, type_safe=True) + assert fi.get_schema(ndf) == "a:str,b:double" # double -> str df = self.df([["a", 1.1], ["b", None]], "a:str,b:double") - data = df.alter_columns("b:str").as_array(type_safe=True) + data = fi.as_array( + fi.alter_columns(df, "b:str", as_fugue=True), type_safe=True + ) assert [["a", "1.1"], ["b", None]] == data # double -> int df = self.df([["a", 1.0], ["b", None]], "a:str,b:double") - data = df.alter_columns("b:int").as_array(type_safe=True) + data = fi.as_array( + fi.alter_columns(df, "b:int", as_fugue=True), type_safe=True + ) assert [["a", 1], ["b", None]] == data # date -> str @@ -318,7 +324,9 @@ def test_alter_columns(self): [["a", date(2020, 1, 1)], ["b", date(2020, 1, 2)], ["c", None]], "a:str,b:date", ) - data = df.alter_columns("b:str").as_array(type_safe=True) + data = fi.as_array( + fi.alter_columns(df, "b:str", as_fugue=True), type_safe=True + ) assert [["a", "2020-01-01"], ["b", "2020-01-02"], ["c", None]] == data # datetime -> str @@ -330,7 +338,9 @@ def test_alter_columns(self): ], "a:str,b:datetime", ) - data = df.alter_columns("b:str").as_array(type_safe=True) + data = fi.as_array( + fi.alter_columns(df, "b:str", as_fugue=True), type_safe=True + ) assert [ ["a", "2020-01-01 03:04:05"], ["b", "2020-01-02 16:07:08"], @@ -339,49 +349,51 @@ def test_alter_columns(self): # str -> bool df = self.df([["a", "trUe"], ["b", "False"], ["c", None]], "a:str,b:str") - ndf = df.alter_columns("b:bool,a:str") - assert [["a", True], ["b", False], ["c", None]] == ndf.as_array( - type_safe=True + ndf = fi.alter_columns(df, "b:bool,a:str", as_fugue=True) + assert [["a", True], ["b", False], ["c", None]] == fi.as_array( + ndf, type_safe=True ) - assert ndf.schema == "a:str,b:bool" + assert fi.get_schema(ndf) == "a:str,b:bool" # str -> int df = self.df([["a", "1"]], "a:str,b:str") - ndf = df.alter_columns("b:int,a:str") - assert [["a", 1]] == ndf.as_array(type_safe=True) - assert ndf.schema == "a:str,b:int" + ndf = fi.alter_columns(df, "b:int,a:str") + assert [["a", 1]] == fi.as_array(ndf, type_safe=True) + assert fi.get_schema(ndf) == "a:str,b:int" # str -> double df = self.df([["a", "1.1"], ["b", "2"], ["c", None]], "a:str,b:str") - ndf = df.alter_columns("b:double") - assert [["a", 1.1], ["b", 2.0], ["c", None]] == ndf.as_array(type_safe=True) - assert ndf.schema == "a:str,b:double" + ndf = fi.alter_columns(df, "b:double", as_fugue=True) + assert [["a", 1.1], ["b", 2.0], ["c", None]] == fi.as_array( + ndf, type_safe=True + ) + assert fi.get_schema(ndf) == "a:str,b:double" # str -> date df = self.df( [["1", "2020-01-01"], ["2", "2020-01-02 01:02:03"], ["3", None]], "a:str,b:str", ) - ndf = df.alter_columns("b:date,a:int") + ndf = fi.alter_columns(df, "b:date,a:int", as_fugue=True) assert [ [1, date(2020, 1, 1)], [2, date(2020, 1, 2)], [3, None], - ] == ndf.as_array(type_safe=True) - assert ndf.schema == "a:int,b:date" + ] == fi.as_array(ndf, type_safe=True) + assert fi.get_schema(ndf) == "a:int,b:date" # str -> datetime df = self.df( [["1", "2020-01-01"], ["2", "2020-01-02 01:02:03"], ["3", None]], "a:str,b:str", ) - ndf = df.alter_columns("b:datetime,a:int") + ndf = fi.alter_columns(df, "b:datetime,a:int", as_fugue=True) assert [ [1, datetime(2020, 1, 1)], [2, datetime(2020, 1, 2, 1, 2, 3)], [3, None], - ] == ndf.as_array(type_safe=True) - assert ndf.schema == "a:int,b:datetime" + ] == fi.as_array(ndf, type_safe=True) + assert fi.get_schema(ndf) == "a:int,b:datetime" def test_alter_columns_invalid(self): # invalid conversion @@ -390,5 +402,5 @@ def test_alter_columns_invalid(self): [["1", "x"], ["2", "y"], ["3", None]], "a:str,b:str", ) - ndf = df.alter_columns("b:int") - ndf.show() # lazy dataframes will force to materialize + ndf = fi.alter_columns(df, "b:int") + fi.show(ndf) # lazy dataframes will force to materialize diff --git a/tests/fugue/dataframe/test_arrow_dataframe.py b/tests/fugue/dataframe/test_arrow_dataframe.py index 551b7ecb..63e68edb 100644 --- a/tests/fugue/dataframe/test_arrow_dataframe.py +++ b/tests/fugue/dataframe/test_arrow_dataframe.py @@ -17,6 +17,14 @@ def df(self, data: Any = None, schema: Any = None) -> ArrowDataFrame: return ArrowDataFrame(data, schema) +class NativeArrowDataFrameTests(DataFrameTests.Tests): + def df(self, data: Any = None, schema: Any = None) -> pd.DataFrame: + return ArrowDataFrame(data, schema).as_arrow() + + def test_get_altered_schema(self): + pass + + def test_init(): df = ArrowDataFrame(schema="a:str,b:int") assert df.empty diff --git a/tests/fugue/dataframe/test_pandas_dataframe.py b/tests/fugue/dataframe/test_pandas_dataframe.py index c0bc5e7a..73c28b6e 100644 --- a/tests/fugue/dataframe/test_pandas_dataframe.py +++ b/tests/fugue/dataframe/test_pandas_dataframe.py @@ -5,7 +5,7 @@ import numpy as np import pandas as pd -from fugue.dataframe import PandasDataFrame +from fugue.dataframe import PandasDataFrame, ArrowDataFrame from fugue.dataframe.array_dataframe import ArrayDataFrame from fugue.dataframe.utils import _df_eq as df_eq from fugue_test.dataframe_suite import DataFrameTests @@ -19,6 +19,17 @@ def df(self, data: Any = None, schema: Any = None) -> PandasDataFrame: return PandasDataFrame(data, schema) +class NativePandasDataFrameTests(DataFrameTests.Tests): + def df(self, data: Any = None, schema: Any = None) -> pd.DataFrame: + return ArrowDataFrame(data, schema).as_pandas() + + def test_get_altered_schema(self): + pass + + def test_map_type(self): + pass + + def test_init(): df = PandasDataFrame(schema="a:str,b:int") assert df.is_bounded @@ -76,10 +87,10 @@ def test_simple_methods(): def test_nested(): - #data = [[dict(a=1, b=[3, 4], d=1.0)], [json.dumps(dict(b=[30, "40"]))]] - #df = PandasDataFrame(data, "a:{a:str,b:[int]}") - #a = df.as_array(type_safe=True) - #assert [[dict(a="1", b=[3, 4])], [dict(a=None, b=[30, 40])]] == a + # data = [[dict(a=1, b=[3, 4], d=1.0)], [json.dumps(dict(b=[30, "40"]))]] + # df = PandasDataFrame(data, "a:{a:str,b:[int]}") + # a = df.as_array(type_safe=True) + # assert [[dict(a="1", b=[3, 4])], [dict(a=None, b=[30, 40])]] == a data = [[[json.dumps(dict(b=[30, "40"]))]]] df = PandasDataFrame(data, "a:[{a:str,b:[int]}]") From f7be9780f5d129453848cf5dbb95fb4ad6c4721b Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sun, 18 Dec 2022 21:02:36 +0000 Subject: [PATCH 03/30] fix test coverage --- fugue/__init__.py | 21 +++------------------ fugue/dataframe/arrow_dataframe.py | 2 ++ fugue/dataframe/pandas_dataframe.py | 2 ++ fugue/extensions/creator/convert.py | 3 ++- fugue_dask/registry.py | 8 ++------ fugue_duckdb/registry.py | 2 +- fugue_ray/registry.py | 8 ++------ fugue_spark/registry.py | 10 ++-------- fugue_test/dataframe_suite.py | 5 +++++ tests/fugue/dataframe/test_dataframe.py | 16 +++++++++++++--- tests/fugue_dask/test_execution_engine.py | 12 +++++------- tests/fugue_duckdb/test_execution_engine.py | 16 ++++++++-------- tests/fugue_ray/test_execution_engine.py | 18 ++++++------------ tests/fugue_spark/test_execution_engine.py | 19 ++++++++++--------- 14 files changed, 63 insertions(+), 79 deletions(-) diff --git a/fugue/__init__.py b/fugue/__init__.py index 9f3ef2b1..7de5bc35 100644 --- a/fugue/__init__.py +++ b/fugue/__init__.py @@ -23,12 +23,9 @@ from fugue.dataset import Dataset, DatasetDisplay, as_fugue_dataset, get_dataset_display from fugue.execution.execution_engine import ExecutionEngine, MapEngine, SQLEngine from fugue.execution.factory import ( - infer_execution_engine, is_pandas_or, make_execution_engine, make_sql_engine, - parse_execution_engine, - parse_sql_engine, register_default_execution_engine, register_default_sql_engine, register_execution_engine, @@ -40,19 +37,9 @@ QPDPandasEngine, SqliteEngine, ) -from fugue.extensions.creator import Creator, creator, parse_creator, register_creator -from fugue.extensions.outputter import ( - Outputter, - outputter, - parse_outputter, - register_outputter, -) -from fugue.extensions.processor import ( - Processor, - parse_processor, - processor, - register_processor, -) +from fugue.extensions.creator import Creator, creator, register_creator +from fugue.extensions.outputter import Outputter, outputter, register_outputter +from fugue.extensions.processor import Processor, processor, register_processor from fugue.extensions.transformer import ( CoTransformer, OutputCoTransformer, @@ -61,8 +48,6 @@ cotransformer, output_cotransformer, output_transformer, - parse_output_transformer, - parse_transformer, register_output_transformer, register_transformer, transformer, diff --git a/fugue/dataframe/arrow_dataframe.py b/fugue/dataframe/arrow_dataframe.py index 8a9dd482..63d96c2f 100644 --- a/fugue/dataframe/arrow_dataframe.py +++ b/fugue/dataframe/arrow_dataframe.py @@ -285,6 +285,8 @@ def _drop_pa_columns(df: pa.Table, columns: List[str]) -> pa.Table: @select_columns.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table)) def _select_pa_columns(df: pa.Table, columns: List[Any]) -> pa.Table: + if len(columns) == 0: + raise FugueDataFrameOperationError("must select at least one column") _assert_no_missing(df, columns=columns) return df.select(columns) diff --git a/fugue/dataframe/pandas_dataframe.py b/fugue/dataframe/pandas_dataframe.py index 2850ecb1..3dd3ba5b 100644 --- a/fugue/dataframe/pandas_dataframe.py +++ b/fugue/dataframe/pandas_dataframe.py @@ -246,6 +246,8 @@ def _drop_pd_columns( def _select_pd_columns( df: pd.DataFrame, columns: List[Any], as_fugue: bool = False ) -> Any: + if len(columns) == 0: + raise FugueDataFrameOperationError("must select at least one column") _assert_no_missing(df, columns) return _adjust_df(df[columns], as_fugue=as_fugue) diff --git a/fugue/extensions/creator/convert.py b/fugue/extensions/creator/convert.py index 86970f84..18b8ae94 100644 --- a/fugue/extensions/creator/convert.py +++ b/fugue/extensions/creator/convert.py @@ -24,7 +24,8 @@ def parse_creator(obj: Any) -> Any: .. code-block:: python - from fugue import Creator, parse_creator, FugueWorkflow + from fugue import Creator, FugueWorkflow + from fugue.plugins import parse_creator from triad import to_uuid class My(Creator): diff --git a/fugue_dask/registry.py b/fugue_dask/registry.py index d4dc386b..a7b0f530 100644 --- a/fugue_dask/registry.py +++ b/fugue_dask/registry.py @@ -5,18 +5,14 @@ from dask.distributed import Client from triad import run_at_def -from fugue import ( - DataFrame, - infer_execution_engine, - is_pandas_or, - register_execution_engine, -) +from fugue import DataFrame, is_pandas_or, register_execution_engine from fugue._utils.interfaceless import ( DataFrameParam, ExecutionEngineParam, SimpleAnnotationConverter, register_annotation_converter, ) +from fugue.plugins import infer_execution_engine from fugue.workflow import register_raw_df_type from fugue_dask._utils import DASK_UTILS from fugue_dask.dataframe import DaskDataFrame diff --git a/fugue_duckdb/registry.py b/fugue_duckdb/registry.py index ccdc3b23..4fbb9274 100644 --- a/fugue_duckdb/registry.py +++ b/fugue_duckdb/registry.py @@ -7,7 +7,6 @@ from fugue import ( DataFrame, ExecutionEngine, - infer_execution_engine, is_pandas_or, register_execution_engine, register_sql_engine, @@ -18,6 +17,7 @@ SimpleAnnotationConverter, register_annotation_converter, ) +from fugue.plugins import infer_execution_engine from fugue.workflow import register_raw_df_type from fugue_duckdb.dataframe import DuckDataFrame from fugue_duckdb.execution_engine import DuckDBEngine, DuckExecutionEngine diff --git a/fugue_ray/registry.py b/fugue_ray/registry.py index 4445ee77..9f85868e 100644 --- a/fugue_ray/registry.py +++ b/fugue_ray/registry.py @@ -4,18 +4,14 @@ import ray.data as rd from triad import run_at_def -from fugue import ( - DataFrame, - infer_execution_engine, - is_pandas_or, - register_execution_engine, -) +from fugue import DataFrame, is_pandas_or, register_execution_engine from fugue._utils.interfaceless import ( DataFrameParam, ExecutionEngineParam, SimpleAnnotationConverter, register_annotation_converter, ) +from fugue.plugins import infer_execution_engine from fugue.workflow import register_raw_df_type from .dataframe import RayDataFrame diff --git a/fugue_spark/registry.py b/fugue_spark/registry.py index 27c5b55e..bf3b3fa7 100644 --- a/fugue_spark/registry.py +++ b/fugue_spark/registry.py @@ -7,20 +7,14 @@ from pyspark.sql import SparkSession from triad import run_at_def -from fugue import ( - DataFrame, - ExecutionEngine, - infer_execution_engine, - is_pandas_or, - parse_creator, - register_execution_engine, -) +from fugue import DataFrame, ExecutionEngine, is_pandas_or, register_execution_engine from fugue._utils.interfaceless import ( DataFrameParam, ExecutionEngineParam, SimpleAnnotationConverter, register_annotation_converter, ) +from fugue.plugins import infer_execution_engine, parse_creator from fugue.workflow import register_raw_df_type from fugue_spark.dataframe import SparkDataFrame from fugue_spark.execution_engine import SparkExecutionEngine diff --git a/fugue_test/dataframe_suite.py b/fugue_test/dataframe_suite.py index 959d4535..cbe7a966 100644 --- a/fugue_test/dataframe_suite.py +++ b/fugue_test/dataframe_suite.py @@ -50,6 +50,7 @@ def test_as_pandas(self): df = self.df([], "x:str,y:double") pdf = fi.as_pandas(df) assert [] == pdf.values.tolist() + assert fi.is_local(pdf) def test_drop_columns(self): df = fi.drop_columns(self.df([], "a:str,b:int"), ["a"]) @@ -74,6 +75,9 @@ def test_drop_columns(self): def test_select(self): df = fi.select_columns(self.df([], "a:str,b:int"), ["b"]) assert fi.get_schema(df) == "b:int" + raises( + FugueDataFrameOperationError, lambda: fi.select_columns(df, []) + ) # select empty raises( FugueDataFrameOperationError, lambda: fi.select_columns(df, ["a"]) ) # not existed @@ -207,6 +211,7 @@ def test_as_arrow(self): # empty df = self.df([], "a:int,b:int") assert [] == list(ArrowDataFrame(fi.as_arrow(df)).as_dict_iterable()) + assert fi.is_local(fi.as_arrow(df)) # pd.Nat df = self.df([[pd.NaT, 1]], "a:datetime,b:int") assert [dict(a=None, b=1)] == list( diff --git a/tests/fugue/dataframe/test_dataframe.py b/tests/fugue/dataframe/test_dataframe.py index 506e6de8..e69cf0bf 100644 --- a/tests/fugue/dataframe/test_dataframe.py +++ b/tests/fugue/dataframe/test_dataframe.py @@ -1,6 +1,16 @@ -from fugue.dataframe import ArrayDataFrame, DataFrame -from triad.collections.schema import Schema import copy +import pandas as pd +from triad.collections.schema import Schema +from pytest import raises +from fugue.dataframe import ArrayDataFrame, DataFrame +from fugue.interfaceless import as_fugue_df + + +def test_as_fugue_df(): + with raises(NotImplementedError): + as_fugue_df(10) + df = pd.DataFrame([[0]], columns=["a"]) + assert isinstance(as_fugue_df(df), DataFrame) def test_show(): @@ -56,5 +66,5 @@ def test_copy(): class MockDF(ArrayDataFrame): def __init__(self, df=None, schema=None): - super(). __init__(df=df, schema=schema) + super().__init__(df=df, schema=schema) DataFrame.__init__(self, lambda: Schema(schema)) diff --git a/tests/fugue_dask/test_execution_engine.py b/tests/fugue_dask/test_execution_engine.py index b96f95be..380e3179 100644 --- a/tests/fugue_dask/test_execution_engine.py +++ b/tests/fugue_dask/test_execution_engine.py @@ -5,16 +5,17 @@ import dask.dataframe as dd import pandas as pd from dask.distributed import Client -from fugue import infer_execution_engine, transform + +from fugue import transform from fugue.collections.partition import PartitionSpec from fugue.dataframe.pandas_dataframe import PandasDataFrame from fugue.dataframe.utils import _df_eq as df_eq +from fugue.plugins import infer_execution_engine from fugue.workflow.workflow import FugueWorkflow -from fugue_test.builtin_suite import BuiltInTests -from fugue_test.execution_suite import ExecutionEngineTests - from fugue_dask.dataframe import DaskDataFrame from fugue_dask.execution_engine import DaskExecutionEngine +from fugue_test.builtin_suite import BuiltInTests +from fugue_test.execution_suite import ExecutionEngineTests _CONF = { "fugue.rpc.server": "fugue.rpc.flask.FlaskRPCServer", @@ -192,6 +193,3 @@ def tr(df: List[List[Any]], add: Optional[callable]) -> List[List[Any]]: assert not res.is_local assert 5 == res.count() assert 5 == cb.n - - - diff --git a/tests/fugue_duckdb/test_execution_engine.py b/tests/fugue_duckdb/test_execution_engine.py index 5e2323a6..f0ddad65 100644 --- a/tests/fugue_duckdb/test_execution_engine.py +++ b/tests/fugue_duckdb/test_execution_engine.py @@ -3,15 +3,15 @@ import duckdb import pandas as pd import pyarrow as pa -from fugue import ArrowDataFrame, DataFrame, FugueWorkflow, infer_execution_engine -from fugue.dataframe.utils import _df_eq as df_eq -from fugue import fsql -from fugue_test.builtin_suite import BuiltInTests -from fugue_test.execution_suite import ExecutionEngineTests from pytest import raises +from fugue import ArrowDataFrame, DataFrame, FugueWorkflow, fsql +from fugue.dataframe.utils import _df_eq as df_eq +from fugue.plugins import infer_execution_engine from fugue_duckdb import DuckExecutionEngine from fugue_duckdb.dataframe import DuckDataFrame +from fugue_test.builtin_suite import BuiltInTests +from fugue_test.execution_suite import ExecutionEngineTests class DuckExecutionEngineTests(ExecutionEngineTests.Tests): @@ -178,9 +178,9 @@ def test_sql_yield(): def test_infer_engine(): con = duckdb.connect() df = con.from_df(pd.DataFrame([[0]], columns=["a"])) - assert infer_execution_engine([df])=="duckdb" + assert infer_execution_engine([df]) == "duckdb" fdf = DuckDataFrame(df) - assert infer_execution_engine([fdf])=="duckdb" + assert infer_execution_engine([fdf]) == "duckdb" - con.close() \ No newline at end of file + con.close() diff --git a/tests/fugue_ray/test_execution_engine.py b/tests/fugue_ray/test_execution_engine.py index 49aa99b1..b0ce1f04 100644 --- a/tests/fugue_ray/test_execution_engine.py +++ b/tests/fugue_ray/test_execution_engine.py @@ -4,21 +4,15 @@ import pandas as pd import ray import ray.data as rd -from fugue import ( - ArrayDataFrame, - FugueWorkflow, - transform, - DataFrame, - infer_execution_engine, -) -from fugue.dataframe.utils import _df_eq as df_eq -from fugue import fsql -from fugue_test.builtin_suite import BuiltInTests -from fugue_test.execution_suite import ExecutionEngineTests from pytest import raises from triad import FileSystem -from fugue_ray import RayExecutionEngine, RayDataFrame +from fugue import ArrayDataFrame, DataFrame, FugueWorkflow, fsql, transform +from fugue.dataframe.utils import _df_eq as df_eq +from fugue.plugins import infer_execution_engine +from fugue_ray import RayDataFrame, RayExecutionEngine +from fugue_test.builtin_suite import BuiltInTests +from fugue_test.execution_suite import ExecutionEngineTests _CONF = { "fugue.rpc.server": "fugue.rpc.flask.FlaskRPCServer", diff --git a/tests/fugue_spark/test_execution_engine.py b/tests/fugue_spark/test_execution_engine.py index 61fb8a13..e4a4b80a 100644 --- a/tests/fugue_spark/test_execution_engine.py +++ b/tests/fugue_spark/test_execution_engine.py @@ -6,7 +6,13 @@ import pyspark.rdd as pr import pyspark.sql as ps import pytest -from fugue import infer_execution_engine, transform +from pyspark import SparkContext, StorageLevel +from pyspark.sql import DataFrame as SDataFrame +from pyspark.sql import SparkSession +from pytest import raises +from triad import Schema + +from fugue import transform from fugue.collections.partition import PartitionSpec from fugue.dataframe import ( ArrayDataFrame, @@ -16,17 +22,12 @@ ) from fugue.dataframe.utils import _df_eq as df_eq from fugue.extensions.transformer import Transformer, transformer +from fugue.plugins import infer_execution_engine from fugue.workflow.workflow import FugueWorkflow -from fugue_test.builtin_suite import BuiltInTests -from fugue_test.execution_suite import ExecutionEngineTests -from pyspark import SparkContext, StorageLevel -from pyspark.sql import DataFrame as SDataFrame -from pyspark.sql import SparkSession -from pytest import raises -from triad import Schema - from fugue_spark.dataframe import SparkDataFrame from fugue_spark.execution_engine import SparkExecutionEngine +from fugue_test.builtin_suite import BuiltInTests +from fugue_test.execution_suite import ExecutionEngineTests class SparkExecutionEngineTests(ExecutionEngineTests.Tests): From 610f4139e1d8add7776e36b736dba17ab41e4dc5 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sun, 18 Dec 2022 21:16:08 +0000 Subject: [PATCH 04/30] fix numpy brreaking change --- fugue/execution/interfaceless.py | 0 tests/fugue_duckdb/test_utils.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 fugue/execution/interfaceless.py diff --git a/fugue/execution/interfaceless.py b/fugue/execution/interfaceless.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/fugue_duckdb/test_utils.py b/tests/fugue_duckdb/test_utils.py index 1cc2deaa..fa709903 100644 --- a/tests/fugue_duckdb/test_utils.py +++ b/tests/fugue_duckdb/test_utils.py @@ -13,7 +13,7 @@ def test_encode_value_to_expr(): assert "1" == encode_value_to_expr(1) assert "1" == encode_value_to_expr(np.int32(1)) assert "FALSE" == encode_value_to_expr(False) - assert "TRUE" == encode_value_to_expr(np.bool(1)) + assert "TRUE" == encode_value_to_expr(np.bool_(1)) assert "E'abc'" == encode_value_to_expr("abc") assert "E'abc\\n;def'" == encode_value_to_expr("abc\n;def") assert "'\\xcaABC'::BLOB" == encode_value_to_expr(b"\xCAABC") From 3665ef653338c2dcd988306578f7415d4e511a12 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Mon, 19 Dec 2022 06:41:10 +0000 Subject: [PATCH 05/30] update backends for utils functions --- fugue/dataframe/arrow_dataframe.py | 2 +- fugue/dataframe/pandas_dataframe.py | 2 +- fugue/dataframe/utils.py | 4 +- fugue/execution/execution_engine.py | 7 +- fugue/execution/interfaceless.py | 184 +++++++++++++++++++++ fugue/execution/native_execution_engine.py | 3 +- fugue/interfaceless/__init__.py | 9 + fugue_dask/dataframe.py | 129 +++++++++++---- fugue_dask/execution_engine.py | 25 ++- fugue_dask/registry.py | 7 +- fugue_duckdb/_utils.py | 4 + fugue_duckdb/dataframe.py | 56 +++++-- fugue_duckdb/execution_engine.py | 3 +- fugue_duckdb/registry.py | 7 +- fugue_ibis/execution_engine.py | 19 +-- fugue_ray/dataframe.py | 48 +++--- fugue_ray/registry.py | 7 +- fugue_spark/dataframe.py | 101 ++++++++++- fugue_spark/execution_engine.py | 11 +- fugue_spark/registry.py | 7 +- fugue_test/dataframe_suite.py | 1 + tests/fugue_dask/test_dataframe.py | 37 ++++- tests/fugue_duckdb/test_dataframe.py | 15 +- tests/fugue_ray/test_dataframe.py | 18 ++ tests/fugue_spark/test_dataframe.py | 22 +++ 25 files changed, 598 insertions(+), 130 deletions(-) diff --git a/fugue/dataframe/arrow_dataframe.py b/fugue/dataframe/arrow_dataframe.py index 63d96c2f..9c66b0ba 100644 --- a/fugue/dataframe/arrow_dataframe.py +++ b/fugue/dataframe/arrow_dataframe.py @@ -301,4 +301,4 @@ def _build_empty_arrow(schema: Schema) -> pa.Table: # pragma: no cover def _assert_no_missing(df: pa.Table, columns: Iterable[Any]) -> None: missing = [x for x in columns if x not in df.schema.names] if len(missing) > 0: - raise FugueDataFrameOperationError("cannot drop nonexistent columns: {missing}") + raise FugueDataFrameOperationError("found nonexistent columns: {missing}") diff --git a/fugue/dataframe/pandas_dataframe.py b/fugue/dataframe/pandas_dataframe.py index 3dd3ba5b..77577981 100644 --- a/fugue/dataframe/pandas_dataframe.py +++ b/fugue/dataframe/pandas_dataframe.py @@ -271,4 +271,4 @@ def _adjust_df(res: pd.DataFrame, as_fugue: bool): def _assert_no_missing(df: pd.DataFrame, columns: Iterable[Any]) -> None: missing = [x for x in columns if x not in df.columns] if len(missing) > 0: - raise FugueDataFrameOperationError("cannot drop nonexistent columns: {missing}") + raise FugueDataFrameOperationError("found nonexistent columns: {missing}") diff --git a/fugue/dataframe/utils.py b/fugue/dataframe/utils.py index 8a3e3e2c..791c755c 100644 --- a/fugue/dataframe/utils.py +++ b/fugue/dataframe/utils.py @@ -294,7 +294,7 @@ def deserialize_df( def get_join_schemas( - df1: DataFrame, df2: DataFrame, how: str, on: Iterable[str] + df1: DataFrame, df2: DataFrame, how: str, on: Optional[Iterable[str]] ) -> Tuple[Schema, Schema]: """Get :class:`~triad:triad.collections.schema.Schema` object after joining ``df1`` and ``df2``. If ``on`` is not empty, it's mainly for @@ -332,7 +332,7 @@ def get_join_schemas( ], ValueError(f"{how} is not a valid join type"), ) - on = list(on) + on = list(on) if on is not None else [] aot(len(on) == len(set(on)), f"{on} has duplication") if how != "cross" and len(on) == 0: on = list(df1.schema.intersect(df2.schema.names).names) diff --git a/fugue/execution/execution_engine.py b/fugue/execution/execution_engine.py index 94394c08..3eb8d5a9 100644 --- a/fugue/execution/execution_engine.py +++ b/fugue/execution/execution_engine.py @@ -30,8 +30,6 @@ "_FUGUE_EXECUTION_ENGINE_CONTEXT", default=None ) -_DEFAULT_JOIN_KEYS: List[str] = [] - class ExecutionEngineFacet: """The base class for different factes of the execution engines. @@ -307,7 +305,6 @@ def persist( :param lazy: ``True``: first usage of the output will trigger persisting to happen; ``False`` (eager): persist is forced to happend immediately. Default to ``False`` - :param args: parameter to pass to the underlying persist implementation :param kwargs: parameter to pass to the underlying persist implementation :return: the persisted dataframe @@ -327,7 +324,7 @@ def join( df1: DataFrame, df2: DataFrame, how: str, - on: List[str] = _DEFAULT_JOIN_KEYS, + on: Optional[List[str]] = None, ) -> DataFrame: # pragma: no cover """Join two dataframes @@ -341,7 +338,7 @@ def join( .. note:: - Please read :func:`this ` + Please read :func:`~.fugue.dataframe.utils.get_join_schemas` """ raise NotImplementedError diff --git a/fugue/execution/interfaceless.py b/fugue/execution/interfaceless.py index e69de29b..c7bb03fe 100644 --- a/fugue/execution/interfaceless.py +++ b/fugue/execution/interfaceless.py @@ -0,0 +1,184 @@ +from ..collections.partition import PartitionSpec +from ..dataframe.dataframe import DataFrame +from .factory import make_execution_engine +from typing import Any, Optional, List + + +def repartition( + df: Any, + partition_spec: PartitionSpec, + engine: Any = None, + engine_conf: Any = None, + as_fugue: bool = False, +) -> Any: + """Partition the input dataframe using ``partition_spec``. + + :param df: an input dataframe that can be recognized by Fugue + :param partition_spec: how you want to partition the dataframe + :return: the repartitioned dataframe + """ + e = make_execution_engine(engine, engine_conf, infer_by=[df]) + edf = e.to_df(df) + return _adjust_df( + [df], e.repartition(edf, partition_spec=partition_spec), as_fugue=as_fugue + ) + + +def broadcast( + df: Any, + engine: Any = None, + engine_conf: Any = None, + as_fugue: bool = False, +) -> Any: + """Broadcast the dataframe to all workers for a distributed computing framework + + :param df: an input dataframe that can be recognized by Fugue + :return: the broadcasted dataframe + """ + e = make_execution_engine(engine, engine_conf, infer_by=[df]) + edf = e.to_df(df) + return _adjust_df([df], e.broadcast(edf), as_fugue=as_fugue) + + +def persist( + df: Any, + lazy: bool = False, + engine: Any = None, + engine_conf: Any = None, + as_fugue: bool = False, + **kwargs: Any, +) -> Any: + """Force materializing and caching the dataframe + + :param df: an input dataframe that can be recognized by Fugue + :param lazy: ``True``: first usage of the output will trigger persisting + to happen; ``False`` (eager): persist is forced to happend immediately. + Default to ``False`` + :param kwargs: parameter to pass to the underlying persist implementation + :return: the persisted dataframe + """ + e = make_execution_engine(engine, engine_conf, infer_by=[df]) + edf = e.to_df(df) + return _adjust_df([df], e.persist(edf, lazy=lazy, **kwargs), as_fugue=as_fugue) + + +def join( + df1: Any, + df2: Any, + how: str, + on: Optional[List[str]] = None, + engine: Any = None, + engine_conf: Any = None, + as_fugue: bool = False, +) -> Any: + """Join two dataframes + + :param df1: the first dataframe + :param df2: the second dataframe + :param how: can accept ``semi``, ``left_semi``, ``anti``, ``left_anti``, + ``inner``, ``left_outer``, ``right_outer``, ``full_outer``, ``cross`` + :param on: it can always be inferred, but if you provide, it will be + validated against the inferred keys. + :return: the joined dataframe + + .. note:: + + Please read :func:`~.fugue.dataframe.utils.get_join_schemas` + """ + e = make_execution_engine(engine, engine_conf, infer_by=[df1, df2]) + edf1 = e.to_df(df1) + edf2 = e.to_df(df2) + return _adjust_df([df1, df2], e.join(edf1, edf2, how=how, on=on), as_fugue=as_fugue) + + +def union( + df1: Any, + df2: Any, + distinct: bool = True, + engine: Any = None, + engine_conf: Any = None, + as_fugue: bool = False, +) -> Any: + """Join two dataframes + + :param df1: the first dataframe + :param df2: the second dataframe + :param distinct: ``true`` for ``UNION`` (== ``UNION DISTINCT``), + ``false`` for ``UNION ALL`` + :return: the unioned dataframe + + .. note:: + + Currently, the schema of ``df1`` and ``df2`` must be identical, or + an exception will be thrown. + """ + e = make_execution_engine(engine, engine_conf, infer_by=[df1, df2]) + edf1 = e.to_df(df1) + edf2 = e.to_df(df2) + return _adjust_df( + [df1, df2], e.union(edf1, edf2, distinct=distinct), as_fugue=as_fugue + ) + + +def subtract( + df1: Any, + df2: Any, + distinct: bool = True, + engine: Any = None, + engine_conf: Any = None, + as_fugue: bool = False, +) -> Any: + """``df1 - df2`` + + :param df1: the first dataframe + :param df2: the second dataframe + :param distinct: ``true`` for ``EXCEPT`` (== ``EXCEPT DISTINCT``), + ``false`` for ``EXCEPT ALL`` + :return: the unioned dataframe + + .. note:: + + Currently, the schema of ``df1`` and ``df2`` must be identical, or + an exception will be thrown. + """ + e = make_execution_engine(engine, engine_conf, infer_by=[df1, df2]) + edf1 = e.to_df(df1) + edf2 = e.to_df(df2) + return _adjust_df( + [df1, df2], e.subtract(edf1, edf2, distinct=distinct), as_fugue=as_fugue + ) + + +def intersect( + df1: Any, + df2: Any, + distinct: bool = True, + engine: Any = None, + engine_conf: Any = None, + as_fugue: bool = False, +) -> Any: + """Intersect ``df1`` and ``df2`` + + :param df1: the first dataframe + :param df2: the second dataframe + :param distinct: ``true`` for ``INTERSECT`` (== ``INTERSECT DISTINCT``), + ``false`` for ``INTERSECT ALL`` + :return: the unioned dataframe + + .. note:: + + Currently, the schema of ``df1`` and ``df2`` must be identical, or + an exception will be thrown. + """ + e = make_execution_engine(engine, engine_conf, infer_by=[df1, df2]) + edf1 = e.to_df(df1) + edf2 = e.to_df(df2) + return _adjust_df( + [df1, df2], e.intersect(edf1, edf2, distinct=distinct), as_fugue=as_fugue + ) + + +def _adjust_df(input_dfs: Any, output_df: DataFrame, as_fugue: bool) -> Any: + if as_fugue or any(isinstance(x, DataFrame) for x in input_dfs): + return output_df + return output_df.native # type: ignore diff --git a/fugue/execution/native_execution_engine.py b/fugue/execution/native_execution_engine.py index c0e82fd7..87caef93 100644 --- a/fugue/execution/native_execution_engine.py +++ b/fugue/execution/native_execution_engine.py @@ -34,7 +34,6 @@ ) from fugue.dataframe.utils import get_join_schemas, to_local_df from fugue.execution.execution_engine import ( - _DEFAULT_JOIN_KEYS, ExecutionEngine, MapEngine, SQLEngine, @@ -188,7 +187,7 @@ def join( df1: DataFrame, df2: DataFrame, how: str, - on: List[str] = _DEFAULT_JOIN_KEYS, + on: Optional[List[str]] = None, ) -> DataFrame: key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on) d = self.pl_utils.join( diff --git a/fugue/interfaceless/__init__.py b/fugue/interfaceless/__init__.py index df0d7674..ac554b3e 100644 --- a/fugue/interfaceless/__init__.py +++ b/fugue/interfaceless/__init__.py @@ -26,5 +26,14 @@ is_local, show, ) +from fugue.execution.interfaceless import ( + broadcast, + intersect, + join, + persist, + repartition, + subtract, + union, +) from .transformation import out_transform, transform diff --git a/fugue_dask/dataframe.py b/fugue_dask/dataframe.py index b86e8299..909269bf 100644 --- a/fugue_dask/dataframe.py +++ b/fugue_dask/dataframe.py @@ -1,8 +1,12 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple -import dask.dataframe as pd +import dask.dataframe as dd import pandas import pyarrow as pa +from triad.collections.schema import Schema +from triad.utils.assertion import assert_arg_not_none, assert_or_throw +from triad.utils.pyarrow import to_pandas_dtype + from fugue.dataframe import ( ArrowDataFrame, DataFrame, @@ -11,15 +15,18 @@ PandasDataFrame, ) from fugue.dataframe.dataframe import _input_schema -from fugue.dataframe.utils import ( +from fugue.exceptions import FugueDataFrameOperationError +from fugue.plugins import ( + count, + drop_columns, get_column_names, + head, + is_bounded, + is_empty, + is_local, rename, + select_columns, ) -from fugue.exceptions import FugueDataFrameOperationError -from triad.collections.schema import Schema -from triad.utils.assertion import assert_arg_not_none, assert_or_throw -from triad.utils.pyarrow import to_pandas_dtype - from fugue_dask._constants import ( FUGUE_DASK_CONF_DATAFRAME_DEFAULT_PARTITIONS, FUGUE_DASK_DEFAULT_CONF, @@ -27,18 +34,6 @@ from fugue_dask._utils import DASK_UTILS -@get_column_names.candidate(lambda df: isinstance(df, pd.DataFrame)) -def _get_dask_dataframe_columns(df: pd.DataFrame) -> List[Any]: - return list(df.columns) - - -@rename.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame)) -def _rename_dask_dataframe(df: pd.DataFrame, names: Dict[str, Any]) -> pd.DataFrame: - if len(names) == 0: - return df - return df.rename(columns=names) - - class DaskDataFrame(DataFrame): """DataFrame that wraps Dask DataFrame. Please also read |DataFrameTutorial| to understand this Fugue concept @@ -72,22 +67,22 @@ def __init__( # noqa: C901 df = [] if isinstance(df, DaskDataFrame): super().__init__(df.schema) - self._native: pd.DataFrame = df._native + self._native: dd.DataFrame = df._native return - elif isinstance(df, (pd.DataFrame, pd.Series)): - if isinstance(df, pd.Series): + elif isinstance(df, (dd.DataFrame, dd.Series)): + if isinstance(df, dd.Series): df = df.to_frame() pdf = df schema = None if schema is None else _input_schema(schema) elif isinstance(df, (pandas.DataFrame, pandas.Series)): if isinstance(df, pandas.Series): df = df.to_frame() - pdf = pd.from_pandas(df, npartitions=num_partitions, sort=False) + pdf = dd.from_pandas(df, npartitions=num_partitions, sort=False) schema = None if schema is None else _input_schema(schema) elif isinstance(df, Iterable): schema = _input_schema(schema).assert_not_empty() t = PandasDataFrame(df, schema) - pdf = pd.from_pandas(t.native, npartitions=num_partitions, sort=False) + pdf = dd.from_pandas(t.native, npartitions=num_partitions, sort=False) type_safe = False else: raise ValueError(f"{df} is incompatible with DaskDataFrame") @@ -96,7 +91,7 @@ def __init__( # noqa: C901 self._native = pdf @property - def native(self) -> pd.DataFrame: + def native(self) -> dd.DataFrame: """The wrapped Dask DataFrame :rtype: :class:`dask:dask.dataframe.DataFrame` @@ -140,7 +135,7 @@ def persist(self, **kwargs: Any) -> "DaskDataFrame": return self def count(self) -> int: - return self.as_pandas().shape[0] + return self.native.shape[0].compute() def as_pandas(self) -> pandas.DataFrame: return self.native.compute().reset_index(drop=True) @@ -220,8 +215,8 @@ def head( return PandasDataFrame(ddf.head(n, compute=True, npartitions=-1), schema=schema) def _apply_schema( - self, pdf: pd.DataFrame, schema: Optional[Schema], type_safe: bool = True - ) -> Tuple[pd.DataFrame, Schema]: + self, pdf: dd.DataFrame, schema: Optional[Schema], type_safe: bool = True + ) -> Tuple[dd.DataFrame, Schema]: if not type_safe: assert_arg_not_none(pdf, "pdf") assert_arg_not_none(schema, "schema") @@ -242,3 +237,81 @@ def _apply_schema( ) pdf.columns = schema.names return DASK_UTILS.enforce_type(pdf, schema.pa_schema, null_safe=True), schema + + +@count.candidate(lambda df: isinstance(df, dd.DataFrame)) +def _dd_count(df: dd.DataFrame) -> int: + return df.shape[0].compute() + + +@is_bounded.candidate(lambda df: isinstance(df, dd.DataFrame)) +def _dd_is_bounded(df: dd.DataFrame) -> bool: + return True + + +@is_empty.candidate(lambda df: isinstance(df, dd.DataFrame)) +def _dd_is_empty(df: dd.DataFrame) -> bool: + return DASK_UTILS.empty(df) + + +@is_local.candidate(lambda df: isinstance(df, dd.DataFrame)) +def _dd_is_local(df: dd.DataFrame) -> bool: + return False + + +@get_column_names.candidate(lambda df: isinstance(df, dd.DataFrame)) +def _get_dask_dataframe_columns(df: dd.DataFrame) -> List[Any]: + return list(df.columns) + + +@rename.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame)) +def _rename_dask_dataframe(df: dd.DataFrame, columns: Dict[str, Any]) -> dd.DataFrame: + if len(columns) == 0: + return df + _assert_no_missing(df, columns.keys()) + return df.rename(columns=columns) + + +@drop_columns.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame)) +def _drop_dd_columns( + df: dd.DataFrame, columns: List[str], as_fugue: bool = False +) -> Any: + cols = [x for x in df.columns if x not in columns] + if len(cols) == 0: + raise FugueDataFrameOperationError("cannot drop all columns") + if len(cols) + len(columns) != len(df.columns): + _assert_no_missing(df, columns) + return _adjust_df(df[cols], as_fugue=as_fugue) + + +@select_columns.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame)) +def _select_dd_columns( + df: dd.DataFrame, columns: List[Any], as_fugue: bool = False +) -> Any: + if len(columns) == 0: + raise FugueDataFrameOperationError("must select at least one column") + _assert_no_missing(df, columns) + return _adjust_df(df[columns], as_fugue=as_fugue) + + +@head.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame)) +def _dd_head( + df: dd.DataFrame, + n: int, + columns: Optional[List[str]] = None, + as_fugue: bool = False, +) -> pandas.DataFrame: + if columns is not None: + df = df[columns] + res = df.head(n, compute=True, npartitions=-1) + return PandasDataFrame(res) if as_fugue else res + + +def _assert_no_missing(df: dd.DataFrame, columns: Iterable[Any]) -> None: + missing = set(columns) - set(df.columns) + if len(missing) > 0: + raise FugueDataFrameOperationError("found nonexistent columns: {missing}") + + +def _adjust_df(res: dd.DataFrame, as_fugue: bool): + return res if not as_fugue else DaskDataFrame(res) diff --git a/fugue_dask/execution_engine.py b/fugue_dask/execution_engine.py index 966479e3..b3e54c31 100644 --- a/fugue_dask/execution_engine.py +++ b/fugue_dask/execution_engine.py @@ -4,6 +4,14 @@ import dask.dataframe as dd from distributed import Client +from qpd_dask import run_sql_on_dask +from triad.collections import Schema +from triad.collections.dict import IndexedOrderedDict, ParamDict +from triad.collections.fs import FileSystem +from triad.utils.assertion import assert_or_throw +from triad.utils.hash import to_uuid +from triad.utils.threading import RunOnce + from fugue.collections.partition import ( EMPTY_PARTITION_SPEC, PartitionCursor, @@ -13,21 +21,8 @@ from fugue.constants import KEYWORD_CORECOUNT, KEYWORD_ROWCOUNT from fugue.dataframe import DataFrame, DataFrames, LocalDataFrame, PandasDataFrame from fugue.dataframe.utils import get_join_schemas -from fugue.execution.execution_engine import ( - _DEFAULT_JOIN_KEYS, - ExecutionEngine, - SQLEngine, - MapEngine, -) +from fugue.execution.execution_engine import ExecutionEngine, MapEngine, SQLEngine from fugue.execution.native_execution_engine import NativeExecutionEngine -from qpd_dask import run_sql_on_dask -from triad.collections import Schema -from triad.collections.dict import IndexedOrderedDict, ParamDict -from triad.collections.fs import FileSystem -from triad.utils.assertion import assert_or_throw -from triad.utils.hash import to_uuid -from triad.utils.threading import RunOnce - from fugue_dask._constants import ( CPU_COUNT, FUGUE_DASK_CONF_DATAFRAME_DEFAULT_PARTITIONS, @@ -249,7 +244,7 @@ def join( df1: DataFrame, df2: DataFrame, how: str, - on: List[str] = _DEFAULT_JOIN_KEYS, + on: Optional[List[str]] = None, ) -> DataFrame: key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on) d = self.pl_utils.join( diff --git a/fugue_dask/registry.py b/fugue_dask/registry.py index a7b0f530..0aaeddc3 100644 --- a/fugue_dask/registry.py +++ b/fugue_dask/registry.py @@ -12,7 +12,7 @@ SimpleAnnotationConverter, register_annotation_converter, ) -from fugue.plugins import infer_execution_engine +from fugue.plugins import infer_execution_engine, as_fugue_dataset from fugue.workflow import register_raw_df_type from fugue_dask._utils import DASK_UTILS from fugue_dask.dataframe import DaskDataFrame @@ -26,6 +26,11 @@ def _infer_dask_client(objs: Any) -> Any: return DASK_UTILS.get_or_create_client() +@as_fugue_dataset.candidate(lambda df: isinstance(df, dd.DataFrame)) +def _dask_as_fugue_df(df: dd.DataFrame) -> DaskDataFrame: + return DaskDataFrame(df) + + def _register_raw_dataframes() -> None: register_raw_df_type(dd.DataFrame) diff --git a/fugue_duckdb/_utils.py b/fugue_duckdb/_utils.py index 9502ed61..f42beef0 100644 --- a/fugue_duckdb/_utils.py +++ b/fugue_duckdb/_utils.py @@ -32,6 +32,10 @@ _PA_TYPES_TO_DUCK: Dict[pa.DataType, str] = {v: k for k, v in _DUCK_TYPES_TO_PA.items()} +def encode_column_name(name: str) -> str: + return '"' + name.replace('"', '""') + '"' + + def encode_value_to_expr(value: Any) -> str: # noqa: C901 if isinstance(value, list): return "[" + ", ".join(encode_value_to_expr(x) for x in value) + "]" diff --git a/fugue_duckdb/dataframe.py b/fugue_duckdb/dataframe.py index 0efe7390..635961ea 100644 --- a/fugue_duckdb/dataframe.py +++ b/fugue_duckdb/dataframe.py @@ -3,17 +3,19 @@ import pandas as pd import pyarrow as pa from duckdb import DuckDBPyRelation +from triad import Schema + from fugue import ( + ArrayDataFrame, ArrowDataFrame, DataFrame, LocalBoundedDataFrame, LocalDataFrame, - ArrayDataFrame, ) -from fugue.exceptions import FugueDatasetEmptyError, FugueDataFrameOperationError -from triad import Schema +from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError +from fugue.plugins import get_column_names -from fugue_duckdb._utils import to_duck_type, to_pa_type +from ._utils import encode_column_name, to_duck_type, to_pa_type class DuckDataFrame(LocalBoundedDataFrame): @@ -24,10 +26,15 @@ class DuckDataFrame(LocalBoundedDataFrame): def __init__(self, rel: DuckDBPyRelation): self._rel = rel - schema = Schema( - [pa.field(x, to_pa_type(y)) for x, y in zip(rel.columns, rel.types)] + super().__init__(schema=self._get_schema) + + def _get_schema(self) -> Schema: + return Schema( + [ + pa.field(x, to_pa_type(y)) + for x, y in zip(self._rel.columns, self._rel.types) + ] ) - super().__init__(schema=schema) @property def native(self) -> DuckDBPyRelation: @@ -48,21 +55,23 @@ def count(self) -> int: return self._rel.aggregate("count(1) AS ct").fetchone()[0] def _drop_cols(self, cols: List[str]) -> DataFrame: - schema = self.schema.exclude(cols) - rel = self._rel.project(",".join(n for n in schema.names)) + cols = [col for col in self._rel.columns if col not in cols] + rel = self._rel.project(",".join(encode_column_name(n) for n in cols)) return DuckDataFrame(rel) def _select_cols(self, keys: List[Any]) -> DataFrame: - schema = self.schema.extract(keys) - rel = self._rel.project(",".join(n for n in schema.names)) + rel = self._rel.project(",".join(encode_column_name(n) for n in keys)) return DuckDataFrame(rel) def rename(self, columns: Dict[str, str]) -> DataFrame: - try: - schema = self.schema.rename(columns) - except Exception as e: - raise FugueDataFrameOperationError from e - expr = ", ".join(f"{a} AS {b}" for a, b in zip(self.schema.names, schema.names)) + _assert_no_missing(self._rel, columns.keys()) + expr = ", ".join( + f"{a} AS {b}" + for a, b in [ + (encode_column_name(name), encode_column_name(columns.get(name, name))) + for name in self._rel.columns + ] + ) return DuckDataFrame(self._rel.project(expr)) def alter_columns(self, columns: Any) -> DataFrame: @@ -75,7 +84,9 @@ def alter_columns(self, columns: Any) -> DataFrame: fields.append(f1.name) else: tp = to_duck_type(f2.type) - fields.append(f"CAST({f1.name} AS {tp}) AS {f1.name}") + fields.append( + f"CAST({encode_column_name(f1.name)} AS {tp}) AS {f1.name}" + ) return DuckDataFrame(self._rel.project(", ".join(fields))) def as_arrow(self, type_safe: bool = False) -> pa.Table: @@ -125,3 +136,14 @@ def to_list(row: Any) -> List[Any]: return res return [to_list(x) for x in rel.fetchall()] + + +@get_column_names.candidate(lambda df: isinstance(df, DuckDBPyRelation)) +def _get_duckdb_columns(df: DuckDBPyRelation) -> List[Any]: + return list(df.columns) + + +def _assert_no_missing(df: DuckDBPyRelation, columns: Iterable[Any]) -> None: + missing = set(columns) - set(df.columns) + if len(missing) > 0: + raise FugueDataFrameOperationError("found nonexistent columns: {missing}") diff --git a/fugue_duckdb/execution_engine.py b/fugue_duckdb/execution_engine.py index fe80a22b..e192851c 100644 --- a/fugue_duckdb/execution_engine.py +++ b/fugue_duckdb/execution_engine.py @@ -28,7 +28,6 @@ PandasDataFrame, ) from fugue.dataframe.utils import get_join_schemas -from fugue.execution.execution_engine import _DEFAULT_JOIN_KEYS from fugue_duckdb._io import DuckDBIO from fugue_duckdb._utils import encode_value_to_expr, get_temp_df_name from fugue_duckdb.dataframe import DuckDataFrame @@ -167,7 +166,7 @@ def join( df1: DataFrame, df2: DataFrame, how: str, - on: List[str] = _DEFAULT_JOIN_KEYS, + on: Optional[List[str]] = None, ) -> DataFrame: key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on) t1, t2, t3 = ( diff --git a/fugue_duckdb/registry.py b/fugue_duckdb/registry.py index 4fbb9274..a63ed7d7 100644 --- a/fugue_duckdb/registry.py +++ b/fugue_duckdb/registry.py @@ -17,7 +17,7 @@ SimpleAnnotationConverter, register_annotation_converter, ) -from fugue.plugins import infer_execution_engine +from fugue.plugins import infer_execution_engine, as_fugue_dataset from fugue.workflow import register_raw_df_type from fugue_duckdb.dataframe import DuckDataFrame from fugue_duckdb.execution_engine import DuckDBEngine, DuckExecutionEngine @@ -30,6 +30,11 @@ def _infer_duckdb_client(objs: Any) -> Any: return "duckdb" +@as_fugue_dataset.candidate(lambda df: isinstance(df, DuckDBPyRelation)) +def _duckdb_as_fugue_df(df: DuckDBPyRelation) -> DuckDataFrame: + return DuckDataFrame(df) + + def _register_raw_dataframes() -> None: register_raw_df_type(DuckDBPyRelation) diff --git a/fugue_ibis/execution_engine.py b/fugue_ibis/execution_engine.py index 54636680..1240a586 100644 --- a/fugue_ibis/execution_engine.py +++ b/fugue_ibis/execution_engine.py @@ -1,6 +1,10 @@ -from typing import Any, List, Optional, Dict +import itertools +from typing import Any, Dict, List, Optional import ibis +from ibis import BaseBackend +from triad.utils.assertion import assert_or_throw + from fugue.collections.partition import ( EMPTY_PARTITION_SPEC, PartitionSpec, @@ -8,17 +12,10 @@ ) from fugue.dataframe import DataFrame, DataFrames from fugue.dataframe.utils import get_join_schemas -from fugue.execution.execution_engine import ( - _DEFAULT_JOIN_KEYS, - ExecutionEngine, - SQLEngine, -) -from ibis import BaseBackend -from triad.utils.assertion import assert_or_throw +from fugue.execution.execution_engine import ExecutionEngine, SQLEngine -from .dataframe import IbisDataFrame from ._compat import IbisTable -import itertools +from .dataframe import IbisDataFrame _JOIN_RIGHT_SUFFIX = "_ibis_y__" _GEN_TABLE_NAMES = (f"_fugue_temp_table_{i:d}" for i in itertools.count()) @@ -82,7 +79,7 @@ def join( df1: DataFrame, df2: DataFrame, how: str, - on: List[str] = _DEFAULT_JOIN_KEYS, + on: Optional[List[str]] = None, ) -> DataFrame: _df1 = self._to_ibis_dataframe(df1) _df2 = self._to_ibis_dataframe(df2) diff --git a/fugue_ray/dataframe.py b/fugue_ray/dataframe.py index 8780845a..fd791a95 100644 --- a/fugue_ray/dataframe.py +++ b/fugue_ray/dataframe.py @@ -4,6 +4,8 @@ import pyarrow as pa import ray import ray.data as rd +from triad.collections.schema import Schema + from fugue.dataframe import ( ArrowDataFrame, DataFrame, @@ -11,34 +13,12 @@ LocalDataFrame, ) from fugue.dataframe.dataframe import _input_schema -from fugue.dataframe.utils import ( - get_column_names, - rename, -) from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError -from triad.collections.schema import Schema +from fugue.plugins import get_column_names, rename from ._utils.dataframe import _build_empty_arrow, build_empty, get_dataset_format -@get_column_names.candidate(lambda df: isinstance(df, rd.Dataset)) -def _get_ray_dataframe_columns(df: rd.Dataset) -> List[Any]: - fmt = get_dataset_format(df) - if fmt == "pandas": - return list(df.schema(True).names) - elif fmt == "arrow": - return [f.name for f in df.schema(True)] - raise NotImplementedError(f"{fmt} is not supported") # pragma: no cover - - -@rename.candidate(lambda df, *args, **kwargs: isinstance(df, rd.Dataset)) -def _rename_ray_dataframe(df: rd.Dataset, names: Dict[str, Any]) -> rd.Dataset: - if len(names) == 0: - return df - new_cols = [names.get(name, name) for name in _get_ray_dataframe_columns(df)] - return df.map_batches(lambda b: b.rename_columns(new_cols), batch_format="pyarrow") - - class RayDataFrame(DataFrame): """DataFrame that wraps Ray DataSet. Please also read |DataFrameTutorial| to understand this Fugue concept @@ -254,3 +234,25 @@ def _alter(table: pa.Table) -> pa.Table: # pragma: no cover def _remote_args(self) -> Dict[str, Any]: return {"num_cpus": 1} + + +@get_column_names.candidate(lambda df: isinstance(df, rd.Dataset)) +def _get_ray_dataframe_columns(df: rd.Dataset) -> List[Any]: + fmt = get_dataset_format(df) + if fmt == "pandas": + return list(df.schema(True).names) + elif fmt == "arrow": + return [f.name for f in df.schema(True)] + raise NotImplementedError(f"{fmt} is not supported") # pragma: no cover + + +@rename.candidate(lambda df, *args, **kwargs: isinstance(df, rd.Dataset)) +def _rename_ray_dataframe(df: rd.Dataset, columns: Dict[str, Any]) -> rd.Dataset: + if len(columns) == 0: + return df + cols = _get_ray_dataframe_columns(df) + missing = set(columns.keys()) - set(cols) + if len(missing) > 0: + raise FugueDataFrameOperationError("found nonexistent columns: {missing}") + new_cols = [columns.get(name, name) for name in cols] + return df.map_batches(lambda b: b.rename_columns(new_cols), batch_format="pyarrow") diff --git a/fugue_ray/registry.py b/fugue_ray/registry.py index 9f85868e..8ec17013 100644 --- a/fugue_ray/registry.py +++ b/fugue_ray/registry.py @@ -11,7 +11,7 @@ SimpleAnnotationConverter, register_annotation_converter, ) -from fugue.plugins import infer_execution_engine +from fugue.plugins import infer_execution_engine, as_fugue_dataset from fugue.workflow import register_raw_df_type from .dataframe import RayDataFrame @@ -25,6 +25,11 @@ def _infer_ray_client(objs: Any) -> Any: return "ray" +@as_fugue_dataset.candidate(lambda df: isinstance(df, rd.Dataset)) +def _ray_as_fugue_df(df: rd.Dataset) -> RayDataFrame: + return RayDataFrame(df) + + def _register_raw_dataframes() -> None: register_raw_df_type(rd.Dataset) diff --git a/fugue_spark/dataframe.py b/fugue_spark/dataframe.py index 6c42a8e5..10c9168a 100644 --- a/fugue_spark/dataframe.py +++ b/fugue_spark/dataframe.py @@ -3,6 +3,11 @@ import pandas as pd import pyarrow as pa import pyspark.sql as ps +from pyspark.sql.functions import col +from triad import SerializableRLock +from triad.collections.schema import SchemaError +from triad.utils.assertion import assert_or_throw + from fugue.dataframe import ( ArrayDataFrame, DataFrame, @@ -11,16 +16,18 @@ LocalDataFrame, PandasDataFrame, ) -from fugue.dataframe.utils import ( +from fugue.exceptions import FugueDataFrameOperationError +from fugue.plugins import ( + count, + drop_columns, get_column_names, + head, + is_bounded, + is_empty, + is_local, rename, + select_columns, ) -from fugue.exceptions import FugueDataFrameOperationError -from pyspark.sql.functions import col -from triad import SerializableRLock -from triad.collections.schema import SchemaError -from triad.utils.assertion import assert_or_throw - from fugue_spark._utils.convert import to_cast_expression, to_schema, to_type_safe_input @@ -178,3 +185,83 @@ def _select_columns(self, columns: Optional[List[str]]) -> "SparkDataFrame": if columns is None: return self return SparkDataFrame(self.native.select(*columns)) + + +@count.candidate(lambda df: isinstance(df, ps.DataFrame)) +def _spark_df_count(df: ps.DataFrame) -> int: + return df.count() + + +@is_bounded.candidate(lambda df: isinstance(df, ps.DataFrame)) +def _spark_df_is_bounded(df: ps.DataFrame) -> bool: + return True + + +@is_empty.candidate(lambda df: isinstance(df, ps.DataFrame)) +def _spark_df_is_empty(df: ps.DataFrame) -> bool: + return df.first() is None + + +@is_local.candidate(lambda df: isinstance(df, ps.DataFrame)) +def _spark_df_is_local(df: ps.DataFrame) -> bool: + return False + + +@get_column_names.candidate(lambda df: isinstance(df, ps.DataFrame)) +def _get_spark_df_columns(df: ps.DataFrame) -> List[Any]: + return df.columns + + +@rename.candidate(lambda df, *args, **kwargs: isinstance(df, ps.DataFrame)) +def _rename_spark_df( + df: ps.DataFrame, columns: Dict[str, Any], as_fugue: bool = False +) -> ps.DataFrame: + if len(columns) == 0: + return df + _assert_no_missing(df, columns.keys()) + return _adjust_df(_rename_spark_dataframe(df, columns), as_fugue=as_fugue) + + +@drop_columns.candidate(lambda df, *args, **kwargs: isinstance(df, ps.DataFrame)) +def _drop_spark_df_columns( + df: ps.DataFrame, columns: List[str], as_fugue: bool = False +) -> Any: + cols = [x for x in df.columns if x not in columns] + if len(cols) == 0: + raise FugueDataFrameOperationError("cannot drop all columns") + if len(cols) + len(columns) != len(df.columns): + _assert_no_missing(df, columns) + return _adjust_df(df[cols], as_fugue=as_fugue) + + +@select_columns.candidate(lambda df, *args, **kwargs: isinstance(df, ps.DataFrame)) +def _select_spark_df_columns( + df: ps.DataFrame, columns: List[Any], as_fugue: bool = False +) -> Any: + if len(columns) == 0: + raise FugueDataFrameOperationError("must select at least one column") + _assert_no_missing(df, columns) + return _adjust_df(df[columns], as_fugue=as_fugue) + + +@head.candidate(lambda df, *args, **kwargs: isinstance(df, ps.DataFrame)) +def _spark_df_head( + df: ps.DataFrame, + n: int, + columns: Optional[List[str]] = None, + as_fugue: bool = False, +) -> pd.DataFrame: + if columns is not None: + df = df[columns] + res = df.limit(n) + return SparkDataFrame(res).as_local() if as_fugue else res.toPandas() + + +def _assert_no_missing(df: ps.DataFrame, columns: Iterable[Any]) -> None: + missing = set(columns) - set(df.columns) + if len(missing) > 0: + raise FugueDataFrameOperationError("found nonexistent columns: {missing}") + + +def _adjust_df(res: ps.DataFrame, as_fugue: bool): + return res if not as_fugue else SparkDataFrame(res) diff --git a/fugue_spark/execution_engine.py b/fugue_spark/execution_engine.py index ae63f9e3..80a4ec01 100644 --- a/fugue_spark/execution_engine.py +++ b/fugue_spark/execution_engine.py @@ -4,8 +4,8 @@ import pandas as pd import pyarrow as pa -import pyspark.sql as ps import pyspark +import pyspark.sql as ps from pyspark import StorageLevel from pyspark.rdd import RDD from pyspark.sql import SparkSession @@ -37,12 +37,7 @@ ) from fugue.dataframe.utils import get_join_schemas from fugue.exceptions import FugueDataFrameInitError -from fugue.execution.execution_engine import ( - _DEFAULT_JOIN_KEYS, - ExecutionEngine, - MapEngine, - SQLEngine, -) +from fugue.execution.execution_engine import ExecutionEngine, MapEngine, SQLEngine from fugue_spark._constants import ( FUGUE_SPARK_CONF_USE_PANDAS_UDF, FUGUE_SPARK_DEFAULT_CONF, @@ -444,7 +439,7 @@ def join( df1: DataFrame, df2: DataFrame, how: str, - on: List[str] = _DEFAULT_JOIN_KEYS, + on: Optional[List[str]] = None, ) -> DataFrame: key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on) how = how.lower().replace("_", "").replace(" ", "") diff --git a/fugue_spark/registry.py b/fugue_spark/registry.py index bf3b3fa7..8700f802 100644 --- a/fugue_spark/registry.py +++ b/fugue_spark/registry.py @@ -14,7 +14,7 @@ SimpleAnnotationConverter, register_annotation_converter, ) -from fugue.plugins import infer_execution_engine, parse_creator +from fugue.plugins import as_fugue_dataset, infer_execution_engine, parse_creator from fugue.workflow import register_raw_df_type from fugue_spark.dataframe import SparkDataFrame from fugue_spark.execution_engine import SparkExecutionEngine @@ -35,6 +35,11 @@ def _infer_spark_client(obj: Any) -> Any: return SparkSession.builder.getOrCreate() +@as_fugue_dataset.candidate(lambda df: isinstance(df, ps.DataFrame)) +def _spark_as_fugue_df(df: ps.DataFrame) -> SparkDataFrame: + return SparkDataFrame(df) + + @parse_creator.candidate(lambda obj: _is_sparksql(obj)) def _parse_sparksql_creator(sql): def _run_sql(spark: SparkSession) -> ps.DataFrame: diff --git a/fugue_test/dataframe_suite.py b/fugue_test/dataframe_suite.py index cbe7a966..127fe990 100644 --- a/fugue_test/dataframe_suite.py +++ b/fugue_test/dataframe_suite.py @@ -75,6 +75,7 @@ def test_drop_columns(self): def test_select(self): df = fi.select_columns(self.df([], "a:str,b:int"), ["b"]) assert fi.get_schema(df) == "b:int" + assert fi.get_column_names(df) == ["b"] raises( FugueDataFrameOperationError, lambda: fi.select_columns(df, []) ) # select empty diff --git a/tests/fugue_dask/test_dataframe.py b/tests/fugue_dask/test_dataframe.py index 8e739134..4c2a2ffd 100644 --- a/tests/fugue_dask/test_dataframe.py +++ b/tests/fugue_dask/test_dataframe.py @@ -6,6 +6,7 @@ import dask.dataframe as pd import numpy as np import pandas +import fugue.interfaceless as fi from fugue.dataframe.array_dataframe import ArrayDataFrame from fugue.dataframe.pandas_dataframe import PandasDataFrame from fugue.dataframe.utils import _df_eq as df_eq @@ -20,12 +21,42 @@ class DaskDataFrameTests(DataFrameTests.Tests): - def df( - self, data: Any = None, schema: Any = None - ) -> DaskDataFrame: + def df(self, data: Any = None, schema: Any = None) -> DaskDataFrame: return DaskDataFrame(data, schema) +class NativeDaskDataFrameTests(DataFrameTests.Tests): + def df(self, data: Any = None, schema: Any = None): + return DaskDataFrame(data, schema).native + + def test_not_local(self): + assert not fi.is_local(self.df([], "a:int,b:str")) + + def test_get_altered_schema(self): + pass + + def test_alter_columns(self): + pass + + def test_as_arrow(self): + pass + + def test_binary_type(self): + pass + + def test_deep_nested_types(self): + pass + + def test_list_type(self): + pass + + def test_map_type(self): + pass + + def test_struct_type(self): + pass + + def test_init(): df = DaskDataFrame(schema="a:str,b:int") assert df.is_bounded diff --git a/tests/fugue_duckdb/test_dataframe.py b/tests/fugue_duckdb/test_dataframe.py index fa0dab7e..a43f6922 100644 --- a/tests/fugue_duckdb/test_dataframe.py +++ b/tests/fugue_duckdb/test_dataframe.py @@ -64,6 +64,19 @@ def test_init(self): assert df.is_bounded assert df.is_local - def test_duck_as_locak(self): + def test_duck_as_local(self): df = self.df([[2.1, 1]], "a:double,b:int") assert isinstance(df.as_local(), ArrowDataFrame) + + +class NativeDuckDataFrameTests(DataFrameTests.Tests): + @classmethod + def setUpClass(cls): + cls._con = duckdb.connect() + + def df(self, data: Any = None, schema: Any = None) -> DuckDataFrame: + df = ArrowDataFrame(data, schema) + return DuckDataFrame(duckdb.arrow(df.native, self._con)).native + + def test_get_altered_schema(self): + pass diff --git a/tests/fugue_ray/test_dataframe.py b/tests/fugue_ray/test_dataframe.py index 2b09b62a..9dc973bc 100644 --- a/tests/fugue_ray/test_dataframe.py +++ b/tests/fugue_ray/test_dataframe.py @@ -120,3 +120,21 @@ def test_rename(self): df = rename(pdf, {"0": "_0", "1": "_1", "2": "_2"}) assert isinstance(df, rd.Dataset) assert get_column_names(df) == ["_0", "_1", "_2"] + + +class NativeRayDataFrameTests(DataFrameTests.Tests): + @classmethod + def setUpClass(cls): + ray.init(num_cpus=2) + + @classmethod + def tearDownClass(cls): + ray.shutdown() + + def df(self, data: Any = None, schema: Any = None): + res = RayDataFrame(data, schema) + # native ray dataset can't handle the schema when empty + return res if res.empty else res.native + + def test_get_altered_schema(self): + pass diff --git a/tests/fugue_spark/test_dataframe.py b/tests/fugue_spark/test_dataframe.py index 0bbef2e8..da8c99a3 100644 --- a/tests/fugue_spark/test_dataframe.py +++ b/tests/fugue_spark/test_dataframe.py @@ -38,6 +38,28 @@ def test_map_type(self): return super().test_map_type() +class NativeSparkDataFrameTests(DataFrameTests.Tests): + @pytest.fixture(autouse=True) + def init_session(self, spark_session): + self.spark_session = spark_session + + def df(self, data: Any = None, schema: Any = None): + session = SparkSession.builder.getOrCreate() + engine = SparkExecutionEngine(session) + return engine.to_df(data, schema=schema).native + + def test_alter_columns_invalid(self): + # TODO: Spark will silently cast invalid data to nulls without exceptions + pass + + def test_map_type(self): + if pyspark.__version__ >= "3": + return super().test_map_type() + + def test_get_altered_schema(self): + pass + + def test_init(spark_session): sdf = spark_session.createDataFrame([["a", 1]]) df = SparkDataFrame(sdf, "a:str,b:double") From 4c9055a54dd96097f006b1266f3605d77dda5ea7 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Mon, 19 Dec 2022 08:03:34 +0000 Subject: [PATCH 06/30] fix --- fugue_spark/dataframe.py | 28 ++++++++++------------------ tests/fugue_spark/test_dataframe.py | 13 +++++++------ 2 files changed, 17 insertions(+), 24 deletions(-) diff --git a/fugue_spark/dataframe.py b/fugue_spark/dataframe.py index 10c9168a..cdeee587 100644 --- a/fugue_spark/dataframe.py +++ b/fugue_spark/dataframe.py @@ -31,24 +31,6 @@ from fugue_spark._utils.convert import to_cast_expression, to_schema, to_type_safe_input -@get_column_names.candidate(lambda df: isinstance(df, ps.DataFrame)) -def _get_spark_dataframe_columns(df: ps.DataFrame) -> List[Any]: - return [f.name for f in df.schema] - - -@rename.candidate(lambda df, *args, **kwargs: isinstance(df, ps.DataFrame)) -def _rename_spark_dataframe(df: ps.DataFrame, names: Dict[str, Any]) -> ps.DataFrame: - if len(names) == 0: - return df - cols: List[ps.Column] = [] - for f in df.schema: - c = col(f.name) - if f.name in names: - c = c.alias(names[f.name]) - cols.append(c) - return df.select(cols) - - class SparkDataFrame(DataFrame): """DataFrame that wraps Spark DataFrame. Please also read |DataFrameTutorial| to understand this Fugue concept @@ -257,6 +239,16 @@ def _spark_df_head( return SparkDataFrame(res).as_local() if as_fugue else res.toPandas() +def _rename_spark_dataframe(df: ps.DataFrame, names: Dict[str, Any]) -> ps.DataFrame: + cols: List[ps.Column] = [] + for f in df.schema: + c = col(f.name) + if f.name in names: + c = c.alias(names[f.name]) + cols.append(c) + return df.select(cols) + + def _assert_no_missing(df: ps.DataFrame, columns: Iterable[Any]) -> None: missing = set(columns) - set(df.columns) if len(missing) > 0: diff --git a/tests/fugue_spark/test_dataframe.py b/tests/fugue_spark/test_dataframe.py index da8c99a3..29837927 100644 --- a/tests/fugue_spark/test_dataframe.py +++ b/tests/fugue_spark/test_dataframe.py @@ -5,18 +5,16 @@ import pyspark import pyspark.sql as ps import pytest -from fugue.dataframe.pandas_dataframe import PandasDataFrame -from fugue.dataframe.utils import ( - get_column_names, - rename, -) -from fugue_test.dataframe_suite import DataFrameTests from pyspark.sql import SparkSession from triad.collections.schema import Schema +import fugue.interfaceless as fi +from fugue.dataframe.pandas_dataframe import PandasDataFrame +from fugue.plugins import get_column_names, rename from fugue_spark import SparkExecutionEngine from fugue_spark._utils.convert import to_schema, to_spark_schema from fugue_spark.dataframe import SparkDataFrame +from fugue_test.dataframe_suite import DataFrameTests class SparkDataFrameTests(DataFrameTests.Tests): @@ -48,6 +46,9 @@ def df(self, data: Any = None, schema: Any = None): engine = SparkExecutionEngine(session) return engine.to_df(data, schema=schema).native + def test_not_local(self): + assert not fi.is_local(self.df([], "a:int,b:str")) + def test_alter_columns_invalid(self): # TODO: Spark will silently cast invalid data to nulls without exceptions pass From 2889830dadadc7690d29ca923945a9b60b34cf01 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Mon, 19 Dec 2022 08:09:22 +0000 Subject: [PATCH 07/30] update qpd --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 80849629..4065bd0e 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ def get_version() -> str: install_requires=[ "triad>=0.7.0", "adagio>=0.2.4", - "qpd>=0.3.1", + "qpd>=0.3.4", "fugue-sql-antlr>=0.1.1", "sqlalchemy", "pyarrow>=0.15.1", @@ -43,7 +43,7 @@ def get_version() -> str: extras_require={ "cpp_sql_parser": ["fugue-sql-antlr[cpp]>=0.1.1"], "spark": ["pyspark"], - "dask": ["dask[distributed,dataframe]", "qpd[dask]>=0.3.1"], + "dask": ["dask[distributed,dataframe]", "qpd[dask]>=0.3.4"], "ray": ["ray[data]>=2.0.0", "duckdb>=0.5.0", "pyarrow>=6.0.1"], "duckdb": [ "duckdb>=0.5.0", @@ -60,7 +60,7 @@ def get_version() -> str: "pyspark", "dask[distributed,dataframe]", "ray[data]>=2.0.0", - "qpd[dask]>=0.3.1", + "qpd[dask]>=0.3.4", "notebook", "jupyterlab", "ipython>=7.10.0", From 98b04e1c1b120fb1ab0c3ab0fa9ee724049459af Mon Sep 17 00:00:00 2001 From: Han Wang Date: Tue, 20 Dec 2022 07:50:16 +0000 Subject: [PATCH 08/30] refactor code --- docs/api/fugue.rst | 4 +-- fugue/__init__.py | 2 +- fugue/dataframe/arrow_dataframe.py | 9 +++++ fugue/dataframe/dataframe.py | 34 ++++++++++++++++++- fugue/dataframe/pandas_dataframe.py | 9 +++++ fugue/dataset.py | 6 ++++ .../{interfaceless.py => express.py} | 0 fugue/execution/factory.py | 2 +- fugue/{interfaceless => express}/__init__.py | 4 ++- .../transformation.py | 0 fugue/plugins.py | 1 + fugue/workflow/workflow.py | 7 ++++ fugue_dask/dataframe.py | 12 +++++-- fugue_duckdb/dataframe.py | 10 +++++- fugue_ibis/dataframe.py | 12 ++++++- fugue_ray/dataframe.py | 10 +++++- fugue_spark/dataframe.py | 9 +++++ fugue_test/dataframe_suite.py | 16 +++++++-- tests/fugue/dataframe/test_dataframe.py | 12 +++++-- tests/fugue_dask/test_dataframe.py | 2 +- tests/fugue_spark/test_dataframe.py | 2 +- 21 files changed, 145 insertions(+), 18 deletions(-) rename fugue/execution/{interfaceless.py => express.py} (100%) rename fugue/{interfaceless => express}/__init__.py (89%) rename fugue/{interfaceless => express}/transformation.py (100%) diff --git a/docs/api/fugue.rst b/docs/api/fugue.rst index c22e96c8..52455f62 100644 --- a/docs/api/fugue.rst +++ b/docs/api/fugue.rst @@ -64,10 +64,10 @@ fugue.exceptions :undoc-members: :show-inheritance: -fugue.interfaceless +fugue.express ------------------- -.. automodule:: fugue.interfaceless +.. automodule:: fugue.express :members: :undoc-members: :show-inheritance: diff --git a/fugue/__init__.py b/fugue/__init__.py index 7de5bc35..482da900 100644 --- a/fugue/__init__.py +++ b/fugue/__init__.py @@ -52,7 +52,7 @@ register_transformer, transformer, ) -from fugue.interfaceless import out_transform, transform +from fugue.express import out_transform, transform from fugue.registry import _register from fugue.rpc import ( EmptyRPCHandler, diff --git a/fugue/dataframe/arrow_dataframe.py b/fugue/dataframe/arrow_dataframe.py index 9c66b0ba..91781064 100644 --- a/fugue/dataframe/arrow_dataframe.py +++ b/fugue/dataframe/arrow_dataframe.py @@ -16,6 +16,7 @@ drop_columns, get_column_names, get_schema, + is_df, rename, select_columns, ) @@ -113,6 +114,9 @@ def native(self) -> pa.Table: """:func:`pyarrow.Table `""" return self._native + def native_as_df(self) -> pa.Table: + return self._native + @property def empty(self) -> bool: return self.count() == 0 @@ -235,6 +239,11 @@ def _pa_table_as_fugue_df(df: pa.Table) -> "ArrowDataFrame": return ArrowDataFrame(df) +@is_df.candidate(lambda df: isinstance(df, pa.Table)) +def _pa_table_is_df(df: pa.Table) -> bool: + return True + + @count.candidate(lambda df: isinstance(df, pa.Table)) def _pa_table_count(df: pa.Table) -> int: return df.shape[0] diff --git a/fugue/dataframe/dataframe.py b/fugue/dataframe/dataframe.py index d31603b0..fc9c2449 100644 --- a/fugue/dataframe/dataframe.py +++ b/fugue/dataframe/dataframe.py @@ -47,7 +47,7 @@ def __init__(self, schema: Any = None): @property def schema(self) -> Schema: - """Schema of the dataframe""" + """The schema of the dataframe""" if self._schema_discovered: # we must keep it simple because it could be called on every row by a user assert isinstance(self._schema, Schema) @@ -60,6 +60,16 @@ def schema(self) -> Schema: self._schema_discovered = True return self._schema + @abstractmethod + def native_as_df(self) -> Any: # pragma: no cover + """The dataframe form of the native object this Dataset class wraps. + Dataframe form means the object contains schema information. For example + the native an ArrayDataFrame is a python array, it doesn't contain schema + information, and its ``native_as_df`` should be either a pandas dataframe + or an arrow dataframe. + """ + raise NotImplementedError + @abstractmethod def as_local(self) -> "LocalDataFrame": # pragma: no cover """Convert this dataframe to a :class:`.LocalDataFrame`""" @@ -284,6 +294,9 @@ class LocalDataFrame(DataFrame): implementing a new :class:`~fugue.execution.execution_engine.ExecutionEngine` """ + def native_as_df(self) -> Any: + return self.as_pandas() + @property def is_local(self) -> bool: """Always True because it's a LocalDataFrame""" @@ -428,6 +441,25 @@ def as_fugue_df(df: Any) -> DataFrame: return res # type: ignore +@fugue_plugin +def is_df(df: Any) -> bool: + """Whether ``df`` is a DataFrame like object""" + return isinstance(df, DataFrame) + + +def get_native_as_df(df: Any) -> Any: + """Return the dataframe form of the input ``df``. + If ``df`` is a :class:`~.DataFrame`, then call the + :meth:`~.DataFrame.native_as_df`, otherwise, it depends on whether there is + a correspondent function handling it. + """ + if isinstance(df, DataFrame): + return df.native_as_df() + if is_df(df): + return df + raise NotImplementedError(f"cannot get a dataframe like object from {type(df)}") + + @fugue_plugin def get_schema(df: Any) -> Schema: """Get the schema of the ``df`` diff --git a/fugue/dataframe/pandas_dataframe.py b/fugue/dataframe/pandas_dataframe.py index 77577981..00424619 100644 --- a/fugue/dataframe/pandas_dataframe.py +++ b/fugue/dataframe/pandas_dataframe.py @@ -17,6 +17,7 @@ get_column_names, get_schema, head, + is_df, rename, select_columns, ) @@ -85,6 +86,9 @@ def native(self) -> pd.DataFrame: """Pandas DataFrame""" return self._native + def native_as_df(self) -> pd.DataFrame: + return self._native + @property def empty(self) -> bool: return self.native.empty @@ -190,6 +194,11 @@ def _pd_as_fugue_df(df: pd.DataFrame) -> "PandasDataFrame": return PandasDataFrame(df) +@is_df.candidate(lambda df: isinstance(df, pd.DataFrame)) +def _pd_is_df(df: pd.DataFrame) -> bool: + return True + + @count.candidate(lambda df: isinstance(df, pd.DataFrame)) def _pd_count(df: pd.DataFrame) -> int: return df.shape[0] diff --git a/fugue/dataset.py b/fugue/dataset.py index 075a540f..2752255a 100644 --- a/fugue/dataset.py +++ b/fugue/dataset.py @@ -36,6 +36,12 @@ def reset_metadata(self, metadata: Any) -> None: """Reset metadata""" self._metadata = ParamDict(metadata) if metadata is not None else None + @property + @abstractmethod + def native(self) -> Any: # pragma: no cover + """The native object this Dataset class wraps""" + raise NotImplementedError + @property @abstractmethod def is_local(self) -> bool: # pragma: no cover diff --git a/fugue/execution/interfaceless.py b/fugue/execution/express.py similarity index 100% rename from fugue/execution/interfaceless.py rename to fugue/execution/express.py diff --git a/fugue/execution/factory.py b/fugue/execution/factory.py index 22f56cbb..4d581c0d 100644 --- a/fugue/execution/factory.py +++ b/fugue/execution/factory.py @@ -398,7 +398,7 @@ def is_pandas_or(objs: List[Any], obj_type: Any) -> bool: @fugue_plugin def infer_execution_engine(obj: List[Any]) -> Any: """Infer the correspondent ExecutionEngine based on the input objects. This is - used in interfaceless functions. + used in express functions. :param objs: the objects :return: if the inference succeeded, it returns an object that can be used by diff --git a/fugue/interfaceless/__init__.py b/fugue/express/__init__.py similarity index 89% rename from fugue/interfaceless/__init__.py rename to fugue/express/__init__.py index ac554b3e..071a9526 100644 --- a/fugue/interfaceless/__init__.py +++ b/fugue/express/__init__.py @@ -9,8 +9,10 @@ as_pandas, drop_columns, get_column_names, + get_native_as_df, get_schema, head, + is_df, normalize_column_names, peek_array, peek_dict, @@ -26,7 +28,7 @@ is_local, show, ) -from fugue.execution.interfaceless import ( +from fugue.execution.express import ( broadcast, intersect, join, diff --git a/fugue/interfaceless/transformation.py b/fugue/express/transformation.py similarity index 100% rename from fugue/interfaceless/transformation.py rename to fugue/express/transformation.py diff --git a/fugue/plugins.py b/fugue/plugins.py index 6fec357e..6ae2f4f3 100644 --- a/fugue/plugins.py +++ b/fugue/plugins.py @@ -11,6 +11,7 @@ get_column_names, get_schema, head, + is_df, peek_array, peek_dict, rename, diff --git a/fugue/workflow/workflow.py b/fugue/workflow/workflow.py index bc051b7c..e6faeff2 100644 --- a/fugue/workflow/workflow.py +++ b/fugue/workflow/workflow.py @@ -96,6 +96,13 @@ def spec_uuid(self) -> str: """UUID of its task spec""" return self._task.__uuid__() + @property + def native(self) -> Any: # pragma: no cover + raise NotImplementedError + + def native_as_df(self) -> Any: # pragma: no cover + raise NotImplementedError + @property def name(self) -> str: """Name of its task spec""" diff --git a/fugue_dask/dataframe.py b/fugue_dask/dataframe.py index 909269bf..28710143 100644 --- a/fugue_dask/dataframe.py +++ b/fugue_dask/dataframe.py @@ -22,6 +22,7 @@ get_column_names, head, is_bounded, + is_df, is_empty, is_local, rename, @@ -92,10 +93,10 @@ def __init__( # noqa: C901 @property def native(self) -> dd.DataFrame: - """The wrapped Dask DataFrame + """The wrapped Dask DataFrame""" + return self._native - :rtype: :class:`dask:dask.dataframe.DataFrame` - """ + def native_as_df(self) -> dd.DataFrame: return self._native @property @@ -239,6 +240,11 @@ def _apply_schema( return DASK_UTILS.enforce_type(pdf, schema.pa_schema, null_safe=True), schema +@is_df.candidate(lambda df: isinstance(df, dd.DataFrame)) +def _dd_is_df(df: dd.DataFrame) -> bool: + return True + + @count.candidate(lambda df: isinstance(df, dd.DataFrame)) def _dd_count(df: dd.DataFrame) -> int: return df.shape[0].compute() diff --git a/fugue_duckdb/dataframe.py b/fugue_duckdb/dataframe.py index 635961ea..8a49e1b7 100644 --- a/fugue_duckdb/dataframe.py +++ b/fugue_duckdb/dataframe.py @@ -13,7 +13,7 @@ LocalDataFrame, ) from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError -from fugue.plugins import get_column_names +from fugue.plugins import get_column_names, is_df from ._utils import encode_column_name, to_duck_type, to_pa_type @@ -41,6 +41,9 @@ def native(self) -> DuckDBPyRelation: """DuckDB relation object""" return self._rel + def native_as_df(self) -> DuckDBPyRelation: + return self._rel + @property def empty(self) -> bool: return self._rel.fetchone() is None @@ -138,6 +141,11 @@ def to_list(row: Any) -> List[Any]: return [to_list(x) for x in rel.fetchall()] +@is_df.candidate(lambda df: isinstance(df, DuckDBPyRelation)) +def _duck_is_df(df: DuckDBPyRelation) -> bool: + return True + + @get_column_names.candidate(lambda df: isinstance(df, DuckDBPyRelation)) def _get_duckdb_columns(df: DuckDBPyRelation) -> List[Any]: return list(df.columns) diff --git a/fugue_ibis/dataframe.py b/fugue_ibis/dataframe.py index d21c23e7..325c8f45 100644 --- a/fugue_ibis/dataframe.py +++ b/fugue_ibis/dataframe.py @@ -2,6 +2,8 @@ import pandas as pd import pyarrow as pa +from triad import Schema + from fugue import ( DataFrame, IterableDataFrame, @@ -11,7 +13,7 @@ ) from fugue.dataframe.dataframe import _input_schema from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError -from triad import Schema +from fugue.plugins import is_df from ._compat import IbisTable from ._utils import _pa_to_ibis_type, to_schema @@ -39,6 +41,9 @@ def native(self) -> IbisTable: """Ibis Table object""" return self._table + def native_as_df(self) -> IbisTable: + return self._table + def _to_local_df(self, table: IbisTable, schema: Any = None) -> LocalDataFrame: raise NotImplementedError # pragma: no cover @@ -152,3 +157,8 @@ def _alter_table_columns( def _type_equal(self, tp1: pa.DataType, tp2: pa.DataType) -> bool: return tp1 == tp2 + + +@is_df.candidate(lambda df: isinstance(df, IbisTable)) +def _ibis_is_df(df: IbisTable) -> bool: + return True diff --git a/fugue_ray/dataframe.py b/fugue_ray/dataframe.py index fd791a95..c14dc5e8 100644 --- a/fugue_ray/dataframe.py +++ b/fugue_ray/dataframe.py @@ -14,7 +14,7 @@ ) from fugue.dataframe.dataframe import _input_schema from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError -from fugue.plugins import get_column_names, rename +from fugue.plugins import get_column_names, rename, is_df from ._utils.dataframe import _build_empty_arrow, build_empty, get_dataset_format @@ -97,6 +97,9 @@ def native(self) -> rd.Dataset: """The wrapped ray Dataset""" return self._native + def native_as_df(self) -> rd.Dataset: + return self._native + @property def is_local(self) -> bool: return False @@ -236,6 +239,11 @@ def _remote_args(self) -> Dict[str, Any]: return {"num_cpus": 1} +@is_df.candidate(lambda df: isinstance(df, rd.Dataset)) +def _rd_is_df(df: rd.Dataset) -> bool: + return True + + @get_column_names.candidate(lambda df: isinstance(df, rd.Dataset)) def _get_ray_dataframe_columns(df: rd.Dataset) -> List[Any]: fmt = get_dataset_format(df) diff --git a/fugue_spark/dataframe.py b/fugue_spark/dataframe.py index cdeee587..090479c4 100644 --- a/fugue_spark/dataframe.py +++ b/fugue_spark/dataframe.py @@ -23,6 +23,7 @@ get_column_names, head, is_bounded, + is_df, is_empty, is_local, rename, @@ -72,6 +73,9 @@ def native(self) -> ps.DataFrame: """ return self._native + def native_as_df(self) -> ps.DataFrame: + return self._native + @property def is_local(self) -> bool: return False @@ -169,6 +173,11 @@ def _select_columns(self, columns: Optional[List[str]]) -> "SparkDataFrame": return SparkDataFrame(self.native.select(*columns)) +@is_df.candidate(lambda df: isinstance(df, ps.DataFrame)) +def _spark_is_df(df: ps.DataFrame) -> bool: + return True + + @count.candidate(lambda df: isinstance(df, ps.DataFrame)) def _spark_df_count(df: ps.DataFrame) -> int: return df.count() diff --git a/fugue_test/dataframe_suite.py b/fugue_test/dataframe_suite.py index 127fe990..20f4a18a 100644 --- a/fugue_test/dataframe_suite.py +++ b/fugue_test/dataframe_suite.py @@ -8,8 +8,8 @@ import pandas as pd from pytest import raises -import fugue.interfaceless as fi -from fugue.dataframe import ArrowDataFrame +import fugue.express as fi +from fugue.dataframe import ArrowDataFrame, DataFrame from fugue.dataframe.utils import _df_eq as df_eq from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError @@ -31,6 +31,18 @@ def tearDownClass(cls): def df(self, data: Any = None, schema: Any = None) -> Any: # pragma: no cover raise NotImplementedError + def test_native(self): + df = self.df([1], "a:int") + assert fi.is_df(df) + fdf = fi.as_fugue_df(df) + assert isinstance(fdf, DataFrame) + assert fi.is_df(fdf) + ndf = fi.get_native_as_df(fdf) + assert fi.is_df(ndf) + assert not isinstance(ndf, DataFrame) + ndf2 = fi.get_native_as_df(ndf) + assert ndf2 is ndf + def test_peek(self): df = self.df([], "x:str,y:double") raises(FugueDatasetEmptyError, lambda: fi.peek_array(df)) diff --git a/tests/fugue/dataframe/test_dataframe.py b/tests/fugue/dataframe/test_dataframe.py index e69cf0bf..26b28712 100644 --- a/tests/fugue/dataframe/test_dataframe.py +++ b/tests/fugue/dataframe/test_dataframe.py @@ -1,9 +1,11 @@ import copy + import pandas as pd -from triad.collections.schema import Schema from pytest import raises +from triad.collections.schema import Schema + from fugue.dataframe import ArrayDataFrame, DataFrame -from fugue.interfaceless import as_fugue_df +from fugue.express import as_fugue_df, get_native_as_df def test_as_fugue_df(): @@ -13,6 +15,12 @@ def test_as_fugue_df(): assert isinstance(as_fugue_df(df), DataFrame) +def test_get_native_as_df(): + with raises(NotImplementedError): + get_native_as_df(10) + # other tests are in the suites + + def test_show(): df = ArrayDataFrame(schema="a:str,b:str") df.show() diff --git a/tests/fugue_dask/test_dataframe.py b/tests/fugue_dask/test_dataframe.py index 4c2a2ffd..0fef361c 100644 --- a/tests/fugue_dask/test_dataframe.py +++ b/tests/fugue_dask/test_dataframe.py @@ -6,7 +6,7 @@ import dask.dataframe as pd import numpy as np import pandas -import fugue.interfaceless as fi +import fugue.express as fi from fugue.dataframe.array_dataframe import ArrayDataFrame from fugue.dataframe.pandas_dataframe import PandasDataFrame from fugue.dataframe.utils import _df_eq as df_eq diff --git a/tests/fugue_spark/test_dataframe.py b/tests/fugue_spark/test_dataframe.py index 29837927..057e40b6 100644 --- a/tests/fugue_spark/test_dataframe.py +++ b/tests/fugue_spark/test_dataframe.py @@ -8,7 +8,7 @@ from pyspark.sql import SparkSession from triad.collections.schema import Schema -import fugue.interfaceless as fi +import fugue.express as fi from fugue.dataframe.pandas_dataframe import PandasDataFrame from fugue.plugins import get_column_names, rename from fugue_spark import SparkExecutionEngine From 3de983e2cbd3dcaeb70e46527c2a2ad82201876f Mon Sep 17 00:00:00 2001 From: Han Wang Date: Wed, 21 Dec 2022 08:37:14 +0000 Subject: [PATCH 09/30] Add test suite for express functions --- fugue_ibis/dataframe.py | 42 ++++++++++++--- fugue_test/dataframe_suite.py | 20 +++++++ tests/fugue/dataframe/test_arrow_dataframe.py | 7 +-- .../fugue/dataframe/test_pandas_dataframe.py | 6 +-- tests/fugue/dataframe/test_utils.py | 4 +- tests/fugue_dask/test_dataframe.py | 12 ++--- tests/fugue_duckdb/test_dataframe.py | 6 +-- tests/fugue_ibis/mock/dataframe.py | 7 +++ tests/fugue_ibis/test_dataframe.py | 53 ++++++++++++++++--- tests/fugue_ray/test_dataframe.py | 10 ++-- tests/fugue_spark/test_dataframe.py | 15 +++--- 11 files changed, 137 insertions(+), 45 deletions(-) diff --git a/fugue_ibis/dataframe.py b/fugue_ibis/dataframe.py index 325c8f45..bd6aeef8 100644 --- a/fugue_ibis/dataframe.py +++ b/fugue_ibis/dataframe.py @@ -13,7 +13,7 @@ ) from fugue.dataframe.dataframe import _input_schema from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError -from fugue.plugins import is_df +from fugue.plugins import is_df, get_column_names, rename from ._compat import IbisTable from ._utils import _pa_to_ibis_type, to_schema @@ -96,13 +96,8 @@ def rename(self, columns: Dict[str, str]) -> DataFrame: schema = self.schema.rename(columns) except Exception as e: raise FugueDataFrameOperationError from e - cols: List[Any] = [] - for a, b in zip(self.schema.names, schema.names): - if a == b: - cols.append(self._table[a]) - else: - cols.append(self._table[a].name(b)) - return self._to_new_df(self._table.projection(cols), schema=schema) + df = _rename(self._table, self.schema.names, schema.names) + return self if df is self._table else self._to_new_df(df, schema=schema) def alter_columns(self, columns: Any) -> DataFrame: new_schema = self._get_altered_schema(columns) @@ -162,3 +157,34 @@ def _type_equal(self, tp1: pa.DataType, tp2: pa.DataType) -> bool: @is_df.candidate(lambda df: isinstance(df, IbisTable)) def _ibis_is_df(df: IbisTable) -> bool: return True + + +@get_column_names.candidate(lambda df: isinstance(df, IbisTable)) +def _get_ibis_columns(df: IbisTable) -> List[Any]: + return df.columns + + +@rename.candidate(lambda df, *args, **kwargs: isinstance(df, IbisTable)) +def _rename_dask_dataframe(df: IbisTable, columns: Dict[str, Any]) -> IbisTable: + _assert_no_missing(df, columns.keys()) + old_names = df.columns + new_names = [columns.get(name, name) for name in old_names] + return _rename(df, old_names, new_names) + + +def _rename(df: IbisTable, old_names: List[str], new_names: List[str]) -> IbisTable: + cols: List[Any] = [] + has_change = False + for a, b in zip(old_names, new_names): + if a == b: + cols.append(df[a]) + else: + cols.append(df[a].name(b)) + has_change = True + return df.projection(cols) if has_change else df + + +def _assert_no_missing(df: IbisTable, columns: Iterable[Any]) -> None: + missing = set(columns) - set(df.columns) + if len(missing) > 0: + raise FugueDataFrameOperationError("found nonexistent columns: {missing}") diff --git a/fugue_test/dataframe_suite.py b/fugue_test/dataframe_suite.py index 20f4a18a..126a8318 100644 --- a/fugue_test/dataframe_suite.py +++ b/fugue_test/dataframe_suite.py @@ -422,3 +422,23 @@ def test_alter_columns_invalid(self): ) ndf = fi.alter_columns(df, "b:int") fi.show(ndf) # lazy dataframes will force to materialize + + class NativeTests(Tests): + def to_native_df(self, pdf: pd.DataFrame) -> Any: # pragma: no cover + raise NotImplementedError + + def test_get_altered_schema(self): + pass + + def _test_get_column_names(self): + df = self.to_native_df(pd.DataFrame([[0, 1, 2]], columns=["0", "1", "2"])) + assert fi.get_column_names(df) == ["0", "1", "2"] + + def test_rename_any_names(self): + pdf = self.to_native_df(pd.DataFrame([[0, 1, 2]], columns=["a", "b", "c"])) + df = fi.rename(pdf, {}) + assert fi.get_column_names(df) == ["a", "b", "c"] + + pdf = self.to_native_df(pd.DataFrame([[0, 1, 2]], columns=["0", "1", "2"])) + df = fi.rename(pdf, {"0": "_0", "1": "_1", "2": "_2"}) + assert fi.get_column_names(df) == ["_0", "_1", "_2"] diff --git a/tests/fugue/dataframe/test_arrow_dataframe.py b/tests/fugue/dataframe/test_arrow_dataframe.py index 63e68edb..5b80c212 100644 --- a/tests/fugue/dataframe/test_arrow_dataframe.py +++ b/tests/fugue/dataframe/test_arrow_dataframe.py @@ -4,6 +4,7 @@ import numpy as np import pandas as pd +import pyarrow as pa from fugue.dataframe import ArrowDataFrame, PandasDataFrame from fugue.dataframe.utils import _df_eq as df_eq from fugue_test.dataframe_suite import DataFrameTests @@ -17,12 +18,12 @@ def df(self, data: Any = None, schema: Any = None) -> ArrowDataFrame: return ArrowDataFrame(data, schema) -class NativeArrowDataFrameTests(DataFrameTests.Tests): +class NativeArrowDataFrameTests(DataFrameTests.NativeTests): def df(self, data: Any = None, schema: Any = None) -> pd.DataFrame: return ArrowDataFrame(data, schema).as_arrow() - def test_get_altered_schema(self): - pass + def to_native_df(self, pdf: pd.DataFrame) -> Any: # pragma: no cover + return pa.Table.from_pandas(pdf) def test_init(): diff --git a/tests/fugue/dataframe/test_pandas_dataframe.py b/tests/fugue/dataframe/test_pandas_dataframe.py index 73c28b6e..80d38789 100644 --- a/tests/fugue/dataframe/test_pandas_dataframe.py +++ b/tests/fugue/dataframe/test_pandas_dataframe.py @@ -19,12 +19,12 @@ def df(self, data: Any = None, schema: Any = None) -> PandasDataFrame: return PandasDataFrame(data, schema) -class NativePandasDataFrameTests(DataFrameTests.Tests): +class NativePandasDataFrameTests(DataFrameTests.NativeTests): def df(self, data: Any = None, schema: Any = None) -> pd.DataFrame: return ArrowDataFrame(data, schema).as_pandas() - def test_get_altered_schema(self): - pass + def to_native_df(self, pdf: pd.DataFrame) -> Any: # pragma: no cover + return pdf def test_map_type(self): pass diff --git a/tests/fugue/dataframe/test_utils.py b/tests/fugue/dataframe/test_utils.py index 4c33cded..4ee75973 100644 --- a/tests/fugue/dataframe/test_utils.py +++ b/tests/fugue/dataframe/test_utils.py @@ -200,7 +200,7 @@ def assert_eq(df, df_expected=None, raw=False): raises(ValueError, lambda: deserialize_df('{"x":1}')) -def test_get_column_names(): +def _test_get_column_names(): df = pd.DataFrame([[0, 1, 2]]) assert get_column_names(df) == [0, 1, 2] @@ -211,7 +211,7 @@ def test_get_column_names(): assert get_column_names(pdf) == ["a", "b"] -def test_rename(): +def _test_rename(): assert rename("dummy", {}) == "dummy" pdf = pd.DataFrame([[0, 1, 2]], columns=["a", "b", "c"]) df = rename(pdf, {}) diff --git a/tests/fugue_dask/test_dataframe.py b/tests/fugue_dask/test_dataframe.py index 0fef361c..8e17e010 100644 --- a/tests/fugue_dask/test_dataframe.py +++ b/tests/fugue_dask/test_dataframe.py @@ -25,16 +25,16 @@ def df(self, data: Any = None, schema: Any = None) -> DaskDataFrame: return DaskDataFrame(data, schema) -class NativeDaskDataFrameTests(DataFrameTests.Tests): +class NativeDaskDataFrameTests(DataFrameTests.NativeTests): def df(self, data: Any = None, schema: Any = None): return DaskDataFrame(data, schema).native + def to_native_df(self, pdf: pandas.DataFrame) -> Any: + return pd.from_pandas(pdf, npartitions=2) + def test_not_local(self): assert not fi.is_local(self.df([], "a:int,b:str")) - def test_get_altered_schema(self): - pass - def test_alter_columns(self): pass @@ -232,12 +232,12 @@ def _test_as_array_perf(): print(nts, ts) -def test_get_column_names(): +def _test_get_column_names(): df = pd.from_pandas(pandas.DataFrame([[0, 1, 2]]), npartitions=1) assert get_column_names(df) == [0, 1, 2] -def test_rename(): +def _test_rename(): pdf = pd.from_pandas( pandas.DataFrame([[0, 1, 2]], columns=["a", "b", "c"]), npartitions=1 ) diff --git a/tests/fugue_duckdb/test_dataframe.py b/tests/fugue_duckdb/test_dataframe.py index a43f6922..64d32d26 100644 --- a/tests/fugue_duckdb/test_dataframe.py +++ b/tests/fugue_duckdb/test_dataframe.py @@ -69,7 +69,7 @@ def test_duck_as_local(self): assert isinstance(df.as_local(), ArrowDataFrame) -class NativeDuckDataFrameTests(DataFrameTests.Tests): +class NativeDuckDataFrameTests(DataFrameTests.NativeTests): @classmethod def setUpClass(cls): cls._con = duckdb.connect() @@ -78,5 +78,5 @@ def df(self, data: Any = None, schema: Any = None) -> DuckDataFrame: df = ArrowDataFrame(data, schema) return DuckDataFrame(duckdb.arrow(df.native, self._con)).native - def test_get_altered_schema(self): - pass + def to_native_df(self, pdf: pd.DataFrame) -> Any: + return duckdb.from_df(pdf) diff --git a/tests/fugue_ibis/mock/dataframe.py b/tests/fugue_ibis/mock/dataframe.py index 4cf03587..66f14667 100644 --- a/tests/fugue_ibis/mock/dataframe.py +++ b/tests/fugue_ibis/mock/dataframe.py @@ -3,6 +3,7 @@ from fugue import ArrowDataFrame, DataFrame, LocalDataFrame from fugue_ibis import IbisDataFrame, IbisTable from fugue_ibis._utils import to_schema +from fugue.plugins import as_fugue_dataset class MockDuckDataFrame(IbisDataFrame): @@ -14,3 +15,9 @@ def _to_local_df(self, table: IbisTable, schema: Any = None) -> LocalDataFrame: def _to_iterable_df(self, table: IbisTable, schema: Any = None) -> LocalDataFrame: return self._to_local_df(table, schema=schema) + + +# should also check the df._findbackend is duckdb +@as_fugue_dataset.candidate(lambda df: isinstance(df, IbisTable)) +def _ibis_as_fugue(df: IbisTable) -> bool: + return MockDuckDataFrame(df) diff --git a/tests/fugue_ibis/test_dataframe.py b/tests/fugue_ibis/test_dataframe.py index 67ca52bc..16da2d67 100644 --- a/tests/fugue_ibis/test_dataframe.py +++ b/tests/fugue_ibis/test_dataframe.py @@ -4,7 +4,10 @@ import ibis import pandas as pd +import pyarrow as pa import pytest + +import fugue.express as fe from fugue import ArrowDataFrame from fugue_duckdb.dataframe import DuckDataFrame from fugue_test.dataframe_suite import DataFrameTests @@ -31,13 +34,8 @@ def test_init_df(self): def test_is_local(self): df = self.df([["x", 1]], "a:str,b:int") - assert not df.is_local - assert df.is_bounded - - def _test_as_arrow(self): - # empty - df = self.df([["a", 1]], "a:str,b:int") - assert [["a", 1]] == list(ArrowDataFrame(df.as_arrow()).as_array()) + assert not fe.is_local(df) + assert fe.is_bounded(df) def test_map_type(self): pass @@ -56,3 +54,44 @@ def test_as_arrow(self): assert [dict(a=datetime(2020, 1, 1), b=1)] == list( ArrowDataFrame(df.as_arrow()).as_dict_iterable() ) + + def test_deep_nested_types(self): + pass + + def test_list_type(self): + pass + + +@pytest.mark.skipif(sys.version_info < (3, 8), reason="< 3.8") +class NativeIbisDataFrameTests(DataFrameTests.NativeTests): + @classmethod + def setUpClass(cls): + cls._con = ibis.duckdb.connect() + + def df(self, data: Any = None, schema: Any = None): + df = ArrowDataFrame(data, schema) + name = f"_{id(df.native)}" + self._con.con.execute("register", (name, df.native)) + return MockDuckDataFrame(self._con.table(name), schema=schema).native + + def to_native_df(self, pdf: pd.DataFrame) -> Any: + name = f"_{id(pdf)}" + self._con.con.execute("register", (name, pa.Table.from_pandas(pdf))) + return self._con.table(name) + + def test_is_local(self): + df = self.df([["x", 1]], "a:str,b:int") + assert not fe.is_local(df) + assert fe.is_bounded(df) + + def test_map_type(self): + pass + + def test_as_arrow(self): + pass + + def test_deep_nested_types(self): + pass + + def test_list_type(self): + pass diff --git a/tests/fugue_ray/test_dataframe.py b/tests/fugue_ray/test_dataframe.py index 9dc973bc..8ca3bdcd 100644 --- a/tests/fugue_ray/test_dataframe.py +++ b/tests/fugue_ray/test_dataframe.py @@ -101,7 +101,7 @@ def test_ray_num_partitions(self): df = RayDataFrame(rdf.repartition(5)) assert 5 == df.num_partitions - def test_get_column_names(self): + def _test_get_column_names(self): df = rd.from_pandas(pd.DataFrame([[0, 10, 20]], columns=["0", "1", "2"])) assert get_column_names(df) == ["0", "1", "2"] @@ -110,7 +110,7 @@ def test_get_column_names(self): ) assert get_column_names(df) == ["0", "1", "2"] - def test_rename(self): + def _test_rename(self): rdf = rd.from_pandas(pd.DataFrame([[0, 10, 20]], columns=["a", "b", "c"])) df = rename(rdf, {}) assert isinstance(df, rd.Dataset) @@ -122,7 +122,7 @@ def test_rename(self): assert get_column_names(df) == ["_0", "_1", "_2"] -class NativeRayDataFrameTests(DataFrameTests.Tests): +class NativeRayDataFrameTests(DataFrameTests.NativeTests): @classmethod def setUpClass(cls): ray.init(num_cpus=2) @@ -136,5 +136,5 @@ def df(self, data: Any = None, schema: Any = None): # native ray dataset can't handle the schema when empty return res if res.empty else res.native - def test_get_altered_schema(self): - pass + def to_native_df(self, pdf: pd.DataFrame) -> Any: + return rd.from_pandas(pdf) diff --git a/tests/fugue_spark/test_dataframe.py b/tests/fugue_spark/test_dataframe.py index 057e40b6..b3d5b697 100644 --- a/tests/fugue_spark/test_dataframe.py +++ b/tests/fugue_spark/test_dataframe.py @@ -36,16 +36,18 @@ def test_map_type(self): return super().test_map_type() -class NativeSparkDataFrameTests(DataFrameTests.Tests): +class NativeSparkDataFrameTests(DataFrameTests.NativeTests): @pytest.fixture(autouse=True) def init_session(self, spark_session): self.spark_session = spark_session def df(self, data: Any = None, schema: Any = None): - session = SparkSession.builder.getOrCreate() - engine = SparkExecutionEngine(session) + engine = SparkExecutionEngine(self.spark_session) return engine.to_df(data, schema=schema).native + def to_native_df(self, pdf: pd.DataFrame) -> Any: + return self.spark_session.createDataFrame(pdf) + def test_not_local(self): assert not fi.is_local(self.df([], "a:int,b:str")) @@ -57,9 +59,6 @@ def test_map_type(self): if pyspark.__version__ >= "3": return super().test_map_type() - def test_get_altered_schema(self): - pass - def test_init(spark_session): sdf = spark_session.createDataFrame([["a", 1]]) @@ -146,14 +145,14 @@ def _df(data, schema=None): return SparkDataFrame(df, schema) -def test_get_column_names(spark_session): +def _test_get_column_names(spark_session): df = spark_session.createDataFrame( pd.DataFrame([[0, 1, 2]], columns=["0", "1", "2"]) ) assert get_column_names(df) == ["0", "1", "2"] -def test_rename(spark_session): +def _test_rename(spark_session): pdf = spark_session.createDataFrame( pd.DataFrame([[0, 1, 2]], columns=["a", "b", "c"]) ) From 2a52b799149419bf327d4e25e4937e1cc16bbcd2 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Thu, 22 Dec 2022 08:49:34 +0000 Subject: [PATCH 10/30] add engine level utils --- fugue/collections/partition.py | 3 - fugue/execution/execution_engine.py | 13 +- fugue/execution/express.py | 236 +++++++++++++++++++-- fugue/execution/native_execution_engine.py | 7 +- fugue/express/__init__.py | 7 + fugue_dask/execution_engine.py | 7 +- fugue_duckdb/dask.py | 4 +- fugue_duckdb/execution_engine.py | 20 +- fugue_ibis/execution_engine.py | 4 +- fugue_ray/_utils/io.py | 5 +- fugue_ray/execution_engine.py | 4 +- fugue_spark/_utils/io.py | 5 +- fugue_spark/execution_engine.py | 7 +- tests/fugue_ibis/mock/execution_engine.py | 4 +- 14 files changed, 274 insertions(+), 52 deletions(-) diff --git a/fugue/collections/partition.py b/fugue/collections/partition.py index acdc02fd..bec2db0d 100644 --- a/fugue/collections/partition.py +++ b/fugue/collections/partition.py @@ -323,9 +323,6 @@ def _update_dict(self, d: Dict[str, Any], u: Dict[str, Any]) -> None: d[k] = v -EMPTY_PARTITION_SPEC = PartitionSpec() - - class DatasetPartitionCursor: """The cursor pointing at the first item of each logical partition inside a physical partition. diff --git a/fugue/execution/execution_engine.py b/fugue/execution/execution_engine.py index 3eb8d5a9..b2f6801d 100644 --- a/fugue/execution/execution_engine.py +++ b/fugue/execution/execution_engine.py @@ -13,7 +13,6 @@ from fugue.bag import Bag, LocalBag from fugue.collections.partition import ( - EMPTY_PARTITION_SPEC, BagPartitionCursor, PartitionCursor, PartitionSpec, @@ -494,7 +493,7 @@ def take( n: int, presort: str, na_position: str = "last", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, ) -> DataFrame: # pragma: no cover """ Get the first n rows of a DataFrame per partition. If a presort is defined, @@ -742,7 +741,7 @@ def zip( df1: DataFrame, df2: DataFrame, how: str = "inner", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, temp_path: Optional[str] = None, to_file_threshold: Any = -1, df1_name: Optional[str] = None, @@ -780,6 +779,7 @@ def zip( For more details and examples, read |ZipComap|. """ + partition_spec = partition_spec or PartitionSpec() on = list(partition_spec.partition_by) how = how.lower() assert_or_throw( @@ -806,7 +806,7 @@ def update_df(df: DataFrame, name: Optional[str]) -> DataFrame: if not df.metadata.get("serialized", False): df = self._serialize_by_partition( df, - partition_spec, + partition_spec or PartitionSpec(), name, temp_path, to_file_threshold, @@ -836,7 +836,7 @@ def zip_all( self, dfs: DataFrames, how: str = "inner", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, temp_path: Optional[str] = None, to_file_threshold: Any = -1, ) -> DataFrame: @@ -865,6 +865,7 @@ def zip_all( For more details and examples, read |ZipComap| """ + partition_spec = partition_spec or PartitionSpec() assert_or_throw(len(dfs) > 0, "can't zip 0 dataframes") pairs = list(dfs.items()) has_name = dfs.has_key @@ -977,7 +978,7 @@ def save_df( path: str, format_hint: Any = None, mode: str = "overwrite", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, force_single: bool = False, **kwargs: Any, ) -> None: # pragma: no cover diff --git a/fugue/execution/express.py b/fugue/execution/express.py index c7bb03fe..6eefa4be 100644 --- a/fugue/execution/express.py +++ b/fugue/execution/express.py @@ -1,7 +1,8 @@ +from typing import Any, List, Optional, Union + from ..collections.partition import PartitionSpec from ..dataframe.dataframe import DataFrame from .factory import make_execution_engine -from typing import Any, Optional, List def repartition( @@ -62,9 +63,203 @@ def persist( return _adjust_df([df], e.persist(edf, lazy=lazy, **kwargs), as_fugue=as_fugue) +def distinct( + df: Any, engine: Any = None, engine_conf: Any = None, as_fugue: bool = False +) -> Any: + """Equivalent to ``SELECT DISTINCT * FROM df`` + + :param df: an input dataframe that can be recognized by Fugue + :return: [description] + """ + e = make_execution_engine(engine, engine_conf, infer_by=[df]) + edf = e.distinct(e.to_df(df)) + return _adjust_df([df], edf, as_fugue=as_fugue) + + +def dropna( + df: Any, + how: str = "any", + thresh: int = None, + subset: List[str] = None, + engine: Any = None, + engine_conf: Any = None, + as_fugue: bool = False, +) -> Any: + """Drop NA recods from dataframe + + :param df: an input dataframe that can be recognized by Fugue + :param how: 'any' or 'all'. 'any' drops rows that contain any nulls. + 'all' drops rows that contain all nulls. + :param thresh: int, drops rows that have less than thresh non-null values + :param subset: list of columns to operate on + + :return: DataFrame with NA records dropped + """ + e = make_execution_engine(engine, engine_conf, infer_by=[df]) + edf = e.dropna(e.to_df(df), how=how, thresh=thresh, subset=subset) + return _adjust_df([df], edf, as_fugue=as_fugue) + + +def fillna( + df: Any, + value: Any, + subset: List[str] = None, + engine: Any = None, + engine_conf: Any = None, + as_fugue: bool = False, +) -> Any: + """ + Fill ``NULL``, ``NAN``, ``NAT`` values in a dataframe + + :param df: an input dataframe that can be recognized by Fugue + :param value: if scalar, fills all columns with same value. + if dictionary, fills NA using the keys as column names and the + values as the replacement values. + :param subset: list of columns to operate on. ignored if value is + a dictionary + + :return: DataFrame with NA records filled + """ + e = make_execution_engine(engine, engine_conf, infer_by=[df]) + edf = e.fillna(e.to_df(df), value=value, subset=subset) + return _adjust_df([df], edf, as_fugue=as_fugue) + + +def sample( + df: Any, + n: Optional[int] = None, + frac: Optional[float] = None, + replace: bool = False, + seed: Optional[int] = None, + engine: Any = None, + engine_conf: Any = None, + as_fugue: bool = False, +) -> Any: + """ + Sample dataframe by number of rows or by fraction + + :param df: an input dataframe that can be recognized by Fugue + :param n: number of rows to sample, one and only one of ``n`` and ``frac`` + must be set + :param frac: fraction [0,1] to sample, one and only one of ``n`` and ``frac`` + must be set + :param replace: whether replacement is allowed. With replacement, + there may be duplicated rows in the result, defaults to False + :param seed: seed for randomness, defaults to None + + :return: the sampled dataframe + """ + e = make_execution_engine(engine, engine_conf, infer_by=[df]) + edf = e.sample(e.to_df(df), n=n, frac=frac, replace=replace, seed=seed) + return _adjust_df([df], edf, as_fugue=as_fugue) + + +def take( + df: Any, + n: int, + presort: str, + na_position: str = "last", + partition_spec: Optional[PartitionSpec] = None, + engine: Any = None, + engine_conf: Any = None, + as_fugue: bool = False, +) -> Any: + """ + Get the first n rows of a DataFrame per partition. If a presort is defined, + use the presort before applying take. presort overrides partition_spec.presort. + The Fugue implementation of the presort follows Pandas convention of specifying + NULLs first or NULLs last. This is different from the Spark and SQL convention + of NULLs as the smallest value. + + :param df: an input dataframe that can be recognized by Fugue + :param n: number of rows to return + :param presort: presort expression similar to partition presort + :param na_position: position of null values during the presort. + can accept ``first`` or ``last`` + :param partition_spec: PartitionSpec to apply the take operation, + defaults to None + + :return: n rows of DataFrame per partition + """ + e = make_execution_engine(engine, engine_conf, infer_by=[df]) + edf = e.take( + e.to_df(df), + n=n, + presort=presort, + na_position=na_position, + partition_spec=partition_spec, + ) + return _adjust_df([df], edf, as_fugue=as_fugue) + + +def load( + path: Union[str, List[str]], + format_hint: Any = None, + columns: Any = None, + engine: Any = None, + engine_conf: Any = None, + as_fugue: bool = False, + **kwargs: Any, +) -> Any: + """Load dataframe from persistent storage + + :param path: the path to the dataframe + :param format_hint: can accept ``parquet``, ``csv``, ``json``, + defaults to None, meaning to infer + :param columns: list of columns or a |SchemaLikeObject|, defaults to None + :param kwargs: parameters to pass to the underlying framework + :return: an engine compatible dataframe + + For more details and examples, read |ZipComap|. + """ + e = make_execution_engine(engine, engine_conf) + res = e.load_df(path=path, format_hint=format_hint, columns=columns, **kwargs) + return _adjust_df([], res, as_fugue=as_fugue) + + +def save( + df: Any, + path: str, + format_hint: Any = None, + mode: str = "overwrite", + partition_spec: Optional[PartitionSpec] = None, + force_single: bool = False, + engine: Any = None, + engine_conf: Any = None, + **kwargs: Any, +) -> None: + """Save dataframe to a persistent storage + + :param df: an input dataframe that can be recognized by Fugue + :param path: output path + :param format_hint: can accept ``parquet``, ``csv``, ``json``, + defaults to None, meaning to infer + :param mode: can accept ``overwrite``, ``append``, ``error``, + defaults to "overwrite" + :param partition_spec: how to partition the dataframe before saving, + defaults to empty + :param force_single: force the output as a single file, defaults to False + :param kwargs: parameters to pass to the underlying framework + + For more details and examples, read |LoadSave|. + """ + e = make_execution_engine(engine, engine_conf, infer_by=[df]) + edf = e.to_df(df) + e.save_df( + edf, + path=path, + format_hint=format_hint, + mode=mode, + partition_spec=partition_spec, + force_single=force_single, + **kwargs, + ) + + def join( df1: Any, df2: Any, + *dfs: Any, how: str, on: Optional[List[str]] = None, engine: Any = None, @@ -75,6 +270,7 @@ def join( :param df1: the first dataframe :param df2: the second dataframe + :param dfs: more dataframes to join :param how: can accept ``semi``, ``left_semi``, ``anti``, ``left_anti``, ``inner``, ``left_outer``, ``right_outer``, ``full_outer``, ``cross`` :param on: it can always be inferred, but if you provide, it will be @@ -88,12 +284,16 @@ def join( e = make_execution_engine(engine, engine_conf, infer_by=[df1, df2]) edf1 = e.to_df(df1) edf2 = e.to_df(df2) - return _adjust_df([df1, df2], e.join(edf1, edf2, how=how, on=on), as_fugue=as_fugue) + res = e.join(edf1, edf2, how=how, on=on) + for odf in dfs: + res = e.join(res, e.to_df(odf), how=how, on=on) + return _adjust_df([df1, df2, *dfs], res, as_fugue=as_fugue) def union( df1: Any, df2: Any, + *dfs: Any, distinct: bool = True, engine: Any = None, engine_conf: Any = None, @@ -103,26 +303,29 @@ def union( :param df1: the first dataframe :param df2: the second dataframe + :param dfs: more dataframes to union :param distinct: ``true`` for ``UNION`` (== ``UNION DISTINCT``), ``false`` for ``UNION ALL`` :return: the unioned dataframe .. note:: - Currently, the schema of ``df1`` and ``df2`` must be identical, or + Currently, the schema of all dataframes must be identical, or an exception will be thrown. """ e = make_execution_engine(engine, engine_conf, infer_by=[df1, df2]) edf1 = e.to_df(df1) edf2 = e.to_df(df2) - return _adjust_df( - [df1, df2], e.union(edf1, edf2, distinct=distinct), as_fugue=as_fugue - ) + res = e.union(edf1, edf2, distinct=distinct) + for odf in dfs: + res = e.union(res, e.to_df(odf), distinct=distinct) + return _adjust_df([df1, df2, *dfs], res, as_fugue=as_fugue) def subtract( df1: Any, df2: Any, + *dfs: Any, distinct: bool = True, engine: Any = None, engine_conf: Any = None, @@ -132,27 +335,30 @@ def subtract( :param df1: the first dataframe :param df2: the second dataframe + :param dfs: more dataframes to subtract :param distinct: ``true`` for ``EXCEPT`` (== ``EXCEPT DISTINCT``), ``false`` for ``EXCEPT ALL`` :return: the unioned dataframe .. note:: - Currently, the schema of ``df1`` and ``df2`` must be identical, or + Currently, the schema of all datafrmes must be identical, or an exception will be thrown. """ e = make_execution_engine(engine, engine_conf, infer_by=[df1, df2]) edf1 = e.to_df(df1) edf2 = e.to_df(df2) - return _adjust_df( - [df1, df2], e.subtract(edf1, edf2, distinct=distinct), as_fugue=as_fugue - ) + res = e.subtract(edf1, edf2, distinct=distinct) + for odf in dfs: + res = e.subtract(edf1, e.to_df(odf), distinct=distinct) + return _adjust_df([df1, df2, *dfs], res, as_fugue=as_fugue) def intersect( df1: Any, df2: Any, - distinct: bool = True, + *dfs: Any, + distinct: bool = True, # pylint: disable-all engine: Any = None, engine_conf: Any = None, as_fugue: bool = False, @@ -161,6 +367,7 @@ def intersect( :param df1: the first dataframe :param df2: the second dataframe + :param dfs: more dataframes to intersect with :param distinct: ``true`` for ``INTERSECT`` (== ``INTERSECT DISTINCT``), ``false`` for ``INTERSECT ALL`` :return: the unioned dataframe @@ -173,9 +380,10 @@ def intersect( e = make_execution_engine(engine, engine_conf, infer_by=[df1, df2]) edf1 = e.to_df(df1) edf2 = e.to_df(df2) - return _adjust_df( - [df1, df2], e.intersect(edf1, edf2, distinct=distinct), as_fugue=as_fugue - ) + res = e.intersect(edf1, edf2, distinct=distinct) + for odf in dfs: + res = e.intersect(res, e.to_df(odf), distinct=distinct) + return _adjust_df([df1, df2, *dfs], res, as_fugue=as_fugue) def _adjust_df(input_dfs: Any, output_df: DataFrame, as_fugue: bool) -> Any: diff --git a/fugue/execution/native_execution_engine.py b/fugue/execution/native_execution_engine.py index 87caef93..5e9997f6 100644 --- a/fugue/execution/native_execution_engine.py +++ b/fugue/execution/native_execution_engine.py @@ -19,7 +19,6 @@ ) from fugue._utils.io import load_df, save_df from fugue.collections.partition import ( - EMPTY_PARTITION_SPEC, PartitionCursor, PartitionSpec, parse_presort_exp, @@ -308,8 +307,9 @@ def take( n: int, presort: str, na_position: str = "last", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, ) -> DataFrame: + partition_spec = partition_spec or PartitionSpec() assert_or_throw( isinstance(n, int), ValueError("n needs to be an integer"), @@ -356,10 +356,11 @@ def save_df( path: str, format_hint: Any = None, mode: str = "overwrite", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, force_single: bool = False, **kwargs: Any, ) -> None: + partition_spec = partition_spec or PartitionSpec() if not force_single and not partition_spec.empty: kwargs["partition_cols"] = partition_spec.partition_by self.fs.makedirs(os.path.dirname(path), recreate=True) diff --git a/fugue/express/__init__.py b/fugue/express/__init__.py index 071a9526..53e58024 100644 --- a/fugue/express/__init__.py +++ b/fugue/express/__init__.py @@ -30,11 +30,18 @@ ) from fugue.execution.express import ( broadcast, + distinct, + dropna, + fillna, intersect, join, + load, persist, repartition, + sample, + save, subtract, + take, union, ) diff --git a/fugue_dask/execution_engine.py b/fugue_dask/execution_engine.py index b3e54c31..c384e0ea 100644 --- a/fugue_dask/execution_engine.py +++ b/fugue_dask/execution_engine.py @@ -13,7 +13,6 @@ from triad.utils.threading import RunOnce from fugue.collections.partition import ( - EMPTY_PARTITION_SPEC, PartitionCursor, PartitionSpec, parse_presort_exp, @@ -371,8 +370,9 @@ def take( n: int, presort: str, na_position: str = "last", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, ) -> DataFrame: + partition_spec = partition_spec or PartitionSpec() assert_or_throw( isinstance(n, int), ValueError("n needs to be an integer"), @@ -439,10 +439,11 @@ def save_df( path: str, format_hint: Any = None, mode: str = "overwrite", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, force_single: bool = False, **kwargs: Any, ) -> None: + partition_spec = partition_spec or PartitionSpec() if force_single: self._native.save_df( df, diff --git a/fugue_duckdb/dask.py b/fugue_duckdb/dask.py index b968e6b7..78004b42 100644 --- a/fugue_duckdb/dask.py +++ b/fugue_duckdb/dask.py @@ -8,7 +8,6 @@ from triad import assert_or_throw from fugue import DataFrame, MapEngine, PartitionSpec -from fugue.collections.partition import EMPTY_PARTITION_SPEC from fugue_dask import DaskDataFrame, DaskExecutionEngine from fugue_dask.execution_engine import DaskMapEngine from fugue_duckdb.dataframe import DuckDataFrame @@ -79,10 +78,11 @@ def save_df( path: str, format_hint: Any = None, mode: str = "overwrite", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, force_single: bool = False, **kwargs: Any, ) -> None: + partition_spec = partition_spec or PartitionSpec() if isinstance(df, DaskDataFrame) or not partition_spec.empty: return self._dask_engine.save_df( self._to_dask_df(df), diff --git a/fugue_duckdb/execution_engine.py b/fugue_duckdb/execution_engine.py index e192851c..8ec0b9f1 100644 --- a/fugue_duckdb/execution_engine.py +++ b/fugue_duckdb/execution_engine.py @@ -3,7 +3,7 @@ import duckdb import pyarrow as pa -from duckdb import DuckDBPyConnection +from duckdb import DuckDBPyConnection, DuckDBPyRelation from triad import SerializableRLock from triad.collections.fs import FileSystem from triad.utils.assertion import assert_or_throw @@ -16,11 +16,7 @@ PandasMapEngine, SQLEngine, ) -from fugue.collections.partition import ( - EMPTY_PARTITION_SPEC, - PartitionSpec, - parse_presort_exp, -) +from fugue.collections.partition import PartitionSpec, parse_presort_exp from fugue.dataframe import ( DataFrame, DataFrames, @@ -336,8 +332,9 @@ def take( n: int, presort: str, na_position: str = "last", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, ) -> DataFrame: + partition_spec = partition_spec or PartitionSpec() assert_or_throw( isinstance(n, int), ValueError("n needs to be an integer"), @@ -399,10 +396,11 @@ def save_df( path: str, format_hint: Any = None, mode: str = "overwrite", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, force_single: bool = False, **kwargs: Any, ) -> None: + partition_spec = partition_spec or PartitionSpec() if not partition_spec.empty and not force_single: kwargs["partition_cols"] = partition_spec.partition_by dio = DuckDBIO(self.fs, self.connection) @@ -417,6 +415,12 @@ def _sql(self, sql: str, dfs: Dict[str, DataFrame]) -> DuckDataFrame: return DuckDataFrame(df.native) # type: ignore def _to_duck_df(self, df: Any, schema: Any = None) -> DuckDataFrame: + if isinstance(df, DuckDBPyRelation): + assert_or_throw( + schema is None, + ValueError("schema must be None when df is a DuckDBPyRelation"), + ) + return DuckDataFrame(df) if isinstance(df, DataFrame): assert_or_throw( schema is None, diff --git a/fugue_ibis/execution_engine.py b/fugue_ibis/execution_engine.py index 1240a586..2fd951e9 100644 --- a/fugue_ibis/execution_engine.py +++ b/fugue_ibis/execution_engine.py @@ -6,7 +6,6 @@ from triad.utils.assertion import assert_or_throw from fugue.collections.partition import ( - EMPTY_PARTITION_SPEC, PartitionSpec, parse_presort_exp, ) @@ -210,8 +209,9 @@ def take( n: int, presort: str, na_position: str = "last", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, ) -> DataFrame: + partition_spec = partition_spec or PartitionSpec() assert_or_throw( isinstance(n, int), ValueError("n needs to be an integer"), diff --git a/fugue_ray/_utils/io.py b/fugue_ray/_utils/io.py index cece9c8f..72dddd17 100644 --- a/fugue_ray/_utils/io.py +++ b/fugue_ray/_utils/io.py @@ -6,7 +6,7 @@ import ray.data as rd from fugue import ExecutionEngine from fugue._utils.io import FileParser, load_df, save_df -from fugue.collections.partition import EMPTY_PARTITION_SPEC, PartitionSpec +from fugue.collections.partition import PartitionSpec from fugue.dataframe import DataFrame from fugue_ray.dataframe import RayDataFrame from pyarrow import csv as pacsv @@ -59,11 +59,12 @@ def save_df( df: RayDataFrame, uri: str, format_hint: Optional[str] = None, - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, mode: str = "overwrite", force_single: bool = False, **kwargs: Any, ) -> None: + partition_spec = partition_spec or PartitionSpec() if self._fs.exists(uri): assert_or_throw(mode == "overwrite", FileExistsError(uri)) try: diff --git a/fugue_ray/execution_engine.py b/fugue_ray/execution_engine.py index 6c170973..e8a24a4b 100644 --- a/fugue_ray/execution_engine.py +++ b/fugue_ray/execution_engine.py @@ -13,7 +13,6 @@ PartitionCursor, PartitionSpec, ) -from fugue.collections.partition import EMPTY_PARTITION_SPEC from fugue.constants import KEYWORD_ROWCOUNT from fugue.dataframe.arrow_dataframe import _build_empty_arrow from fugue_duckdb.dataframe import DuckDataFrame @@ -239,10 +238,11 @@ def save_df( path: str, format_hint: Any = None, mode: str = "overwrite", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, force_single: bool = False, **kwargs: Any, ) -> None: + partition_spec = partition_spec or PartitionSpec() df = self._to_ray_df(df) self._io.save_df( df, diff --git a/fugue_spark/_utils/io.py b/fugue_spark/_utils/io.py index 80c5e6b0..b925ef77 100644 --- a/fugue_spark/_utils/io.py +++ b/fugue_spark/_utils/io.py @@ -1,7 +1,7 @@ from typing import Any, Callable, Dict, List, Optional, Union import pyspark.sql as ps -from fugue.collections.partition import EMPTY_PARTITION_SPEC, PartitionSpec +from fugue.collections.partition import PartitionSpec from fugue.dataframe import DataFrame from fugue._utils.io import FileParser, save_df from fugue_spark.dataframe import SparkDataFrame @@ -48,11 +48,12 @@ def save_df( df: SparkDataFrame, uri: str, format_hint: Optional[str] = None, - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, mode: str = "overwrite", force_single: bool = False, **kwargs: Any, ) -> None: + partition_spec = partition_spec or PartitionSpec() if not force_single: p = FileParser(uri, format_hint) writer = self._get_writer(df.native, partition_spec) diff --git a/fugue_spark/execution_engine.py b/fugue_spark/execution_engine.py index 80a4ec01..4aaed7e9 100644 --- a/fugue_spark/execution_engine.py +++ b/fugue_spark/execution_engine.py @@ -19,7 +19,6 @@ from triad.utils.threading import RunOnce from fugue.collections.partition import ( - EMPTY_PARTITION_SPEC, PartitionCursor, PartitionSpec, parse_presort_exp, @@ -579,8 +578,9 @@ def take( n: int, presort: str, na_position: str = "last", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, ) -> DataFrame: + partition_spec = partition_spec or PartitionSpec() assert_or_throw( isinstance(n, int), ValueError("n needs to be an integer"), @@ -650,10 +650,11 @@ def save_df( path: str, format_hint: Any = None, mode: str = "overwrite", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, force_single: bool = False, **kwargs: Any, ) -> None: + partition_spec = partition_spec or PartitionSpec() df = self.to_df(df) self._io.save_df( df, diff --git a/tests/fugue_ibis/mock/execution_engine.py b/tests/fugue_ibis/mock/execution_engine.py index 21dfe7ae..64523876 100644 --- a/tests/fugue_ibis/mock/execution_engine.py +++ b/tests/fugue_ibis/mock/execution_engine.py @@ -12,7 +12,6 @@ PartitionCursor, PartitionSpec, ) -from fugue.collections.partition import EMPTY_PARTITION_SPEC from fugue_ibis import IbisDataFrame, IbisExecutionEngine, IbisTable from triad import FileSystem, assert_or_throw @@ -124,10 +123,11 @@ def save_df( path: str, format_hint: Any = None, mode: str = "overwrite", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, force_single: bool = False, **kwargs: Any, ) -> None: + partition_spec = partition_spec or PartitionSpec() return self._native_engine.save_df( df, path, format_hint, mode, partition_spec, force_single, **kwargs ) From fdd4998f8b2379713bb235258c4d793a757cef17 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Fri, 23 Dec 2022 05:40:19 +0000 Subject: [PATCH 11/30] refactor code --- docs/api/fugue.dataframe.rst | 8 + docs/api/fugue.dataset.rst | 45 +++ docs/api/fugue.execution.rst | 8 + docs/api/fugue.rst | 19 +- docs/api/fugue.workflow.rst | 8 + fugue/__init__.py | 2 +- fugue/{express/__init__.py => api.py} | 20 +- fugue/dataframe/__init__.py | 1 + fugue/dataframe/api.py | 308 ++++++++++++++++++ fugue/dataframe/arrow_dataframe.py | 8 +- fugue/dataframe/dataframe.py | 300 ----------------- fugue/dataframe/pandas_dataframe.py | 8 +- fugue/dataframe/utils.py | 10 +- fugue/dataset/__init__.py | 3 + fugue/dataset/api.py | 71 ++++ fugue/{ => dataset}/dataset.py | 71 +--- fugue/execution/__init__.py | 7 +- fugue/execution/{express.py => api.py} | 246 ++++++++++---- fugue/execution/execution_engine.py | 38 ++- fugue/plugins.py | 2 +- fugue/workflow/__init__.py | 9 +- .../transformation.py => workflow/api.py} | 54 ++- fugue/workflow/workflow.py | 2 +- fugue_duckdb/execution_engine.py | 4 +- fugue_test/dataframe_suite.py | 2 +- tests/fugue/dataframe/test_dataframe.py | 2 +- tests/fugue/execution/test_factory.py | 6 + tests/fugue/test_interfaceless.py | 14 +- tests/fugue_dask/test_dataframe.py | 2 +- tests/fugue_dask/test_execution_engine.py | 6 +- tests/fugue_duckdb/test_execution_engine.py | 15 + tests/fugue_ibis/test_dataframe.py | 2 +- tests/fugue_ray/test_execution_engine.py | 8 +- tests/fugue_spark/test_dataframe.py | 2 +- 34 files changed, 782 insertions(+), 529 deletions(-) create mode 100644 docs/api/fugue.dataset.rst rename fugue/{express/__init__.py => api.py} (64%) create mode 100644 fugue/dataframe/api.py create mode 100644 fugue/dataset/__init__.py create mode 100644 fugue/dataset/api.py rename fugue/{ => dataset}/dataset.py (70%) rename fugue/execution/{express.py => api.py} (64%) rename fugue/{express/transformation.py => workflow/api.py} (87%) diff --git a/docs/api/fugue.dataframe.rst b/docs/api/fugue.dataframe.rst index 67e2a87b..99b67137 100644 --- a/docs/api/fugue.dataframe.rst +++ b/docs/api/fugue.dataframe.rst @@ -27,6 +27,14 @@ fugue.dataframe .. |FugueDataTypes| replace:: :doc:`Fugue Data Types ` +fugue.dataframe.api +------------------- + +.. automodule:: fugue.dataframe.api + :members: + :undoc-members: + :show-inheritance: + fugue.dataframe.array\_dataframe -------------------------------- diff --git a/docs/api/fugue.dataset.rst b/docs/api/fugue.dataset.rst new file mode 100644 index 00000000..8c4ec9a1 --- /dev/null +++ b/docs/api/fugue.dataset.rst @@ -0,0 +1,45 @@ +fugue.dataset +============== + +.. |SchemaLikeObject| replace:: :ref:`Schema like object ` +.. |ParamsLikeObject| replace:: :ref:`Parameters like object ` +.. |DataFrameLikeObject| replace:: :ref:`DataFrame like object ` +.. |DataFramesLikeObject| replace:: :ref:`DataFrames like object ` +.. |PartitionLikeObject| replace:: :ref:`Partition like object ` +.. |RPCHandlerLikeObject| replace:: :ref:`RPChandler like object ` + +.. |ExecutionEngine| replace:: :class:`~fugue.execution.execution_engine.ExecutionEngine` +.. |NativeExecutionEngine| replace:: :class:`~fugue.execution.native_execution_engine.NativeExecutionEngine` +.. |FugueWorkflow| replace:: :class:`~fugue.workflow.workflow.FugueWorkflow` + +.. |ReadJoin| replace:: Read Join tutorials on :ref:`workflow ` and :ref:`engine ` for details +.. |FugueConfig| replace:: :doc:`the Fugue Configuration Tutorial ` +.. |PartitionTutorial| replace:: :doc:`the Partition Tutorial ` +.. |FugueSQLTutorial| replace:: :doc:`the Fugue SQL Tutorial ` +.. |DataFrameTutorial| replace:: :ref:`the DataFrame Tutorial ` +.. |ExecutionEngineTutorial| replace:: :doc:`the ExecutionEngine Tutorial ` +.. |ZipComap| replace:: :ref:`Zip & Comap ` +.. |LoadSave| replace:: :ref:`Load & Save ` +.. |AutoPersist| replace:: :ref:`Auto Persist ` +.. |TransformerTutorial| replace:: :doc:`the Transformer Tutorial ` +.. |CoTransformer| replace:: :ref:`CoTransformer ` +.. |CoTransformerTutorial| replace:: :doc:`the CoTransformer Tutorial ` +.. |FugueDataTypes| replace:: :doc:`Fugue Data Types ` + + +fugue.dataset.api +----------------- + +.. automodule:: fugue.dataset.api + :members: + :undoc-members: + :show-inheritance: + +fugue.dataset.dataset +--------------------- + +.. automodule:: fugue.dataset.dataset + :members: + :undoc-members: + :show-inheritance: + diff --git a/docs/api/fugue.execution.rst b/docs/api/fugue.execution.rst index 6000ace3..d09e526b 100644 --- a/docs/api/fugue.execution.rst +++ b/docs/api/fugue.execution.rst @@ -27,6 +27,14 @@ fugue.execution .. |FugueDataTypes| replace:: :doc:`Fugue Data Types ` +fugue.execution.api +------------------- + +.. automodule:: fugue.execution.api + :members: + :undoc-members: + :show-inheritance: + fugue.execution.execution\_engine --------------------------------- diff --git a/docs/api/fugue.rst b/docs/api/fugue.rst index 52455f62..c3364baf 100644 --- a/docs/api/fugue.rst +++ b/docs/api/fugue.rst @@ -8,6 +8,7 @@ fugue fugue.collections fugue.column fugue.dataframe + fugue.dataset fugue.execution fugue.extensions fugue.rpc @@ -40,18 +41,18 @@ fugue .. |FugueDataTypes| replace:: :doc:`Fugue Data Types ` -fugue.constants ---------------- +fugue.api +--------- -.. automodule:: fugue.constants +.. automodule:: fugue.api :members: :undoc-members: :show-inheritance: -fugue.dataset -------------- +fugue.constants +--------------- -.. automodule:: fugue.dataset +.. automodule:: fugue.constants :members: :undoc-members: :show-inheritance: @@ -64,10 +65,10 @@ fugue.exceptions :undoc-members: :show-inheritance: -fugue.express -------------------- +fugue.plugins +------------- -.. automodule:: fugue.express +.. automodule:: fugue.plugins :members: :undoc-members: :show-inheritance: diff --git a/docs/api/fugue.workflow.rst b/docs/api/fugue.workflow.rst index 2c85cef5..5b97d8d1 100644 --- a/docs/api/fugue.workflow.rst +++ b/docs/api/fugue.workflow.rst @@ -27,6 +27,14 @@ fugue.workflow .. |FugueDataTypes| replace:: :doc:`Fugue Data Types ` +fugue.workflow.api +------------------ + +.. automodule:: fugue.workflow.api + :members: + :undoc-members: + :show-inheritance: + fugue.workflow.input -------------------- diff --git a/fugue/__init__.py b/fugue/__init__.py index 482da900..d73af6cb 100644 --- a/fugue/__init__.py +++ b/fugue/__init__.py @@ -52,7 +52,7 @@ register_transformer, transformer, ) -from fugue.express import out_transform, transform +from fugue.api import out_transform, transform from fugue.registry import _register from fugue.rpc import ( EmptyRPCHandler, diff --git a/fugue/express/__init__.py b/fugue/api.py similarity index 64% rename from fugue/express/__init__.py rename to fugue/api.py index 53e58024..4cfdd39b 100644 --- a/fugue/express/__init__.py +++ b/fugue/api.py @@ -1,5 +1,6 @@ # flake8: noqa -from fugue.dataframe.dataframe import ( +# pylint: disable-all +from .dataframe.api import ( alter_columns, as_array, as_array_iterable, @@ -19,30 +20,23 @@ rename, select_columns, ) -from fugue.dataset import ( - as_fugue_dataset, - count, - get_dataset_display, - is_bounded, - is_empty, - is_local, - show, -) -from fugue.execution.express import ( +from .dataset.api import as_fugue_dataset, count, is_bounded, is_empty, is_local, show +from .execution.api import ( broadcast, distinct, dropna, + engine_context, fillna, intersect, join, load, persist, repartition, + run_engine_function, sample, save, subtract, take, union, ) - -from .transformation import out_transform, transform +from .workflow.api import out_transform, raw_sql, transform diff --git a/fugue/dataframe/__init__.py b/fugue/dataframe/__init__.py index b18a9631..cd88f2b0 100644 --- a/fugue/dataframe/__init__.py +++ b/fugue/dataframe/__init__.py @@ -1,4 +1,5 @@ # flake8: noqa +from .api import * from .array_dataframe import ArrayDataFrame from .arrow_dataframe import ArrowDataFrame from .dataframe import ( diff --git a/fugue/dataframe/api.py b/fugue/dataframe/api.py new file mode 100644 index 00000000..1ed27cce --- /dev/null +++ b/fugue/dataframe/api.py @@ -0,0 +1,308 @@ +from typing import Any, Dict, Iterable, List, Optional, Tuple + +import pandas as pd +import pyarrow as pa +from triad.collections.schema import Schema +from triad.utils.assertion import assert_or_throw +from triad.utils.rename import normalize_names + +from fugue.dataset.api import as_fugue_dataset + +from .._utils.registry import fugue_plugin +from .dataframe import DataFrame + + +def as_fugue_df(df: Any) -> DataFrame: + """Wrap the object as a Fugue DataFrame. This is a wrapper + of :func:`~fugue.dataset.as_fugue_dataset` + + :param df: the object to wrap + """ + res = as_fugue_dataset(df) + assert_or_throw( + isinstance(res, DataFrame), + TypeError(f"{type(df)} can't be converted to a Fugue DataFrame"), + ) + return res # type: ignore + + +@fugue_plugin +def is_df(df: Any) -> bool: + """Whether ``df`` is a DataFrame like object""" + return isinstance(df, DataFrame) + + +def get_native_as_df(df: Any) -> Any: + """Return the dataframe form of the input ``df``. + If ``df`` is a :class:`~.DataFrame`, then call the + :meth:`~.DataFrame.native_as_df`, otherwise, it depends on whether there is + a correspondent function handling it. + """ + if isinstance(df, DataFrame): + return df.native_as_df() + if is_df(df): + return df + raise NotImplementedError(f"cannot get a dataframe like object from {type(df)}") + + +@fugue_plugin +def get_schema(df: Any) -> Schema: + """Get the schema of the ``df`` + + :param df: the object that can be recognized as a dataframe by Fugue + :return: the Schema object + """ + return as_fugue_df(df).schema + + +@fugue_plugin +def as_pandas(df: Any) -> pd.DataFrame: + """Convert ``df`` to a Pandas DataFrame + + :param df: the object that can be recognized as a dataframe by Fugue + :return: the Pandas DataFrame + """ + return as_fugue_df(df).as_pandas() + + +@fugue_plugin +def as_arrow(df: Any) -> pa.Table: + """Convert ``df`` to a PyArrow Table + + :param df: the object that can be recognized as a dataframe by Fugue + :return: the PyArrow Table + """ + return as_fugue_df(df).as_arrow() + + +@fugue_plugin +def as_array( + df: Any, columns: Optional[List[str]] = None, type_safe: bool = False +) -> List[Any]: # pragma: no cover + """Convert df to 2-dimensional native python array + + :param df: the object that can be recognized as a dataframe by Fugue + :param columns: columns to extract, defaults to None + :param type_safe: whether to ensure output conforms with its schema, + defaults to False + :return: 2-dimensional native python array + + .. note:: + + If ``type_safe`` is False, then the returned values are 'raw' values. + """ + return as_fugue_df(df).as_array(columns=columns, type_safe=type_safe) + + +@fugue_plugin +def as_array_iterable( + df: Any, columns: Optional[List[str]] = None, type_safe: bool = False +) -> Iterable[Any]: # pragma: no cover + """Convert df to iterable of native python arrays + + :param df: the object that can be recognized as a dataframe by Fugue + :param columns: columns to extract, defaults to None + :param type_safe: whether to ensure output conforms with its schema, + defaults to False + :return: iterable of native python arrays + + .. note:: + + If ``type_safe`` is False, then the returned values are 'raw' values. + """ + + return as_fugue_df(df).as_array_iterable(columns=columns, type_safe=type_safe) + + +@fugue_plugin +def as_dict_iterable( + df: Any, columns: Optional[List[str]] = None +) -> Iterable[Dict[str, Any]]: + """Convert df to iterable of native python dicts + + :param df: the object that can be recognized as a dataframe by Fugue + :param columns: columns to extract, defaults to None + :return: iterable of native python dicts + + .. note:: + + The default implementation enforces ``type_safe`` True + """ + return as_fugue_df(df).as_dict_iterable(columns=columns) + + +@fugue_plugin +def peek_array(df: Any) -> List[Any]: + """Peek the first row of the dataframe as an array + + :param df: the object that can be recognized as a dataframe by Fugue + :return: the first row as an array + """ + return as_fugue_df(df).peek_array() + + +@fugue_plugin +def peek_dict(df: Any) -> Dict[str, Any]: + """Peek the first row of the dataframe as a array + + :param df: the object that can be recognized as a dataframe by Fugue + :return: the first row as a dict + """ + return as_fugue_df(df).peek_dict() + + +@fugue_plugin +def head( + df: Any, n: int, columns: Optional[List[str]] = None, as_fugue: bool = False +) -> Any: + """Get first n rows of the dataframe as a new local bounded dataframe + + :param n: number of rows + :param columns: selected columns, defaults to None (all columns) + :param as_fugue: whether return a Fugue :class:`~.DataFrame`, default to + False. If False, then if the input ``df`` is not a Fugue DataFrame + then it will return the underlying DataFrame object. + :return: a local bounded dataframe + """ + res = as_fugue_df(df).head(n=n, columns=columns) + if as_fugue or isinstance(df, DataFrame): + return res + return res.as_pandas() + + +@fugue_plugin +def alter_columns(df: Any, columns: Any, as_fugue: bool = False) -> Any: + """Change column types + + :param df: the object that can be recognized as a dataframe by Fugue + :param columns: |SchemaLikeObject|, + all columns should be contained by the dataframe schema + :param as_fugue: whether return a Fugue :class:`~.DataFrame`, default to + False. If False, then if the input ``df`` is not a Fugue DataFrame + then it will return the underlying DataFrame object. + :return: a new dataframe with altered columns, the order of the + original schema will not change + """ + return _adjust_df(df, as_fugue_df(df).alter_columns(columns), as_fugue=as_fugue) + + +@fugue_plugin +def drop_columns(df: Any, columns: List[str], as_fugue: bool = False) -> Any: + """Drop certain columns and return a new dataframe + + :param df: the object that can be recognized as a dataframe by Fugue + :param columns: columns to drop + :param as_fugue: whether return a Fugue :class:`~.DataFrame`, default to + False. If False, then if the input ``df`` is not a Fugue DataFrame + then it will return the underlying DataFrame object. + :return: a new dataframe removing the columns + """ + return _adjust_df(df, as_fugue_df(df).drop(columns), as_fugue=as_fugue) + + +@fugue_plugin +def select_columns(df: Any, columns: List[Any], as_fugue: bool = False) -> Any: + """Select certain columns and return a new dataframe + + :param df: the object that can be recognized as a dataframe by Fugue + :param columns: columns to return + :param as_fugue: whether return a Fugue :class:`~.DataFrame`, default to + False. If False, then if the input ``df`` is not a Fugue DataFrame + then it will return the underlying DataFrame object. + :return: a new dataframe with the selected the columns + """ + return _adjust_df(df, as_fugue_df(df)[columns], as_fugue=as_fugue) + + +@fugue_plugin +def get_column_names(df: Any) -> List[Any]: # pragma: no cover + """A generic function to get column names of any dataframe + + :param df: the dataframe object + :return: the column names + + .. note:: + + In order to support a new type of dataframe, an implementation must + be registered, for example + + .. code-block::python + + @get_column_names.candidate(lambda df: isinstance(df, pa.Table)) + def _get_pyarrow_dataframe_columns(df: pa.Table) -> List[Any]: + return [f.name for f in df.schema] + """ + return get_schema(df).names + + +@fugue_plugin +def rename(df: Any, columns: Dict[str, Any], as_fugue: bool = False) -> Any: + """A generic function to rename column names of any dataframe + + :param df: the dataframe object + :param columns: the rename operations as a dict: ``old name => new name`` + :param as_fugue: whether return a Fugue :class:`~.DataFrame`, default to + False. If False, then if the input ``df`` is not a Fugue DataFrame + then it will return the underlying DataFrame object. + :return: the renamed dataframe + + .. note:: + + In order to support a new type of dataframe, an implementation must + be registered, for example + + .. code-block::python + + @rename.candidate( + lambda df, *args, **kwargs: isinstance(df, pd.DataFrame) + ) + def _rename_pandas_dataframe( + df: pd.DataFrame, columns: Dict[str, Any] + ) -> pd.DataFrame: + if len(columns) == 0: + return df + return df.rename(columns=columns) + """ + if len(columns) == 0: + return df + return _adjust_df(df, as_fugue_df(df).rename(columns), as_fugue=as_fugue) + + +def normalize_column_names(df: Any) -> Tuple[Any, Dict[str, Any]]: + """A generic function to normalize any dataframe's column names to follow + Fugue naming rules + + .. note:: + + This is a temporary solution before + :class:`~triad:triad.collections.schema.Schema` + can take arbitrary names + + .. admonition:: Examples + + * ``[0,1]`` => ``{"_0":0, "_1":1}`` + * ``["1a","2b"]`` => ``{"_1a":"1a", "_2b":"2b"}`` + * ``["*a","-a"]`` => ``{"_a":"*a", "_a_1":"-a"}`` + + :param df: a dataframe object + :return: the renamed dataframe and the rename operations as a dict that + can **undo** the change + + .. seealso:: + + * :func:`~.get_column_names` + * :func:`~.rename` + * :func:`~triad:triad.utils.rename.normalize_names` + """ + cols = get_column_names(df) + names = normalize_names(cols) + if len(names) == 0: + return df, {} + undo = {v: k for k, v in names.items()} + return (rename(df, names), undo) + + +def _adjust_df(input_df: Any, output_df: DataFrame, as_fugue: bool) -> Any: + if as_fugue or isinstance(input_df, DataFrame): + return output_df + return output_df.native # type: ignore diff --git a/fugue/dataframe/arrow_dataframe.py b/fugue/dataframe/arrow_dataframe.py index 91781064..44a25454 100644 --- a/fugue/dataframe/arrow_dataframe.py +++ b/fugue/dataframe/arrow_dataframe.py @@ -6,13 +6,10 @@ from triad.exceptions import InvalidOperationError from triad.utils.assertion import assert_or_throw -from fugue.dataset import as_fugue_dataset, count, is_bounded, is_empty, is_local +from fugue.dataset.api import as_fugue_dataset, count, is_bounded, is_empty, is_local from fugue.exceptions import FugueDataFrameOperationError -from .dataframe import ( - DataFrame, - LocalBoundedDataFrame, - _input_schema, +from .api import ( drop_columns, get_column_names, get_schema, @@ -20,6 +17,7 @@ rename, select_columns, ) +from .dataframe import DataFrame, LocalBoundedDataFrame, _input_schema class ArrowDataFrame(LocalBoundedDataFrame): diff --git a/fugue/dataframe/dataframe.py b/fugue/dataframe/dataframe.py index fc9c2449..3f8fde94 100644 --- a/fugue/dataframe/dataframe.py +++ b/fugue/dataframe/dataframe.py @@ -9,12 +9,8 @@ from triad.exceptions import InvalidOperationError from triad.utils.assertion import assert_or_throw from triad.utils.pandas_like import PD_UTILS -from triad.utils.rename import normalize_names - -from fugue.dataset import as_fugue_dataset from .._utils.display import PrettyTable -from .._utils.registry import fugue_plugin from ..collections.yielded import Yielded from ..dataset import Dataset, DatasetDisplay, get_dataset_display from ..exceptions import FugueDataFrameOperationError @@ -427,296 +423,6 @@ def show( print("") -def as_fugue_df(df: Any) -> DataFrame: - """Wrap the object as a Fugue DataFrame. This is a wrapper - of :func:`~fugue.dataset.as_fugue_dataset` - - :param df: the object to wrap - """ - res = as_fugue_dataset(df) - assert_or_throw( - isinstance(res, DataFrame), - TypeError(f"{type(df)} can't be converted to a Fugue DataFrame"), - ) - return res # type: ignore - - -@fugue_plugin -def is_df(df: Any) -> bool: - """Whether ``df`` is a DataFrame like object""" - return isinstance(df, DataFrame) - - -def get_native_as_df(df: Any) -> Any: - """Return the dataframe form of the input ``df``. - If ``df`` is a :class:`~.DataFrame`, then call the - :meth:`~.DataFrame.native_as_df`, otherwise, it depends on whether there is - a correspondent function handling it. - """ - if isinstance(df, DataFrame): - return df.native_as_df() - if is_df(df): - return df - raise NotImplementedError(f"cannot get a dataframe like object from {type(df)}") - - -@fugue_plugin -def get_schema(df: Any) -> Schema: - """Get the schema of the ``df`` - - :param df: the object that can be recognized as a dataframe by Fugue - :return: the Schema object - """ - return as_fugue_df(df).schema - - -@fugue_plugin -def as_pandas(df: Any) -> pd.DataFrame: - """Convert ``df`` to a Pandas DataFrame - - :param df: the object that can be recognized as a dataframe by Fugue - :return: the Pandas DataFrame - """ - return as_fugue_df(df).as_pandas() - - -@fugue_plugin -def as_arrow(df: Any) -> pa.Table: - """Convert ``df`` to a PyArrow Table - - :param df: the object that can be recognized as a dataframe by Fugue - :return: the PyArrow Table - """ - return as_fugue_df(df).as_arrow() - - -@fugue_plugin -def as_array( - df: Any, columns: Optional[List[str]] = None, type_safe: bool = False -) -> List[Any]: # pragma: no cover - """Convert df to 2-dimensional native python array - - :param df: the object that can be recognized as a dataframe by Fugue - :param columns: columns to extract, defaults to None - :param type_safe: whether to ensure output conforms with its schema, - defaults to False - :return: 2-dimensional native python array - - .. note:: - - If ``type_safe`` is False, then the returned values are 'raw' values. - """ - return as_fugue_df(df).as_array(columns=columns, type_safe=type_safe) - - -@fugue_plugin -def as_array_iterable( - df: Any, columns: Optional[List[str]] = None, type_safe: bool = False -) -> Iterable[Any]: # pragma: no cover - """Convert df to iterable of native python arrays - - :param df: the object that can be recognized as a dataframe by Fugue - :param columns: columns to extract, defaults to None - :param type_safe: whether to ensure output conforms with its schema, - defaults to False - :return: iterable of native python arrays - - .. note:: - - If ``type_safe`` is False, then the returned values are 'raw' values. - """ - - return as_fugue_df(df).as_array_iterable(columns=columns, type_safe=type_safe) - - -@fugue_plugin -def as_dict_iterable( - df: Any, columns: Optional[List[str]] = None -) -> Iterable[Dict[str, Any]]: - """Convert df to iterable of native python dicts - - :param df: the object that can be recognized as a dataframe by Fugue - :param columns: columns to extract, defaults to None - :return: iterable of native python dicts - - .. note:: - - The default implementation enforces ``type_safe`` True - """ - return as_fugue_df(df).as_dict_iterable(columns=columns) - - -@fugue_plugin -def peek_array(df: Any) -> List[Any]: - """Peek the first row of the dataframe as an array - - :param df: the object that can be recognized as a dataframe by Fugue - :return: the first row as an array - """ - return as_fugue_df(df).peek_array() - - -@fugue_plugin -def peek_dict(df: Any) -> Dict[str, Any]: - """Peek the first row of the dataframe as a array - - :param df: the object that can be recognized as a dataframe by Fugue - :return: the first row as a dict - """ - return as_fugue_df(df).peek_dict() - - -@fugue_plugin -def head( - df: Any, n: int, columns: Optional[List[str]] = None, as_fugue: bool = False -) -> Any: - """Get first n rows of the dataframe as a new local bounded dataframe - - :param n: number of rows - :param columns: selected columns, defaults to None (all columns) - :param as_fugue: whether return a Fugue :class:`~.DataFrame`, default to - False. If False, then if the input ``df`` is not a Fugue DataFrame - then it will return the underlying DataFrame object. - :return: a local bounded dataframe - """ - res = as_fugue_df(df).head(n=n, columns=columns) - if as_fugue or isinstance(df, DataFrame): - return res - return res.as_pandas() - - -@fugue_plugin -def alter_columns(df: Any, columns: Any, as_fugue: bool = False) -> Any: - """Change column types - - :param df: the object that can be recognized as a dataframe by Fugue - :param columns: |SchemaLikeObject|, - all columns should be contained by the dataframe schema - :param as_fugue: whether return a Fugue :class:`~.DataFrame`, default to - False. If False, then if the input ``df`` is not a Fugue DataFrame - then it will return the underlying DataFrame object. - :return: a new dataframe with altered columns, the order of the - original schema will not change - """ - return _adjust_df(df, as_fugue_df(df).alter_columns(columns), as_fugue=as_fugue) - - -@fugue_plugin -def drop_columns(df: Any, columns: List[str], as_fugue: bool = False) -> Any: - """Drop certain columns and return a new dataframe - - :param df: the object that can be recognized as a dataframe by Fugue - :param columns: columns to drop - :param as_fugue: whether return a Fugue :class:`~.DataFrame`, default to - False. If False, then if the input ``df`` is not a Fugue DataFrame - then it will return the underlying DataFrame object. - :return: a new dataframe removing the columns - """ - return _adjust_df(df, as_fugue_df(df).drop(columns), as_fugue=as_fugue) - - -@fugue_plugin -def select_columns(df: Any, columns: List[Any], as_fugue: bool = False) -> Any: - """Select certain columns and return a new dataframe - - :param df: the object that can be recognized as a dataframe by Fugue - :param columns: columns to return - :param as_fugue: whether return a Fugue :class:`~.DataFrame`, default to - False. If False, then if the input ``df`` is not a Fugue DataFrame - then it will return the underlying DataFrame object. - :return: a new dataframe with the selected the columns - """ - return _adjust_df(df, as_fugue_df(df)[columns], as_fugue=as_fugue) - - -@fugue_plugin -def get_column_names(df: Any) -> List[Any]: # pragma: no cover - """A generic function to get column names of any dataframe - - :param df: the dataframe object - :return: the column names - - .. note:: - - In order to support a new type of dataframe, an implementation must - be registered, for example - - .. code-block::python - - @get_column_names.candidate(lambda df: isinstance(df, pa.Table)) - def _get_pyarrow_dataframe_columns(df: pa.Table) -> List[Any]: - return [f.name for f in df.schema] - """ - return get_schema(df).names - - -@fugue_plugin -def rename(df: Any, columns: Dict[str, Any], as_fugue: bool = False) -> Any: - """A generic function to rename column names of any dataframe - - :param df: the dataframe object - :param columns: the rename operations as a dict: ``old name => new name`` - :param as_fugue: whether return a Fugue :class:`~.DataFrame`, default to - False. If False, then if the input ``df`` is not a Fugue DataFrame - then it will return the underlying DataFrame object. - :return: the renamed dataframe - - .. note:: - - In order to support a new type of dataframe, an implementation must - be registered, for example - - .. code-block::python - - @rename.candidate( - lambda df, *args, **kwargs: isinstance(df, pd.DataFrame) - ) - def _rename_pandas_dataframe( - df: pd.DataFrame, columns: Dict[str, Any] - ) -> pd.DataFrame: - if len(columns) == 0: - return df - return df.rename(columns=columns) - """ - if len(columns) == 0: - return df - return _adjust_df(df, as_fugue_df(df).rename(columns), as_fugue=as_fugue) - - -def normalize_column_names(df: Any) -> Tuple[Any, Dict[str, Any]]: - """A generic function to normalize any dataframe's column names to follow - Fugue naming rules - - .. note:: - - This is a temporary solution before - :class:`~triad:triad.collections.schema.Schema` - can take arbitrary names - - .. admonition:: Examples - - * ``[0,1]`` => ``{"_0":0, "_1":1}`` - * ``["1a","2b"]`` => ``{"_1a":"1a", "_2b":"2b"}`` - * ``["*a","-a"]`` => ``{"_a":"*a", "_a_1":"-a"}`` - - :param df: a dataframe object - :return: the renamed dataframe and the rename operations as a dict that - can **undo** the change - - .. seealso:: - - * :func:`~.get_column_names` - * :func:`~.rename` - * :func:`~triad:triad.utils.rename.normalize_names` - """ - cols = get_column_names(df) - names = normalize_names(cols) - if len(names) == 0: - return df, {} - undo = {v: k for k, v in names.items()} - return (rename(df, names), undo) - - @get_dataset_display.candidate(lambda ds: isinstance(ds, DataFrame), priority=0.1) def _get_dataframe_display(ds: DataFrame): return DataFrameDisplay(ds) @@ -750,9 +456,3 @@ def _get_schema_change( def _input_schema(schema: Any) -> Schema: return schema if isinstance(schema, Schema) else Schema(schema) - - -def _adjust_df(input_df: Any, output_df: DataFrame, as_fugue: bool) -> Any: - if as_fugue or isinstance(input_df, DataFrame): - return output_df - return output_df.native # type: ignore diff --git a/fugue/dataframe/pandas_dataframe.py b/fugue/dataframe/pandas_dataframe.py index 00424619..e2b456b8 100644 --- a/fugue/dataframe/pandas_dataframe.py +++ b/fugue/dataframe/pandas_dataframe.py @@ -6,13 +6,10 @@ from triad.utils.assertion import assert_or_throw from triad.utils.pandas_like import PD_UTILS -from fugue.dataset import as_fugue_dataset, count, is_bounded, is_empty, is_local +from fugue.dataset.api import as_fugue_dataset, count, is_bounded, is_empty, is_local from fugue.exceptions import FugueDataFrameOperationError -from .dataframe import ( - DataFrame, - LocalBoundedDataFrame, - _input_schema, +from .api import ( drop_columns, get_column_names, get_schema, @@ -21,6 +18,7 @@ rename, select_columns, ) +from .dataframe import DataFrame, LocalBoundedDataFrame, _input_schema class PandasDataFrame(LocalBoundedDataFrame): diff --git a/fugue/dataframe/utils.py b/fugue/dataframe/utils.py index 791c755c..f90df592 100644 --- a/fugue/dataframe/utils.py +++ b/fugue/dataframe/utils.py @@ -13,15 +13,9 @@ from triad.utils.assertion import assert_arg_not_none from triad.utils.assertion import assert_or_throw as aot +from .api import get_column_names, normalize_column_names, rename from .array_dataframe import ArrayDataFrame -from .dataframe import ( - DataFrame, - LocalBoundedDataFrame, - LocalDataFrame, - get_column_names, - normalize_column_names, - rename, -) +from .dataframe import DataFrame, LocalBoundedDataFrame, LocalDataFrame from .iterable_dataframe import IterableDataFrame from .pandas_dataframe import PandasDataFrame diff --git a/fugue/dataset/__init__.py b/fugue/dataset/__init__.py new file mode 100644 index 00000000..4b140844 --- /dev/null +++ b/fugue/dataset/__init__.py @@ -0,0 +1,3 @@ +# flake8: noqa +from .dataset import Dataset, DatasetDisplay, get_dataset_display +from .api import * diff --git a/fugue/dataset/api.py b/fugue/dataset/api.py new file mode 100644 index 00000000..af3754b4 --- /dev/null +++ b/fugue/dataset/api.py @@ -0,0 +1,71 @@ +from typing import Any, Optional + +from .._utils.registry import fugue_plugin +from .dataset import Dataset + + +@fugue_plugin +def as_fugue_dataset(data: Any) -> Dataset: + """Wrap the input as a :class:`~.Dataset` + + :param data: the data to be wrapped + """ + if isinstance(data, Dataset): + return data + raise NotImplementedError(f"no registered dataset conversion for {type(data)}") + + +def show( + data: Any, n: int = 10, with_count: bool = False, title: Optional[str] = None +) -> None: + """Display the Dataset + + :param data: the data that can be recognized by Fugue + :param n: number of rows to print, defaults to 10 + :param with_count: whether to show dataset count, defaults to False + :param title: title of the dataset, defaults to None + + .. note:: + + When ``with_count`` is True, it can trigger expensive calculation for + a distributed dataframe. So if you call this function directly, you may + need to :func:`fugue.execution.execution_engine.ExecutionEngine.persist` + the dataset. + """ + return as_fugue_dataset(data).show(n=n, with_count=with_count, title=title) + + +@fugue_plugin +def is_local(data: Any) -> bool: + """Whether the dataset is local + + :param data: the data that can be recognized by Fugue + """ + return as_fugue_dataset(data).is_local + + +@fugue_plugin +def is_bounded(data: Any) -> bool: + """Whether the dataset is local + + :param data: the data that can be recognized by Fugue + """ + return as_fugue_dataset(data).is_bounded + + +@fugue_plugin +def is_empty(data: Any) -> bool: + """Whether the dataset is empty + + :param data: the data that can be recognized by Fugue + """ + return as_fugue_dataset(data).empty + + +@fugue_plugin +def count(data: Any) -> int: + """The number of elements in the dataset + + :param data: the data that can be recognized by Fugue + """ + return as_fugue_dataset(data).count() diff --git a/fugue/dataset.py b/fugue/dataset/dataset.py similarity index 70% rename from fugue/dataset.py rename to fugue/dataset/dataset.py index 2752255a..9be2875b 100644 --- a/fugue/dataset.py +++ b/fugue/dataset/dataset.py @@ -4,8 +4,8 @@ from triad import ParamDict, SerializableRLock, assert_or_throw -from ._utils.registry import fugue_plugin -from .exceptions import FugueDatasetEmptyError +from .._utils.registry import fugue_plugin +from ..exceptions import FugueDatasetEmptyError class Dataset(ABC): @@ -153,70 +153,3 @@ def get_dataset_display(ds: "Dataset") -> DatasetDisplay: # pragma: no cover """ raise NotImplementedError(f"no matching DatasetDisplay registered for {type(ds)}") - - -@fugue_plugin -def as_fugue_dataset(data: Any) -> Dataset: - """Wrap the input as a :class:`~.Dataset` - - :param data: the data to be wrapped - """ - if isinstance(data, Dataset): - return data - raise NotImplementedError(f"no registered dataset conversion for {type(data)}") - - -def show( - data: Any, n: int = 10, with_count: bool = False, title: Optional[str] = None -) -> None: - """Display the Dataset - - :param data: the data that can be recognized by Fugue - :param n: number of rows to print, defaults to 10 - :param with_count: whether to show dataset count, defaults to False - :param title: title of the dataset, defaults to None - - .. note:: - - When ``with_count`` is True, it can trigger expensive calculation for - a distributed dataframe. So if you call this function directly, you may - need to :func:`fugue.execution.execution_engine.ExecutionEngine.persist` - the dataset. - """ - return as_fugue_dataset(data).show(n=n, with_count=with_count, title=title) - - -@fugue_plugin -def is_local(data: Any) -> bool: - """Whether the dataset is local - - :param data: the data that can be recognized by Fugue - """ - return as_fugue_dataset(data).is_local - - -@fugue_plugin -def is_bounded(data: Any) -> bool: - """Whether the dataset is local - - :param data: the data that can be recognized by Fugue - """ - return as_fugue_dataset(data).is_bounded - - -@fugue_plugin -def is_empty(data: Any) -> bool: - """Whether the dataset is empty - - :param data: the data that can be recognized by Fugue - """ - return as_fugue_dataset(data).empty - - -@fugue_plugin -def count(data: Any) -> int: - """The number of elements in the dataset - - :param data: the data that can be recognized by Fugue - """ - return as_fugue_dataset(data).count() diff --git a/fugue/execution/__init__.py b/fugue/execution/__init__.py index 35d6f7aa..95cbb7cd 100644 --- a/fugue/execution/__init__.py +++ b/fugue/execution/__init__.py @@ -1,6 +1,7 @@ # flake8: noqa -from fugue.execution.execution_engine import ExecutionEngine, MapEngine, SQLEngine -from fugue.execution.factory import ( +from .api import * +from .execution_engine import ExecutionEngine, MapEngine, SQLEngine +from .factory import ( infer_execution_engine, make_execution_engine, make_sql_engine, @@ -9,7 +10,7 @@ register_execution_engine, register_sql_engine, ) -from fugue.execution.native_execution_engine import ( +from .native_execution_engine import ( NativeExecutionEngine, QPDPandasEngine, SqliteEngine, diff --git a/fugue/execution/express.py b/fugue/execution/api.py similarity index 64% rename from fugue/execution/express.py rename to fugue/execution/api.py index 6eefa4be..f1205d46 100644 --- a/fugue/execution/express.py +++ b/fugue/execution/api.py @@ -1,10 +1,37 @@ -from typing import Any, List, Optional, Union +from contextlib import contextmanager +from typing import Any, Callable, Iterator, List, Optional, Union from ..collections.partition import PartitionSpec from ..dataframe.dataframe import DataFrame +from .execution_engine import ExecutionEngine from .factory import make_execution_engine +@contextmanager +def engine_context( + engine: Any = None, engine_conf: Any = None, infer_by: Optional[List[Any]] = None +) -> Iterator[ExecutionEngine]: + e = make_execution_engine(engine, engine_conf, infer_by=infer_by) + return e._as_context() + + +def run_engine_function( + func: Callable[[ExecutionEngine], Any], + engine: Any = None, + engine_conf: Any = None, + as_fugue: bool = False, + infer_by: Optional[List[Any]] = None, +) -> Any: + with engine_context(engine, engine_conf, infer_by=infer_by) as e: + res = func(e) + + if isinstance(res, DataFrame): + if as_fugue or any(isinstance(x, DataFrame) for x in (infer_by or [])): + return res + return res.native_as_df() + return res + + def repartition( df: Any, partition_spec: PartitionSpec, @@ -18,10 +45,12 @@ def repartition( :param partition_spec: how you want to partition the dataframe :return: the repartitioned dataframe """ - e = make_execution_engine(engine, engine_conf, infer_by=[df]) - edf = e.to_df(df) - return _adjust_df( - [df], e.repartition(edf, partition_spec=partition_spec), as_fugue=as_fugue + return run_engine_function( + lambda e: e.repartition(e.to_df(df), partition_spec=partition_spec), + engine=engine, + engine_conf=engine_conf, + infer_by=[df], + as_fugue=as_fugue, ) @@ -36,9 +65,13 @@ def broadcast( :param df: an input dataframe that can be recognized by Fugue :return: the broadcasted dataframe """ - e = make_execution_engine(engine, engine_conf, infer_by=[df]) - edf = e.to_df(df) - return _adjust_df([df], e.broadcast(edf), as_fugue=as_fugue) + return run_engine_function( + lambda e: e.broadcast(e.to_df(df)), + engine=engine, + engine_conf=engine_conf, + infer_by=[df], + as_fugue=as_fugue, + ) def persist( @@ -58,9 +91,13 @@ def persist( :param kwargs: parameter to pass to the underlying persist implementation :return: the persisted dataframe """ - e = make_execution_engine(engine, engine_conf, infer_by=[df]) - edf = e.to_df(df) - return _adjust_df([df], e.persist(edf, lazy=lazy, **kwargs), as_fugue=as_fugue) + return run_engine_function( + lambda e: e.persist(e.to_df(df), lazy=lazy, **kwargs), + engine=engine, + engine_conf=engine_conf, + infer_by=[df], + as_fugue=as_fugue, + ) def distinct( @@ -71,9 +108,13 @@ def distinct( :param df: an input dataframe that can be recognized by Fugue :return: [description] """ - e = make_execution_engine(engine, engine_conf, infer_by=[df]) - edf = e.distinct(e.to_df(df)) - return _adjust_df([df], edf, as_fugue=as_fugue) + return run_engine_function( + lambda e: e.distinct(e.to_df(df)), + engine=engine, + engine_conf=engine_conf, + infer_by=[df], + as_fugue=as_fugue, + ) def dropna( @@ -95,9 +136,13 @@ def dropna( :return: DataFrame with NA records dropped """ - e = make_execution_engine(engine, engine_conf, infer_by=[df]) - edf = e.dropna(e.to_df(df), how=how, thresh=thresh, subset=subset) - return _adjust_df([df], edf, as_fugue=as_fugue) + return run_engine_function( + lambda e: e.dropna(e.to_df(df), how=how, thresh=thresh, subset=subset), + engine=engine, + engine_conf=engine_conf, + infer_by=[df], + as_fugue=as_fugue, + ) def fillna( @@ -120,9 +165,13 @@ def fillna( :return: DataFrame with NA records filled """ - e = make_execution_engine(engine, engine_conf, infer_by=[df]) - edf = e.fillna(e.to_df(df), value=value, subset=subset) - return _adjust_df([df], edf, as_fugue=as_fugue) + return run_engine_function( + lambda e: e.fillna(e.to_df(df), value=value, subset=subset), + engine=engine, + engine_conf=engine_conf, + infer_by=[df], + as_fugue=as_fugue, + ) def sample( @@ -149,9 +198,13 @@ def sample( :return: the sampled dataframe """ - e = make_execution_engine(engine, engine_conf, infer_by=[df]) - edf = e.sample(e.to_df(df), n=n, frac=frac, replace=replace, seed=seed) - return _adjust_df([df], edf, as_fugue=as_fugue) + return run_engine_function( + lambda e: e.sample(e.to_df(df), n=n, frac=frac, replace=replace, seed=seed), + engine=engine, + engine_conf=engine_conf, + infer_by=[df], + as_fugue=as_fugue, + ) def take( @@ -181,15 +234,20 @@ def take( :return: n rows of DataFrame per partition """ - e = make_execution_engine(engine, engine_conf, infer_by=[df]) - edf = e.take( - e.to_df(df), - n=n, - presort=presort, - na_position=na_position, - partition_spec=partition_spec, + + return run_engine_function( + lambda e: e.take( + e.to_df(df), + n=n, + presort=presort, + na_position=na_position, + partition_spec=partition_spec, + ), + engine=engine, + engine_conf=engine_conf, + infer_by=[df], + as_fugue=as_fugue, ) - return _adjust_df([df], edf, as_fugue=as_fugue) def load( @@ -212,9 +270,14 @@ def load( For more details and examples, read |ZipComap|. """ - e = make_execution_engine(engine, engine_conf) - res = e.load_df(path=path, format_hint=format_hint, columns=columns, **kwargs) - return _adjust_df([], res, as_fugue=as_fugue) + return run_engine_function( + lambda e: e.load_df( + path=path, format_hint=format_hint, columns=columns, **kwargs + ), + engine=engine, + engine_conf=engine_conf, + as_fugue=as_fugue, + ) def save( @@ -243,16 +306,19 @@ def save( For more details and examples, read |LoadSave|. """ - e = make_execution_engine(engine, engine_conf, infer_by=[df]) - edf = e.to_df(df) - e.save_df( - edf, - path=path, - format_hint=format_hint, - mode=mode, - partition_spec=partition_spec, - force_single=force_single, - **kwargs, + run_engine_function( + lambda e: e.save_df( + e.to_df(df), + path=path, + format_hint=format_hint, + mode=mode, + partition_spec=partition_spec, + force_single=force_single, + **kwargs, + ), + engine=engine, + engine_conf=engine_conf, + infer_by=[df], ) @@ -281,13 +347,22 @@ def join( Please read :func:`~.fugue.dataframe.utils.get_join_schemas` """ - e = make_execution_engine(engine, engine_conf, infer_by=[df1, df2]) - edf1 = e.to_df(df1) - edf2 = e.to_df(df2) - res = e.join(edf1, edf2, how=how, on=on) - for odf in dfs: - res = e.join(res, e.to_df(odf), how=how, on=on) - return _adjust_df([df1, df2, *dfs], res, as_fugue=as_fugue) + + def _join(e: ExecutionEngine): + edf1 = e.to_df(df1) + edf2 = e.to_df(df2) + res = e.join(edf1, edf2, how=how, on=on) + for odf in dfs: + res = e.join(res, e.to_df(odf), how=how, on=on) + return res + + run_engine_function( + _join, + engine=engine, + engine_conf=engine_conf, + as_fugue=as_fugue, + infer_by=[df1, df2, *dfs], + ) def union( @@ -313,13 +388,22 @@ def union( Currently, the schema of all dataframes must be identical, or an exception will be thrown. """ - e = make_execution_engine(engine, engine_conf, infer_by=[df1, df2]) - edf1 = e.to_df(df1) - edf2 = e.to_df(df2) - res = e.union(edf1, edf2, distinct=distinct) - for odf in dfs: - res = e.union(res, e.to_df(odf), distinct=distinct) - return _adjust_df([df1, df2, *dfs], res, as_fugue=as_fugue) + + def _union(e: ExecutionEngine): + edf1 = e.to_df(df1) + edf2 = e.to_df(df2) + res = e.union(edf1, edf2, distinct=distinct) + for odf in dfs: + res = e.union(res, e.to_df(odf), distinct=distinct) + return res + + run_engine_function( + _union, + engine=engine, + engine_conf=engine_conf, + as_fugue=as_fugue, + infer_by=[df1, df2, *dfs], + ) def subtract( @@ -345,13 +429,22 @@ def subtract( Currently, the schema of all datafrmes must be identical, or an exception will be thrown. """ - e = make_execution_engine(engine, engine_conf, infer_by=[df1, df2]) - edf1 = e.to_df(df1) - edf2 = e.to_df(df2) - res = e.subtract(edf1, edf2, distinct=distinct) - for odf in dfs: - res = e.subtract(edf1, e.to_df(odf), distinct=distinct) - return _adjust_df([df1, df2, *dfs], res, as_fugue=as_fugue) + + def _subtract(e: ExecutionEngine): + edf1 = e.to_df(df1) + edf2 = e.to_df(df2) + res = e.subtract(edf1, edf2, distinct=distinct) + for odf in dfs: + res = e.subtract(res, e.to_df(odf), distinct=distinct) + return res + + run_engine_function( + _subtract, + engine=engine, + engine_conf=engine_conf, + as_fugue=as_fugue, + infer_by=[df1, df2, *dfs], + ) def intersect( @@ -377,13 +470,22 @@ def intersect( Currently, the schema of ``df1`` and ``df2`` must be identical, or an exception will be thrown. """ - e = make_execution_engine(engine, engine_conf, infer_by=[df1, df2]) - edf1 = e.to_df(df1) - edf2 = e.to_df(df2) - res = e.intersect(edf1, edf2, distinct=distinct) - for odf in dfs: - res = e.intersect(res, e.to_df(odf), distinct=distinct) - return _adjust_df([df1, df2, *dfs], res, as_fugue=as_fugue) + + def _intersect(e: ExecutionEngine): + edf1 = e.to_df(df1) + edf2 = e.to_df(df2) + res = e.intersect(edf1, edf2, distinct=distinct) + for odf in dfs: + res = e.intersect(res, e.to_df(odf), distinct=distinct) + return res + + run_engine_function( + _intersect, + engine=engine, + engine_conf=engine_conf, + as_fugue=as_fugue, + infer_by=[df1, df2, *dfs], + ) def _adjust_df(input_dfs: Any, output_df: DataFrame, as_fugue: bool) -> Any: diff --git a/fugue/execution/execution_engine.py b/fugue/execution/execution_engine.py index b2f6801d..9d480a49 100644 --- a/fugue/execution/execution_engine.py +++ b/fugue/execution/execution_engine.py @@ -5,7 +5,7 @@ from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Union from uuid import uuid4 -from triad import ParamDict, Schema, assert_or_throw +from triad import ParamDict, Schema, SerializableRLock, assert_or_throw from triad.collections.fs import FileSystem from triad.exceptions import InvalidOperationError from triad.utils.convert import to_size @@ -29,6 +29,8 @@ "_FUGUE_EXECUTION_ENGINE_CONTEXT", default=None ) +_CONTEXT_LOCK = SerializableRLock() + class ExecutionEngineFacet: """The base class for different factes of the execution engines. @@ -152,6 +154,7 @@ def __init__(self, conf: Any): self._compile_conf = ParamDict() self._sql_engine: Optional[SQLEngine] = None self._map_engine: Optional[MapEngine] = None + self._ctx_count = 0 @contextmanager def as_context(self) -> Iterator["ExecutionEngine"]: @@ -166,11 +169,12 @@ def as_context(self) -> Iterator["ExecutionEngine"]: transform(df, func) # will use engine in this transformation """ - token = _FUGUE_EXECUTION_ENGINE_CONTEXT.set(self) # type: ignore - try: - yield self - finally: - _FUGUE_EXECUTION_ENGINE_CONTEXT.reset(token) + return self._as_context() + + @property + def in_context(self) -> bool: + with _CONTEXT_LOCK: + return self._ctx_count > 0 def stop(self) -> None: """Stop this execution engine, do not override @@ -1005,6 +1009,28 @@ def __copy__(self) -> "ExecutionEngine": def __deepcopy__(self, memo: Any) -> "ExecutionEngine": return self + def _as_context(self) -> Iterator["ExecutionEngine"]: + """Set this execution engine as the context engine. This function + is thread safe and async safe. + + .. admonition:: Examples + + .. code-block:: python + + with engine.as_context(): + transform(df, func) # will use engine in this transformation + + """ + with _CONTEXT_LOCK: + token = _FUGUE_EXECUTION_ENGINE_CONTEXT.set(self) # type: ignore + self._ctx_count += 1 + try: + yield self + finally: + with _CONTEXT_LOCK: + self._ctx_count -= 1 + _FUGUE_EXECUTION_ENGINE_CONTEXT.reset(token) + def _serialize_by_partition( self, df: DataFrame, diff --git a/fugue/plugins.py b/fugue/plugins.py index 6ae2f4f3..9866c2e0 100644 --- a/fugue/plugins.py +++ b/fugue/plugins.py @@ -1,6 +1,6 @@ # flake8: noqa # pylint: disable-all -from fugue.dataframe.dataframe import ( +from fugue.dataframe import ( alter_columns, as_array, as_array_iterable, diff --git a/fugue/workflow/__init__.py b/fugue/workflow/__init__.py index cb13f310..bbb6a7e2 100644 --- a/fugue/workflow/__init__.py +++ b/fugue/workflow/__init__.py @@ -1,6 +1,7 @@ # flake8: noqa -from fugue.workflow._workflow_context import FugueWorkflowContext -from fugue.workflow.input import is_acceptable_raw_df, register_raw_df_type -from fugue.workflow.module import module -from fugue.workflow.workflow import FugueWorkflow, WorkflowDataFrame, WorkflowDataFrames +from ._workflow_context import FugueWorkflowContext +from .api import * +from .input import is_acceptable_raw_df, register_raw_df_type +from .module import module +from .workflow import FugueWorkflow, WorkflowDataFrame, WorkflowDataFrames diff --git a/fugue/express/transformation.py b/fugue/workflow/api.py similarity index 87% rename from fugue/express/transformation.py rename to fugue/workflow/api.py index c7bff4f0..16eebe4f 100644 --- a/fugue/express/transformation.py +++ b/fugue/workflow/api.py @@ -1,13 +1,14 @@ -from typing import Any, List, Optional +from typing import Any, Dict, List, Optional from triad.utils.assertion import assert_or_throw -from fugue.collections.yielded import Yielded -from fugue.constants import FUGUE_CONF_WORKFLOW_EXCEPTION_INJECT -from fugue.dataframe import DataFrame -from fugue.exceptions import FugueInterfacelessError, FugueWorkflowCompileError -from fugue.execution import make_execution_engine -from fugue.workflow import FugueWorkflow +from ..collections.yielded import Yielded +from ..constants import FUGUE_CONF_WORKFLOW_EXCEPTION_INJECT +from ..dataframe import DataFrame +from ..dataframe.api import get_native_as_df +from ..exceptions import FugueInterfacelessError, FugueWorkflowCompileError +from ..execution import make_execution_engine +from .workflow import FugueWorkflow def _check_valid_input(df: Any, save_path: Optional[str]) -> None: @@ -38,13 +39,13 @@ def transform( # noqa: C901 partition: Any = None, callback: Any = None, ignore_errors: Optional[List[Any]] = None, - engine: Any = None, - engine_conf: Any = None, - force_output_fugue_dataframe: bool = False, persist: bool = False, as_local: bool = False, save_path: Optional[str] = None, checkpoint: bool = False, + engine: Any = None, + engine_conf: Any = None, + as_fugue: bool = False, ) -> Any: """Transform this dataframe using transformer. It's a wrapper of :meth:`~fugue.workflow.workflow.FugueWorkflow.transform` and @@ -77,7 +78,7 @@ def transform( # noqa: C901 engine and the second value represents the sql engine (you can use ``None`` for either of them to use the default one), defaults to None :param engine_conf: |ParamsLikeObject|, defaults to None - :param force_output_fugue_dataframe: If true, the function will always return + :param as_fugue: If true, the function will always return a ``FugueDataFrame``, otherwise, if ``df`` is in native dataframe types such as pandas dataframe, then the output will also in its native format. Defaults to False @@ -178,7 +179,7 @@ def _no_op_processor(df: DataFrame) -> DataFrame: result = dag.yields["result"].result # type:ignore else: return save_path - if force_output_fugue_dataframe or isinstance(df, (DataFrame, Yielded)): + if as_fugue or isinstance(df, (DataFrame, Yielded)): return result return result.as_pandas() if result.is_local else result.native # type:ignore @@ -247,3 +248,32 @@ def out_transform( ) dag.run(make_execution_engine(engine, conf=engine_conf, infer_by=[df])) + + +def raw_sql( + *statements: Any, + engine: Any = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +): + dag = FugueWorkflow(compile_conf={FUGUE_CONF_WORKFLOW_EXCEPTION_INJECT: 0}) + sp: List[Any] = [] + infer_by: List[Any] = [] + inputs: Dict[int, Any] = {} + for x in statements: + if isinstance(x, str): + sp.append(x) + else: + if id(x) in inputs: + sp.append(inputs[id(x)]) + else: + inputs[id(x)] = dag.create(x) + sp.append(inputs[id(x)]) + infer_by.append(x) + + engine = make_execution_engine(engine, engine_conf, infer_by=infer_by) + dag.select(*sp).yield_dataframe_as("result", as_local=as_local) + res = dag.run(engine) + + return res["result"] if as_fugue else get_native_as_df(res["result"]) diff --git a/fugue/workflow/workflow.py b/fugue/workflow/workflow.py index e6faeff2..70d3227d 100644 --- a/fugue/workflow/workflow.py +++ b/fugue/workflow/workflow.py @@ -2066,7 +2066,7 @@ def select( for s in statements: if isinstance(s, str): s_str.append(s) - if isinstance(s, DataFrame): + else: ws = self.df(s) dfs[ws.name] = ws s_str.append(ws.name) diff --git a/fugue_duckdb/execution_engine.py b/fugue_duckdb/execution_engine.py index 8ec0b9f1..d39fccfc 100644 --- a/fugue_duckdb/execution_engine.py +++ b/fugue_duckdb/execution_engine.py @@ -407,7 +407,9 @@ def save_df( dio.save_df(self._to_duck_df(df), path, format_hint, mode, **kwargs) def convert_yield_dataframe(self, df: DataFrame, as_local: bool) -> DataFrame: - return df.as_local() if not self._external_con or as_local else df + if as_local: + return df.as_local() + return df.as_local() if not self.in_context and not self._external_con else df def _sql(self, sql: str, dfs: Dict[str, DataFrame]) -> DuckDataFrame: with self._context_lock: diff --git a/fugue_test/dataframe_suite.py b/fugue_test/dataframe_suite.py index 126a8318..aff9cd34 100644 --- a/fugue_test/dataframe_suite.py +++ b/fugue_test/dataframe_suite.py @@ -8,7 +8,7 @@ import pandas as pd from pytest import raises -import fugue.express as fi +import fugue.api as fi from fugue.dataframe import ArrowDataFrame, DataFrame from fugue.dataframe.utils import _df_eq as df_eq from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError diff --git a/tests/fugue/dataframe/test_dataframe.py b/tests/fugue/dataframe/test_dataframe.py index 26b28712..35cf5d2a 100644 --- a/tests/fugue/dataframe/test_dataframe.py +++ b/tests/fugue/dataframe/test_dataframe.py @@ -5,7 +5,7 @@ from triad.collections.schema import Schema from fugue.dataframe import ArrayDataFrame, DataFrame -from fugue.express import as_fugue_df, get_native_as_df +from fugue.api import as_fugue_df, get_native_as_df def test_as_fugue_df(): diff --git a/tests/fugue/execution/test_factory.py b/tests/fugue/execution/test_factory.py index 58352182..612d876c 100644 --- a/tests/fugue/execution/test_factory.py +++ b/tests/fugue/execution/test_factory.py @@ -255,8 +255,11 @@ def test_make_execution_engine(): def test_context_and_infer_execution_engine(): e1 = _MockExecutionEngine({}) e2 = _MockExecutionEngine2(Dummy2(), {}) + assert not e1.in_context and not e2.in_context with e2.as_context(): + assert not e1.in_context and e2.in_context with e1.as_context() as ex: + assert e1.in_context and e2.in_context assert ex is e1 e = make_execution_engine( None, conf={"x": False}, infer_by=[pd.DataFrame(), Dummy2()] @@ -264,10 +267,13 @@ def test_context_and_infer_execution_engine(): assert isinstance(e, _MockExecutionEngine) assert not isinstance(e, _MockExecutionEngine2) assert not e.conf["x"] + assert not e1.in_context and e2.in_context e = make_execution_engine(None, conf={"x": True}) assert isinstance(e, _MockExecutionEngine2) + assert not e1.in_context and not e2.in_context + e = make_execution_engine(None) assert isinstance(e, NativeExecutionEngine) assert not isinstance(e, _MockExecutionEngine) diff --git a/tests/fugue/test_interfaceless.py b/tests/fugue/test_interfaceless.py index e78608ce..530b3f7b 100644 --- a/tests/fugue/test_interfaceless.py +++ b/tests/fugue/test_interfaceless.py @@ -42,7 +42,7 @@ def f2(df: pd.DataFrame) -> pd.DataFrame: assert isinstance(result, pd.DataFrame) assert sorted(result.values.tolist(), key=lambda x: x[0]) == [[0, 0], [1, 1]] result = transform( - pdf, f2, partition=dict(by=["a"]), force_output_fugue_dataframe=True + pdf, f2, partition=dict(by=["a"]), as_fugue=True ) assert isinstance(result, DataFrame) @@ -93,7 +93,7 @@ def test_transform_from_file(tmpdir): def f(df: pd.DataFrame) -> pd.DataFrame: return df.assign(x=1) - result = transform(fp, f, force_output_fugue_dataframe=True) + result = transform(fp, f, as_fugue=True) assert result.as_array(type_safe=True) == [[2, 1]] with raises(FugueInterfacelessError): @@ -116,7 +116,7 @@ def f(df: pd.DataFrame) -> pd.DataFrame: # checkpoint is True, save_path is None result = transform( - tdf, f, force_output_fugue_dataframe=True, checkpoint=True, engine=engine + tdf, f, as_fugue=True, checkpoint=True, engine=engine ) assert result.as_array() == [[2, 1]] @@ -124,7 +124,7 @@ def f(df: pd.DataFrame) -> pd.DataFrame: result = transform( tdf, f, - force_output_fugue_dataframe=True, + as_fugue=True, checkpoint=True, save_path=fp, engine=engine, @@ -136,7 +136,7 @@ def f(df: pd.DataFrame) -> pd.DataFrame: result = transform( tdf, f, - force_output_fugue_dataframe=True, + as_fugue=True, save_path=fp, engine=engine, ) @@ -163,7 +163,7 @@ def f(df: pd.DataFrame) -> pd.DataFrame: transform( tdf, f, - force_output_fugue_dataframe=True, + as_fugue=True, save_path="f.csv", engine=engine, ) @@ -171,7 +171,7 @@ def f(df: pd.DataFrame) -> pd.DataFrame: transform( tdf, f, - force_output_fugue_dataframe=True, + as_fugue=True, save_path="f.json", engine=engine, ) diff --git a/tests/fugue_dask/test_dataframe.py b/tests/fugue_dask/test_dataframe.py index 8e17e010..03aa5f40 100644 --- a/tests/fugue_dask/test_dataframe.py +++ b/tests/fugue_dask/test_dataframe.py @@ -6,7 +6,7 @@ import dask.dataframe as pd import numpy as np import pandas -import fugue.express as fi +import fugue.api as fi from fugue.dataframe.array_dataframe import ArrayDataFrame from fugue.dataframe.pandas_dataframe import PandasDataFrame from fugue.dataframe.utils import _df_eq as df_eq diff --git a/tests/fugue_dask/test_execution_engine.py b/tests/fugue_dask/test_execution_engine.py index 380e3179..79e4dc5f 100644 --- a/tests/fugue_dask/test_execution_engine.py +++ b/tests/fugue_dask/test_execution_engine.py @@ -160,7 +160,7 @@ def tr(df: List[List[Any]], add: Optional[callable]) -> List[List[Any]]: schema="b:binary", callback=cb.add, as_local=True, - force_output_fugue_dataframe=True, + as_fugue=True, engine="dask", engine_conf=_CONF, ) @@ -172,7 +172,7 @@ def tr(df: List[List[Any]], add: Optional[callable]) -> List[List[Any]]: pdf, tr, schema="b:binary", - force_output_fugue_dataframe=True, + as_fugue=True, engine="dask", ) assert not res.is_local @@ -185,7 +185,7 @@ def tr(df: List[List[Any]], add: Optional[callable]) -> List[List[Any]]: tr, schema="b:binary", callback=cb.add, - force_output_fugue_dataframe=True, + as_fugue=True, engine="dask", engine_conf=_CONF, persist=True, # when you have a persist, you can use callback diff --git a/tests/fugue_duckdb/test_execution_engine.py b/tests/fugue_duckdb/test_execution_engine.py index f0ddad65..f01d63bf 100644 --- a/tests/fugue_duckdb/test_execution_engine.py +++ b/tests/fugue_duckdb/test_execution_engine.py @@ -12,6 +12,7 @@ from fugue_duckdb.dataframe import DuckDataFrame from fugue_test.builtin_suite import BuiltInTests from fugue_test.execution_suite import ExecutionEngineTests +from fugue.api import engine_context class DuckExecutionEngineTests(ExecutionEngineTests.Tests): @@ -174,6 +175,20 @@ def test_sql_yield(): assert isinstance(res["a"], ArrowDataFrame) assert isinstance(res["b"], ArrowDataFrame) + # in context + with engine_context("duck"): + res = fsql( + """ + CREATE [[0]] SCHEMA a:int + YIELD DATAFRAME AS a + CREATE [[0]] SCHEMA b:int + YIELD LOCAL DATAFRAME AS b + """ + ).run() + + assert isinstance(res["a"], DuckDataFrame) + assert isinstance(res["b"], ArrowDataFrame) + def test_infer_engine(): con = duckdb.connect() diff --git a/tests/fugue_ibis/test_dataframe.py b/tests/fugue_ibis/test_dataframe.py index 16da2d67..f863c83c 100644 --- a/tests/fugue_ibis/test_dataframe.py +++ b/tests/fugue_ibis/test_dataframe.py @@ -7,7 +7,7 @@ import pyarrow as pa import pytest -import fugue.express as fe +import fugue.api as fe from fugue import ArrowDataFrame from fugue_duckdb.dataframe import DuckDataFrame from fugue_test.dataframe_suite import DataFrameTests diff --git a/tests/fugue_ray/test_execution_engine.py b/tests/fugue_ray/test_execution_engine.py index b0ce1f04..5ca8ded8 100644 --- a/tests/fugue_ray/test_execution_engine.py +++ b/tests/fugue_ray/test_execution_engine.py @@ -55,7 +55,7 @@ def t(df: pd.DataFrame) -> pd.DataFrame: partition="per_row", engine="ray", as_local=True, - force_output_fugue_dataframe=True, + as_fugue=True, ) df_eq( res, @@ -71,7 +71,7 @@ def t(df: pd.DataFrame) -> pd.DataFrame: partition=dict(num=3, algo="rand"), engine="ray", as_local=True, - force_output_fugue_dataframe=True, + as_fugue=True, ) df_eq( res, @@ -87,7 +87,7 @@ def t(df: pd.DataFrame) -> pd.DataFrame: partition=dict(num=40), engine="ray", as_local=True, - force_output_fugue_dataframe=True, + as_fugue=True, ) df_eq( res, @@ -113,7 +113,7 @@ def t(df: pd.DataFrame) -> pd.DataFrame: "fugue.ray.remote.num_cpus": 1, }, as_local=True, - force_output_fugue_dataframe=True, + as_fugue=True, ) df_eq( res, diff --git a/tests/fugue_spark/test_dataframe.py b/tests/fugue_spark/test_dataframe.py index b3d5b697..d9fafc21 100644 --- a/tests/fugue_spark/test_dataframe.py +++ b/tests/fugue_spark/test_dataframe.py @@ -8,7 +8,7 @@ from pyspark.sql import SparkSession from triad.collections.schema import Schema -import fugue.express as fi +import fugue.api as fi from fugue.dataframe.pandas_dataframe import PandasDataFrame from fugue.plugins import get_column_names, rename from fugue_spark import SparkExecutionEngine From df869551f169a09dc86c5b69e1f6288b653e660e Mon Sep 17 00:00:00 2001 From: Han Wang Date: Fri, 23 Dec 2022 07:33:59 +0000 Subject: [PATCH 12/30] add engine operations --- .gitignore | 1 + fugue/api.py | 3 ++ fugue/execution/api.py | 79 ++++++++++++++++++++++++++--- fugue/execution/execution_engine.py | 51 +++++++++++++++++++ fugue/execution/factory.py | 13 ++++- tests/fugue/execution/test_api.py | 25 +++++++++ 6 files changed, 165 insertions(+), 7 deletions(-) create mode 100644 tests/fugue/execution/test_api.py diff --git a/.gitignore b/.gitignore index 2d295fc5..9568b79c 100644 --- a/.gitignore +++ b/.gitignore @@ -120,6 +120,7 @@ pythonenv* # mkdocs documentation /site +.virtual_documents # mypy .mypy_cache diff --git a/fugue/api.py b/fugue/api.py index 4cfdd39b..846cb959 100644 --- a/fugue/api.py +++ b/fugue/api.py @@ -23,10 +23,12 @@ from .dataset.api import as_fugue_dataset, count, is_bounded, is_empty, is_local, show from .execution.api import ( broadcast, + clear_global_engine, distinct, dropna, engine_context, fillna, + get_current_engine, intersect, join, load, @@ -35,6 +37,7 @@ run_engine_function, sample, save, + set_global_engine, subtract, take, union, diff --git a/fugue/execution/api.py b/fugue/execution/api.py index f1205d46..15861c68 100644 --- a/fugue/execution/api.py +++ b/fugue/execution/api.py @@ -1,9 +1,9 @@ from contextlib import contextmanager from typing import Any, Callable, Iterator, List, Optional, Union - +from triad import assert_or_throw from ..collections.partition import PartitionSpec from ..dataframe.dataframe import DataFrame -from .execution_engine import ExecutionEngine +from .execution_engine import ExecutionEngine, _FUGUE_GLOBAL_EXECUTION_ENGINE_CONTEXT from .factory import make_execution_engine @@ -11,10 +11,77 @@ def engine_context( engine: Any = None, engine_conf: Any = None, infer_by: Optional[List[Any]] = None ) -> Iterator[ExecutionEngine]: + """Make an execution engine and set it as the context engine. This function + is thread safe and async safe. + + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param infer_by: a list of objects to infer the engine, defaults to None + + .. note:: + + For more details, please read + :func:`~.fugue.execution.factory.make_execution_engine` + + .. admonition:: Examples + + .. code-block:: python + + import fugue.api as fa + + with fa.engine_context(spark_session): + transform(df, func) # will use spark in this transformation + + """ e = make_execution_engine(engine, engine_conf, infer_by=infer_by) return e._as_context() +def set_global_engine(engine: Any, engine_conf: Any = None) -> ExecutionEngine: + """Make an execution engine and set it as the global execution engine + + :param engine: an engine like object, must not be None + :param engine_conf: the configs for the engine, defaults to None + + .. caution:: + + In general, it is not a good practice to set a global engine. You should + consider :func:`~.engine_context` instead. The exception + is when you iterate in a notebook and cross cells, this could simplify + the code. + + .. note:: + + For more details, please read + :func:`~.fugue.execution.factory.make_execution_engine` and + :meth:`~fugue.execution.execution_engine.ExecutionEngine.set_global` + + .. admonition:: Examples + + .. code-block:: python + + import fugue.api as fa + + fa.set_global_engine(spark_session) + transform(df, func) # will use spark in this transformation + fa.clear_global_engine() # remove the global setting + """ + assert_or_throw(engine is not None, ValueError("engine must be specified")) + return make_execution_engine(engine, engine_conf).set_global() + + +def clear_global_engine() -> None: + """Remove the global exeuction engine (if set)""" + _FUGUE_GLOBAL_EXECUTION_ENGINE_CONTEXT.set(None) + + +def get_current_engine() -> ExecutionEngine: + """Get the current execution engine. Regarding the order of the logic + please read :func:`~.fugue.execution.factory.make_execution_engine` + """ + return make_execution_engine() + + def run_engine_function( func: Callable[[ExecutionEngine], Any], engine: Any = None, @@ -356,7 +423,7 @@ def _join(e: ExecutionEngine): res = e.join(res, e.to_df(odf), how=how, on=on) return res - run_engine_function( + return run_engine_function( _join, engine=engine, engine_conf=engine_conf, @@ -397,7 +464,7 @@ def _union(e: ExecutionEngine): res = e.union(res, e.to_df(odf), distinct=distinct) return res - run_engine_function( + return run_engine_function( _union, engine=engine, engine_conf=engine_conf, @@ -438,7 +505,7 @@ def _subtract(e: ExecutionEngine): res = e.subtract(res, e.to_df(odf), distinct=distinct) return res - run_engine_function( + return run_engine_function( _subtract, engine=engine, engine_conf=engine_conf, @@ -479,7 +546,7 @@ def _intersect(e: ExecutionEngine): res = e.intersect(res, e.to_df(odf), distinct=distinct) return res - run_engine_function( + return run_engine_function( _intersect, engine=engine, engine_conf=engine_conf, diff --git a/fugue/execution/execution_engine.py b/fugue/execution/execution_engine.py index 9d480a49..0e83a11e 100644 --- a/fugue/execution/execution_engine.py +++ b/fugue/execution/execution_engine.py @@ -32,6 +32,27 @@ _CONTEXT_LOCK = SerializableRLock() +class _GlobalExecutionEngineContext: + def __init__(self): + self._engine: Optional["ExecutionEngine"] = None + + def set(self, engine: Optional["ExecutionEngine"]): + with _CONTEXT_LOCK: + if self._engine is not None: + self._engine._is_global = False + self._engine._ctx_count -= 1 + self._engine = engine + if engine is not None: + engine._is_global = True + engine._ctx_count += 1 + + def get(self) -> Optional["ExecutionEngine"]: + return self._engine + + +_FUGUE_GLOBAL_EXECUTION_ENGINE_CONTEXT = _GlobalExecutionEngineContext() + + class ExecutionEngineFacet: """The base class for different factes of the execution engines. @@ -155,6 +176,7 @@ def __init__(self, conf: Any): self._sql_engine: Optional[SQLEngine] = None self._map_engine: Optional[MapEngine] = None self._ctx_count = 0 + self._is_global = False @contextmanager def as_context(self) -> Iterator["ExecutionEngine"]: @@ -173,9 +195,38 @@ def as_context(self) -> Iterator["ExecutionEngine"]: @property def in_context(self) -> bool: + """Whether this engine is being used as a context engine""" with _CONTEXT_LOCK: return self._ctx_count > 0 + def set_global(self) -> "ExecutionEngine": + """Set this execution engine to be the global execution engine. + + .. note:: + Global engine is also considered as a context engine, so + :meth:`~.ExecutionEngine.in_context` will also become true + for the global engine. + + .. admonition:: Examples + + .. code-block:: python + + engine1.set_global(): + transform(df, func) # will use engine1 in this transformation + + with engine2.as_context(): + transform(df, func) # will use engine2 + + transform(df, func) # will use engine1 + """ + _FUGUE_GLOBAL_EXECUTION_ENGINE_CONTEXT.set(self) + return self + + @property + def is_global(self) -> bool: + """Whether this engine is being used as THE global engine""" + return self._is_global + def stop(self) -> None: """Stop this execution engine, do not override You should customize :meth:`~.stop_engine` if necessary. diff --git a/fugue/execution/factory.py b/fugue/execution/factory.py index 4d581c0d..c2320b66 100644 --- a/fugue/execution/factory.py +++ b/fugue/execution/factory.py @@ -8,6 +8,7 @@ from ..exceptions import FuguePluginsRegistrationError from .execution_engine import ( _FUGUE_EXECUTION_ENGINE_CONTEXT, + _FUGUE_GLOBAL_EXECUTION_ENGINE_CONTEXT, ExecutionEngine, SQLEngine, ) @@ -250,6 +251,9 @@ def make_execution_engine( * If ``engine`` is None, it first try to see if there is any defined context engine to use (=> engine) + * If ``engine`` is still empty, then it will try to get the global execution + engine. See + :meth:`~fugue.execution.execution_engine.ExecutionEngine.set_global` * If ``engine`` is still empty, then if ``infer_by`` is given, it will try to infer the execution engine (=> engine) * If ``engine`` is still empty, then it will construct the default @@ -291,13 +295,20 @@ def make_execution_engine( # assume object e2_df can infer E2 engine make_execution_engine(infer_by=[e2_df]) # an E2 engine + # global + e_global = E1(conf) + e_global.set_global() + make_execution_engine() # e_global + # context with E2(conf).as_context() as ec: make_execution_engine() # ec - make_execution_engine() # the default execution engine + make_execution_engine() # e_global """ if engine is None: engine = _FUGUE_EXECUTION_ENGINE_CONTEXT.get() + if engine is None: + engine = _FUGUE_GLOBAL_EXECUTION_ENGINE_CONTEXT.get() if engine is None and infer_by is not None: engine = infer_execution_engine(infer_by) diff --git a/tests/fugue/execution/test_api.py b/tests/fugue/execution/test_api.py new file mode 100644 index 00000000..15890d17 --- /dev/null +++ b/tests/fugue/execution/test_api.py @@ -0,0 +1,25 @@ +import fugue.api as fa +from fugue import NativeExecutionEngine + + +def test_engine_operations(): + e = fa.set_global_engine("native") + assert isinstance(e, NativeExecutionEngine) + assert e.in_context and e.is_global + assert fa.get_current_engine() is e + with fa.engine_context("duckdb") as e2: + assert fa.get_current_engine() is e2 + assert not e2.is_global and e2.in_context + with e.as_context(): + assert not e2.is_global and e2.in_context + assert e.in_context and e.is_global + assert fa.get_current_engine() is e + assert e.in_context and e.is_global + assert fa.get_current_engine() is e2 + assert not e2.is_global and not e2.in_context + assert e.in_context and e.is_global + e3 = fa.set_global_engine("duckdb") + assert not e.in_context and not e.is_global + assert e3.in_context and e3.is_global + fa.clear_global_engine() + assert not e3.in_context and not e3.is_global From 58a6b9a8cdd7551f3ae3d847e02a7ca2b244988f Mon Sep 17 00:00:00 2001 From: Han Wang Date: Fri, 23 Dec 2022 08:30:38 +0000 Subject: [PATCH 13/30] update type annotations and docs --- fugue/__init__.py | 10 +- fugue/dataframe/__init__.py | 1 + fugue/dataframe/api.py | 59 +++++---- fugue/dataframe/dataframe.py | 9 +- fugue/execution/__init__.py | 2 +- fugue/execution/api.py | 184 +++++++++++++++++++--------- fugue/execution/execution_engine.py | 14 ++- 7 files changed, 194 insertions(+), 85 deletions(-) diff --git a/fugue/__init__.py b/fugue/__init__.py index d73af6cb..69e97d46 100644 --- a/fugue/__init__.py +++ b/fugue/__init__.py @@ -2,6 +2,7 @@ from triad.collections import Schema from triad.collections.fs import FileSystem +from fugue.api import out_transform, transform from fugue.bag.array_bag import ArrayBag from fugue.bag.bag import Bag, BagDisplay from fugue.collections.partition import PartitionCursor, PartitionSpec @@ -10,6 +11,7 @@ from fugue.dataframe.array_dataframe import ArrayDataFrame from fugue.dataframe.arrow_dataframe import ArrowDataFrame from fugue.dataframe.dataframe import ( + AnyDataFrame, DataFrame, DataFrameDisplay, LocalBoundedDataFrame, @@ -21,7 +23,12 @@ from fugue.dataframe.pandas_dataframe import PandasDataFrame from fugue.dataframe.utils import to_local_bounded_df, to_local_df from fugue.dataset import Dataset, DatasetDisplay, as_fugue_dataset, get_dataset_display -from fugue.execution.execution_engine import ExecutionEngine, MapEngine, SQLEngine +from fugue.execution.execution_engine import ( + AnyExecutionEngine, + ExecutionEngine, + MapEngine, + SQLEngine, +) from fugue.execution.factory import ( is_pandas_or, make_execution_engine, @@ -52,7 +59,6 @@ register_transformer, transformer, ) -from fugue.api import out_transform, transform from fugue.registry import _register from fugue.rpc import ( EmptyRPCHandler, diff --git a/fugue/dataframe/__init__.py b/fugue/dataframe/__init__.py index cd88f2b0..d47a1580 100644 --- a/fugue/dataframe/__init__.py +++ b/fugue/dataframe/__init__.py @@ -3,6 +3,7 @@ from .array_dataframe import ArrayDataFrame from .arrow_dataframe import ArrowDataFrame from .dataframe import ( + AnyDataFrame, DataFrame, LocalBoundedDataFrame, LocalDataFrame, diff --git a/fugue/dataframe/api.py b/fugue/dataframe/api.py index 1ed27cce..bced84e5 100644 --- a/fugue/dataframe/api.py +++ b/fugue/dataframe/api.py @@ -9,10 +9,10 @@ from fugue.dataset.api import as_fugue_dataset from .._utils.registry import fugue_plugin -from .dataframe import DataFrame +from .dataframe import DataFrame, AnyDataFrame -def as_fugue_df(df: Any) -> DataFrame: +def as_fugue_df(df: AnyDataFrame) -> DataFrame: """Wrap the object as a Fugue DataFrame. This is a wrapper of :func:`~fugue.dataset.as_fugue_dataset` @@ -27,12 +27,12 @@ def as_fugue_df(df: Any) -> DataFrame: @fugue_plugin -def is_df(df: Any) -> bool: +def is_df(df: AnyDataFrame) -> bool: """Whether ``df`` is a DataFrame like object""" return isinstance(df, DataFrame) -def get_native_as_df(df: Any) -> Any: +def get_native_as_df(df: AnyDataFrame) -> AnyDataFrame: """Return the dataframe form of the input ``df``. If ``df`` is a :class:`~.DataFrame`, then call the :meth:`~.DataFrame.native_as_df`, otherwise, it depends on whether there is @@ -46,7 +46,7 @@ def get_native_as_df(df: Any) -> Any: @fugue_plugin -def get_schema(df: Any) -> Schema: +def get_schema(df: AnyDataFrame) -> Schema: """Get the schema of the ``df`` :param df: the object that can be recognized as a dataframe by Fugue @@ -56,7 +56,7 @@ def get_schema(df: Any) -> Schema: @fugue_plugin -def as_pandas(df: Any) -> pd.DataFrame: +def as_pandas(df: AnyDataFrame) -> pd.DataFrame: """Convert ``df`` to a Pandas DataFrame :param df: the object that can be recognized as a dataframe by Fugue @@ -66,7 +66,7 @@ def as_pandas(df: Any) -> pd.DataFrame: @fugue_plugin -def as_arrow(df: Any) -> pa.Table: +def as_arrow(df: AnyDataFrame) -> pa.Table: """Convert ``df`` to a PyArrow Table :param df: the object that can be recognized as a dataframe by Fugue @@ -77,7 +77,7 @@ def as_arrow(df: Any) -> pa.Table: @fugue_plugin def as_array( - df: Any, columns: Optional[List[str]] = None, type_safe: bool = False + df: AnyDataFrame, columns: Optional[List[str]] = None, type_safe: bool = False ) -> List[Any]: # pragma: no cover """Convert df to 2-dimensional native python array @@ -96,7 +96,7 @@ def as_array( @fugue_plugin def as_array_iterable( - df: Any, columns: Optional[List[str]] = None, type_safe: bool = False + df: AnyDataFrame, columns: Optional[List[str]] = None, type_safe: bool = False ) -> Iterable[Any]: # pragma: no cover """Convert df to iterable of native python arrays @@ -116,7 +116,7 @@ def as_array_iterable( @fugue_plugin def as_dict_iterable( - df: Any, columns: Optional[List[str]] = None + df: AnyDataFrame, columns: Optional[List[str]] = None ) -> Iterable[Dict[str, Any]]: """Convert df to iterable of native python dicts @@ -132,7 +132,7 @@ def as_dict_iterable( @fugue_plugin -def peek_array(df: Any) -> List[Any]: +def peek_array(df: AnyDataFrame) -> List[Any]: """Peek the first row of the dataframe as an array :param df: the object that can be recognized as a dataframe by Fugue @@ -142,7 +142,7 @@ def peek_array(df: Any) -> List[Any]: @fugue_plugin -def peek_dict(df: Any) -> Dict[str, Any]: +def peek_dict(df: AnyDataFrame) -> Dict[str, Any]: """Peek the first row of the dataframe as a array :param df: the object that can be recognized as a dataframe by Fugue @@ -153,8 +153,11 @@ def peek_dict(df: Any) -> Dict[str, Any]: @fugue_plugin def head( - df: Any, n: int, columns: Optional[List[str]] = None, as_fugue: bool = False -) -> Any: + df: AnyDataFrame, + n: int, + columns: Optional[List[str]] = None, + as_fugue: bool = False, +) -> AnyDataFrame: """Get first n rows of the dataframe as a new local bounded dataframe :param n: number of rows @@ -167,11 +170,13 @@ def head( res = as_fugue_df(df).head(n=n, columns=columns) if as_fugue or isinstance(df, DataFrame): return res - return res.as_pandas() + return res.native_as_df() @fugue_plugin -def alter_columns(df: Any, columns: Any, as_fugue: bool = False) -> Any: +def alter_columns( + df: AnyDataFrame, columns: Any, as_fugue: bool = False +) -> AnyDataFrame: """Change column types :param df: the object that can be recognized as a dataframe by Fugue @@ -187,7 +192,9 @@ def alter_columns(df: Any, columns: Any, as_fugue: bool = False) -> Any: @fugue_plugin -def drop_columns(df: Any, columns: List[str], as_fugue: bool = False) -> Any: +def drop_columns( + df: AnyDataFrame, columns: List[str], as_fugue: bool = False +) -> AnyDataFrame: """Drop certain columns and return a new dataframe :param df: the object that can be recognized as a dataframe by Fugue @@ -201,7 +208,9 @@ def drop_columns(df: Any, columns: List[str], as_fugue: bool = False) -> Any: @fugue_plugin -def select_columns(df: Any, columns: List[Any], as_fugue: bool = False) -> Any: +def select_columns( + df: AnyDataFrame, columns: List[Any], as_fugue: bool = False +) -> AnyDataFrame: """Select certain columns and return a new dataframe :param df: the object that can be recognized as a dataframe by Fugue @@ -215,7 +224,7 @@ def select_columns(df: Any, columns: List[Any], as_fugue: bool = False) -> Any: @fugue_plugin -def get_column_names(df: Any) -> List[Any]: # pragma: no cover +def get_column_names(df: AnyDataFrame) -> List[Any]: # pragma: no cover """A generic function to get column names of any dataframe :param df: the dataframe object @@ -236,7 +245,9 @@ def _get_pyarrow_dataframe_columns(df: pa.Table) -> List[Any]: @fugue_plugin -def rename(df: Any, columns: Dict[str, Any], as_fugue: bool = False) -> Any: +def rename( + df: AnyDataFrame, columns: Dict[str, Any], as_fugue: bool = False +) -> AnyDataFrame: """A generic function to rename column names of any dataframe :param df: the dataframe object @@ -268,7 +279,7 @@ def _rename_pandas_dataframe( return _adjust_df(df, as_fugue_df(df).rename(columns), as_fugue=as_fugue) -def normalize_column_names(df: Any) -> Tuple[Any, Dict[str, Any]]: +def normalize_column_names(df: AnyDataFrame) -> Tuple[AnyDataFrame, Dict[str, Any]]: """A generic function to normalize any dataframe's column names to follow Fugue naming rules @@ -302,7 +313,9 @@ def normalize_column_names(df: Any) -> Tuple[Any, Dict[str, Any]]: return (rename(df, names), undo) -def _adjust_df(input_df: Any, output_df: DataFrame, as_fugue: bool) -> Any: +def _adjust_df( + input_df: AnyDataFrame, output_df: DataFrame, as_fugue: bool +) -> AnyDataFrame: if as_fugue or isinstance(input_df, DataFrame): return output_df - return output_df.native # type: ignore + return output_df.native_as_df() diff --git a/fugue/dataframe/dataframe.py b/fugue/dataframe/dataframe.py index 3f8fde94..828a89ff 100644 --- a/fugue/dataframe/dataframe.py +++ b/fugue/dataframe/dataframe.py @@ -1,6 +1,6 @@ import json from abc import abstractmethod -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, TypeVar import pandas as pd import pyarrow as pa @@ -16,6 +16,9 @@ from ..exceptions import FugueDataFrameOperationError +AnyDataFrame = TypeVar("AnyDataFrame", "DataFrame", object) + + class DataFrame(Dataset): """Base class of Fugue DataFrame. Please read |DataFrameTutorial| to understand the concept @@ -57,7 +60,7 @@ def schema(self) -> Schema: return self._schema @abstractmethod - def native_as_df(self) -> Any: # pragma: no cover + def native_as_df(self) -> AnyDataFrame: # pragma: no cover """The dataframe form of the native object this Dataset class wraps. Dataframe form means the object contains schema information. For example the native an ArrayDataFrame is a python array, it doesn't contain schema @@ -290,7 +293,7 @@ class LocalDataFrame(DataFrame): implementing a new :class:`~fugue.execution.execution_engine.ExecutionEngine` """ - def native_as_df(self) -> Any: + def native_as_df(self) -> AnyDataFrame: return self.as_pandas() @property diff --git a/fugue/execution/__init__.py b/fugue/execution/__init__.py index 95cbb7cd..f82c646e 100644 --- a/fugue/execution/__init__.py +++ b/fugue/execution/__init__.py @@ -1,6 +1,6 @@ # flake8: noqa from .api import * -from .execution_engine import ExecutionEngine, MapEngine, SQLEngine +from .execution_engine import AnyExecutionEngine, ExecutionEngine, MapEngine, SQLEngine from .factory import ( infer_execution_engine, make_execution_engine, diff --git a/fugue/execution/api.py b/fugue/execution/api.py index 15861c68..ac161075 100644 --- a/fugue/execution/api.py +++ b/fugue/execution/api.py @@ -1,15 +1,23 @@ from contextlib import contextmanager from typing import Any, Callable, Iterator, List, Optional, Union + from triad import assert_or_throw + from ..collections.partition import PartitionSpec -from ..dataframe.dataframe import DataFrame -from .execution_engine import ExecutionEngine, _FUGUE_GLOBAL_EXECUTION_ENGINE_CONTEXT +from ..dataframe.dataframe import AnyDataFrame, DataFrame +from .execution_engine import ( + _FUGUE_GLOBAL_EXECUTION_ENGINE_CONTEXT, + AnyExecutionEngine, + ExecutionEngine, +) from .factory import make_execution_engine @contextmanager def engine_context( - engine: Any = None, engine_conf: Any = None, infer_by: Optional[List[Any]] = None + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + infer_by: Optional[List[Any]] = None, ) -> Iterator[ExecutionEngine]: """Make an execution engine and set it as the context engine. This function is thread safe and async safe. @@ -37,7 +45,9 @@ def engine_context( return e._as_context() -def set_global_engine(engine: Any, engine_conf: Any = None) -> ExecutionEngine: +def set_global_engine( + engine: AnyExecutionEngine, engine_conf: Any = None +) -> ExecutionEngine: """Make an execution engine and set it as the global execution engine :param engine: an engine like object, must not be None @@ -84,11 +94,24 @@ def get_current_engine() -> ExecutionEngine: def run_engine_function( func: Callable[[ExecutionEngine], Any], - engine: Any = None, + engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, infer_by: Optional[List[Any]] = None, ) -> Any: + """Run a lambda function based on the engine provided + + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame + :param infer_by: a list of objects to infer the engine, defaults to None + + :return: None or a Fugue :class:`~.fugue.dataframe.dataframe.DataFrame` if + ``as_fugue`` is True, otherwise if ``infer_by`` contains any + Fugue DataFrame, then return the Fugue DataFrame, otherwise + it returns the underlying dataframe using + :meth:`~.fugue.dataframe.dataframe.DataFrame.native_as_df` + """ with engine_context(engine, engine_conf, infer_by=infer_by) as e: res = func(e) @@ -100,16 +123,19 @@ def run_engine_function( def repartition( - df: Any, + df: AnyDataFrame, partition_spec: PartitionSpec, - engine: Any = None, + engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, -) -> Any: +) -> AnyDataFrame: """Partition the input dataframe using ``partition_spec``. :param df: an input dataframe that can be recognized by Fugue :param partition_spec: how you want to partition the dataframe + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame :return: the repartitioned dataframe """ return run_engine_function( @@ -122,14 +148,17 @@ def repartition( def broadcast( - df: Any, - engine: Any = None, + df: AnyDataFrame, + engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, -) -> Any: +) -> AnyDataFrame: """Broadcast the dataframe to all workers for a distributed computing framework :param df: an input dataframe that can be recognized by Fugue + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame :return: the broadcasted dataframe """ return run_engine_function( @@ -142,13 +171,13 @@ def broadcast( def persist( - df: Any, + df: AnyDataFrame, lazy: bool = False, - engine: Any = None, + engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, **kwargs: Any, -) -> Any: +) -> AnyDataFrame: """Force materializing and caching the dataframe :param df: an input dataframe that can be recognized by Fugue @@ -156,6 +185,9 @@ def persist( to happen; ``False`` (eager): persist is forced to happend immediately. Default to ``False`` :param kwargs: parameter to pass to the underlying persist implementation + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame :return: the persisted dataframe """ return run_engine_function( @@ -168,12 +200,18 @@ def persist( def distinct( - df: Any, engine: Any = None, engine_conf: Any = None, as_fugue: bool = False -) -> Any: + df: AnyDataFrame, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, +) -> AnyDataFrame: """Equivalent to ``SELECT DISTINCT * FROM df`` :param df: an input dataframe that can be recognized by Fugue - :return: [description] + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame + :return: the result with distinct rows """ return run_engine_function( lambda e: e.distinct(e.to_df(df)), @@ -185,14 +223,14 @@ def distinct( def dropna( - df: Any, + df: AnyDataFrame, how: str = "any", thresh: int = None, subset: List[str] = None, - engine: Any = None, + engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, -) -> Any: +) -> AnyDataFrame: """Drop NA recods from dataframe :param df: an input dataframe that can be recognized by Fugue @@ -200,6 +238,9 @@ def dropna( 'all' drops rows that contain all nulls. :param thresh: int, drops rows that have less than thresh non-null values :param subset: list of columns to operate on + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame :return: DataFrame with NA records dropped """ @@ -213,13 +254,13 @@ def dropna( def fillna( - df: Any, + df: AnyDataFrame, value: Any, subset: List[str] = None, - engine: Any = None, + engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, -) -> Any: +) -> AnyDataFrame: """ Fill ``NULL``, ``NAN``, ``NAT`` values in a dataframe @@ -229,6 +270,9 @@ def fillna( values as the replacement values. :param subset: list of columns to operate on. ignored if value is a dictionary + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame :return: DataFrame with NA records filled """ @@ -242,15 +286,15 @@ def fillna( def sample( - df: Any, + df: AnyDataFrame, n: Optional[int] = None, frac: Optional[float] = None, replace: bool = False, seed: Optional[int] = None, - engine: Any = None, + engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, -) -> Any: +) -> AnyDataFrame: """ Sample dataframe by number of rows or by fraction @@ -262,6 +306,9 @@ def sample( :param replace: whether replacement is allowed. With replacement, there may be duplicated rows in the result, defaults to False :param seed: seed for randomness, defaults to None + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame :return: the sampled dataframe """ @@ -275,15 +322,15 @@ def sample( def take( - df: Any, + df: AnyDataFrame, n: int, presort: str, na_position: str = "last", partition_spec: Optional[PartitionSpec] = None, - engine: Any = None, + engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, -) -> Any: +) -> AnyDataFrame: """ Get the first n rows of a DataFrame per partition. If a presort is defined, use the presort before applying take. presort overrides partition_spec.presort. @@ -298,6 +345,9 @@ def take( can accept ``first`` or ``last`` :param partition_spec: PartitionSpec to apply the take operation, defaults to None + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame :return: n rows of DataFrame per partition """ @@ -321,11 +371,11 @@ def load( path: Union[str, List[str]], format_hint: Any = None, columns: Any = None, - engine: Any = None, + engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, **kwargs: Any, -) -> Any: +) -> AnyDataFrame: """Load dataframe from persistent storage :param path: the path to the dataframe @@ -333,6 +383,10 @@ def load( defaults to None, meaning to infer :param columns: list of columns or a |SchemaLikeObject|, defaults to None :param kwargs: parameters to pass to the underlying framework + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame + :return: an engine compatible dataframe For more details and examples, read |ZipComap|. @@ -348,13 +402,13 @@ def load( def save( - df: Any, + df: AnyDataFrame, path: str, format_hint: Any = None, mode: str = "overwrite", partition_spec: Optional[PartitionSpec] = None, force_single: bool = False, - engine: Any = None, + engine: AnyExecutionEngine = None, engine_conf: Any = None, **kwargs: Any, ) -> None: @@ -370,6 +424,8 @@ def save( defaults to empty :param force_single: force the output as a single file, defaults to False :param kwargs: parameters to pass to the underlying framework + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None For more details and examples, read |LoadSave|. """ @@ -390,15 +446,15 @@ def save( def join( - df1: Any, - df2: Any, - *dfs: Any, + df1: AnyDataFrame, + df2: AnyDataFrame, + *dfs: AnyDataFrame, how: str, on: Optional[List[str]] = None, - engine: Any = None, + engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, -) -> Any: +) -> AnyDataFrame: """Join two dataframes :param df1: the first dataframe @@ -408,6 +464,10 @@ def join( ``inner``, ``left_outer``, ``right_outer``, ``full_outer``, ``cross`` :param on: it can always be inferred, but if you provide, it will be validated against the inferred keys. + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame + :return: the joined dataframe .. note:: @@ -433,14 +493,14 @@ def _join(e: ExecutionEngine): def union( - df1: Any, - df2: Any, - *dfs: Any, + df1: AnyDataFrame, + df2: AnyDataFrame, + *dfs: AnyDataFrame, distinct: bool = True, - engine: Any = None, + engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, -) -> Any: +) -> AnyDataFrame: """Join two dataframes :param df1: the first dataframe @@ -448,6 +508,10 @@ def union( :param dfs: more dataframes to union :param distinct: ``true`` for ``UNION`` (== ``UNION DISTINCT``), ``false`` for ``UNION ALL`` + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame + :return: the unioned dataframe .. note:: @@ -474,14 +538,14 @@ def _union(e: ExecutionEngine): def subtract( - df1: Any, - df2: Any, - *dfs: Any, + df1: AnyDataFrame, + df2: AnyDataFrame, + *dfs: AnyDataFrame, distinct: bool = True, - engine: Any = None, + engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, -) -> Any: +) -> AnyDataFrame: """``df1 - df2`` :param df1: the first dataframe @@ -489,6 +553,10 @@ def subtract( :param dfs: more dataframes to subtract :param distinct: ``true`` for ``EXCEPT`` (== ``EXCEPT DISTINCT``), ``false`` for ``EXCEPT ALL`` + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame + :return: the unioned dataframe .. note:: @@ -515,14 +583,14 @@ def _subtract(e: ExecutionEngine): def intersect( - df1: Any, - df2: Any, - *dfs: Any, + df1: AnyDataFrame, + df2: AnyDataFrame, + *dfs: AnyDataFrame, distinct: bool = True, # pylint: disable-all - engine: Any = None, + engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, -) -> Any: +) -> AnyDataFrame: """Intersect ``df1`` and ``df2`` :param df1: the first dataframe @@ -530,6 +598,10 @@ def intersect( :param dfs: more dataframes to intersect with :param distinct: ``true`` for ``INTERSECT`` (== ``INTERSECT DISTINCT``), ``false`` for ``INTERSECT ALL`` + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame + :return: the unioned dataframe .. note:: @@ -555,7 +627,9 @@ def _intersect(e: ExecutionEngine): ) -def _adjust_df(input_dfs: Any, output_df: DataFrame, as_fugue: bool) -> Any: +def _adjust_df( + input_dfs: List[AnyDataFrame], output_df: DataFrame, as_fugue: bool +) -> AnyDataFrame: if as_fugue or any(isinstance(x, DataFrame) for x in input_dfs): return output_df - return output_df.native # type: ignore + return output_df.native_as_df() diff --git a/fugue/execution/execution_engine.py b/fugue/execution/execution_engine.py index 0e83a11e..b579240e 100644 --- a/fugue/execution/execution_engine.py +++ b/fugue/execution/execution_engine.py @@ -2,7 +2,17 @@ from abc import ABC, abstractmethod from contextlib import contextmanager from contextvars import ContextVar -from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Union +from typing import ( + Any, + Callable, + Dict, + Iterable, + Iterator, + List, + Optional, + TypeVar, + Union, +) from uuid import uuid4 from triad import ParamDict, Schema, SerializableRLock, assert_or_throw @@ -25,6 +35,8 @@ from fugue.dataframe.utils import deserialize_df, serialize_df from fugue.exceptions import FugueBug +AnyExecutionEngine = TypeVar("AnyExecutionEngine", object, None) + _FUGUE_EXECUTION_ENGINE_CONTEXT = ContextVar( "_FUGUE_EXECUTION_ENGINE_CONTEXT", default=None ) From c283a247d66b7c92193e765807df01ddd3a1073a Mon Sep 17 00:00:00 2001 From: Han Wang Date: Fri, 23 Dec 2022 08:33:02 +0000 Subject: [PATCH 14/30] lint --- fugue/execution/api.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fugue/execution/api.py b/fugue/execution/api.py index ac161075..9fff8c8c 100644 --- a/fugue/execution/api.py +++ b/fugue/execution/api.py @@ -111,6 +111,10 @@ def run_engine_function( Fugue DataFrame, then return the Fugue DataFrame, otherwise it returns the underlying dataframe using :meth:`~.fugue.dataframe.dataframe.DataFrame.native_as_df` + + .. note:: + + This function is for deveopment use. Users should not need it. """ with engine_context(engine, engine_conf, infer_by=infer_by) as e: res = func(e) @@ -136,7 +140,12 @@ def repartition( :param engine: an engine like object, defaults to None :param engine_conf: the configs for the engine, defaults to None :param as_fugue: whether to force return a Fugue DataFrame + :return: the repartitioned dataframe + + .. caution:: + + This function is experimental, and may be removed in the future. """ return run_engine_function( lambda e: e.repartition(e.to_df(df), partition_spec=partition_spec), @@ -159,6 +168,7 @@ def broadcast( :param engine: an engine like object, defaults to None :param engine_conf: the configs for the engine, defaults to None :param as_fugue: whether to force return a Fugue DataFrame + :return: the broadcasted dataframe """ return run_engine_function( @@ -188,6 +198,7 @@ def persist( :param engine: an engine like object, defaults to None :param engine_conf: the configs for the engine, defaults to None :param as_fugue: whether to force return a Fugue DataFrame + :return: the persisted dataframe """ return run_engine_function( @@ -211,6 +222,7 @@ def distinct( :param engine: an engine like object, defaults to None :param engine_conf: the configs for the engine, defaults to None :param as_fugue: whether to force return a Fugue DataFrame + :return: the result with distinct rows """ return run_engine_function( From cb89e0f1487cc37e39bd379a7ec253c5339df1a5 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Fri, 23 Dec 2022 09:34:38 +0000 Subject: [PATCH 15/30] top api docs --- docs/index.rst | 1 + docs/top_api.rst | 119 +++++++++++++++++++++++++++++++++++++++++ fugue/dataframe/api.py | 2 +- 3 files changed, 121 insertions(+), 1 deletion(-) create mode 100644 docs/top_api.rst diff --git a/docs/index.rst b/docs/index.rst index a627874a..5f2912a0 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -12,5 +12,6 @@ Fugue Documentation introduction tutorials community + top_api api diff --git a/docs/top_api.rst b/docs/top_api.rst new file mode 100644 index 00000000..1aa87c68 --- /dev/null +++ b/docs/top_api.rst @@ -0,0 +1,119 @@ +Top Level User API Reference +============================ + +.. |SchemaLikeObject| replace:: :ref:`Schema like object ` +.. |ParamsLikeObject| replace:: :ref:`Parameters like object ` +.. |DataFrameLikeObject| replace:: :ref:`DataFrame like object ` +.. |DataFramesLikeObject| replace:: :ref:`DataFrames like object ` +.. |PartitionLikeObject| replace:: :ref:`Partition like object ` +.. |RPCHandlerLikeObject| replace:: :ref:`RPChandler like object ` + +.. |ExecutionEngine| replace:: :class:`~fugue.execution.execution_engine.ExecutionEngine` +.. |NativeExecutionEngine| replace:: :class:`~fugue.execution.native_execution_engine.NativeExecutionEngine` +.. |FugueWorkflow| replace:: :class:`~fugue.workflow.workflow.FugueWorkflow` + +.. |ReadJoin| replace:: Read Join tutorials on :ref:`workflow ` and :ref:`engine ` for details +.. |FugueConfig| replace:: :doc:`the Fugue Configuration Tutorial ` +.. |PartitionTutorial| replace:: :doc:`the Partition Tutorial ` +.. |FugueSQLTutorial| replace:: :doc:`the Fugue SQL Tutorial ` +.. |DataFrameTutorial| replace:: :ref:`the DataFrame Tutorial ` +.. |ExecutionEngineTutorial| replace:: :doc:`the ExecutionEngine Tutorial ` +.. |ZipComap| replace:: :ref:`Zip & Comap ` +.. |LoadSave| replace:: :ref:`Load & Save ` +.. |AutoPersist| replace:: :ref:`Auto Persist ` +.. |TransformerTutorial| replace:: :doc:`the Transformer Tutorial ` +.. |CoTransformer| replace:: :ref:`CoTransformer ` +.. |CoTransformerTutorial| replace:: :doc:`the CoTransformer Tutorial ` +.. |FugueDataTypes| replace:: :doc:`Fugue Data Types ` + +IO +~~ + +.. autofunction:: fugue.api.as_fugue_dataset + +.. autofunction:: fugue.api.as_fugue_df +.. autofunction:: fugue.api.load +.. autofunction:: fugue.api.save + + + +Information +~~~~~~~~~~~ + +.. autofunction:: fugue.api.count +.. autofunction:: fugue.api.is_bounded +.. autofunction:: fugue.api.is_empty +.. autofunction:: fugue.api.is_local +.. autofunction:: fugue.api.show + +.. autofunction:: fugue.api.get_column_names +.. autofunction:: fugue.api.get_schema +.. autofunction:: fugue.api.is_df +.. autofunction:: fugue.api.peek_array +.. autofunction:: fugue.api.peek_dict + + + +Transformation +~~~~~~~~~~~~~~ + +.. autofunction:: fugue.api.alter_columns +.. autofunction:: fugue.api.drop_columns +.. autofunction:: fugue.api.head +.. autofunction:: fugue.api.normalize_column_names +.. autofunction:: fugue.api.rename +.. autofunction:: fugue.api.select_columns + +.. autofunction:: fugue.api.distinct +.. autofunction:: fugue.api.dropna +.. autofunction:: fugue.api.fillna +.. autofunction:: fugue.api.sample +.. autofunction:: fugue.api.take + +.. autofunction:: fugue.api.join +.. autofunction:: fugue.api.union +.. autofunction:: fugue.api.intersect +.. autofunction:: fugue.api.subtract + +.. autofunction:: fugue.api.transform +.. autofunction:: fugue.api.out_transform + +SQL +~~~ + +.. autofunction:: fugue.api.raw_sql + +Conversion +~~~~~~~~~~ + +.. autofunction:: fugue.api.as_array +.. autofunction:: fugue.api.as_array_iterable +.. autofunction:: fugue.api.as_arrow +.. autofunction:: fugue.api.as_dict_iterable +.. autofunction:: fugue.api.as_pandas +.. autofunction:: fugue.api.get_native_as_df + +ExecutionEngine +~~~~~~~~~~~~~~~ +.. autofunction:: fugue.api.engine_context +.. autofunction:: fugue.api.set_global_engine +.. autofunction:: fugue.api.clear_global_engine +.. autofunction:: fugue.api.get_current_engine + + +Big Data Operations +~~~~~~~~~~~~~~~~~~~ +.. autofunction:: fugue.api.broadcast +.. autofunction:: fugue.api.persist +.. autofunction:: fugue.api.repartition + + +Development +~~~~~~~~~~~ + +.. autofunction:: fugue.api.run_engine_function + + + + + diff --git a/fugue/dataframe/api.py b/fugue/dataframe/api.py index bced84e5..8ae9d728 100644 --- a/fugue/dataframe/api.py +++ b/fugue/dataframe/api.py @@ -27,7 +27,7 @@ def as_fugue_df(df: AnyDataFrame) -> DataFrame: @fugue_plugin -def is_df(df: AnyDataFrame) -> bool: +def is_df(df: Any) -> bool: """Whether ``df`` is a DataFrame like object""" return isinstance(df, DataFrame) From b3e4b503cd4cac6cb3983e9150c93ba7ec8bd145 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sat, 24 Dec 2022 02:17:19 +0000 Subject: [PATCH 16/30] Refactor ibis, add fugue sql api --- docs/api/fugue.sql.rst | 8 ++ docs/top_api.rst | 2 + fugue/__init__.py | 3 +- fugue/api.py | 1 + fugue/sql/api.py | 163 +++++++++++++++++++++++++ fugue/sql/workflow.py | 110 ----------------- fugue/workflow/workflow.py | 16 ++- fugue_dask/__init__.py | 5 - fugue_dask/ibis_engine.py | 25 ++-- fugue_duckdb/ibis_engine.py | 26 ++-- fugue_ibis/__init__.py | 9 +- fugue_ibis/execution/ibis_engine.py | 31 ++--- fugue_ibis/execution/pandas_backend.py | 16 ++- fugue_ibis/extensions.py | 9 +- fugue_spark/__init__.py | 5 - fugue_spark/ibis_engine.py | 25 ++-- setup.py | 3 + tests/fugue/workflow/test_workflow.py | 6 + tests/fugue_dask/test_ibis.py | 4 +- tests/fugue_ibis/test_extensions.py | 16 ++- tests/fugue_spark/test_ibis.py | 7 +- 21 files changed, 265 insertions(+), 225 deletions(-) create mode 100644 fugue/sql/api.py diff --git a/docs/api/fugue.sql.rst b/docs/api/fugue.sql.rst index 209ed804..320724bf 100644 --- a/docs/api/fugue.sql.rst +++ b/docs/api/fugue.sql.rst @@ -27,6 +27,14 @@ fugue.sql .. |FugueDataTypes| replace:: :doc:`Fugue Data Types ` +fugue.sql.api +------------- + +.. automodule:: fugue.sql.api + :members: + :undoc-members: + :show-inheritance: + fugue.sql.workflow ------------------ diff --git a/docs/top_api.rst b/docs/top_api.rst index 1aa87c68..af6201f8 100644 --- a/docs/top_api.rst +++ b/docs/top_api.rst @@ -81,6 +81,8 @@ Transformation SQL ~~~ +.. autofunction:: fugue.api.fugue_sql +.. autofunction:: fugue.api.fugue_sql_flow .. autofunction:: fugue.api.raw_sql Conversion diff --git a/fugue/__init__.py b/fugue/__init__.py index 69e97d46..bddd314a 100644 --- a/fugue/__init__.py +++ b/fugue/__init__.py @@ -69,7 +69,8 @@ make_rpc_server, to_rpc_handler, ) -from fugue.sql.workflow import FugueSQLWorkflow, fsql +from fugue.sql.api import fsql +from fugue.sql.workflow import FugueSQLWorkflow from fugue.workflow._workflow_context import FugueWorkflowContext from fugue.workflow.module import module from fugue.workflow.workflow import FugueWorkflow, WorkflowDataFrame, WorkflowDataFrames diff --git a/fugue/api.py b/fugue/api.py index 846cb959..0dd4ef0d 100644 --- a/fugue/api.py +++ b/fugue/api.py @@ -42,4 +42,5 @@ take, union, ) +from .sql.api import fugue_sql, fugue_sql_flow from .workflow.api import out_transform, raw_sql, transform diff --git a/fugue/sql/api.py b/fugue/sql/api.py new file mode 100644 index 00000000..a6c00134 --- /dev/null +++ b/fugue/sql/api.py @@ -0,0 +1,163 @@ +from typing import Any, Dict, Tuple + +from triad.utils.convert import get_caller_global_local_vars + +from fugue.dataframe import DataFrame +from fugue.exceptions import FugueSQLError +from fugue.execution import AnyExecutionEngine +from fugue.workflow.workflow import FugueWorkflowResult + +from ..constants import FUGUE_CONF_SQL_IGNORE_CASE +from .workflow import FugueSQLWorkflow + + +def fugue_sql_flow( + query: str, + *args: Any, + fsql_ignore_case: bool = False, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + **kwargs: Any, +) -> FugueWorkflowResult: + dag = _build_dag(query, fsql_ignore_case=fsql_ignore_case, args=args, kwargs=kwargs) + return dag.run(engine, engine_conf) + + +def fugue_sql( + query: str, + *args: Any, + fsql_ignore_case: bool = False, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, + **kwargs: Any, +) -> DataFrame: + dag = _build_dag(query, fsql_ignore_case=fsql_ignore_case, args=args, kwargs=kwargs) + if dag.last_df is not None: + dag.last_df.yield_dataframe_as("result", as_local=as_local) + else: + raise FugueSQLError(f"no dataframe to output from\n{query}") + res = dag.run(engine, engine_conf) + return res["result"] if as_fugue else res["result"].native_as_df() + + +def fsql( + query: str, *args: Any, fsql_ignore_case: bool = False, **kwargs: Any +) -> FugueSQLWorkflow: + """Fugue SQL functional interface + + :param query: the Fugue SQL string (can be a jinja template) + :param args: variables related to the SQL string + :param fsql_ignore_case: whether to ignore case when parsing the SQL string + defaults to False. + :param kwargs: variables related to the SQL string + :return: the translated Fugue workflow + + .. code-block:: python + + # Basic case + fsql(''' + CREATE [[0]] SCHEMA a:int + PRINT + ''').run() + + # With external data sources + df = pd.DataFrame([[0],[1]], columns=["a"]) + fsql(''' + SELECT * FROM df WHERE a=0 + PRINT + ''').run() + + # With external variables + df = pd.DataFrame([[0],[1]], columns=["a"]) + t = 1 + fsql(''' + SELECT * FROM df WHERE a={{t}} + PRINT + ''').run() + + # The following is the explicit way to specify variables and datafrems + # (recommended) + df = pd.DataFrame([[0],[1]], columns=["a"]) + t = 1 + fsql(''' + SELECT * FROM df WHERE a={{t}} + PRINT + ''', df=df, t=t).run() + + # Using extensions + def dummy(df:pd.DataFrame) -> pd.DataFrame: + return df + + fsql(''' + CREATE [[0]] SCHEMA a:int + TRANSFORM USING dummy SCHEMA * + PRINT + ''').run() + + # It's recommended to provide full path of the extension inside + # Fugue SQL, so the SQL definition and exeuction can be more + # independent from the extension definition. + + # Run with different execution engines + sql = ''' + CREATE [[0]] SCHEMA a:int + TRANSFORM USING dummy SCHEMA * + PRINT + ''' + + fsql(sql).run(user_defined_spark_session()) + fsql(sql).run(SparkExecutionEngine, {"spark.executor.instances":10}) + fsql(sql).run(DaskExecutionEngine) + + # Passing dataframes between fsql calls + result = fsql(''' + CREATE [[0]] SCHEMA a:int + YIELD DATAFRAME AS x + + CREATE [[1]] SCHEMA a:int + YIELD DATAFRAME AS y + ''').run(DaskExecutionEngine) + + fsql(''' + SELECT * FROM x + UNION + SELECT * FROM y + UNION + SELECT * FROM z + + PRINT + ''', result, z=pd.DataFrame([[2]], columns=["z"])).run() + + # Get framework native dataframes + result["x"].native # Dask dataframe + result["y"].native # Dask dataframe + result["x"].as_pandas() # Pandas dataframe + + # Use lower case fugue sql + df = pd.DataFrame([[0],[1]], columns=["a"]) + t = 1 + fsql(''' + select * from df where a={{t}} + print + ''', df=df, t=t, fsql_ignore_case=True).run() + """ + dag = _build_dag(query, fsql_ignore_case=fsql_ignore_case, args=args, kwargs=kwargs) + return dag + + +def _build_dag( + query: str, + fsql_ignore_case: bool, + args: Tuple[Any, ...], + kwargs: Dict[str, Any], + level: int = -2, +) -> FugueSQLWorkflow: + global_vars, local_vars = get_caller_global_local_vars(start=level, end=level) + dag = FugueSQLWorkflow(compile_conf={FUGUE_CONF_SQL_IGNORE_CASE: fsql_ignore_case}) + try: + dag._sql(query, global_vars, local_vars, *args, **kwargs) + except SyntaxError as ex: + raise SyntaxError(str(ex)).with_traceback(None) from None + return dag diff --git a/fugue/sql/workflow.py b/fugue/sql/workflow.py index 33fcfe73..ca09cecd 100644 --- a/fugue/sql/workflow.py +++ b/fugue/sql/workflow.py @@ -76,113 +76,3 @@ def _split_params( else: p[k] = v return p, dfs - - -def fsql( - sql: str, *args: Any, fsql_ignore_case: bool = False, **kwargs: Any -) -> FugueSQLWorkflow: - """Fugue SQL functional interface - - :param sql: the Fugue SQL string (can be a jinja template) - :param args: variables related to the SQL string - :param fsql_ignore_case: whether to ignore case when parsing the SQL string - defaults to False. - :param kwargs: variables related to the SQL string - :return: the translated Fugue workflow - - .. code-block:: python - - # Basic case - fsql(''' - CREATE [[0]] SCHEMA a:int - PRINT - ''').run() - - # With external data sources - df = pd.DataFrame([[0],[1]], columns=["a"]) - fsql(''' - SELECT * FROM df WHERE a=0 - PRINT - ''').run() - - # With external variables - df = pd.DataFrame([[0],[1]], columns=["a"]) - t = 1 - fsql(''' - SELECT * FROM df WHERE a={{t}} - PRINT - ''').run() - - # The following is the explicit way to specify variables and datafrems - # (recommended) - df = pd.DataFrame([[0],[1]], columns=["a"]) - t = 1 - fsql(''' - SELECT * FROM df WHERE a={{t}} - PRINT - ''', df=df, t=t).run() - - # Using extensions - def dummy(df:pd.DataFrame) -> pd.DataFrame: - return df - - fsql(''' - CREATE [[0]] SCHEMA a:int - TRANSFORM USING dummy SCHEMA * - PRINT - ''').run() - - # It's recommended to provide full path of the extension inside - # Fugue SQL, so the SQL definition and exeuction can be more - # independent from the extension definition. - - # Run with different execution engines - sql = ''' - CREATE [[0]] SCHEMA a:int - TRANSFORM USING dummy SCHEMA * - PRINT - ''' - - fsql(sql).run(user_defined_spark_session()) - fsql(sql).run(SparkExecutionEngine, {"spark.executor.instances":10}) - fsql(sql).run(DaskExecutionEngine) - - # Passing dataframes between fsql calls - result = fsql(''' - CREATE [[0]] SCHEMA a:int - YIELD DATAFRAME AS x - - CREATE [[1]] SCHEMA a:int - YIELD DATAFRAME AS y - ''').run(DaskExecutionEngine) - - fsql(''' - SELECT * FROM x - UNION - SELECT * FROM y - UNION - SELECT * FROM z - - PRINT - ''', result, z=pd.DataFrame([[2]], columns=["z"])).run() - - # Get framework native dataframes - result["x"].native # Dask dataframe - result["y"].native # Dask dataframe - result["x"].as_pandas() # Pandas dataframe - - # Use lower case fugue sql - df = pd.DataFrame([[0],[1]], columns=["a"]) - t = 1 - fsql(''' - select * from df where a={{t}} - print - ''', df=df, t=t, fsql_ignore_case=True).run() - """ - global_vars, local_vars = get_caller_global_local_vars() - dag = FugueSQLWorkflow(compile_conf={FUGUE_CONF_SQL_IGNORE_CASE: fsql_ignore_case}) - try: - dag._sql(sql, global_vars, local_vars, *args, **kwargs) - except SyntaxError as ex: - raise SyntaxError(str(ex)).with_traceback(None) from None - return dag diff --git a/fugue/workflow/workflow.py b/fugue/workflow/workflow.py index 70d3227d..e1a02ce0 100644 --- a/fugue/workflow/workflow.py +++ b/fugue/workflow/workflow.py @@ -1469,6 +1469,7 @@ def __init__(self, compile_conf: Any = None): self._compile_conf = ParamDict( {**_FUGUE_GLOBAL_CONF, **ParamDict(compile_conf)} ) + self._last_df: Optional[WorkflowDataFrame] = None @property def conf(self) -> ParamDict: @@ -1548,6 +1549,10 @@ def run( def yields(self) -> Dict[str, Yielded]: return self._yields + @property + def last_df(self) -> Optional[WorkflowDataFrame]: + return self._last_df + def __enter__(self): return self @@ -1593,7 +1598,9 @@ def create( :return: result dataframe """ task = Create(creator=using, schema=schema, params=params) - return self.add(task) + res = self.add(task) + self._last_df = res + return res def process( self, @@ -1633,9 +1640,11 @@ def process( input_names=None if not _dfs.has_key else list(_dfs.keys()), ) if _dfs.has_key: - return self.add(task, **_dfs) + res = self.add(task, **_dfs) else: - return self.add(task, *_dfs.values()) + res = self.add(task, *_dfs.values()) + self._last_df = res + return res def output( self, *dfs: Any, using: Any, params: Any = None, pre_partition: Any = None @@ -1701,6 +1710,7 @@ def create_data( "schema must be None when data is WorkflowDataFrame" ), ) + self._last_df = data return data if ( (isinstance(data, (List, Iterable)) and not isinstance(data, str)) diff --git a/fugue_dask/__init__.py b/fugue_dask/__init__.py index 49cbec1d..296f6e5c 100644 --- a/fugue_dask/__init__.py +++ b/fugue_dask/__init__.py @@ -3,8 +3,3 @@ from fugue_dask.dataframe import DaskDataFrame from fugue_dask.execution_engine import DaskExecutionEngine - -try: - from fugue_dask.ibis_engine import DaskIbisEngine -except Exception: # pragma: no cover - pass diff --git a/fugue_dask/ibis_engine.py b/fugue_dask/ibis_engine.py index 52970484..cab934af 100644 --- a/fugue_dask/ibis_engine.py +++ b/fugue_dask/ibis_engine.py @@ -1,16 +1,16 @@ -from typing import Any, Callable, Optional +from typing import Any, Callable import dask.dataframe as dd import ibis -from fugue import DataFrame, DataFrames, ExecutionEngine -from fugue_ibis import IbisTable -from fugue_ibis._utils import to_ibis_schema, to_schema -from fugue_ibis.execution.ibis_engine import IbisEngine, register_ibis_engine from ibis.backends.dask import Backend from triad.utils.assertion import assert_or_throw +from fugue import DataFrame, DataFrames, ExecutionEngine from fugue_dask.dataframe import DaskDataFrame from fugue_dask.execution_engine import DaskExecutionEngine +from fugue_ibis import IbisTable +from fugue_ibis._utils import to_ibis_schema, to_schema +from fugue_ibis.execution.ibis_engine import IbisEngine, parse_ibis_engine class DaskIbisEngine(IbisEngine): @@ -42,13 +42,11 @@ def select( return DaskDataFrame(result, schema=schema) -def _to_dask_ibis_engine( - engine: ExecutionEngine, ibis_engine: Any -) -> Optional[IbisEngine]: - if isinstance(engine, DaskExecutionEngine): - if ibis_engine is None: - return DaskIbisEngine(engine) - return None # pragma: no cover +@parse_ibis_engine.candidate( + lambda obj, *args, **kwargs: isinstance(obj, DaskExecutionEngine) +) +def _to_dask_ibis_engine(obj: Any, engine: ExecutionEngine) -> IbisEngine: + return DaskIbisEngine(engine) class _BackendWrapper(Backend): @@ -62,6 +60,3 @@ def table(self, name: str, schema: Any = None): if schema is None and name in self._schemas else schema, ) - - -register_ibis_engine(0, _to_dask_ibis_engine) diff --git a/fugue_duckdb/ibis_engine.py b/fugue_duckdb/ibis_engine.py index 2de268b0..66fbd3cc 100644 --- a/fugue_duckdb/ibis_engine.py +++ b/fugue_duckdb/ibis_engine.py @@ -1,13 +1,13 @@ from typing import Any, Callable, Optional import ibis -from fugue import DataFrame, DataFrames, ExecutionEngine -from fugue_ibis import IbisTable -from fugue_ibis._utils import to_ibis_schema -from fugue_ibis.execution.ibis_engine import IbisEngine, register_ibis_engine from ibis.backends.pandas import Backend +from fugue import DataFrame, DataFrames, ExecutionEngine from fugue_duckdb.execution_engine import DuckDBEngine, DuckExecutionEngine +from fugue_ibis import IbisTable +from fugue_ibis._utils import to_ibis_schema +from fugue_ibis.execution.ibis_engine import IbisEngine, parse_ibis_engine class DuckDBIbisEngine(IbisEngine): @@ -24,15 +24,12 @@ def select( return engine.select(dfs, sql) -def _to_duckdb_ibis_engine( - engine: ExecutionEngine, ibis_engine: Any -) -> Optional[IbisEngine]: - if isinstance(ibis_engine, str) and ibis_engine in ["duck", "duckdb"]: - return DuckDBIbisEngine(engine) - if isinstance(engine, DuckExecutionEngine): - if ibis_engine is None: - return DuckDBIbisEngine(engine) - return None # pragma: no cover +@parse_ibis_engine.candidate( + lambda obj, *args, **kwargs: isinstance(obj, DuckExecutionEngine) + or (isinstance(obj, str) and obj in ["duck", "duckdb"]) +) +def _to_duck_ibis_engine(obj: Any, engine: ExecutionEngine) -> Optional[IbisEngine]: + return DuckDBIbisEngine(engine) class _BackendWrapper(Backend): @@ -41,6 +38,3 @@ def set_schemas(self, dfs: DataFrames) -> None: def table(self, name: str, schema: Any = None): return ibis.table(self._schemas[name], name=name) - - -register_ibis_engine(0, _to_duckdb_ibis_engine) diff --git a/fugue_ibis/__init__.py b/fugue_ibis/__init__.py index 22d8ea89..1366c0fb 100644 --- a/fugue_ibis/__init__.py +++ b/fugue_ibis/__init__.py @@ -3,12 +3,7 @@ from ._compat import IbisTable from .dataframe import IbisDataFrame -from .execution.ibis_engine import IbisEngine, register_ibis_engine -from .execution.pandas_backend import _to_pandas_ibis_engine +from .execution.ibis_engine import IbisEngine, parse_ibis_engine +from .execution.pandas_backend import PandasIbisEngine from .execution_engine import IbisExecutionEngine from .extensions import as_fugue, as_ibis, run_ibis - - -@run_at_def -def register(): - register_ibis_engine(1, _to_pandas_ibis_engine) diff --git a/fugue_ibis/execution/ibis_engine.py b/fugue_ibis/execution/ibis_engine.py index 3108001a..20cbedaa 100644 --- a/fugue_ibis/execution/ibis_engine.py +++ b/fugue_ibis/execution/ibis_engine.py @@ -1,34 +1,21 @@ from abc import abstractmethod -from typing import Any, Callable, List, Optional, Tuple +from typing import Any, Callable import ibis + from fugue import DataFrame, DataFrames, ExecutionEngine +from fugue._utils.registry import fugue_plugin from .._compat import IbisTable -_ENGINE_FUNC: List[ - Tuple[int, int, Callable[[ExecutionEngine, Any], Optional["IbisEngine"]]] -] = [] - - -def register_ibis_engine( - priority: int, func: Callable[[ExecutionEngine, Any], Optional["IbisEngine"]] -) -> None: - _ENGINE_FUNC.append((priority, len(_ENGINE_FUNC), func)) - _ENGINE_FUNC.sort() - -def to_ibis_engine( - execution_engine: ExecutionEngine, ibis_engine: Any = None -) -> "IbisEngine": - if isinstance(ibis_engine, IbisEngine): - return ibis_engine - for _, _, f in _ENGINE_FUNC: - e = f(execution_engine, ibis_engine) - if e is not None: - return e +@fugue_plugin +def parse_ibis_engine(obj: Any, engine: ExecutionEngine) -> "IbisEngine": + if isinstance(obj, IbisEngine): + return obj raise NotImplementedError( - f"can't get ibis engine from {execution_engine}, {ibis_engine}" + f"Ibis execution engine can't be parsed from {obj}." + " You may need to register a parser for it." ) diff --git a/fugue_ibis/execution/pandas_backend.py b/fugue_ibis/execution/pandas_backend.py index e93b64a5..03787528 100644 --- a/fugue_ibis/execution/pandas_backend.py +++ b/fugue_ibis/execution/pandas_backend.py @@ -1,4 +1,4 @@ -from typing import Any, Callable, Optional +from typing import Any, Callable import ibis import pandas as pd @@ -10,7 +10,7 @@ PandasDataFrame, ) from fugue_ibis._utils import to_ibis_schema, to_schema -from fugue_ibis.execution.ibis_engine import IbisEngine +from .ibis_engine import IbisEngine, parse_ibis_engine from ibis.backends.pandas import Backend from triad.utils.assertion import assert_or_throw @@ -33,13 +33,11 @@ def select( return PandasDataFrame(result, schema=schema) -def _to_pandas_ibis_engine( - engine: ExecutionEngine, ibis_engine: Any -) -> Optional[IbisEngine]: - if isinstance(engine, NativeExecutionEngine): - if ibis_engine is None: - return PandasIbisEngine(engine) - return None # pragma: no cover +@parse_ibis_engine.candidate( + lambda obj, *args, **kwargs: isinstance(obj, NativeExecutionEngine) +) +def _pd_to_ibis_engine(obj: Any, engine: ExecutionEngine) -> IbisEngine: + return PandasIbisEngine(engine) class _BackendWrapper(Backend): diff --git a/fugue_ibis/extensions.py b/fugue_ibis/extensions.py index 6268696c..ffed50a6 100644 --- a/fugue_ibis/extensions.py +++ b/fugue_ibis/extensions.py @@ -6,8 +6,8 @@ from fugue.workflow.workflow import WorkflowDataFrames from triad import assert_or_throw, extension_method -from fugue_ibis._utils import LazyIbisObject, _materialize -from fugue_ibis.execution.ibis_engine import to_ibis_engine +from ._utils import LazyIbisObject, _materialize +from .execution.ibis_engine import parse_ibis_engine from ._compat import IbisTable @@ -196,5 +196,8 @@ class _IbisProcessor(Processor): def process(self, dfs: DataFrames) -> DataFrame: ibis_func = self.params.get_or_throw("ibis_func", Callable) ibis_engine = self.params.get_or_none("ibis_engine", object) - ie = to_ibis_engine(self.execution_engine, ibis_engine) + ie = parse_ibis_engine( + self.execution_engine if ibis_engine is None else ibis_engine, + self.execution_engine, + ) return ie.select(dfs, ibis_func) diff --git a/fugue_spark/__init__.py b/fugue_spark/__init__.py index d3cac0c3..7d74f40d 100644 --- a/fugue_spark/__init__.py +++ b/fugue_spark/__init__.py @@ -3,8 +3,3 @@ from fugue_spark.dataframe import SparkDataFrame from fugue_spark.execution_engine import SparkExecutionEngine - -try: - from fugue_spark.ibis_engine import SparkIbisEngine -except Exception: # pragma: no cover - pass diff --git a/fugue_spark/ibis_engine.py b/fugue_spark/ibis_engine.py index 166a8dd8..26b3adda 100644 --- a/fugue_spark/ibis_engine.py +++ b/fugue_spark/ibis_engine.py @@ -1,13 +1,13 @@ -from typing import Any, Callable, Optional +from typing import Any, Callable import ibis -from fugue import DataFrame, DataFrames, ExecutionEngine -from fugue_ibis import IbisTable -from fugue_ibis._utils import to_schema -from fugue_ibis.execution.ibis_engine import IbisEngine, register_ibis_engine from pyspark.sql import DataFrame as PySparkDataFrame from triad.utils.assertion import assert_or_throw +from fugue import DataFrame, DataFrames, ExecutionEngine +from fugue_ibis import IbisTable +from fugue_ibis._utils import to_schema +from fugue_ibis.execution.ibis_engine import IbisEngine, parse_ibis_engine from fugue_spark.dataframe import SparkDataFrame from fugue_spark.execution_engine import SparkExecutionEngine @@ -38,13 +38,8 @@ def select( return SparkDataFrame(result, schema=schema) -def _to_spark_ibis_engine( - engine: ExecutionEngine, ibis_engine: Any -) -> Optional[IbisEngine]: - if isinstance(engine, SparkExecutionEngine): - if ibis_engine is None: - return SparkIbisEngine(engine) - return None # pragma: no cover - - -register_ibis_engine(0, _to_spark_ibis_engine) +@parse_ibis_engine.candidate( + lambda obj, *args, **kwargs: isinstance(obj, SparkExecutionEngine) +) +def _spark_to_ibis_engine(obj: Any, engine: ExecutionEngine) -> IbisEngine: + return SparkIbisEngine(engine) diff --git a/setup.py b/setup.py index 4065bd0e..1b9e1820 100644 --- a/setup.py +++ b/setup.py @@ -89,8 +89,11 @@ def get_version() -> str: "fugue.plugins": [ "ibis = fugue_ibis[ibis]", "duckdb = fugue_duckdb.registry[duckdb]", + "duckdb_ibis = fugue_duckdb.ibis_engine[duckdb,ibis]", "spark = fugue_spark.registry[spark]", + "spark_ibis = fugue_spark.ibis_engine[spark,ibis]", "dask = fugue_dask.registry[dask]", + "dask_ibis = fugue_dask.ibis_engine[dask,ibis]", "ray = fugue_ray.registry[ray]", ] }, diff --git a/tests/fugue/workflow/test_workflow.py b/tests/fugue/workflow/test_workflow.py index 7429f77c..bd9a721e 100644 --- a/tests/fugue/workflow/test_workflow.py +++ b/tests/fugue/workflow/test_workflow.py @@ -67,6 +67,7 @@ def test_workflow(): builder = FugueWorkflow() a = builder.create_data([[0], [0], [1]], "a:int") + assert builder.last_df is a raises(InvalidOperationError, lambda: a._task.copy()) raises(InvalidOperationError, lambda: copy.copy(a._task)) raises(InvalidOperationError, lambda: copy.deepcopy(a._task)) @@ -77,11 +78,16 @@ def test_workflow(): b = a.transform(mock_tf1, "*,b:int", pre_partition=dict(by=["a"])) b.show() + assert builder.last_df is b builder.create_data([[0], [1]], "b:int").show() + assert builder.last_df is not b c = ArrayDataFrame([[100]], "a:int") builder.show(a, b, c) b = a.partition(by=["a"]).transform(mock_tf2).persist().broadcast() b.show() + assert builder.last_df is b + c = builder.df(a) + assert builder.last_df is a builder.run() df_eq(a.result, [[0], [0], [1]], "a:int") diff --git a/tests/fugue_dask/test_ibis.py b/tests/fugue_dask/test_ibis.py index 1141e68d..2274ddd9 100644 --- a/tests/fugue_dask/test_ibis.py +++ b/tests/fugue_dask/test_ibis.py @@ -1,11 +1,11 @@ import pytest ibis = pytest.importorskip("ibis") +from fugue_dask import DaskExecutionEngine +from fugue_dask.ibis_engine import DaskIbisEngine from fugue_ibis import IbisEngine from fugue_test.ibis_suite import IbisTests -from fugue_dask import DaskExecutionEngine, DaskIbisEngine - class DaskIbisTests(IbisTests.Tests): def make_engine(self): diff --git a/tests/fugue_ibis/test_extensions.py b/tests/fugue_ibis/test_extensions.py index 1a01afb1..5e55b216 100644 --- a/tests/fugue_ibis/test_extensions.py +++ b/tests/fugue_ibis/test_extensions.py @@ -1,21 +1,19 @@ import pytest ibis = pytest.importorskip("ibis") -from fugue import FugueWorkflow, NativeExecutionEngine - -from fugue_ibis import as_fugue, as_ibis, run_ibis -from fugue_ibis.execution.ibis_engine import to_ibis_engine -from fugue_ibis.execution.pandas_backend import PandasIbisEngine from pytest import raises +from fugue import FugueWorkflow, NativeExecutionEngine +from fugue_ibis import PandasIbisEngine, as_fugue, as_ibis, parse_ibis_engine, run_ibis + -def test_to_ibis_engine(): +def test_parse_ibis_engine(): e = NativeExecutionEngine() ie = PandasIbisEngine(e) - assert isinstance(to_ibis_engine(e, None), PandasIbisEngine) - assert isinstance(to_ibis_engine(e, ie), PandasIbisEngine) + assert isinstance(parse_ibis_engine(e, e), PandasIbisEngine) + assert isinstance(parse_ibis_engine(ie, e), PandasIbisEngine) with raises(NotImplementedError): - to_ibis_engine(e, "dummy") + parse_ibis_engine("dummy", e) def test_run_ibis(): diff --git a/tests/fugue_spark/test_ibis.py b/tests/fugue_spark/test_ibis.py index 664461c7..cf9561ad 100644 --- a/tests/fugue_spark/test_ibis.py +++ b/tests/fugue_spark/test_ibis.py @@ -1,11 +1,12 @@ import pytest ibis = pytest.importorskip("ibis") -from fugue_ibis import IbisEngine -from fugue_test.ibis_suite import IbisTests from pyspark.sql import SparkSession -from fugue_spark import SparkExecutionEngine, SparkIbisEngine +from fugue_ibis import IbisEngine +from fugue_spark import SparkExecutionEngine +from fugue_spark.ibis_engine import SparkIbisEngine +from fugue_test.ibis_suite import IbisTests class SparkIbisTests(IbisTests.Tests): From e9ebb23b21fb1ec70f5734eec6a1ea6400440dae Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sat, 24 Dec 2022 21:06:14 +0000 Subject: [PATCH 17/30] make duckdb columns encoded --- fugue_duckdb/_utils.py | 10 +++++ fugue_duckdb/execution_engine.py | 68 +++++++++++++++++++++++--------- 2 files changed, 59 insertions(+), 19 deletions(-) diff --git a/fugue_duckdb/_utils.py b/fugue_duckdb/_utils.py index f42beef0..83d062f5 100644 --- a/fugue_duckdb/_utils.py +++ b/fugue_duckdb/_utils.py @@ -7,6 +7,7 @@ import pyarrow as pa from duckdb import __version__ as _DUCKDB_VERSION # type: ignore from triad.utils.pyarrow import TRIAD_DEFAULT_TIMESTAMP +from triad import Schema _LEGACY_DUCKDB = _DUCKDB_VERSION < "0.3.3" @@ -36,6 +37,15 @@ def encode_column_name(name: str) -> str: return '"' + name.replace('"', '""') + '"' +def encode_column_names(names: Iterable[str]) -> Iterable[str]: + for name in names: + yield encode_column_name(name) + + +def encode_schema_names(schema: Schema) -> Iterable[str]: + return encode_column_names(schema.names) + + def encode_value_to_expr(value: Any) -> str: # noqa: C901 if isinstance(value, list): return "[" + ", ".join(encode_value_to_expr(x) for x in value) + "]" diff --git a/fugue_duckdb/execution_engine.py b/fugue_duckdb/execution_engine.py index d39fccfc..bfe9c2c5 100644 --- a/fugue_duckdb/execution_engine.py +++ b/fugue_duckdb/execution_engine.py @@ -24,9 +24,16 @@ PandasDataFrame, ) from fugue.dataframe.utils import get_join_schemas -from fugue_duckdb._io import DuckDBIO -from fugue_duckdb._utils import encode_value_to_expr, get_temp_df_name -from fugue_duckdb.dataframe import DuckDataFrame + +from ._io import DuckDBIO +from ._utils import ( + encode_column_name, + encode_column_names, + encode_schema_names, + encode_value_to_expr, + get_temp_df_name, +) +from .dataframe import DuckDataFrame _FUGUE_DUCKDB_PRAGMA_CONFIG_PREFIX = "fugue.duckdb.pragma." @@ -170,17 +177,24 @@ def join( get_temp_df_name(), get_temp_df_name(), ) - on_fields = " AND ".join(f"{t1}.{k}={t2}.{k}" for k in key_schema) + on_fields = " AND ".join( + f"{t1}.{encode_column_name(k)}={t2}.{encode_column_name(k)}" + for k in key_schema + ) join_type = self._how_to_join(how) if how.lower() == "cross": select_fields = ",".join( - f"{t1}.{k}" if k in df1.schema else f"{t2}.{k}" + f"{t1}.{encode_column_name(k)}" + if k in df1.schema + else f"{t2}.{encode_column_name(k)}" for k in output_schema.names ) sql = f"SELECT {select_fields} FROM {t1} {join_type} {t2}" elif how.lower() == "right_outer": select_fields = ",".join( - f"{t2}.{k}" if k in df2.schema else f"{t1}.{k}" + f"{t2}.{encode_column_name(k)}" + if k in df2.schema + else f"{t1}.{encode_column_name(k)}" for k in output_schema.names ) sql = ( @@ -188,20 +202,29 @@ def join( ) elif how.lower() == "full_outer": select_fields = ",".join( - f"COALESCE({t1}.{k},{t2}.{k}) AS {k}" if k in key_schema else k + f"COALESCE({t1}.{encode_column_name(k)},{t2}.{encode_column_name(k)}) " + f"AS {encode_column_name(k)}" + if k in key_schema + else encode_column_name(k) for k in output_schema.names ) sql = f"SELECT {select_fields} FROM {t1} {join_type} {t2} ON {on_fields}" elif how.lower() in ["semi", "left_semi"]: - keys = ",".join(key_schema.names) - on_fields = " AND ".join(f"{t1}.{k}={t3}.{k}" for k in key_schema) + keys = ",".join(encode_schema_names(key_schema)) + on_fields = " AND ".join( + f"{t1}.{encode_column_name(k)}={t3}.{encode_column_name(k)}" + for k in key_schema + ) sql = ( f"SELECT {t1}.* FROM {t1} INNER JOIN (SELECT DISTINCT {keys} " f"FROM {t2}) AS {t3} ON {on_fields}" ) elif how.lower() in ["anti", "left_anti"]: - keys = ",".join(key_schema.names) - on_fields = " AND ".join(f"{t1}.{k}={t3}.{k}" for k in key_schema) + keys = ",".join(encode_schema_names(key_schema)) + on_fields = " AND ".join( + f"{t1}.{encode_column_name(k)}={t3}.{encode_column_name(k)}" + for k in key_schema + ) sql = ( f"SELECT {t1}.* FROM {t1} LEFT OUTER JOIN " f"(SELECT DISTINCT {keys}, 1 AS __contain__ FROM {t2}) AS {t3} " @@ -209,7 +232,9 @@ def join( ) else: select_fields = ",".join( - f"{t1}.{k}" if k in df1.schema else f"{t2}.{k}" + f"{t1}.{encode_column_name(k)}" + if k in df1.schema + else f"{t2}.{encode_column_name(k)}" for k in output_schema.names ) sql = f"SELECT {select_fields} FROM {t1} {join_type} {t2} ON {on_fields}" @@ -273,7 +298,10 @@ def dropna( thr = thresh or len(schema) else: # pragma: no cover raise ValueError(f"{how} is not one of any and all") - cw = [f"CASE WHEN {f} IS NULL THEN 0 ELSE 1 END" for f in schema.names] + cw = [ + f"CASE WHEN {encode_column_name(f)} IS NULL THEN 0 ELSE 1 END" + for f in schema.names + ] expr = " + ".join(cw) + f" >= {thr}" return DuckDataFrame(self._to_duck_df(df).native.filter(expr)) @@ -297,7 +325,9 @@ def _build_value_dict(names: List[str]) -> Dict[str, str]: ValueError("fillna value can not be None or contain None"), ) cols = [ - f"COALESCE({f}, {vd[f]}) AS {f}" if f in names else f + f"COALESCE({encode_column_name(f)}, {vd[f]}) AS {encode_column_name(f)}" + if f in names + else encode_column_name(f) for f in df.schema.names ] return DuckDataFrame(self._to_duck_df(df).native.project(", ".join(cols))) @@ -349,8 +379,8 @@ def take( if len(_presort) == 0: if len(partition_spec.partition_by) == 0: return DuckDataFrame(self._to_duck_df(df).native.limit(n)) - cols = ", ".join(df.schema.names) - pcols = ", ".join(partition_spec.partition_by) + cols = ", ".join(encode_schema_names(df.schema)) + pcols = ", ".join(encode_column_names(partition_spec.partition_by)) sql = ( f"SELECT *, ROW_NUMBER() OVER (PARTITION BY {pcols}) " f"AS __fugue_take_param FROM {tb}" @@ -360,7 +390,7 @@ def take( sorts: List[str] = [] for k, v in _presort.items(): - s = k + s = encode_column_name(k) if not v: s += " DESC" s += " NULLS FIRST" if na_position == "first" else " NULLS LAST" @@ -371,8 +401,8 @@ def take( sql = f"SELECT * FROM {tb} {sort_expr} LIMIT {n}" return self._sql(sql, {tb: df}) - cols = ", ".join(df.schema.names) - pcols = ", ".join(partition_spec.partition_by) + cols = ", ".join(encode_schema_names(df.schema)) + pcols = ", ".join(encode_column_names(partition_spec.partition_by)) sql = ( f"SELECT *, ROW_NUMBER() OVER (PARTITION BY {pcols} {sort_expr}) " f"AS __fugue_take_param FROM {tb}" From 946560196044ee77bd8c841ced380d9c7b5cc8bb Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sat, 24 Dec 2022 22:17:48 +0000 Subject: [PATCH 18/30] improve test coverage --- fugue/execution/api.py | 8 - fugue/execution/native_execution_engine.py | 9 +- fugue_test/execution_suite.py | 406 ++++++++++++--------- 3 files changed, 232 insertions(+), 191 deletions(-) diff --git a/fugue/execution/api.py b/fugue/execution/api.py index 9fff8c8c..cfbd7d84 100644 --- a/fugue/execution/api.py +++ b/fugue/execution/api.py @@ -637,11 +637,3 @@ def _intersect(e: ExecutionEngine): as_fugue=as_fugue, infer_by=[df1, df2, *dfs], ) - - -def _adjust_df( - input_dfs: List[AnyDataFrame], output_df: DataFrame, as_fugue: bool -) -> AnyDataFrame: - if as_fugue or any(isinstance(x, DataFrame) for x in input_dfs): - return output_df - return output_df.native_as_df() diff --git a/fugue/execution/native_execution_engine.py b/fugue/execution/native_execution_engine.py index 5e9997f6..7027b3fc 100644 --- a/fugue/execution/native_execution_engine.py +++ b/fugue/execution/native_execution_engine.py @@ -7,7 +7,7 @@ from qpd_pandas import run_sql_on_pandas from qpd_pandas.engine import PandasUtils from sqlalchemy import create_engine -from triad.collections import Schema +from triad import Schema from triad.collections.dict import IndexedOrderedDict from triad.collections.fs import FileSystem from triad.utils.assertion import assert_or_throw @@ -32,11 +32,8 @@ to_local_bounded_df, ) from fugue.dataframe.utils import get_join_schemas, to_local_df -from fugue.execution.execution_engine import ( - ExecutionEngine, - MapEngine, - SQLEngine, -) + +from .execution_engine import ExecutionEngine, MapEngine, SQLEngine class SqliteEngine(SQLEngine): diff --git a/fugue_test/execution_suite.py b/fugue_test/execution_suite.py index 002ba7ab..baea8134 100644 --- a/fugue_test/execution_suite.py +++ b/fugue_test/execution_suite.py @@ -6,9 +6,14 @@ from datetime import datetime from unittest import TestCase -import fugue.column.functions as ff import pandas as pd import pytest +from pytest import raises +from triad.collections.fs import FileSystem +from triad.exceptions import InvalidOperationError + +import fugue.api as fa +import fugue.column.functions as ff from fugue import ( ArrayDataFrame, DataFrames, @@ -20,10 +25,6 @@ from fugue.column import SelectColumns, col, lit from fugue.dataframe.utils import _df_eq as df_eq from fugue.execution.native_execution_engine import NativeExecutionEngine -from pytest import raises -from triad.collections.fs import FileSystem -from triad.exceptions import InvalidOperationError - from fugue_test._utils import skip_spark2 @@ -38,6 +39,7 @@ class Tests(TestCase): def setUpClass(cls): register_default_sql_engine(lambda engine: engine.sql_engine) cls._engine = cls.make_engine(cls) + fa.set_global_engine(cls._engine) @property def engine(self) -> ExecutionEngine: @@ -45,6 +47,7 @@ def engine(self) -> ExecutionEngine: @classmethod def tearDownClass(cls): + fa.clear_global_engine() cls._engine.stop() def make_engine(self) -> ExecutionEngine: # pragma: no cover @@ -374,38 +377,52 @@ def test_map_with_binary(self): ) df_eq(expected, c, no_pandas=True, check_order=True, throw=True) + def test_join_multiple(self): + e = self.engine + a = e.to_df([[1, 2], [3, 4]], "a:int,b:int") + b = e.to_df([[1, 20], [3, 40]], "a:int,c:int") + c = e.to_df([[1, 200], [3, 400]], "a:int,d:int") + d = fa.join(a, b, c, how="inner") + df_eq( + d, + [[1, 2, 20, 200], [3, 4, 40, 400]], + "a:int,b:int,c:int,d:int", + throw=True, + ) + def test__join_cross(self): e = self.engine a = e.to_df([[1, 2], [3, 4]], "a:int,b:int") b = e.to_df([[6], [7]], "c:int") - c = e.join(a, b, how="Cross") + c = fa.join(a, b, how="Cross") df_eq( c, [[1, 2, 6], [1, 2, 7], [3, 4, 6], [3, 4, 7]], "a:int,b:int,c:int", + throw=True, ) b = e.to_df([], "c:int") - c = e.join(a, b, how="Cross") + c = fa.join(a, b, how="Cross") df_eq(c, [], "a:int,b:int,c:int", throw=True) a = e.to_df([], "a:int,b:int") b = e.to_df([], "c:int") - c = e.join(a, b, how="Cross") + c = fa.join(a, b, how="Cross") df_eq(c, [], "a:int,b:int,c:int", throw=True) def test__join_inner(self): e = self.engine a = e.to_df([[1, 2], [3, 4]], "a:int,b:int") b = e.to_df([[6, 1], [2, 7]], "c:int,a:int") - c = e.join(a, b, how="INNER", on=["a"]) + c = fa.join(a, b, how="INNER", on=["a"]) df_eq(c, [[1, 2, 6]], "a:int,b:int,c:int", throw=True) - c = e.join(b, a, how="INNER", on=["a"]) + c = fa.join(b, a, how="INNER", on=["a"]) df_eq(c, [[6, 1, 2]], "c:int,a:int,b:int", throw=True) a = e.to_df([], "a:int,b:int") b = e.to_df([], "c:int,a:int") - c = e.join(a, b, how="INNER", on=["a"]) + c = fa.join(a, b, how="INNER", on=["a"]) df_eq(c, [], "a:int,b:int,c:int", throw=True) def test__join_outer(self): @@ -413,33 +430,33 @@ def test__join_outer(self): a = e.to_df([], "a:int,b:int") b = e.to_df([], "c:str,a:int") - c = e.join(a, b, how="left_outer", on=["a"]) + c = fa.join(a, b, how="left_outer", on=["a"]) df_eq(c, [], "a:int,b:int,c:str", throw=True) a = e.to_df([], "a:int,b:str") b = e.to_df([], "c:int,a:int") - c = e.join(a, b, how="right_outer", on=["a"]) + c = fa.join(a, b, how="right_outer", on=["a"]) df_eq(c, [], "a:int,b:str,c:int", throw=True) a = e.to_df([], "a:int,b:str") b = e.to_df([], "c:str,a:int") - c = e.join(a, b, how="full_outer", on=["a"]) + c = fa.join(a, b, how="full_outer", on=["a"]) df_eq(c, [], "a:int,b:str,c:str", throw=True) a = e.to_df([[1, "2"], [3, "4"]], "a:int,b:str") b = e.to_df([["6", 1], ["2", 7]], "c:str,a:int") - c = e.join(a, b, how="left_OUTER", on=["a"]) + c = fa.join(a, b, how="left_OUTER", on=["a"]) df_eq(c, [[1, "2", "6"], [3, "4", None]], "a:int,b:str,c:str", throw=True) - c = e.join(b, a, how="left_outer", on=["a"]) + c = fa.join(b, a, how="left_outer", on=["a"]) df_eq(c, [["6", 1, "2"], ["2", 7, None]], "c:str,a:int,b:str", throw=True) a = e.to_df([[1, "2"], [3, "4"]], "a:int,b:str") b = e.to_df([[6, 1], [2, 7]], "c:double,a:int") - c = e.join(a, b, how="left_OUTER", on=["a"]) + c = fa.join(a, b, how="left_OUTER", on=["a"]) df_eq( c, [[1, "2", 6.0], [3, "4", None]], "a:int,b:str,c:double", throw=True ) - c = e.join(b, a, how="left_outer", on=["a"]) + c = fa.join(b, a, how="left_outer", on=["a"]) # assert c.as_pandas().values.tolist()[1][2] is None df_eq( c, [[6.0, 1, "2"], [2.0, 7, None]], "c:double,a:int,b:str", throw=True @@ -447,11 +464,11 @@ def test__join_outer(self): a = e.to_df([[1, "2"], [3, "4"]], "a:int,b:str") b = e.to_df([["6", 1], ["2", 7]], "c:str,a:int") - c = e.join(a, b, how="right_outer", on=["a"]) + c = fa.join(a, b, how="right_outer", on=["a"]) # assert c.as_pandas().values.tolist()[1][1] is None df_eq(c, [[1, "2", "6"], [7, None, "2"]], "a:int,b:str,c:str", throw=True) - c = e.join(a, b, how="full_outer", on=["a"]) + c = fa.join(a, b, how="full_outer", on=["a"]) df_eq( c, [[1, "2", "6"], [3, "4", None], [7, None, "2"]], @@ -464,21 +481,21 @@ def test__join_outer_pandas_incompatible(self): a = e.to_df([[1, "2"], [3, "4"]], "a:int,b:str") b = e.to_df([[6, 1], [2, 7]], "c:int,a:int") - c = e.join(a, b, how="left_OUTER", on=["a"]) + c = fa.join(a, b, how="left_OUTER", on=["a"]) df_eq( c, [[1, "2", 6], [3, "4", None]], "a:int,b:str,c:int", throw=True, ) - c = e.join(b, a, how="left_outer", on=["a"]) + c = fa.join(b, a, how="left_outer", on=["a"]) df_eq(c, [[6, 1, "2"], [2, 7, None]], "c:int,a:int,b:str", throw=True) a = e.to_df([[1, "2"], [3, "4"]], "a:int,b:str") b = e.to_df([[True, 1], [False, 7]], "c:bool,a:int") - c = e.join(a, b, how="left_OUTER", on=["a"]) + c = fa.join(a, b, how="left_OUTER", on=["a"]) df_eq(c, [[1, "2", True], [3, "4", None]], "a:int,b:str,c:bool", throw=True) - c = e.join(b, a, how="left_outer", on=["a"]) + c = fa.join(b, a, how="left_outer", on=["a"]) df_eq( c, [[True, 1, "2"], [False, 7, None]], "c:bool,a:int,b:str", throw=True ) @@ -487,36 +504,36 @@ def test__join_semi(self): e = self.engine a = e.to_df([[1, 2], [3, 4]], "a:int,b:int") b = e.to_df([[6, 1], [2, 7]], "c:int,a:int") - c = e.join(a, b, how="semi", on=["a"]) + c = fa.join(a, b, how="semi", on=["a"]) df_eq(c, [[1, 2]], "a:int,b:int", throw=True) - c = e.join(b, a, how="semi", on=["a"]) + c = fa.join(b, a, how="semi", on=["a"]) df_eq(c, [[6, 1]], "c:int,a:int", throw=True) b = e.to_df([], "c:int,a:int") - c = e.join(a, b, how="semi", on=["a"]) + c = fa.join(a, b, how="semi", on=["a"]) df_eq(c, [], "a:int,b:int", throw=True) a = e.to_df([], "a:int,b:int") b = e.to_df([], "c:int,a:int") - c = e.join(a, b, how="semi", on=["a"]) + c = fa.join(a, b, how="semi", on=["a"]) df_eq(c, [], "a:int,b:int", throw=True) def test__join_anti(self): e = self.engine a = e.to_df([[1, 2], [3, 4]], "a:int,b:int") b = e.to_df([[6, 1], [2, 7]], "c:int,a:int") - c = e.join(a, b, how="anti", on=["a"]) + c = fa.join(a, b, how="anti", on=["a"]) df_eq(c, [[3, 4]], "a:int,b:int", throw=True) - c = e.join(b, a, how="anti", on=["a"]) + c = fa.join(b, a, how="anti", on=["a"]) df_eq(c, [[2, 7]], "c:int,a:int", throw=True) b = e.to_df([], "c:int,a:int") - c = e.join(a, b, how="anti", on=["a"]) + c = fa.join(a, b, how="anti", on=["a"]) df_eq(c, [[1, 2], [3, 4]], "a:int,b:int", throw=True) a = e.to_df([], "a:int,b:int") b = e.to_df([], "c:int,a:int") - c = e.join(a, b, how="anti", on=["a"]) + c = fa.join(a, b, how="anti", on=["a"]) df_eq(c, [], "a:int,b:int", throw=True) def test__join_with_null_keys(self): @@ -524,41 +541,66 @@ def test__join_with_null_keys(self): e = self.engine a = e.to_df([[1, 2, 3], [4, None, 6]], "a:double,b:double,c:int") b = e.to_df([[1, 2, 33], [4, None, 63]], "a:double,b:double,d:int") - c = e.join(a, b, how="INNER") + c = fa.join(a, b, how="INNER") df_eq(c, [[1, 2, 3, 33]], "a:double,b:double,c:int,d:int", throw=True) def test_union(self): e = self.engine a = e.to_df([[1, 2, 3], [4, None, 6]], "a:double,b:double,c:int") b = e.to_df([[1, 2, 33], [4, None, 6]], "a:double,b:double,c:int") - c = e.union(a, b) + c = fa.union(a, b) df_eq( c, [[1, 2, 3], [4, None, 6], [1, 2, 33]], "a:double,b:double,c:int", throw=True, ) - c = e.union(a, b, distinct=False) + c = fa.union(a, b, distinct=False) df_eq( c, [[1, 2, 3], [4, None, 6], [1, 2, 33], [4, None, 6]], "a:double,b:double,c:int", throw=True, ) + d = fa.union(a, b, c, distinct=False) + df_eq( + d, + [ + [1, 2, 3], + [4, None, 6], + [1, 2, 33], + [4, None, 6], + [1, 2, 3], + [4, None, 6], + [1, 2, 33], + [4, None, 6], + ], + "a:double,b:double,c:int", + throw=True, + ) def test_subtract(self): e = self.engine a = e.to_df([[1, 2, 3], [1, 2, 3], [4, None, 6]], "a:double,b:double,c:int") b = e.to_df([[1, 2, 33], [4, None, 6]], "a:double,b:double,c:int") - c = e.subtract(a, b) + c = fa.subtract(a, b) df_eq( c, [[1, 2, 3]], "a:double,b:double,c:int", throw=True, ) + x = e.to_df([[1, 2, 33]], "a:double,b:double,c:int") + y = e.to_df([[4, None, 6]], "a:double,b:double,c:int") + z = fa.subtract(a, x, y) + df_eq( + z, + [[1, 2, 3]], + "a:double,b:double,c:int", + throw=True, + ) # TODO: EXCEPT ALL is not implemented (QPD issue) - # c = e.subtract(a, b, distinct=False) + # c = fa.subtract(a, b, distinct=False) # df_eq( # c, # [[1, 2, 3], [1, 2, 3]], @@ -575,15 +617,30 @@ def test_intersect(self): [[1, 2, 33], [4, None, 6], [4, None, 6], [4, None, 6]], "a:double,b:double,c:int", ) - c = e.intersect(a, b) + c = fa.intersect(a, b) df_eq( c, [[4, None, 6]], "a:double,b:double,c:int", throw=True, ) + x = e.to_df( + [[1, 2, 33]], + "a:double,b:double,c:int", + ) + y = e.to_df( + [[4, None, 6], [4, None, 6], [4, None, 6]], + "a:double,b:double,c:int", + ) + z = fa.intersect(a, x, y) + df_eq( + z, + [], + "a:double,b:double,c:int", + throw=True, + ) # TODO: INTERSECT ALL is not implemented (QPD issue) - # c = e.intersect(a, b, distinct=False) + # c = fa.intersect(a, b, distinct=False) # df_eq( # c, # [[4, None, 6], [4, None, 6]], @@ -596,7 +653,7 @@ def test_distinct(self): a = e.to_df( [[4, None, 6], [1, 2, 3], [4, None, 6]], "a:double,b:double,c:int" ) - c = e.distinct(a) + c = fa.distinct(a) df_eq( c, [[4, None, 6], [1, 2, 3]], @@ -609,11 +666,11 @@ def test_dropna(self): a = e.to_df( [[4, None, 6], [1, 2, 3], [4, None, None]], "a:double,b:double,c:double" ) - c = e.dropna(a) # default - d = e.dropna(a, how="all") - f = e.dropna(a, how="any", thresh=2) - g = e.dropna(a, how="any", subset=["a", "c"]) - h = e.dropna(a, how="any", thresh=1, subset=["a", "c"]) + c = fa.dropna(a) # default + d = fa.dropna(a, how="all") + f = fa.dropna(a, how="any", thresh=2) + g = fa.dropna(a, how="any", subset=["a", "c"]) + h = fa.dropna(a, how="any", thresh=1, subset=["a", "c"]) df_eq( c, [[1, 2, 3]], @@ -644,10 +701,10 @@ def test_fillna(self): a = e.to_df( [[4, None, 6], [1, 2, 3], [4, None, None]], "a:double,b:double,c:double" ) - c = e.fillna(a, value=1) - d = e.fillna(a, {"b": 99, "c": -99}) - f = e.fillna(a, value=-99, subset=["c"]) - g = e.fillna(a, {"b": 99, "c": -99}, subset=["c"]) # subset ignored + c = fa.fillna(a, value=1) + d = fa.fillna(a, {"b": 99, "c": -99}) + f = fa.fillna(a, value=-99, subset=["c"]) + g = fa.fillna(a, {"b": 99, "c": -99}, subset=["c"]) # subset ignored df_eq( c, [[4, 1, 6], [1, 2, 3], [4, 1, 1]], @@ -667,24 +724,24 @@ def test_fillna(self): throw=True, ) df_eq(g, d, throw=True) - raises(ValueError, lambda: e.fillna(a, {"b": None, c: "99"})) - raises(ValueError, lambda: e.fillna(a, None)) - # raises(ValueError, lambda: e.fillna(a, ["b"])) + raises(ValueError, lambda: fa.fillna(a, {"b": None, c: "99"})) + raises(ValueError, lambda: fa.fillna(a, None)) + # raises(ValueError, lambda: fa.fillna(a, ["b"])) def test_sample(self): engine = self.engine a = engine.to_df([[x] for x in range(100)], "a:int") with raises(ValueError): - engine.sample(a) # must set one + fa.sample(a) # must set one with raises(ValueError): - engine.sample(a, n=90, frac=0.9) # can't set both + fa.sample(a, n=90, frac=0.9) # can't set both - f = engine.sample(a, frac=0.8, replace=False) - g = engine.sample(a, frac=0.8, replace=True) - h = engine.sample(a, frac=0.8, seed=1) - h2 = engine.sample(a, frac=0.8, seed=1) - i = engine.sample(a, frac=0.8, seed=2) + f = fa.sample(a, frac=0.8, replace=False) + g = fa.sample(a, frac=0.8, replace=True) + h = fa.sample(a, frac=0.8, seed=1) + h2 = fa.sample(a, frac=0.8, seed=1) + i = fa.sample(a, frac=0.8, seed=2) assert not df_eq(f, g, throw=False) df_eq(h, h2, throw=True) assert not df_eq(h, i, throw=False) @@ -705,12 +762,12 @@ def test_take(self): ], "a:str,b:int,c:long", ) - b = e.take(a, n=1, presort="b desc") - c = e.take(a, n=2, presort="a desc", na_position="first") - d = e.take(a, n=1, presort="a asc, b desc", partition_spec=ps) - f = e.take(a, n=1, presort=None, partition_spec=ps2) - g = e.take(a, n=2, presort="a desc", na_position="last") - h = e.take(a, n=2, presort="a", na_position="first") + b = fa.take(a, n=1, presort="b desc") + c = fa.take(a, n=2, presort="a desc", na_position="first") + d = fa.take(a, n=1, presort="a asc, b desc", partition_spec=ps) + f = fa.take(a, n=1, presort=None, partition_spec=ps2) + g = fa.take(a, n=2, presort="a desc", na_position="last") + h = fa.take(a, n=2, presort="a", na_position="first") df_eq( b, [[None, 4, 2]], @@ -750,17 +807,17 @@ def test_take(self): "a:str,b:int,c:long", throw=True, ) - raises(ValueError, lambda: e.take(a, n=0.5, presort=None)) + raises(ValueError, lambda: fa.take(a, n=0.5, presort=None)) def test_sample_n(self): engine = self.engine a = engine.to_df([[x] for x in range(100)], "a:int") - b = engine.sample(a, n=90, replace=False) - c = engine.sample(a, n=90, replace=True) - d = engine.sample(a, n=90, seed=1) - d2 = engine.sample(a, n=90, seed=1) - e = engine.sample(a, n=90, seed=2) + b = fa.sample(a, n=90, replace=False) + c = fa.sample(a, n=90, replace=True) + d = fa.sample(a, n=90, seed=1) + d2 = fa.sample(a, n=90, seed=1) + e = fa.sample(a, n=90, seed=2) assert not df_eq(b, c, throw=False) df_eq(d, d2, throw=True) assert not df_eq(d, e, throw=False) @@ -773,9 +830,9 @@ def test__serialize_by_partition(self): a, PartitionSpec(by=["a"], presort="b"), df_name="_0" ) assert s.count() == 2 - s = e.persist(e._serialize_by_partition(a, PartitionSpec(), df_name="_0")) + s = fa.persist(e._serialize_by_partition(a, PartitionSpec(), df_name="_0")) assert s.count() == 1 - s = e.persist( + s = fa.persist( e._serialize_by_partition(a, PartitionSpec(by=["x"]), df_name="_0") ) assert s.count() == 1 @@ -788,10 +845,10 @@ def test_zip(self): sa = e._serialize_by_partition(a, ps, df_name="_0") sb = e._serialize_by_partition(b, ps, df_name="_1") # test zip with serialized dfs - z1 = e.persist(e.zip(sa, sb, how="inner", partition_spec=ps)) + z1 = fa.persist(e.zip(sa, sb, how="inner", partition_spec=ps)) assert 1 == z1.count() assert not z1.metadata.get("serialized_has_name", False) - z2 = e.persist(e.zip(sa, sb, how="left_outer", partition_spec=ps)) + z2 = fa.persist(e.zip(sa, sb, how="left_outer", partition_spec=ps)) assert 2 == z2.count() # can't have duplicated keys @@ -816,24 +873,24 @@ def test_zip(self): ) # test zip with unserialized dfs - z3 = e.persist(e.zip(a, b, partition_spec=ps)) + z3 = fa.persist(e.zip(a, b, partition_spec=ps)) df_eq(z1, z3, throw=True) - z3 = e.persist(e.zip(a, sb, partition_spec=ps)) + z3 = fa.persist(e.zip(a, sb, partition_spec=ps)) df_eq(z1, z3, throw=True) - z3 = e.persist(e.zip(sa, b, partition_spec=ps)) + z3 = fa.persist(e.zip(sa, b, partition_spec=ps)) df_eq(z1, z3, throw=True) - z4 = e.persist(e.zip(a, b, how="left_outer", partition_spec=ps)) + z4 = fa.persist(e.zip(a, b, how="left_outer", partition_spec=ps)) df_eq(z2, z4, throw=True) - z4 = e.persist(e.zip(a, sb, how="left_outer", partition_spec=ps)) + z4 = fa.persist(e.zip(a, sb, how="left_outer", partition_spec=ps)) df_eq(z2, z4, throw=True) - z4 = e.persist(e.zip(sa, b, how="left_outer", partition_spec=ps)) + z4 = fa.persist(e.zip(sa, b, how="left_outer", partition_spec=ps)) df_eq(z2, z4, throw=True) - z5 = e.persist(e.zip(a, b, how="cross")) + z5 = fa.persist(e.zip(a, b, how="cross")) assert z5.count() == 1 assert len(z5.schema) == 2 - z6 = e.persist(e.zip(sa, b, how="cross")) + z6 = fa.persist(e.zip(sa, b, how="cross")) assert z6.count() == 2 assert len(z6.schema) == 3 @@ -844,15 +901,15 @@ def test_zip(self): def test_zip_all(self): e = self.engine a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int") - z = e.persist(e.zip_all(DataFrames(a))) + z = fa.persist(e.zip_all(DataFrames(a))) assert 1 == z.count() assert z.metadata.get("serialized", False) assert not z.metadata.get("serialized_has_name", False) - z = e.persist(e.zip_all(DataFrames(x=a))) + z = fa.persist(e.zip_all(DataFrames(x=a))) assert 1 == z.count() assert z.metadata.get("serialized", False) assert z.metadata.get("serialized_has_name", False) - z = e.persist( + z = fa.persist( e.zip_all(DataFrames(x=a), partition_spec=PartitionSpec(by=["a"])) ) assert 2 == z.count() @@ -861,23 +918,23 @@ def test_zip_all(self): b = e.to_df([[6, 1], [2, 7]], "c:int,a:int") c = e.to_df([[6, 1], [2, 7]], "d:int,a:int") - z = e.persist(e.zip_all(DataFrames(a, b, c))) + z = fa.persist(e.zip_all(DataFrames(a, b, c))) assert 1 == z.count() assert not z.metadata.get("serialized_has_name", False) - z = e.persist(e.zip_all(DataFrames(x=a, y=b, z=c))) + z = fa.persist(e.zip_all(DataFrames(x=a, y=b, z=c))) assert 1 == z.count() assert z.metadata.get("serialized_has_name", False) - z = e.persist(e.zip_all(DataFrames(b, b))) + z = fa.persist(e.zip_all(DataFrames(b, b))) assert 2 == z.count() assert not z.metadata.get("serialized_has_name", False) assert ["a", "c"] in z.schema - z = e.persist(e.zip_all(DataFrames(x=b, y=b))) + z = fa.persist(e.zip_all(DataFrames(x=b, y=b))) assert 2 == z.count() assert z.metadata.get("serialized_has_name", False) assert ["a", "c"] in z.schema - z = e.persist( + z = fa.persist( e.zip_all(DataFrames(b, b), partition_spec=PartitionSpec(by=["a"])) ) assert 2 == z.count() @@ -889,12 +946,12 @@ def test_comap(self): e = self.engine a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int") b = e.to_df([[6, 1], [2, 7]], "c:int,a:int") - z1 = e.persist(e.zip(a, b)) - z2 = e.persist(e.zip(a, b, partition_spec=ps, how="left_outer")) - z3 = e.persist( + z1 = fa.persist(e.zip(a, b)) + z2 = fa.persist(e.zip(a, b, partition_spec=ps, how="left_outer")) + z3 = fa.persist( e._serialize_by_partition(a, partition_spec=ps, df_name="_x") ) - z4 = e.persist(e.zip(a, b, partition_spec=ps, how="cross")) + z4 = fa.persist(e.zip(a, b, partition_spec=ps, how="cross")) def comap(cursor, dfs): assert not dfs.has_key @@ -938,9 +995,9 @@ def test_comap_with_key(self): a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int") b = e.to_df([[6, 1], [2, 7]], "c:int,a:int") c = e.to_df([[6, 1]], "c:int,a:int") - z1 = e.persist(e.zip(a, b, df1_name="x", df2_name="y")) - z2 = e.persist(e.zip_all(DataFrames(x=a, y=b, z=b))) - z3 = e.persist( + z1 = fa.persist(e.zip(a, b, df1_name="x", df2_name="y")) + z2 = fa.persist(e.zip_all(DataFrames(x=a, y=b, z=b))) + z3 = fa.persist( e.zip_all(DataFrames(z=c), partition_spec=PartitionSpec(by=["a"])) ) @@ -994,48 +1051,47 @@ def test_save_single_and_load_parquet(self): path = os.path.join(self.tmpdir, "a", "b") e.fs.makedirs(path, recreate=True) # over write folder with single file - e.save_df(b, path, format_hint="parquet", force_single=True) + fa.save(b, path, format_hint="parquet", force_single=True) assert e.fs.isfile(path) - c = e.load_df(path, format_hint="parquet", columns=["a", "c"]) + c = fa.load(path, format_hint="parquet", columns=["a", "c"], as_fugue=True) df_eq(c, [[1, 6], [7, 2]], "a:long,c:int", throw=True) # overwirte single with folder (if applicable) b = ArrayDataFrame([[60, 1], [20, 7]], "c:int,a:long") - e.save_df(b, path, format_hint="parquet", mode="overwrite") - c = e.load_df(path, format_hint="parquet", columns=["a", "c"]) + fa.save(b, path, format_hint="parquet", mode="overwrite") + c = fa.load(path, format_hint="parquet", columns=["a", "c"], as_fugue=True) df_eq(c, [[1, 60], [7, 20]], "a:long,c:int", throw=True) def test_save_and_load_parquet(self): - e = self.engine b = ArrayDataFrame([[6, 1], [2, 7]], "c:int,a:long") path = os.path.join(self.tmpdir, "a", "b") - e.save_df(b, path, format_hint="parquet") - c = e.load_df(path, format_hint="parquet", columns=["a", "c"]) + fa.save(b, path, format_hint="parquet") + c = fa.load(path, format_hint="parquet", columns=["a", "c"], as_fugue=True) df_eq(c, [[1, 6], [7, 2]], "a:long,c:int", throw=True) def test_load_parquet_folder(self): - e = self.engine native = NativeExecutionEngine() a = ArrayDataFrame([[6, 1]], "c:int,a:long") b = ArrayDataFrame([[2, 7], [4, 8]], "c:int,a:long") path = os.path.join(self.tmpdir, "a", "b") - native.save_df(a, os.path.join(path, "a.parquet")) - native.save_df(b, os.path.join(path, "b.parquet")) + fa.save(a, os.path.join(path, "a.parquet"), engine=native) + fa.save(b, os.path.join(path, "b.parquet"), engine=native) FileSystem().touch(os.path.join(path, "_SUCCESS")) - c = e.load_df(path, format_hint="parquet", columns=["a", "c"]) + c = fa.load(path, format_hint="parquet", columns=["a", "c"], as_fugue=True) df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:int", throw=True) def test_load_parquet_files(self): - e = self.engine native = NativeExecutionEngine() a = ArrayDataFrame([[6, 1]], "c:int,a:long") b = ArrayDataFrame([[2, 7], [4, 8]], "c:int,a:long") path = os.path.join(self.tmpdir, "a", "b") f1 = os.path.join(path, "a.parquet") f2 = os.path.join(path, "b.parquet") - native.save_df(a, f1) - native.save_df(b, f2) - c = e.load_df([f1, f2], format_hint="parquet", columns=["a", "c"]) + fa.save(a, f1, engine=native) + fa.save(b, f2, engine=native) + c = fa.load( + [f1, f2], format_hint="parquet", columns=["a", "c"], as_fugue=True + ) df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:int", throw=True) @skip_spark2 @@ -1046,39 +1102,37 @@ def test_save_single_and_load_avro(self): path = os.path.join(self.tmpdir, "a", "b") e.fs.makedirs(path, recreate=True) # over write folder with single file - e.save_df(b, path, format_hint="avro", force_single=True) + fa.save(b, path, format_hint="avro", force_single=True) assert e.fs.isfile(path) - c = e.load_df(path, format_hint="avro", columns=["a", "c"]) + c = fa.load(path, format_hint="avro", columns=["a", "c"], as_fugue=True) df_eq(c, [[1, 6], [7, 2]], "a:long,c:long", throw=True) # overwirte single with folder (if applicable) b = ArrayDataFrame([[60, 1], [20, 7]], "c:long,a:long") - e.save_df(b, path, format_hint="avro", mode="overwrite") - c = e.load_df(path, format_hint="avro", columns=["a", "c"]) + fa.save(b, path, format_hint="avro", mode="overwrite") + c = fa.load(path, format_hint="avro", columns=["a", "c"], as_fugue=True) df_eq(c, [[1, 60], [7, 20]], "a:long,c:long", throw=True) @skip_spark2 def test_save_and_load_avro(self): # TODO: switch to c:int,a:long when we can preserve schema to avro - e = self.engine b = ArrayDataFrame([[6, 1], [2, 7]], "c:long,a:long") path = os.path.join(self.tmpdir, "a", "b") - e.save_df(b, path, format_hint="avro") - c = e.load_df(path, format_hint="avro", columns=["a", "c"]) + fa.save(b, path, format_hint="avro") + c = fa.load(path, format_hint="avro", columns=["a", "c"], as_fugue=True) df_eq(c, [[1, 6], [7, 2]], "a:long,c:long", throw=True) @skip_spark2 def test_load_avro_folder(self): # TODO: switch to c:int,a:long when we can preserve schema to avro - e = self.engine native = NativeExecutionEngine() a = ArrayDataFrame([[6, 1]], "c:long,a:long") b = ArrayDataFrame([[2, 7], [4, 8]], "c:long,a:long") path = os.path.join(self.tmpdir, "a", "b") - native.save_df(a, os.path.join(path, "a.avro")) - native.save_df(b, os.path.join(path, "b.avro")) + fa.save(a, os.path.join(path, "a.avro"), engine=native) + fa.save(b, os.path.join(path, "b.avro"), engine=native) FileSystem().touch(os.path.join(path, "_SUCCESS")) - c = e.load_df(path, format_hint="avro", columns=["a", "c"]) + c = fa.load(path, format_hint="avro", columns=["a", "c"], as_fugue=True) df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:long", throw=True) def test_save_single_and_load_csv(self): @@ -1087,60 +1141,58 @@ def test_save_single_and_load_csv(self): path = os.path.join(self.tmpdir, "a", "b") e.fs.makedirs(path, recreate=True) # over write folder with single file - e.save_df(b, path, format_hint="csv", header=True, force_single=True) + fa.save(b, path, format_hint="csv", header=True, force_single=True) assert e.fs.isfile(path) - c = e.load_df( - path, - format_hint="csv", - header=True, - infer_schema=False, + c = fa.load( + path, format_hint="csv", header=True, infer_schema=False, as_fugue=True ) df_eq(c, [["6.1", "1.1"], ["2.1", "7.1"]], "c:str,a:str", throw=True) - c = e.load_df( - path, - format_hint="csv", - header=True, - infer_schema=True, + c = fa.load( + path, format_hint="csv", header=True, infer_schema=True, as_fugue=True ) df_eq(c, [[6.1, 1.1], [2.1, 7.1]], "c:double,a:double", throw=True) with raises(ValueError): - c = e.load_df( + c = fa.load( path, format_hint="csv", header=True, infer_schema=True, columns="c:str,a:str", # invalid to set schema when infer schema + as_fugue=True, ) - c = e.load_df( + c = fa.load( path, format_hint="csv", header=True, infer_schema=False, columns=["a", "c"], + as_fugue=True, ) df_eq(c, [["1.1", "6.1"], ["7.1", "2.1"]], "a:str,c:str", throw=True) - c = e.load_df( + c = fa.load( path, format_hint="csv", header=True, infer_schema=False, columns="a:double,c:double", + as_fugue=True, ) df_eq(c, [[1.1, 6.1], [7.1, 2.1]], "a:double,c:double", throw=True) # overwirte single with folder (if applicable) b = ArrayDataFrame([[60.1, 1.1], [20.1, 7.1]], "c:double,a:double") - e.save_df(b, path, format_hint="csv", header=True, mode="overwrite") - c = e.load_df( + fa.save(b, path, format_hint="csv", header=True, mode="overwrite") + c = fa.load( path, format_hint="csv", header=True, infer_schema=False, columns=["a", "c"], + as_fugue=True, ) df_eq(c, [["1.1", "60.1"], ["7.1", "20.1"]], "a:str,c:str", throw=True) @@ -1150,87 +1202,100 @@ def test_save_single_and_load_csv_no_header(self): path = os.path.join(self.tmpdir, "a", "b") e.fs.makedirs(path, recreate=True) # over write folder with single file - e.save_df(b, path, format_hint="csv", header=False, force_single=True) + fa.save(b, path, format_hint="csv", header=False, force_single=True) assert e.fs.isfile(path) with raises(ValueError): - c = e.load_df( + c = fa.load( path, format_hint="csv", header=False, infer_schema=False, + as_fugue=True # when header is False, must set columns ) - c = e.load_df( + c = fa.load( path, format_hint="csv", header=False, infer_schema=False, columns=["c", "a"], + as_fugue=True, ) df_eq(c, [["6.1", "1.1"], ["2.1", "7.1"]], "c:str,a:str", throw=True) - c = e.load_df( + c = fa.load( path, format_hint="csv", header=False, infer_schema=True, columns=["c", "a"], + as_fugue=True, ) df_eq(c, [[6.1, 1.1], [2.1, 7.1]], "c:double,a:double", throw=True) with raises(ValueError): - c = e.load_df( + c = fa.load( path, format_hint="csv", header=False, infer_schema=True, columns="c:double,a:double", + as_fugue=True, ) - c = e.load_df( + c = fa.load( path, format_hint="csv", header=False, infer_schema=False, columns="c:double,a:str", + as_fugue=True, ) df_eq(c, [[6.1, "1.1"], [2.1, "7.1"]], "c:double,a:str", throw=True) def test_save_and_load_csv(self): - e = self.engine b = ArrayDataFrame([[6.1, 1.1], [2.1, 7.1]], "c:double,a:double") path = os.path.join(self.tmpdir, "a", "b") - e.save_df(b, path, format_hint="csv", header=True) - c = e.load_df( + fa.save(b, path, format_hint="csv", header=True) + c = fa.load( path, format_hint="csv", header=True, infer_schema=True, columns=["a", "c"], + as_fugue=True, ) df_eq(c, [[1.1, 6.1], [7.1, 2.1]], "a:double,c:double", throw=True) def test_load_csv_folder(self): - e = self.engine native = NativeExecutionEngine() a = ArrayDataFrame([[6.1, 1.1]], "c:double,a:double") b = ArrayDataFrame([[2.1, 7.1], [4.1, 8.1]], "c:double,a:double") path = os.path.join(self.tmpdir, "a", "b") - native.save_df( - a, os.path.join(path, "a.csv"), format_hint="csv", header=True + fa.save( + a, + os.path.join(path, "a.csv"), + format_hint="csv", + header=True, + engine=native, ) - native.save_df( - b, os.path.join(path, "b.csv"), format_hint="csv", header=True + fa.save( + b, + os.path.join(path, "b.csv"), + format_hint="csv", + header=True, + engine=native, ) FileSystem().touch(os.path.join(path, "_SUCCESS")) - c = e.load_df( + c = fa.load( path, format_hint="csv", header=True, infer_schema=True, columns=["a", "c"], + as_fugue=True, ) df_eq( c, [[1.1, 6.1], [7.1, 2.1], [8.1, 4.1]], "a:double,c:double", throw=True @@ -1242,53 +1307,40 @@ def test_save_single_and_load_json(self): path = os.path.join(self.tmpdir, "a", "b") e.fs.makedirs(path, recreate=True) # over write folder with single file - e.save_df(b, path, format_hint="json", force_single=True) + fa.save(b, path, format_hint="json", force_single=True) assert e.fs.isfile(path) - c = e.load_df( - path, - format_hint="json", - columns=["a", "c"], - ) + c = fa.load(path, format_hint="json", columns=["a", "c"], as_fugue=True) df_eq(c, [[1, 6], [7, 2]], "a:long,c:long", throw=True) # overwirte single with folder (if applicable) b = ArrayDataFrame([[60, 1], [20, 7]], "c:long,a:long") - e.save_df(b, path, format_hint="json", mode="overwrite") - c = e.load_df(path, format_hint="json", columns=["a", "c"]) + fa.save(b, path, format_hint="json", mode="overwrite") + c = fa.load(path, format_hint="json", columns=["a", "c"], as_fugue=True) df_eq(c, [[1, 60], [7, 20]], "a:long,c:long", throw=True) def test_save_and_load_json(self): e = self.engine b = ArrayDataFrame([[6, 1], [3, 4], [2, 7], [4, 8], [6, 7]], "c:int,a:long") path = os.path.join(self.tmpdir, "a", "b") - e.save_df( + fa.save( e.repartition(e.to_df(b), PartitionSpec(num=2)), path, format_hint="json", ) - c = e.load_df( - path, - format_hint="json", - columns=["a", "c"], - ) + c = fa.load(path, format_hint="json", columns=["a", "c"], as_fugue=True) df_eq( c, [[1, 6], [7, 2], [4, 3], [8, 4], [7, 6]], "a:long,c:long", throw=True ) def test_load_json_folder(self): - e = self.engine native = NativeExecutionEngine() a = ArrayDataFrame([[6, 1], [3, 4]], "c:int,a:long") b = ArrayDataFrame([[2, 7], [4, 8]], "c:int,a:long") path = os.path.join(self.tmpdir, "a", "b") - native.save_df(a, os.path.join(path, "a.json"), format_hint="json") - native.save_df(b, os.path.join(path, "b.json"), format_hint="json") + fa.save(a, os.path.join(path, "a.json"), format_hint="json", engine=native) + fa.save(b, os.path.join(path, "b.json"), format_hint="json", engine=native) FileSystem().touch(os.path.join(path, "_SUCCESS")) - c = e.load_df( - path, - format_hint="json", - columns=["a", "c"], - ) + c = fa.load(path, format_hint="json", columns=["a", "c"], as_fugue=True) df_eq(c, [[1, 6], [7, 2], [8, 4], [4, 3]], "a:long,c:long", throw=True) From ce24c9fc013e0741c5a470878402d6f2d2d64601 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Mon, 26 Dec 2022 09:17:05 +0000 Subject: [PATCH 19/30] fix tests --- docs/top_api.rst | 2 + fugue/__init__.py | 9 +++- fugue/api.py | 11 ++++- fugue/dataframe/api.py | 19 +------- fugue/dataframe/array_dataframe.py | 6 +++ fugue/dataframe/arrow_dataframe.py | 26 +++++++++-- fugue/dataframe/dataframe.py | 41 ++++++++++++++-- fugue/dataframe/pandas_dataframe.py | 26 +++++++++-- fugue/dataset/__init__.py | 2 +- fugue/dataset/api.py | 52 +++++++++++++++------ fugue/dataset/dataset.py | 5 +- fugue/plugins.py | 2 + fugue_dask/dataframe.py | 11 ++++- fugue_dask/execution_engine.py | 15 +++--- fugue_dask/registry.py | 8 ++-- fugue_duckdb/dask.py | 11 +++-- fugue_duckdb/dataframe.py | 7 ++- fugue_duckdb/execution_engine.py | 1 + fugue_duckdb/registry.py | 8 ++-- fugue_ibis/dataframe.py | 5 +- fugue_ibis/execution/ibis_engine.py | 12 +---- fugue_ray/dataframe.py | 48 +++++++++++++------ fugue_ray/execution_engine.py | 2 +- fugue_ray/registry.py | 8 ++-- fugue_spark/dataframe.py | 14 +++++- fugue_spark/registry.py | 6 +-- fugue_test/dataframe_suite.py | 26 +++++++++++ tests/fugue/dataframe/test_dataframe.py | 3 ++ tests/fugue_dask/test_execution_engine.py | 3 ++ tests/fugue_duckdb/test_dask.py | 22 ++++----- tests/fugue_duckdb/test_execution_engine.py | 13 +++++- tests/fugue_ibis/mock/dataframe.py | 15 ++++-- tests/fugue_ray/test_execution_engine.py | 3 ++ 33 files changed, 322 insertions(+), 120 deletions(-) diff --git a/docs/top_api.rst b/docs/top_api.rst index af6201f8..1891c650 100644 --- a/docs/top_api.rst +++ b/docs/top_api.rst @@ -88,6 +88,8 @@ SQL Conversion ~~~~~~~~~~ +.. autofunction:: fugue.api.as_local +.. autofunction:: fugue.api.as_local_bounded .. autofunction:: fugue.api.as_array .. autofunction:: fugue.api.as_array_iterable .. autofunction:: fugue.api.as_arrow diff --git a/fugue/__init__.py b/fugue/__init__.py index bddd314a..dd5b389d 100644 --- a/fugue/__init__.py +++ b/fugue/__init__.py @@ -22,10 +22,17 @@ from fugue.dataframe.iterable_dataframe import IterableDataFrame from fugue.dataframe.pandas_dataframe import PandasDataFrame from fugue.dataframe.utils import to_local_bounded_df, to_local_df -from fugue.dataset import Dataset, DatasetDisplay, as_fugue_dataset, get_dataset_display +from fugue.dataset import ( + AnyDataset, + Dataset, + DatasetDisplay, + as_fugue_dataset, + get_dataset_display, +) from fugue.execution.execution_engine import ( AnyExecutionEngine, ExecutionEngine, + ExecutionEngineFacet, MapEngine, SQLEngine, ) diff --git a/fugue/api.py b/fugue/api.py index 0dd4ef0d..1d75e9f3 100644 --- a/fugue/api.py +++ b/fugue/api.py @@ -20,7 +20,16 @@ rename, select_columns, ) -from .dataset.api import as_fugue_dataset, count, is_bounded, is_empty, is_local, show +from .dataset.api import ( + as_fugue_dataset, + as_local, + as_local_bounded, + count, + is_bounded, + is_empty, + is_local, + show, +) from .execution.api import ( broadcast, clear_global_engine, diff --git a/fugue/dataframe/api.py b/fugue/dataframe/api.py index 8ae9d728..481f98d8 100644 --- a/fugue/dataframe/api.py +++ b/fugue/dataframe/api.py @@ -3,27 +3,10 @@ import pandas as pd import pyarrow as pa from triad.collections.schema import Schema -from triad.utils.assertion import assert_or_throw from triad.utils.rename import normalize_names -from fugue.dataset.api import as_fugue_dataset - from .._utils.registry import fugue_plugin -from .dataframe import DataFrame, AnyDataFrame - - -def as_fugue_df(df: AnyDataFrame) -> DataFrame: - """Wrap the object as a Fugue DataFrame. This is a wrapper - of :func:`~fugue.dataset.as_fugue_dataset` - - :param df: the object to wrap - """ - res = as_fugue_dataset(df) - assert_or_throw( - isinstance(res, DataFrame), - TypeError(f"{type(df)} can't be converted to a Fugue DataFrame"), - ) - return res # type: ignore +from .dataframe import AnyDataFrame, DataFrame, as_fugue_df @fugue_plugin diff --git a/fugue/dataframe/array_dataframe.py b/fugue/dataframe/array_dataframe.py index 48ca9ee5..5fc2727b 100644 --- a/fugue/dataframe/array_dataframe.py +++ b/fugue/dataframe/array_dataframe.py @@ -4,6 +4,7 @@ DataFrame, LocalBoundedDataFrame, _get_schema_change, + as_fugue_dataset, ) from fugue.exceptions import FugueDataFrameOperationError from triad.utils.assertion import assert_or_throw @@ -120,3 +121,8 @@ def _iter_cols(self, pos: List[int]) -> Iterable[List[Any]]: else: for row in self.native: yield [row[p] for p in pos] + + +@as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, list), priority=0.9) +def _arr_to_fugue(df: List[Any], **kwargs: Any) -> ArrayDataFrame: + return ArrayDataFrame(df, **kwargs) diff --git a/fugue/dataframe/arrow_dataframe.py b/fugue/dataframe/arrow_dataframe.py index 44a25454..a58c8ac6 100644 --- a/fugue/dataframe/arrow_dataframe.py +++ b/fugue/dataframe/arrow_dataframe.py @@ -6,7 +6,15 @@ from triad.exceptions import InvalidOperationError from triad.utils.assertion import assert_or_throw -from fugue.dataset.api import as_fugue_dataset, count, is_bounded, is_empty, is_local +from fugue.dataset.api import ( + as_fugue_dataset, + as_local, + as_local_bounded, + count, + is_bounded, + is_empty, + is_local, +) from fugue.exceptions import FugueDataFrameOperationError from .api import ( @@ -232,9 +240,19 @@ def as_array_iterable( yield list(arr) -@as_fugue_dataset.candidate(lambda df: isinstance(df, pa.Table)) -def _pa_table_as_fugue_df(df: pa.Table) -> "ArrowDataFrame": - return ArrowDataFrame(df) +@as_local.candidate(lambda df: isinstance(df, pa.Table)) +def _pa_table_as_local(df: pa.Table) -> pa.Table: + return df + + +@as_local_bounded.candidate(lambda df: isinstance(df, pa.Table)) +def _pa_table_as_local_bounded(df: pa.Table) -> pa.Table: + return df + + +@as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, pa.Table)) +def _pa_table_as_fugue_df(df: pa.Table, **kwargs: Any) -> "ArrowDataFrame": + return ArrowDataFrame(df, **kwargs) @is_df.candidate(lambda df: isinstance(df, pa.Table)) diff --git a/fugue/dataframe/dataframe.py b/fugue/dataframe/dataframe.py index 828a89ff..a6adf1dd 100644 --- a/fugue/dataframe/dataframe.py +++ b/fugue/dataframe/dataframe.py @@ -1,6 +1,6 @@ import json from abc import abstractmethod -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, TypeVar +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, TypeVar, Union import pandas as pd import pyarrow as pa @@ -12,10 +12,16 @@ from .._utils.display import PrettyTable from ..collections.yielded import Yielded -from ..dataset import Dataset, DatasetDisplay, get_dataset_display +from ..dataset import ( + Dataset, + DatasetDisplay, + as_local, + as_local_bounded, + get_dataset_display, + as_fugue_dataset, +) from ..exceptions import FugueDataFrameOperationError - AnyDataFrame = TypeVar("AnyDataFrame", "DataFrame", object) @@ -426,11 +432,40 @@ def show( print("") +def as_fugue_df(df: AnyDataFrame, **kwargs: Any) -> DataFrame: + """Wrap the object as a Fugue DataFrame. + + :param df: the object to wrap + """ + ds = as_fugue_dataset(df, **kwargs) + if isinstance(ds, DataFrame): + return ds + raise TypeError(f"{type(df)} {kwargs} is not recognized as a Fugue DataFrame: {ds}") + + @get_dataset_display.candidate(lambda ds: isinstance(ds, DataFrame), priority=0.1) def _get_dataframe_display(ds: DataFrame): return DataFrameDisplay(ds) +@as_local.candidate(lambda df: isinstance(df, DataFrame) and not df.is_local) +def _df_to_local(df: DataFrame) -> DataFrame: + return df.as_local() + + +@as_local_bounded.candidate( + lambda df: isinstance(df, DataFrame) and not (df.is_local and df.is_bounded), + priority=0.9, +) +def _df_to_local_bounded(df: DataFrame) -> DataFrame: + res: DataFrame = df.as_local() + if not res.is_bounded: + res = as_fugue_df(res.as_array(), schema=df.schema) + if res is not df and df.has_metadata: + res.reset_metadata(df.metadata) + return res + + def _get_schema_change( orig_schema: Optional[Schema], schema: Any ) -> Tuple[Schema, List[int]]: diff --git a/fugue/dataframe/pandas_dataframe.py b/fugue/dataframe/pandas_dataframe.py index e2b456b8..f8ece0e0 100644 --- a/fugue/dataframe/pandas_dataframe.py +++ b/fugue/dataframe/pandas_dataframe.py @@ -6,7 +6,15 @@ from triad.utils.assertion import assert_or_throw from triad.utils.pandas_like import PD_UTILS -from fugue.dataset.api import as_fugue_dataset, count, is_bounded, is_empty, is_local +from fugue.dataset.api import ( + as_fugue_dataset, + as_local, + as_local_bounded, + count, + is_bounded, + is_empty, + is_local, +) from fugue.exceptions import FugueDataFrameOperationError from .api import ( @@ -187,9 +195,19 @@ def _apply_schema( return PD_UTILS.enforce_type(pdf, schema.pa_schema, null_safe=True), schema -@as_fugue_dataset.candidate(lambda df: isinstance(df, pd.DataFrame)) -def _pd_as_fugue_df(df: pd.DataFrame) -> "PandasDataFrame": - return PandasDataFrame(df) +@as_local.candidate(lambda df: isinstance(df, pd.DataFrame)) +def _pd_as_local(df: pd.DataFrame) -> pd.DataFrame: + return df + + +@as_local_bounded.candidate(lambda df: isinstance(df, pd.DataFrame)) +def _pd_as_local_bounded(df: pd.DataFrame) -> pd.DataFrame: + return df + + +@as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, pd.DataFrame)) +def _pd_as_fugue_df(df: pd.DataFrame, **kwargs: Any) -> "PandasDataFrame": + return PandasDataFrame(df, **kwargs) @is_df.candidate(lambda df: isinstance(df, pd.DataFrame)) diff --git a/fugue/dataset/__init__.py b/fugue/dataset/__init__.py index 4b140844..d70704e5 100644 --- a/fugue/dataset/__init__.py +++ b/fugue/dataset/__init__.py @@ -1,3 +1,3 @@ # flake8: noqa -from .dataset import Dataset, DatasetDisplay, get_dataset_display from .api import * +from .dataset import AnyDataset, Dataset, DatasetDisplay, get_dataset_display diff --git a/fugue/dataset/api.py b/fugue/dataset/api.py index af3754b4..94e57ed0 100644 --- a/fugue/dataset/api.py +++ b/fugue/dataset/api.py @@ -1,26 +1,26 @@ from typing import Any, Optional from .._utils.registry import fugue_plugin -from .dataset import Dataset +from .dataset import AnyDataset, Dataset @fugue_plugin -def as_fugue_dataset(data: Any) -> Dataset: +def as_fugue_dataset(data: AnyDataset, **kwargs: Any) -> Dataset: """Wrap the input as a :class:`~.Dataset` - :param data: the data to be wrapped + :param data: the dataset to be wrapped """ - if isinstance(data, Dataset): + if isinstance(data, Dataset) and len(kwargs) == 0: return data raise NotImplementedError(f"no registered dataset conversion for {type(data)}") def show( - data: Any, n: int = 10, with_count: bool = False, title: Optional[str] = None + data: AnyDataset, n: int = 10, with_count: bool = False, title: Optional[str] = None ) -> None: """Display the Dataset - :param data: the data that can be recognized by Fugue + :param data: the dataset that can be recognized by Fugue :param n: number of rows to print, defaults to 10 :param with_count: whether to show dataset count, defaults to False :param title: title of the dataset, defaults to None @@ -36,36 +36,60 @@ def show( @fugue_plugin -def is_local(data: Any) -> bool: +def as_local(data: AnyDataset) -> AnyDataset: + """Convert the dataset to a local dataset + + :param data: the dataset that can be recognized by Fugue + """ + if isinstance(data, Dataset) and data.is_local: + return data + return as_local_bounded(data) + + +@fugue_plugin +def as_local_bounded(data: AnyDataset) -> AnyDataset: + """Convert the dataset to a local bounded dataset + + :param data: the dataset that can be recognized by Fugue + """ + if isinstance(data, Dataset) and data.is_local and data.is_bounded: + return data + raise NotImplementedError( + f"no registered function to convert {type(data)} to a local bounded dataset" + ) + + +@fugue_plugin +def is_local(data: AnyDataset) -> bool: """Whether the dataset is local - :param data: the data that can be recognized by Fugue + :param data: the dataset that can be recognized by Fugue """ return as_fugue_dataset(data).is_local @fugue_plugin -def is_bounded(data: Any) -> bool: +def is_bounded(data: AnyDataset) -> bool: """Whether the dataset is local - :param data: the data that can be recognized by Fugue + :param data: the dataset that can be recognized by Fugue """ return as_fugue_dataset(data).is_bounded @fugue_plugin -def is_empty(data: Any) -> bool: +def is_empty(data: AnyDataset) -> bool: """Whether the dataset is empty - :param data: the data that can be recognized by Fugue + :param data: the dataset that can be recognized by Fugue """ return as_fugue_dataset(data).empty @fugue_plugin -def count(data: Any) -> int: +def count(data: AnyDataset) -> int: """The number of elements in the dataset - :param data: the data that can be recognized by Fugue + :param data: the dataset that can be recognized by Fugue """ return as_fugue_dataset(data).count() diff --git a/fugue/dataset/dataset.py b/fugue/dataset/dataset.py index 9be2875b..24f3dcdf 100644 --- a/fugue/dataset/dataset.py +++ b/fugue/dataset/dataset.py @@ -1,6 +1,6 @@ import html from abc import ABC, abstractmethod -from typing import Any, Optional +from typing import Any, Optional, TypeVar from triad import ParamDict, SerializableRLock, assert_or_throw @@ -8,6 +8,9 @@ from ..exceptions import FugueDatasetEmptyError +AnyDataset = TypeVar("AnyDataset", "Dataset", object) + + class Dataset(ABC): """The base class of Fugue :class:`~.fugue.dataframe.dataframe.DataFrame` and :class:`~.fugue.bag.bag.Bag`. diff --git a/fugue/plugins.py b/fugue/plugins.py index 9866c2e0..6a7b8aa1 100644 --- a/fugue/plugins.py +++ b/fugue/plugins.py @@ -19,6 +19,8 @@ ) from fugue.dataset import ( as_fugue_dataset, + as_local, + as_local_bounded, count, get_dataset_display, is_bounded, diff --git a/fugue_dask/dataframe.py b/fugue_dask/dataframe.py index 28710143..383a98e0 100644 --- a/fugue_dask/dataframe.py +++ b/fugue_dask/dataframe.py @@ -17,6 +17,7 @@ from fugue.dataframe.dataframe import _input_schema from fugue.exceptions import FugueDataFrameOperationError from fugue.plugins import ( + as_local_bounded, count, drop_columns, get_column_names, @@ -104,7 +105,10 @@ def is_local(self) -> bool: return False def as_local(self) -> LocalDataFrame: - return PandasDataFrame(self.as_pandas(), self.schema) + res = PandasDataFrame(self.as_pandas(), self.schema) + if self.has_metadata: + res.reset_metadata(self.metadata) + return res @property def is_bounded(self) -> bool: @@ -265,6 +269,11 @@ def _dd_is_local(df: dd.DataFrame) -> bool: return False +@as_local_bounded.candidate(lambda df: isinstance(df, dd.DataFrame)) +def _dd_as_local(df: dd.DataFrame) -> bool: + return df.compute() + + @get_column_names.candidate(lambda df: isinstance(df, dd.DataFrame)) def _get_dask_dataframe_columns(df: dd.DataFrame) -> List[Any]: return list(df.columns) diff --git a/fugue_dask/execution_engine.py b/fugue_dask/execution_engine.py index c384e0ea..3374d1df 100644 --- a/fugue_dask/execution_engine.py +++ b/fugue_dask/execution_engine.py @@ -192,14 +192,17 @@ def to_df(self, df: Any, schema: Any = None) -> DaskDataFrame: if isinstance(df, DaskDataFrame): return df if isinstance(df, PandasDataFrame): - return DaskDataFrame( + res = DaskDataFrame( df.native, df.schema, num_partitions=default_partitions ) - return DaskDataFrame( - df.as_array(type_safe=True), - df.schema, - num_partitions=default_partitions, - ) + else: + res = DaskDataFrame( + df.as_array(type_safe=True), + df.schema, + num_partitions=default_partitions, + ) + res.reset_metadata(df.metadata) + return res return DaskDataFrame(df, schema, num_partitions=default_partitions) def repartition( diff --git a/fugue_dask/registry.py b/fugue_dask/registry.py index 0aaeddc3..516cccd1 100644 --- a/fugue_dask/registry.py +++ b/fugue_dask/registry.py @@ -12,7 +12,7 @@ SimpleAnnotationConverter, register_annotation_converter, ) -from fugue.plugins import infer_execution_engine, as_fugue_dataset +from fugue.plugins import as_fugue_dataset, infer_execution_engine from fugue.workflow import register_raw_df_type from fugue_dask._utils import DASK_UTILS from fugue_dask.dataframe import DaskDataFrame @@ -26,9 +26,9 @@ def _infer_dask_client(objs: Any) -> Any: return DASK_UTILS.get_or_create_client() -@as_fugue_dataset.candidate(lambda df: isinstance(df, dd.DataFrame)) -def _dask_as_fugue_df(df: dd.DataFrame) -> DaskDataFrame: - return DaskDataFrame(df) +@as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, dd.DataFrame)) +def _dask_as_fugue_df(df: dd.DataFrame, **kwargs: Any) -> DaskDataFrame: + return DaskDataFrame(df, **kwargs) def _register_raw_dataframes() -> None: diff --git a/fugue_duckdb/dask.py b/fugue_duckdb/dask.py index 78004b42..472eccdf 100644 --- a/fugue_duckdb/dask.py +++ b/fugue_duckdb/dask.py @@ -44,11 +44,14 @@ def to_df(self, df: Any, schema: Any = None) -> DuckDataFrame: if isinstance(df, (dd.DataFrame, DaskDataFrame)): ddf = self._to_dask_df(df, schema) if all(not pa.types.is_nested(f.type) for f in ddf.schema.fields): - return DuckDataFrame(self.connection.from_df(ddf.as_pandas())) + res = DuckDataFrame(self.connection.from_df(ddf.as_pandas())) else: - return DuckDataFrame( + res = DuckDataFrame( duckdb.arrow(ddf.as_arrow(), connection=self.connection) ) + if ddf.has_metadata: + res.reset_metadata(ddf.metadata) + return res return super().to_df(df, schema) def repartition(self, df: DataFrame, partition_spec: PartitionSpec) -> DataFrame: @@ -123,5 +126,7 @@ def _to_auto_df( def _to_dask_df(self, df: Any, schema: Any = None) -> DaskDataFrame: if isinstance(df, DuckDataFrame): - return self._dask_engine.to_df(df.as_pandas(), df.schema) + res = self._dask_engine.to_df(df.as_pandas(), df.schema) + res.reset_metadata(df.metadata if df.has_metadata else None) + return res return self._dask_engine.to_df(df, schema) diff --git a/fugue_duckdb/dataframe.py b/fugue_duckdb/dataframe.py index 8a49e1b7..9861c2e8 100644 --- a/fugue_duckdb/dataframe.py +++ b/fugue_duckdb/dataframe.py @@ -13,7 +13,7 @@ LocalDataFrame, ) from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError -from fugue.plugins import get_column_names, is_df +from fugue.plugins import get_column_names, is_df, as_local_bounded from ._utils import encode_column_name, to_duck_type, to_pa_type @@ -146,6 +146,11 @@ def _duck_is_df(df: DuckDBPyRelation) -> bool: return True +@as_local_bounded.candidate(lambda df: isinstance(df, DuckDBPyRelation)) +def _duck_as_local(df: DuckDBPyRelation) -> DuckDBPyRelation: + return df + + @get_column_names.candidate(lambda df: isinstance(df, DuckDBPyRelation)) def _get_duckdb_columns(df: DuckDBPyRelation) -> List[Any]: return list(df.columns) diff --git a/fugue_duckdb/execution_engine.py b/fugue_duckdb/execution_engine.py index bfe9c2c5..42caca18 100644 --- a/fugue_duckdb/execution_engine.py +++ b/fugue_duckdb/execution_engine.py @@ -469,6 +469,7 @@ def _to_duck_df(self, df: Any, schema: Any = None) -> DuckDataFrame: rdf = DuckDataFrame( duckdb.arrow(df.as_arrow(), connection=self.connection) ) + rdf.reset_metadata(df.metadata if df.has_metadata else None) return rdf tdf = ArrowDataFrame(df, schema) return DuckDataFrame(duckdb.arrow(tdf.native, connection=self.connection)) diff --git a/fugue_duckdb/registry.py b/fugue_duckdb/registry.py index a63ed7d7..b9fea78b 100644 --- a/fugue_duckdb/registry.py +++ b/fugue_duckdb/registry.py @@ -17,7 +17,7 @@ SimpleAnnotationConverter, register_annotation_converter, ) -from fugue.plugins import infer_execution_engine, as_fugue_dataset +from fugue.plugins import as_fugue_dataset, infer_execution_engine from fugue.workflow import register_raw_df_type from fugue_duckdb.dataframe import DuckDataFrame from fugue_duckdb.execution_engine import DuckDBEngine, DuckExecutionEngine @@ -30,9 +30,9 @@ def _infer_duckdb_client(objs: Any) -> Any: return "duckdb" -@as_fugue_dataset.candidate(lambda df: isinstance(df, DuckDBPyRelation)) -def _duckdb_as_fugue_df(df: DuckDBPyRelation) -> DuckDataFrame: - return DuckDataFrame(df) +@as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, DuckDBPyRelation)) +def _duckdb_as_fugue_df(df: DuckDBPyRelation, **kwargs: Any) -> DuckDataFrame: + return DuckDataFrame(df, **kwargs) def _register_raw_dataframes() -> None: diff --git a/fugue_ibis/dataframe.py b/fugue_ibis/dataframe.py index bd6aeef8..717f21d8 100644 --- a/fugue_ibis/dataframe.py +++ b/fugue_ibis/dataframe.py @@ -115,7 +115,10 @@ def as_pandas(self) -> pd.DataFrame: return self.as_local().as_pandas() def as_local(self) -> LocalDataFrame: - return self._to_local_df(self._table, schema=self.schema) + res = self._to_local_df(self._table, schema=self.schema) + if res is not self and self.has_metadata: + res.reset_metadata(self.metadata) + return res def as_array( self, columns: Optional[List[str]] = None, type_safe: bool = False diff --git a/fugue_ibis/execution/ibis_engine.py b/fugue_ibis/execution/ibis_engine.py index 20cbedaa..e0d2f90a 100644 --- a/fugue_ibis/execution/ibis_engine.py +++ b/fugue_ibis/execution/ibis_engine.py @@ -3,7 +3,7 @@ import ibis -from fugue import DataFrame, DataFrames, ExecutionEngine +from fugue import DataFrame, DataFrames, ExecutionEngine, ExecutionEngineFacet from fugue._utils.registry import fugue_plugin from .._compat import IbisTable @@ -19,20 +19,12 @@ def parse_ibis_engine(obj: Any, engine: ExecutionEngine) -> "IbisEngine": ) -class IbisEngine: +class IbisEngine(ExecutionEngineFacet): """The abstract base class for different ibis execution implementations. :param execution_engine: the execution engine this ibis engine will run on """ - def __init__(self, execution_engine: ExecutionEngine) -> None: - self._execution_engine = execution_engine - - @property - def execution_engine(self) -> ExecutionEngine: - """the execution engine this ibis engine will run on""" - return self._execution_engine - @abstractmethod def select( self, dfs: DataFrames, ibis_func: Callable[[ibis.BaseBackend], IbisTable] diff --git a/fugue_ray/dataframe.py b/fugue_ray/dataframe.py index c14dc5e8..553259fc 100644 --- a/fugue_ray/dataframe.py +++ b/fugue_ray/dataframe.py @@ -14,9 +14,9 @@ ) from fugue.dataframe.dataframe import _input_schema from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError -from fugue.plugins import get_column_names, rename, is_df +from fugue.plugins import get_column_names, rename, is_df, as_local_bounded -from ._utils.dataframe import _build_empty_arrow, build_empty, get_dataset_format +from ._utils.dataframe import build_empty, get_dataset_format class RayDataFrame(DataFrame): @@ -41,6 +41,7 @@ def __init__( # noqa: C901 schema: Any = None, internal_schema: bool = False, ): + metadata: Any = None if internal_schema: schema = _input_schema(schema).assert_not_empty() if df is None: @@ -71,6 +72,7 @@ def __init__( # noqa: C901 rdf = df._native if schema is None: schema = df.schema + metadata = None if not df.has_metadata else df.metadata elif isinstance(df, (pd.DataFrame, pd.Series)): if isinstance(df, pd.Series): df = df.to_frame() @@ -86,11 +88,14 @@ def __init__( # noqa: C901 rdf = rd.from_arrow(df.as_arrow(type_safe=True)) if schema is None: schema = df.schema + metadata = None if not df.has_metadata else df.metadata else: raise ValueError(f"{df} is incompatible with DaskDataFrame") rdf, schema = self._apply_schema(rdf, schema, internal_schema) super().__init__(schema) self._native = rdf + if metadata is not None: + self.reset_metadata(metadata) @property def native(self) -> rd.Dataset: @@ -107,8 +112,12 @@ def is_local(self) -> bool: def as_local(self) -> LocalDataFrame: adf = self.as_arrow() if adf.shape[0] == 0: - return ArrowDataFrame([], self.schema) - return ArrowDataFrame(adf) + res = ArrowDataFrame([], self.schema) + else: + res = ArrowDataFrame(adf) + if self.has_metadata: + res.reset_metadata(self.metadata) + return res @property def is_bounded(self) -> bool: @@ -150,17 +159,7 @@ def count(self) -> int: return self.native.count() def as_arrow(self, type_safe: bool = False) -> pa.Table: - def get_tables() -> Iterable[pa.Table]: - empty = True - for block in self.native.get_internal_block_refs(): - tb = ray.get(block) - if tb.shape[0] > 0: - yield tb - empty = False - if empty: - yield _build_empty_arrow(self.schema) - - return pa.concat_tables(get_tables()) + return pa.concat_tables(_get_arrow_tables(self.native)) def as_pandas(self) -> pd.DataFrame: return self.as_arrow().to_pandas() @@ -244,6 +243,11 @@ def _rd_is_df(df: rd.Dataset) -> bool: return True +@as_local_bounded.candidate(lambda df: isinstance(df, rd.Dataset)) +def _rd_as_local(df: rd.Dataset) -> bool: + return pa.concat_tables(_get_arrow_tables(df)) + + @get_column_names.candidate(lambda df: isinstance(df, rd.Dataset)) def _get_ray_dataframe_columns(df: rd.Dataset) -> List[Any]: fmt = get_dataset_format(df) @@ -264,3 +268,17 @@ def _rename_ray_dataframe(df: rd.Dataset, columns: Dict[str, Any]) -> rd.Dataset raise FugueDataFrameOperationError("found nonexistent columns: {missing}") new_cols = [columns.get(name, name) for name in cols] return df.map_batches(lambda b: b.rename_columns(new_cols), batch_format="pyarrow") + + +def _get_arrow_tables(df: rd.Dataset) -> Iterable[pa.Table]: + last_empty: Any = None + empty = True + for block in df.get_internal_block_refs(): + tb = ray.get(block) + if tb.shape[0] > 0: + yield tb + empty = False + else: + last_empty = tb + if empty: + yield last_empty diff --git a/fugue_ray/execution_engine.py b/fugue_ray/execution_engine.py index e8a24a4b..de8a51e0 100644 --- a/fugue_ray/execution_engine.py +++ b/fugue_ray/execution_engine.py @@ -214,7 +214,7 @@ def persist( df = self._to_auto_df(df) if isinstance(df, RayDataFrame): return df.persist(**kwargs) - return df + return df # pragma: no cover def convert_yield_dataframe(self, df: DataFrame, as_local: bool) -> DataFrame: if isinstance(df, RayDataFrame): diff --git a/fugue_ray/registry.py b/fugue_ray/registry.py index 8ec17013..771c6358 100644 --- a/fugue_ray/registry.py +++ b/fugue_ray/registry.py @@ -11,7 +11,7 @@ SimpleAnnotationConverter, register_annotation_converter, ) -from fugue.plugins import infer_execution_engine, as_fugue_dataset +from fugue.plugins import as_fugue_dataset, infer_execution_engine from fugue.workflow import register_raw_df_type from .dataframe import RayDataFrame @@ -25,9 +25,9 @@ def _infer_ray_client(objs: Any) -> Any: return "ray" -@as_fugue_dataset.candidate(lambda df: isinstance(df, rd.Dataset)) -def _ray_as_fugue_df(df: rd.Dataset) -> RayDataFrame: - return RayDataFrame(df) +@as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, rd.Dataset)) +def _ray_as_fugue_df(df: rd.Dataset, **kwargs: Any) -> RayDataFrame: + return RayDataFrame(df, **kwargs) def _register_raw_dataframes() -> None: diff --git a/fugue_spark/dataframe.py b/fugue_spark/dataframe.py index 090479c4..fea74d06 100644 --- a/fugue_spark/dataframe.py +++ b/fugue_spark/dataframe.py @@ -18,6 +18,7 @@ ) from fugue.exceptions import FugueDataFrameOperationError from fugue.plugins import ( + as_local_bounded, count, drop_columns, get_column_names, @@ -87,8 +88,12 @@ def is_bounded(self) -> bool: def as_local(self) -> LocalDataFrame: if any(pa.types.is_nested(t) for t in self.schema.types): data = list(to_type_safe_input(self.native.collect(), self.schema)) - return ArrayDataFrame(data, self.schema) - return PandasDataFrame(self.native.toPandas(), self.schema) + res: LocalDataFrame = ArrayDataFrame(data, self.schema) + else: + res = PandasDataFrame(self.native.toPandas(), self.schema) + if self.has_metadata: + res.reset_metadata(self.metadata) + return res @property def num_partitions(self) -> int: @@ -198,6 +203,11 @@ def _spark_df_is_local(df: ps.DataFrame) -> bool: return False +@as_local_bounded.candidate(lambda df: isinstance(df, ps.DataFrame)) +def _spark_df_as_local(df: ps.DataFrame) -> pd.DataFrame: + return df.toPandas() + + @get_column_names.candidate(lambda df: isinstance(df, ps.DataFrame)) def _get_spark_df_columns(df: ps.DataFrame) -> List[Any]: return df.columns diff --git a/fugue_spark/registry.py b/fugue_spark/registry.py index 8700f802..61c95ce7 100644 --- a/fugue_spark/registry.py +++ b/fugue_spark/registry.py @@ -35,9 +35,9 @@ def _infer_spark_client(obj: Any) -> Any: return SparkSession.builder.getOrCreate() -@as_fugue_dataset.candidate(lambda df: isinstance(df, ps.DataFrame)) -def _spark_as_fugue_df(df: ps.DataFrame) -> SparkDataFrame: - return SparkDataFrame(df) +@as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, ps.DataFrame)) +def _spark_as_fugue_df(df: ps.DataFrame, **kwargs: Any) -> SparkDataFrame: + return SparkDataFrame(df, **kwargs) @parse_creator.candidate(lambda obj: _is_sparksql(obj)) diff --git a/fugue_test/dataframe_suite.py b/fugue_test/dataframe_suite.py index aff9cd34..4ad2c36b 100644 --- a/fugue_test/dataframe_suite.py +++ b/fugue_test/dataframe_suite.py @@ -64,6 +64,26 @@ def test_as_pandas(self): assert [] == pdf.values.tolist() assert fi.is_local(pdf) + def test_as_local(self): + with raises(NotImplementedError): + fi.as_local(10) + with raises(NotImplementedError): + fi.as_local_bounded(10) + + df = self.df([["a", 1.0], ["b", 2.0]], "x:str,y:double") + ldf = fi.as_local(df) + assert fi.is_local(ldf) + lbdf = fi.as_local_bounded(df) + assert fi.is_local(lbdf) and fi.is_bounded(lbdf) + + fdf = fi.as_fugue_df(df) + fdf.reset_metadata({"a": 1}) + ldf = fi.as_local(fdf) + assert ldf.metadata == {"a": 1} + lbdf = fi.as_local_bounded(fdf) + assert fi.is_local(lbdf) and fi.is_bounded(lbdf) + assert ldf.metadata == {"a": 1} + def test_drop_columns(self): df = fi.drop_columns(self.df([], "a:str,b:int"), ["a"]) assert fi.get_schema(df) == "b:int" @@ -122,6 +142,12 @@ def test_rename(self): assert fi.get_schema(df) == "a:str,b:int" df_eq(fi.as_fugue_df(df2), data, "aa:str,b:int", throw=True) + for data in [[["a", 1]], []]: + df = self.df(data, "a:str,b:int") + df3 = fi.rename(df, columns={}) + assert fi.get_schema(df3) == "a:str,b:int" + df_eq(fi.as_fugue_df(df3), data, "a:str,b:int", throw=True) + def test_rename_invalid(self): df = self.df([["a", 1]], "a:str,b:int") raises( diff --git a/tests/fugue/dataframe/test_dataframe.py b/tests/fugue/dataframe/test_dataframe.py index 35cf5d2a..0d7cb7b6 100644 --- a/tests/fugue/dataframe/test_dataframe.py +++ b/tests/fugue/dataframe/test_dataframe.py @@ -6,11 +6,14 @@ from fugue.dataframe import ArrayDataFrame, DataFrame from fugue.api import as_fugue_df, get_native_as_df +from fugue.bag.array_bag import ArrayBag def test_as_fugue_df(): with raises(NotImplementedError): as_fugue_df(10) + with raises(TypeError): + as_fugue_df(ArrayBag([1, 2])) df = pd.DataFrame([[0]], columns=["a"]) assert isinstance(as_fugue_df(df), DataFrame) diff --git a/tests/fugue_dask/test_execution_engine.py b/tests/fugue_dask/test_execution_engine.py index 79e4dc5f..2273eb00 100644 --- a/tests/fugue_dask/test_execution_engine.py +++ b/tests/fugue_dask/test_execution_engine.py @@ -6,6 +6,7 @@ import pandas as pd from dask.distributed import Client +import fugue.api as fa from fugue import transform from fugue.collections.partition import PartitionSpec from fugue.dataframe.pandas_dataframe import PandasDataFrame @@ -29,9 +30,11 @@ class DaskExecutionEngineTests(ExecutionEngineTests.Tests): @classmethod def setUpClass(cls): cls._engine = cls.make_engine(cls) + fa.set_global_engine(cls._engine) @classmethod def tearDownClass(cls): + fa.clear_global_engine() cls._engine.dask_client.close() def make_engine(self): diff --git a/tests/fugue_duckdb/test_dask.py b/tests/fugue_duckdb/test_dask.py index 85081e49..747eb009 100644 --- a/tests/fugue_duckdb/test_dask.py +++ b/tests/fugue_duckdb/test_dask.py @@ -4,17 +4,17 @@ import duckdb import pandas as pd import pyarrow as pa -from fugue import ArrowDataFrame, DataFrame, FugueWorkflow, PartitionSpec +from dask.distributed import Client +from pytest import raises + +import fugue.api as fa +from fugue import ArrowDataFrame, DataFrame, FugueWorkflow, PartitionSpec, fsql from fugue.dataframe.utils import _df_eq as df_eq from fugue_dask import DaskDataFrame -from fugue import fsql -from fugue_test.builtin_suite import BuiltInTests -from fugue_test.execution_suite import ExecutionEngineTests -from pytest import raises -from dask.distributed import Client from fugue_duckdb import DuckDaskExecutionEngine from fugue_duckdb.dataframe import DuckDataFrame - +from fugue_test.builtin_suite import BuiltInTests +from fugue_test.execution_suite import ExecutionEngineTests _CONF = { "fugue.rpc.server": "fugue.rpc.flask.FlaskRPCServer", @@ -29,9 +29,11 @@ class DuckDaskExecutionEngineTests(ExecutionEngineTests.Tests): def setUpClass(cls): cls._con = duckdb.connect() cls._engine = cls.make_engine(cls) + fa.set_global_engine(cls._engine) @classmethod def tearDownClass(cls): + fa.clear_global_engine() cls._con.close() cls._engine.dask_client.close() @@ -118,13 +120,11 @@ def test_yield_2(self): def assert_data(df: DataFrame) -> None: assert df.schema == "a:datetime,b:bytes,c:[long]" - df = pd.DataFrame( - [[1,2,3]], columns=list("abc") - ) + df = pd.DataFrame([[1, 2, 3]], columns=list("abc")) with FugueWorkflow() as dag: x = dag.df(df) result = dag.select("SELECT * FROM ", x) result.yield_dataframe_as("x") res = dag.run(self.engine) - assert res["x"].as_array() == [[1,2,3]] + assert res["x"].as_array() == [[1, 2, 3]] diff --git a/tests/fugue_duckdb/test_execution_engine.py b/tests/fugue_duckdb/test_execution_engine.py index f01d63bf..49830e14 100644 --- a/tests/fugue_duckdb/test_execution_engine.py +++ b/tests/fugue_duckdb/test_execution_engine.py @@ -5,14 +5,14 @@ import pyarrow as pa from pytest import raises +import fugue.api as fa from fugue import ArrowDataFrame, DataFrame, FugueWorkflow, fsql -from fugue.dataframe.utils import _df_eq as df_eq +from fugue.api import engine_context from fugue.plugins import infer_execution_engine from fugue_duckdb import DuckExecutionEngine from fugue_duckdb.dataframe import DuckDataFrame from fugue_test.builtin_suite import BuiltInTests from fugue_test.execution_suite import ExecutionEngineTests -from fugue.api import engine_context class DuckExecutionEngineTests(ExecutionEngineTests.Tests): @@ -20,9 +20,11 @@ class DuckExecutionEngineTests(ExecutionEngineTests.Tests): def setUpClass(cls): cls._con = duckdb.connect() cls._engine = cls.make_engine(cls) + fa.set_global_engine(cls._engine) @classmethod def tearDownClass(cls): + fa.clear_global_engine() cls._con.close() def make_engine(self): @@ -31,6 +33,13 @@ def make_engine(self): ) return e + def test_duck_to_df(self): + e = self.engine + a = e.to_df([[1, 2, 3]], "a:double,b:double,c:int") + assert isinstance(a, DuckDataFrame) + b = e.to_df(a.native_as_df()) + assert isinstance(b, DuckDataFrame) + def test_intersect_all(self): e = self.engine a = e.to_df([[1, 2, 3], [4, None, 6], [4, None, 6]], "a:double,b:double,c:int") diff --git a/tests/fugue_ibis/mock/dataframe.py b/tests/fugue_ibis/mock/dataframe.py index 66f14667..1d8e442b 100644 --- a/tests/fugue_ibis/mock/dataframe.py +++ b/tests/fugue_ibis/mock/dataframe.py @@ -1,9 +1,8 @@ from typing import Any from fugue import ArrowDataFrame, DataFrame, LocalDataFrame +from fugue.plugins import as_fugue_dataset, as_local_bounded from fugue_ibis import IbisDataFrame, IbisTable -from fugue_ibis._utils import to_schema -from fugue.plugins import as_fugue_dataset class MockDuckDataFrame(IbisDataFrame): @@ -18,6 +17,12 @@ def _to_iterable_df(self, table: IbisTable, schema: Any = None) -> LocalDataFram # should also check the df._findbackend is duckdb -@as_fugue_dataset.candidate(lambda df: isinstance(df, IbisTable)) -def _ibis_as_fugue(df: IbisTable) -> bool: - return MockDuckDataFrame(df) +@as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, IbisTable)) +def _ibis_as_fugue(df: IbisTable, **kwargs: Any) -> bool: + return MockDuckDataFrame(df, **kwargs) + + +# should also check the df._findbackend is duckdb +@as_local_bounded.candidate(lambda df, **kwargs: isinstance(df, IbisTable)) +def _ibis_as_local(df: IbisTable, **kwargs: Any) -> bool: + return df.execute() diff --git a/tests/fugue_ray/test_execution_engine.py b/tests/fugue_ray/test_execution_engine.py index 5ca8ded8..be91014e 100644 --- a/tests/fugue_ray/test_execution_engine.py +++ b/tests/fugue_ray/test_execution_engine.py @@ -7,6 +7,7 @@ from pytest import raises from triad import FileSystem +import fugue.api as fa from fugue import ArrayDataFrame, DataFrame, FugueWorkflow, fsql, transform from fugue.dataframe.utils import _df_eq as df_eq from fugue.plugins import infer_execution_engine @@ -28,9 +29,11 @@ def setUpClass(cls): ray.init(num_cpus=2) cls._con = duckdb.connect() cls._engine = cls.make_engine(cls) + fa.set_global_engine(cls._engine) @classmethod def tearDownClass(cls): + fa.clear_global_engine() cls._con.close() ray.shutdown() From 3412839136be65999142abf7242a9a2bfe027537 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Wed, 28 Dec 2022 07:58:59 +0000 Subject: [PATCH 20/30] refactor SQLEngine --- fugue/_utils/sql.py | 32 +++++ fugue/column/sql.py | 39 ++++-- fugue/execution/api.py | 22 ++-- fugue/execution/execution_engine.py | 46 +++++-- fugue/execution/native_execution_engine.py | 19 +-- fugue/extensions/_builtins/processors.py | 2 +- fugue/registry.py | 11 -- fugue/sql/api.py | 3 +- fugue/sql/workflow.py | 10 +- fugue/workflow/__init__.py | 2 +- fugue/workflow/input.py | 32 +---- fugue/workflow/workflow.py | 59 ++++++--- fugue_dask/execution_engine.py | 9 +- fugue_dask/registry.py | 6 - fugue_duckdb/_io.py | 19 +-- fugue_duckdb/_utils.py | 7 +- fugue_duckdb/dataframe.py | 11 +- fugue_duckdb/execution_engine.py | 134 +++++++++++---------- fugue_duckdb/ibis_engine.py | 32 +++-- fugue_duckdb/registry.py | 13 +- fugue_ibis/execution_engine.py | 6 +- fugue_ray/registry.py | 11 +- fugue_spark/execution_engine.py | 2 +- fugue_spark/registry.py | 7 +- fugue_test/builtin_suite.py | 47 ++++++++ fugue_test/dataframe_suite.py | 2 +- fugue_test/execution_suite.py | 22 +++- tests/fugue/column/test_sql.py | 61 ++++++---- tests/fugue/utils/test_sql.py | 23 ++++ tests/fugue_duckdb/test_utils.py | 11 +- 30 files changed, 429 insertions(+), 271 deletions(-) create mode 100644 fugue/_utils/sql.py create mode 100644 tests/fugue/utils/test_sql.py diff --git a/fugue/_utils/sql.py b/fugue/_utils/sql.py new file mode 100644 index 00000000..efeff1a6 --- /dev/null +++ b/fugue/_utils/sql.py @@ -0,0 +1,32 @@ +from typing import Iterable, Tuple +from uuid import uuid4 + + +class TempTableName: + def __init__(self): + self.key = "_" + str(uuid4())[:5] + + def __repr__(self) -> str: + return f"" + + +def get_temp_tb_name() -> TempTableName: + return TempTableName() + + +def parse_sql( + sql: str, prefix: str = " Iterable[Tuple[bool, str]]: + p = 0 + while p < len(sql): + b = sql.find(prefix, p) + if b >= 0: + if b > p: + yield (False, sql[p:b]) + b += len(prefix) + e = sql.find(suffix, b) + yield (True, sql[b:e]) + p = e + len(suffix) + else: + yield (False, sql[p:]) + return diff --git a/fugue/column/sql.py b/fugue/column/sql.py index 76960cd9..4264a7dd 100644 --- a/fugue/column/sql.py +++ b/fugue/column/sql.py @@ -1,4 +1,4 @@ -from typing import Any, Callable, Dict, Iterable, List, Optional, Set +from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple import pyarrow as pa from fugue.column.expressions import ( @@ -238,7 +238,7 @@ def __init__(self, enable_cast: bool = True): self._enable_cast = enable_cast self._func_handler: Dict[str, Callable[[_FuncExpr], Iterable[str]]] = {} - def where(self, condition: ColumnExpr, table: str) -> str: + def where(self, condition: ColumnExpr, table: str) -> Iterable[Tuple[bool, str]]: """Generate a ``SELECT *`` statement with the given where clause :param condition: column expression for ``WHERE`` @@ -261,7 +261,9 @@ def where(self, condition: ColumnExpr, table: str) -> str: lambda: ValueError(f"{condition} has aggregation functions"), ) cond = self.generate(condition.alias("")) - return f"SELECT * FROM {table} WHERE {cond}" + yield (False, "SELECT * FROM ") + yield (True, table) + yield (False, f"WHERE {cond}") def select( self, @@ -269,7 +271,7 @@ def select( table: str, where: Optional[ColumnExpr] = None, having: Optional[ColumnExpr] = None, - ) -> str: + ) -> Iterable[Tuple[bool, str]]: """Construct the full ``SELECT`` statement on a single table :param columns: columns to select, it may contain aggregations, if @@ -290,30 +292,39 @@ def _where() -> str: not is_agg(where), lambda: ValueError(f"{where} has aggregation functions"), ) - return " WHERE " + self.generate(where.alias("")) + return "WHERE " + self.generate(where.alias("")) def _having(as_where: bool = False) -> str: if having is None: return "" - pre = " WHERE " if as_where else " HAVING " + pre = "WHERE " if as_where else "HAVING " return pre + self.generate(having.alias("")) distinct = "" if not columns.is_distinct else "DISTINCT " if not columns.has_agg: expr = ", ".join(self.generate(x) for x in columns.all_cols) - return f"SELECT {distinct}{expr} FROM {table}{_where()}" + yield (False, f"SELECT {distinct}{expr} FROM") + yield (True, table) + yield (False, _where()) + return columns.assert_no_wildcard() if len(columns.literals) == 0: expr = ", ".join(self.generate(x) for x in columns.all_cols) if len(columns.group_keys) == 0: - return f"SELECT {distinct}{expr} FROM {table}{_where()}{_having()}" + yield (False, f"SELECT {distinct}{expr} FROM ") + yield (True, table) + yield (False, _where()) + yield (False, _having()) + return else: keys = ", ".join(self.generate(x) for x in columns.group_keys) - return ( - f"SELECT {distinct}{expr} FROM " - f"{table}{_where()} GROUP BY {keys}{_having()}" - ) + yield (False, f"SELECT {distinct}{expr} FROM ") + yield (True, table) + yield (False, _where()) + yield (False, f"GROUP BY {keys}") + yield (False, _having()) + return else: no_lit = [ x for x in columns.all_cols if not isinstance(x, _LiteralColumnExpr) @@ -324,7 +335,9 @@ def _having(as_where: bool = False) -> str: for x in columns.all_cols ] expr = ", ".join(names) - return f"SELECT {expr} FROM ({sub})" + yield (False, f"SELECT {expr} FROM (") + yield from sub + yield (False, ")") def generate(self, expr: ColumnExpr) -> str: """Convert :class:`~fugue.column.expressions.ColumnExpr` to diff --git a/fugue/execution/api.py b/fugue/execution/api.py index cfbd7d84..a920100d 100644 --- a/fugue/execution/api.py +++ b/fugue/execution/api.py @@ -128,15 +128,15 @@ def run_engine_function( def repartition( df: AnyDataFrame, - partition_spec: PartitionSpec, + partition: PartitionSpec, engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, ) -> AnyDataFrame: - """Partition the input dataframe using ``partition_spec``. + """Partition the input dataframe using ``partition``. :param df: an input dataframe that can be recognized by Fugue - :param partition_spec: how you want to partition the dataframe + :param partition: how you want to partition the dataframe :param engine: an engine like object, defaults to None :param engine_conf: the configs for the engine, defaults to None :param as_fugue: whether to force return a Fugue DataFrame @@ -148,7 +148,7 @@ def repartition( This function is experimental, and may be removed in the future. """ return run_engine_function( - lambda e: e.repartition(e.to_df(df), partition_spec=partition_spec), + lambda e: e.repartition(e.to_df(df), partition_spec=PartitionSpec(partition)), engine=engine, engine_conf=engine_conf, infer_by=[df], @@ -338,7 +338,7 @@ def take( n: int, presort: str, na_position: str = "last", - partition_spec: Optional[PartitionSpec] = None, + partition: Any = None, engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, @@ -355,7 +355,7 @@ def take( :param presort: presort expression similar to partition presort :param na_position: position of null values during the presort. can accept ``first`` or ``last`` - :param partition_spec: PartitionSpec to apply the take operation, + :param partition: PartitionSpec to apply the take operation, defaults to None :param engine: an engine like object, defaults to None :param engine_conf: the configs for the engine, defaults to None @@ -370,7 +370,7 @@ def take( n=n, presort=presort, na_position=na_position, - partition_spec=partition_spec, + partition_spec=None if partition is None else PartitionSpec(partition), ), engine=engine, engine_conf=engine_conf, @@ -418,7 +418,7 @@ def save( path: str, format_hint: Any = None, mode: str = "overwrite", - partition_spec: Optional[PartitionSpec] = None, + partition: Any = None, force_single: bool = False, engine: AnyExecutionEngine = None, engine_conf: Any = None, @@ -432,8 +432,8 @@ def save( defaults to None, meaning to infer :param mode: can accept ``overwrite``, ``append``, ``error``, defaults to "overwrite" - :param partition_spec: how to partition the dataframe before saving, - defaults to empty + :param partition: how to partition the dataframe before saving, + defaults to None :param force_single: force the output as a single file, defaults to False :param kwargs: parameters to pass to the underlying framework :param engine: an engine like object, defaults to None @@ -447,7 +447,7 @@ def save( path=path, format_hint=format_hint, mode=mode, - partition_spec=partition_spec, + partition_spec=None if partition is None else PartitionSpec(partition), force_single=force_single, **kwargs, ), diff --git a/fugue/execution/execution_engine.py b/fugue/execution/execution_engine.py index b579240e..8207570f 100644 --- a/fugue/execution/execution_engine.py +++ b/fugue/execution/execution_engine.py @@ -10,6 +10,7 @@ Iterator, List, Optional, + Tuple, TypeVar, Union, ) @@ -27,6 +28,7 @@ PartitionCursor, PartitionSpec, ) +from fugue._utils.sql import get_temp_tb_name from fugue.column import ColumnExpr, SelectColumns, SQLExpressionGenerator, col, is_agg from fugue.constants import _FUGUE_GLOBAL_CONF from fugue.dataframe import DataFrame, DataFrames @@ -88,18 +90,44 @@ class SQLEngine(ExecutionEngineFacet, ABC): :param execution_engine: the execution engine this sql engine will run on """ + def __init__(self, execution_engine: "ExecutionEngine") -> None: + super().__init__(execution_engine) + self._uid = "_" + str(uuid4())[:5] + "_" + + def encode_name(self, name: str) -> str: + return self._uid + name + + def encode( + self, dfs: DataFrames, statement: List[Tuple[bool, str]] + ) -> Tuple[DataFrames, str]: + d = DataFrames({self.encode_name(k): v for k, v in dfs.items()}) + s = " ".join(self.encode_name(tp[1]) if tp[0] else tp[1] for tp in statement) + print(s) + return d, s + @abstractmethod - def select(self, dfs: DataFrames, statement: str) -> DataFrame: # pragma: no cover + def select( + self, dfs: DataFrames, statement: List[Tuple[bool, str]] + ) -> DataFrame: # pragma: no cover """Execute select statement on the sql engine. :param dfs: a collection of dataframes that must have keys - :param statement: the ``SELECT`` statement using the ``dfs`` keys as tables + :param statement: the ``SELECT`` statement using the ``dfs`` keys as tables. + In each tuple, the first value indicates whether the second value is a + dataframe name reference (True), or just a part of the statement (False) :return: result of the ``SELECT`` statement .. admonition:: Examples - >>> dfs = DataFrames(a=df1, b=df2) - >>> sql_engine.select(dfs, "SELECT * FROM a UNION SELECT * FROM b") + .. code-block:: python + + dfs = DataFrames(a=df1, b=df2) + sql_engine.select( + dfs, + [(False, "SELECT * FROM "), + (True,"a"), + (False," UNION SELECT * FROM "), + (True,"b")]) .. note:: @@ -644,9 +672,9 @@ def select( ) """ gen = SQLExpressionGenerator(enable_cast=False) - df_name = _get_temp_df_name() - sql = gen.select(cols, df_name, where=where, having=having) - res = self.sql_engine.select(DataFrames({df_name: self.to_df(df)}), sql) + df_name = get_temp_tb_name() + sql = list(gen.select(cols, df_name.key, where=where, having=having)) + res = self.sql_engine.select(DataFrames({df_name.key: self.to_df(df)}), sql) diff = gen.correct_select_schema(df.schema, cols, res.schema) return res if diff is None else res.alter_columns(diff) @@ -1208,7 +1236,3 @@ def _generate_comap_empty_dfs(schemas: Any, named: bool) -> DataFrames: return DataFrames({k: ArrayDataFrame([], v) for k, v in schemas.items()}) else: return DataFrames([ArrayDataFrame([], v) for v in schemas.values()]) - - -def _get_temp_df_name() -> str: - return "_" + str(uuid4())[:5] diff --git a/fugue/execution/native_execution_engine.py b/fugue/execution/native_execution_engine.py index 7027b3fc..dfc4eeb6 100644 --- a/fugue/execution/native_execution_engine.py +++ b/fugue/execution/native_execution_engine.py @@ -1,7 +1,7 @@ import inspect import logging import os -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import pandas as pd from qpd_pandas import run_sql_on_pandas @@ -42,11 +42,12 @@ class SqliteEngine(SQLEngine): :param execution_engine: the execution engine this sql engine will run on """ - def select(self, dfs: DataFrames, statement: str) -> DataFrame: + def select(self, dfs: DataFrames, statement: List[Tuple[bool, str]]) -> DataFrame: + _dfs, _sql = self.encode(dfs, statement) sql_engine = create_engine("sqlite:///:memory:") - for k, v in dfs.items(): + for k, v in _dfs.items(): v.as_pandas().to_sql(k, sql_engine, if_exists="replace", index=False) - df = pd.read_sql_query(statement, sql_engine) + df = pd.read_sql_query(_sql, sql_engine) return PandasDataFrame(df) @@ -56,12 +57,14 @@ class QPDPandasEngine(SQLEngine): :param execution_engine: the execution engine this sql engine will run on """ - def select(self, dfs: DataFrames, statement: str) -> DataFrame: - _dfs = { + def select(self, dfs: DataFrames, statement: List[Tuple[bool, str]]) -> DataFrame: + _dfs, _sql = self.encode(dfs, statement) + _dd = { k: self.execution_engine.to_df(v).as_pandas() # type: ignore - for k, v in dfs.items() + for k, v in _dfs.items() } - df = run_sql_on_pandas(statement, _dfs, ignore_case=True) + + df = run_sql_on_pandas(_sql, _dd, ignore_case=True) return self.execution_engine.to_df(df) diff --git a/fugue/extensions/_builtins/processors.py b/fugue/extensions/_builtins/processors.py index 724b24eb..b24da83d 100644 --- a/fugue/extensions/_builtins/processors.py +++ b/fugue/extensions/_builtins/processors.py @@ -147,7 +147,7 @@ def process(self, dfs: DataFrames) -> DataFrame: class RunSQLSelect(Processor): def process(self, dfs: DataFrames) -> DataFrame: - statement = self.params.get_or_throw("statement", str) + statement = self.params.get_or_throw("statement", object) engine = self.params.get_or_none("sql_engine", object) engine_params = self.params.get("sql_engine_params", ParamDict()) sql_engine = make_sql_engine(engine, self.execution_engine, **engine_params) diff --git a/fugue/registry.py b/fugue/registry.py index eb6310a3..003c3625 100644 --- a/fugue/registry.py +++ b/fugue/registry.py @@ -1,7 +1,6 @@ import inspect from typing import Any, Optional -import pandas as pd import pyarrow as pa from fugue._utils.interfaceless import ( @@ -9,7 +8,6 @@ SimpleAnnotationConverter, register_annotation_converter, ) -from fugue.collections.yielded import Yielded from fugue.dataframe import ArrowDataFrame, DataFrame from fugue.execution.factory import register_execution_engine, register_sql_engine from fugue.execution.native_execution_engine import ( @@ -17,7 +15,6 @@ QPDPandasEngine, SqliteEngine, ) -from fugue.workflow import register_raw_df_type def _register() -> None: @@ -29,18 +26,10 @@ def _register() -> None: >>> import fugue """ - _register_raw_dataframes() _register_engines() _register_annotation_converters() -def _register_raw_dataframes() -> None: - register_raw_df_type(Yielded) - register_raw_df_type(pd.DataFrame) - register_raw_df_type(DataFrame) - register_raw_df_type(pa.Table) - - def _register_engines() -> None: register_execution_engine( "native", lambda conf: NativeExecutionEngine(conf), on_dup="ignore" diff --git a/fugue/sql/api.py b/fugue/sql/api.py index a6c00134..8aadb948 100644 --- a/fugue/sql/api.py +++ b/fugue/sql/api.py @@ -36,7 +36,8 @@ def fugue_sql( dag = _build_dag(query, fsql_ignore_case=fsql_ignore_case, args=args, kwargs=kwargs) if dag.last_df is not None: dag.last_df.yield_dataframe_as("result", as_local=as_local) - else: + else: # pragma: no cover + # impossible case raise FugueSQLError(f"no dataframe to output from\n{query}") res = dag.run(engine, engine_conf) return res["result"] if as_fugue else res["result"].native_as_df() diff --git a/fugue/sql/workflow.py b/fugue/sql/workflow.py index ca09cecd..a972dcc8 100644 --- a/fugue/sql/workflow.py +++ b/fugue/sql/workflow.py @@ -7,13 +7,9 @@ from ..collections.yielded import Yielded from ..constants import FUGUE_CONF_SQL_IGNORE_CASE +from ..dataframe.api import is_df from ..dataframe.dataframe import DataFrame -from ..workflow.workflow import ( - FugueWorkflow, - WorkflowDataFrame, - WorkflowDataFrames, - is_acceptable_raw_df, -) +from ..workflow.workflow import FugueWorkflow, WorkflowDataFrame, WorkflowDataFrames from ._utils import LazyWorkflowDataFrame, fill_sql_template from ._visitors import FugueSQLHooks, _Extensions @@ -71,7 +67,7 @@ def _split_params( for k, v in params.items(): if isinstance(v, (int, str, float, bool)): p[k] = v - elif isinstance(v, (DataFrame, Yielded)) or is_acceptable_raw_df(v): + elif isinstance(v, (DataFrame, Yielded)) or is_df(v): dfs[k] = LazyWorkflowDataFrame(k, v, self) else: p[k] = v diff --git a/fugue/workflow/__init__.py b/fugue/workflow/__init__.py index bbb6a7e2..2a3af140 100644 --- a/fugue/workflow/__init__.py +++ b/fugue/workflow/__init__.py @@ -2,6 +2,6 @@ from ._workflow_context import FugueWorkflowContext from .api import * -from .input import is_acceptable_raw_df, register_raw_df_type +from .input import register_raw_df_type from .module import module from .workflow import FugueWorkflow, WorkflowDataFrame, WorkflowDataFrames diff --git a/fugue/workflow/input.py b/fugue/workflow/input.py index ce520a49..8f87cd35 100644 --- a/fugue/workflow/input.py +++ b/fugue/workflow/input.py @@ -1,32 +1,10 @@ -from typing import Any, Set, Type - -from fugue.extensions._builtins import CreateData -from fugue.extensions.creator import parse_creator - -_VALID_RAW_DF_TYPES: Set[Type] = set() +from typing import Type def register_raw_df_type(df_type: Type) -> None: - """Register a base type of dataframe that can be recognized by - :class:`~fugue.workflow.workflow.FugueWorkflow` and converted to - :class:`~fugue.workflow.workflow.WorkflowDataFrame` - - :param df_type: dataframe type, for example ``dask.dataframe.DataFrame`` - """ + """TODO: This function is to be removed before 0.9.0 - _VALID_RAW_DF_TYPES.add(df_type) - - @parse_creator.candidate(lambda x: isinstance(x, df_type), priority=0.5) - def _parse(x: Any) -> Any: - return CreateData(x) - - -def is_acceptable_raw_df(df: Any) -> bool: - """Whether the input ``df`` can be converted to - :class:`~fugue.workflow.workflow.WorkflowDataFrame` - :param df: input raw dataframe - :return: whether this dataframe is convertible + .. deprecated:: 3.1 + Register using :func:`fugue.api.is_df` instead. """ - import fugue._utils.register # pylint: disable=W0611 # noqa: F401 - - return any(isinstance(df, t) for t in _VALID_RAW_DF_TYPES) + raise DeprecationWarning("use fugue.api.is_df to register the dataframe") diff --git a/fugue/workflow/workflow.py b/fugue/workflow/workflow.py index e1a02ce0..704aee13 100644 --- a/fugue/workflow/workflow.py +++ b/fugue/workflow/workflow.py @@ -1,9 +1,28 @@ import sys from collections import defaultdict -from typing import Any, Callable, Dict, Iterable, List, Optional, Set, TypeVar, Union +from typing import ( + Any, + Callable, + Dict, + Iterable, + List, + Optional, + Set, + Tuple, + TypeVar, + Union, +) from uuid import uuid4 from adagio.specs import WorkflowSpec +from triad import ( + ParamDict, + Schema, + SerializableRLock, + assert_or_throw, + extensible_class, +) + from fugue._utils.exception import modify_traceback from fugue.collections.partition import PartitionSpec from fugue.collections.yielded import Yielded @@ -19,6 +38,7 @@ FUGUE_CONF_WORKFLOW_EXCEPTION_OPTIMIZE, ) from fugue.dataframe import DataFrame, LocalBoundedDataFrame, YieldedDataFrame +from fugue.dataframe.api import is_df from fugue.dataframe.dataframes import DataFrames from fugue.exceptions import FugueWorkflowCompileError, FugueWorkflowError from fugue.execution.factory import make_execution_engine @@ -56,14 +76,6 @@ from fugue.workflow._checkpoint import FileCheckpoint, WeakCheckpoint from fugue.workflow._tasks import Create, FugueTask, Output, Process from fugue.workflow._workflow_context import FugueWorkflowContext -from fugue.workflow.input import is_acceptable_raw_df -from triad import ( - ParamDict, - Schema, - SerializableRLock, - assert_or_throw, - extensible_class, -) _DEFAULT_IGNORE_ERRORS: List[Any] = [] @@ -1597,7 +1609,13 @@ def create( :meth:`~fugue.extensions.context.ExtensionContext.partition_spec` :return: result dataframe """ - task = Create(creator=using, schema=schema, params=params) + task = Create( + creator=CreateData(using) + if is_df(using) or isinstance(using, Yielded) + else using, + schema=schema, + params=params, + ) res = self.add(task) self._last_df = res return res @@ -1715,7 +1733,7 @@ def create_data( if ( (isinstance(data, (List, Iterable)) and not isinstance(data, str)) or isinstance(data, Yielded) - or is_acceptable_raw_df(data) + or is_df(data) ): return self.create( using=CreateData( @@ -2071,20 +2089,23 @@ def select( Please read :ref:`this ` for more examples """ - s_str: List[str] = [] + sql: List[Tuple[bool, str]] = [] dfs: Dict[str, DataFrame] = {} for s in statements: if isinstance(s, str): - s_str.append(s) + sql.append((False, s)) else: ws = self.df(s) dfs[ws.name] = ws - s_str.append(ws.name) - sql = " ".join(s_str).strip() - if not sql[:10].upper().startswith("SELECT") and not sql[ - :10 - ].upper().startswith("WITH"): - sql = "SELECT " + sql + sql.append((True, ws.name)) + if sql[0][0]: # starts with reference + sql.insert(0, (False, "SELECT")) + else: # start with string but without select + start = sql[0][1].strip() + if not start[:10].upper().startswith("SELECT") and not start[ + :10 + ].upper().startswith("WITH"): + sql[0] = (False, "SELECT " + start) return self.process( dfs, using=RunSQLSelect, diff --git a/fugue_dask/execution_engine.py b/fugue_dask/execution_engine.py index 3374d1df..81307090 100644 --- a/fugue_dask/execution_engine.py +++ b/fugue_dask/execution_engine.py @@ -1,6 +1,6 @@ import logging import os -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union, Tuple import dask.dataframe as dd from distributed import Client @@ -45,12 +45,13 @@ def __init__(self, execution_engine: ExecutionEngine): ) super().__init__(execution_engine) - def select(self, dfs: DataFrames, statement: str) -> DataFrame: + def select(self, dfs: DataFrames, statement: List[Tuple[bool, str]]) -> DataFrame: + _dfs, _sql = self.encode(dfs, statement) dask_dfs = { k: self.execution_engine.to_df(v).native # type: ignore - for k, v in dfs.items() + for k, v in _dfs.items() } - df = run_sql_on_dask(statement, dask_dfs, ignore_case=True) + df = run_sql_on_dask(_sql, dask_dfs, ignore_case=True) return DaskDataFrame(df) diff --git a/fugue_dask/registry.py b/fugue_dask/registry.py index 516cccd1..17c1c084 100644 --- a/fugue_dask/registry.py +++ b/fugue_dask/registry.py @@ -13,7 +13,6 @@ register_annotation_converter, ) from fugue.plugins import as_fugue_dataset, infer_execution_engine -from fugue.workflow import register_raw_df_type from fugue_dask._utils import DASK_UTILS from fugue_dask.dataframe import DaskDataFrame from fugue_dask.execution_engine import DaskExecutionEngine @@ -31,10 +30,6 @@ def _dask_as_fugue_df(df: dd.DataFrame, **kwargs: Any) -> DaskDataFrame: return DaskDataFrame(df, **kwargs) -def _register_raw_dataframes() -> None: - register_raw_df_type(dd.DataFrame) - - def _register_engines() -> None: register_execution_engine( "dask", @@ -103,6 +98,5 @@ def _register() -> None: >>> import fugue_dask """ - _register_raw_dataframes() _register_engines() _register_annotation_converters() diff --git a/fugue_duckdb/_io.py b/fugue_duckdb/_io.py index c299a50f..74bf8867 100644 --- a/fugue_duckdb/_io.py +++ b/fugue_duckdb/_io.py @@ -2,13 +2,14 @@ from typing import Any, Iterable, List, Optional, Union from duckdb import DuckDBPyConnection -from fugue._utils.io import FileParser, load_df, save_df -from fugue.dataframe import ArrowDataFrame, LocalBoundedDataFrame from triad import ParamDict, Schema from triad.collections.fs import FileSystem from triad.utils.assertion import assert_or_throw -from fugue_duckdb._utils import encode_value_to_expr, get_temp_df_name, to_duck_type +from fugue._utils.io import FileParser, load_df, save_df +from fugue._utils.sql import get_temp_tb_name +from fugue.dataframe import ArrowDataFrame, LocalBoundedDataFrame +from fugue_duckdb._utils import encode_value_to_expr, to_duck_type from fugue_duckdb.dataframe import DuckDataFrame @@ -91,15 +92,15 @@ def save_df( self._format_save[p.file_format](df, p, **kwargs) def _save_csv(self, df: DuckDataFrame, p: FileParser, **kwargs: Any): - dn = get_temp_df_name() - df.native.create_view(dn) + dn = get_temp_tb_name() + df.native.create_view(dn.key) kw = ParamDict({k.lower(): v for k, v in kwargs.items()}) kw["header"] = 1 if kw.pop("header", False) else 0 params: List[str] = [] for k, v in kw.items(): params.append(f"{k.upper()} " + encode_value_to_expr(v)) pm = ", ".join(params) - query = f"COPY {dn} TO {encode_value_to_expr(p.uri)} WITH ({pm})" + query = f"COPY {dn.key} TO {encode_value_to_expr(p.uri)} WITH ({pm})" self._con.execute(query) def _load_csv( # noqa: C901 @@ -176,15 +177,15 @@ def _load_csv( # noqa: C901 return DuckDataFrame(self._con.from_query(query)) def _save_parquet(self, df: DuckDataFrame, p: FileParser, **kwargs: Any): - dn = get_temp_df_name() - df.native.create_view(dn) + dn = get_temp_tb_name() + df.native.create_view(dn.key) kw = ParamDict({k.lower(): v for k, v in kwargs.items()}) kw["format"] = "parquet" params: List[str] = [] for k, v in kw.items(): params.append(f"{k.upper()} " + encode_value_to_expr(v)) pm = ", ".join(params) - query = f"COPY {dn} TO {encode_value_to_expr(p.uri)}" + query = f"COPY {dn.key} TO {encode_value_to_expr(p.uri)}" if len(params) > 0: query += f" WITH ({pm})" self._con.execute(query) diff --git a/fugue_duckdb/_utils.py b/fugue_duckdb/_utils.py index 83d062f5..6245414d 100644 --- a/fugue_duckdb/_utils.py +++ b/fugue_duckdb/_utils.py @@ -1,13 +1,12 @@ from datetime import date, datetime from typing import Any, Dict, Iterable, Optional, Tuple -from uuid import uuid4 import numpy as np import pandas as pd import pyarrow as pa from duckdb import __version__ as _DUCKDB_VERSION # type: ignore -from triad.utils.pyarrow import TRIAD_DEFAULT_TIMESTAMP from triad import Schema +from triad.utils.pyarrow import TRIAD_DEFAULT_TIMESTAMP _LEGACY_DUCKDB = _DUCKDB_VERSION < "0.3.3" @@ -78,10 +77,6 @@ def encode_value_to_expr(value: Any) -> str: # noqa: C901 raise NotImplementedError(value) -def get_temp_df_name() -> str: - return "_" + str(uuid4())[:5] - - def to_duck_type(tp: pa.DataType) -> str: if _LEGACY_DUCKDB: # pragma: no cover return _to_duck_type_legacy(tp) diff --git a/fugue_duckdb/dataframe.py b/fugue_duckdb/dataframe.py index 9861c2e8..7f4dbca2 100644 --- a/fugue_duckdb/dataframe.py +++ b/fugue_duckdb/dataframe.py @@ -13,7 +13,7 @@ LocalDataFrame, ) from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError -from fugue.plugins import get_column_names, is_df, as_local_bounded +from fugue.plugins import as_fugue_dataset, as_local_bounded, get_column_names, is_df from ._utils import encode_column_name, to_duck_type, to_pa_type @@ -36,6 +36,10 @@ def _get_schema(self) -> Schema: ] ) + @property + def alias(self) -> str: + return "_" + str(id(self._rel)) # DuckDBPyRelation.alias is not always unique + @property def native(self) -> DuckDBPyRelation: """DuckDB relation object""" @@ -141,6 +145,11 @@ def to_list(row: Any) -> List[Any]: return [to_list(x) for x in rel.fetchall()] +@as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, DuckDBPyRelation)) +def _duckdb_as_fugue_df(df: DuckDBPyRelation, **kwargs: Any) -> DuckDataFrame: + return DuckDataFrame(df, **kwargs) + + @is_df.candidate(lambda df: isinstance(df, DuckDBPyRelation)) def _duck_is_df(df: DuckDBPyRelation) -> bool: return True diff --git a/fugue_duckdb/execution_engine.py b/fugue_duckdb/execution_engine.py index 42caca18..05456f43 100644 --- a/fugue_duckdb/execution_engine.py +++ b/fugue_duckdb/execution_engine.py @@ -1,5 +1,5 @@ import logging -from typing import Any, Dict, Iterable, List, Optional, Union +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import duckdb import pyarrow as pa @@ -16,6 +16,7 @@ PandasMapEngine, SQLEngine, ) +from fugue._utils.sql import get_temp_tb_name, parse_sql from fugue.collections.partition import PartitionSpec, parse_presort_exp from fugue.dataframe import ( DataFrame, @@ -31,7 +32,6 @@ encode_column_names, encode_schema_names, encode_value_to_expr, - get_temp_df_name, ) from .dataframe import DuckDataFrame @@ -44,27 +44,24 @@ class DuckDBEngine(SQLEngine): :param execution_engine: the execution engine this sql engine will run on """ - def __init__(self, execution_engine: ExecutionEngine) -> None: - super().__init__(execution_engine) - self._cache: Dict[str, int] = {} - - def select(self, dfs: DataFrames, statement: str) -> DataFrame: + def select(self, dfs: DataFrames, statement: List[Tuple[bool, str]]) -> DataFrame: if isinstance(self.execution_engine, DuckExecutionEngine): return self._duck_select(dfs, statement) - return self._other_select(dfs, statement) + else: + _dfs, _sql = self.encode(dfs, statement) + return self._other_select(_dfs, _sql) - def _duck_select(self, dfs: DataFrames, statement: str) -> DataFrame: + def _duck_select( + self, dfs: DataFrames, statement: List[Tuple[bool, str]] + ) -> DataFrame: + name_map: Dict[str, str] = {} for k, v in dfs.items(): - tdf: Any = self.execution_engine._to_duck_df(v) # type: ignore - if k not in self._cache or self._cache[k] != id(tdf.native): - tdf.native.create_view(k, replace=True) - # TODO: remove the following hack, if it is stable - # kk = k + get_temp_df_name() - # tdf.native.query( - # kk, f"CREATE OR REPLACE TEMP VIEW {k} AS SELECT * FROM {kk}" - # ) - self._cache[k] = id(tdf.native) - result = self.execution_engine.connection.query(statement) # type: ignore + tdf: DuckDataFrame = self.execution_engine._to_duck_df( # type: ignore + v, create_view=True + ) + name_map[k] = tdf.alias + query = " ".join(name_map.get(p[1], p[1]) if p[0] else p[1] for p in statement) + result = self.execution_engine.connection.query(query) # type: ignore return DuckDataFrame(result) def _other_select(self, dfs: DataFrames, statement: str) -> DataFrame: @@ -93,6 +90,7 @@ def __init__( self._con = connection or duckdb.connect() self._external_con = connection is not None self._context_lock = SerializableRLock() + self._registered_dfs: Dict[str, DuckDataFrame] = {} try: for pg in list(self._get_pragmas()): # transactional @@ -173,9 +171,9 @@ def join( ) -> DataFrame: key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on) t1, t2, t3 = ( - get_temp_df_name(), - get_temp_df_name(), - get_temp_df_name(), + get_temp_tb_name(), + get_temp_tb_name(), + get_temp_tb_name(), ) on_fields = " AND ".join( f"{t1}.{encode_column_name(k)}={t2}.{encode_column_name(k)}" @@ -238,7 +236,7 @@ def join( for k in output_schema.names ) sql = f"SELECT {select_fields} FROM {t1} {join_type} {t2} ON {on_fields}" - return self._sql(sql, {t1: df1, t2: df2}) + return self._sql(sql, {t1.key: df1, t2.key: df2}) def _how_to_join(self, how: str): return how.upper().replace("_", " ") + " JOIN" @@ -248,9 +246,9 @@ def union(self, df1: DataFrame, df2: DataFrame, distinct: bool = True) -> DataFr df1.schema == df2.schema, ValueError(f"{df1.schema} != {df2.schema}") ) if distinct: - t1, t2 = get_temp_df_name(), get_temp_df_name() + t1, t2 = get_temp_tb_name(), get_temp_tb_name() sql = f"SELECT * FROM {t1} UNION SELECT * FROM {t2}" - return self._sql(sql, {t1: df1, t2: df2}) + return self._sql(sql, {t1.key: df1, t2.key: df2}) return DuckDataFrame( self._to_duck_df(df1).native.union(self._to_duck_df(df2).native) ) @@ -259,9 +257,9 @@ def subtract( self, df1: DataFrame, df2: DataFrame, distinct: bool = True ) -> DataFrame: # pragma: no cover if distinct: - t1, t2 = get_temp_df_name(), get_temp_df_name() + t1, t2 = get_temp_tb_name(), get_temp_tb_name() sql = f"SELECT * FROM {t1} EXCEPT SELECT * FROM {t2}" - return self._sql(sql, {t1: df1, t2: df2}) + return self._sql(sql, {t1.key: df1, t2.key: df2}) return DuckDataFrame( self._to_duck_df(df1).native.except_(self._to_duck_df(df2).native) ) @@ -270,9 +268,9 @@ def intersect( self, df1: DataFrame, df2: DataFrame, distinct: bool = True ) -> DataFrame: if distinct: - t1, t2 = get_temp_df_name(), get_temp_df_name() + t1, t2 = get_temp_tb_name(), get_temp_tb_name() sql = f"SELECT * FROM {t1} INTERSECT DISTINCT SELECT * FROM {t2}" - return self._sql(sql, {t1: df1, t2: df2}) + return self._sql(sql, {t1.key: df1, t2.key: df2}) raise NotImplementedError( "DuckDB doesn't have consist behavior on INTERSECT ALL," " so Fugue doesn't support it" @@ -347,14 +345,14 @@ def sample( f"one and only one of n and frac should be non-negative, {n}, {frac}" ), ) - tb = get_temp_df_name() + tb = get_temp_tb_name() if frac is not None: sql = f"SELECT * FROM {tb} USING SAMPLE bernoulli({frac*100} PERCENT)" else: sql = f"SELECT * FROM {tb} USING SAMPLE reservoir({n} ROWS)" if seed is not None: sql += f" REPEATABLE ({seed})" - return self._sql(sql, {tb: df}) + return self._sql(sql, {tb.key: df}) def take( self, @@ -374,7 +372,7 @@ def take( _presort = parse_presort_exp(presort) else: _presort = partition_spec.presort - tb = get_temp_df_name() + tb = get_temp_tb_name() if len(_presort) == 0: if len(partition_spec.partition_by) == 0: @@ -386,7 +384,7 @@ def take( f"AS __fugue_take_param FROM {tb}" ) sql = f"SELECT {cols} FROM ({sql}) WHERE __fugue_take_param<={n}" - return self._sql(sql, {tb: df}) + return self._sql(sql, {tb.key: df}) sorts: List[str] = [] for k, v in _presort.items(): @@ -399,7 +397,7 @@ def take( if len(partition_spec.partition_by) == 0: sql = f"SELECT * FROM {tb} {sort_expr} LIMIT {n}" - return self._sql(sql, {tb: df}) + return self._sql(sql, {tb.key: df}) cols = ", ".join(encode_schema_names(df.schema)) pcols = ", ".join(encode_column_names(partition_spec.partition_by)) @@ -408,7 +406,7 @@ def take( f"AS __fugue_take_param FROM {tb}" ) sql = f"SELECT {cols} FROM ({sql}) WHERE __fugue_take_param<={n}" - return self._sql(sql, {tb: df}) + return self._sql(sql, {tb.key: df}) def load_df( self, @@ -443,33 +441,45 @@ def convert_yield_dataframe(self, df: DataFrame, as_local: bool) -> DataFrame: def _sql(self, sql: str, dfs: Dict[str, DataFrame]) -> DuckDataFrame: with self._context_lock: - df = self.sql_engine.select(DataFrames(dfs), sql) + df = self.sql_engine.select(DataFrames(dfs), list(parse_sql(sql))) return DuckDataFrame(df.native) # type: ignore - def _to_duck_df(self, df: Any, schema: Any = None) -> DuckDataFrame: - if isinstance(df, DuckDBPyRelation): - assert_or_throw( - schema is None, - ValueError("schema must be None when df is a DuckDBPyRelation"), - ) - return DuckDataFrame(df) - if isinstance(df, DataFrame): - assert_or_throw( - schema is None, - ValueError("schema must be None when df is a DataFrame"), - ) - if isinstance(df, DuckDataFrame): - return df - - if isinstance(df, PandasDataFrame) and all( - not pa.types.is_nested(f.type) for f in df.schema.fields - ): - rdf = DuckDataFrame(self.connection.from_df(df.as_pandas())) - else: - rdf = DuckDataFrame( - duckdb.arrow(df.as_arrow(), connection=self.connection) + def _to_duck_df( + self, df: Any, schema: Any = None, create_view: bool = False + ) -> DuckDataFrame: + def _gen_duck() -> DuckDataFrame: + if isinstance(df, DuckDBPyRelation): + assert_or_throw( + schema is None, + ValueError("schema must be None when df is a DuckDBPyRelation"), ) - rdf.reset_metadata(df.metadata if df.has_metadata else None) - return rdf - tdf = ArrowDataFrame(df, schema) - return DuckDataFrame(duckdb.arrow(tdf.native, connection=self.connection)) + return DuckDataFrame(df) + if isinstance(df, DataFrame): + assert_or_throw( + schema is None, + ValueError("schema must be None when df is a DataFrame"), + ) + if isinstance(df, DuckDataFrame): + return df + + if isinstance(df, PandasDataFrame) and all( + not pa.types.is_nested(f.type) for f in df.schema.fields + ): + rdf = DuckDataFrame(self.connection.from_df(df.as_pandas())) + else: + rdf = DuckDataFrame( + duckdb.arrow(df.as_arrow(), connection=self.connection) + ) + rdf.reset_metadata(df.metadata if df.has_metadata else None) + return rdf + tdf = ArrowDataFrame(df, schema) + return DuckDataFrame(duckdb.arrow(tdf.native, connection=self.connection)) + + res = _gen_duck() + if create_view: + with self._context_lock: + if res.alias not in self._registered_dfs: + res.native.create_view(res.alias, replace=True) + # must hold the reference of the df so the id will not be reused + self._registered_dfs[res.alias] = res + return res diff --git a/fugue_duckdb/ibis_engine.py b/fugue_duckdb/ibis_engine.py index 66fbd3cc..888e022e 100644 --- a/fugue_duckdb/ibis_engine.py +++ b/fugue_duckdb/ibis_engine.py @@ -1,14 +1,16 @@ -from typing import Any, Callable, Optional +from typing import Any, Callable, Dict, Optional, Tuple import ibis from ibis.backends.pandas import Backend from fugue import DataFrame, DataFrames, ExecutionEngine -from fugue_duckdb.execution_engine import DuckDBEngine, DuckExecutionEngine +from fugue._utils.sql import TempTableName, get_temp_tb_name, parse_sql from fugue_ibis import IbisTable from fugue_ibis._utils import to_ibis_schema from fugue_ibis.execution.ibis_engine import IbisEngine, parse_ibis_engine +from .execution_engine import DuckDBEngine, DuckExecutionEngine + class DuckDBIbisEngine(IbisEngine): def select( @@ -17,11 +19,20 @@ def select( be = _BackendWrapper().connect({}) be.set_schemas(dfs) expr = ibis_func(be) - sql = str( - ibis.postgres.compile(expr).compile(compile_kwargs={"literal_binds": True}) + sql = list( + parse_sql( + str( + ibis.postgres.compile(expr).compile( + compile_kwargs={"literal_binds": True} + ) + ), + prefix='" Optional[IbisEngi class _BackendWrapper(Backend): def set_schemas(self, dfs: DataFrames) -> None: self._schemas = {k: to_ibis_schema(v.schema) for k, v in dfs.items()} - - def table(self, name: str, schema: Any = None): - return ibis.table(self._schemas[name], name=name) + self._name_map: Dict[str, Tuple[TempTableName, IbisTable]] = {} + + def table(self, name: str, schema: Any = None) -> IbisTable: + if name not in self._name_map: + tn = get_temp_tb_name() + tb = ibis.table(self._schemas[name], name=(str(tn))) + self._name_map[name] = (tn, tb) + return self._name_map[name][1] diff --git a/fugue_duckdb/registry.py b/fugue_duckdb/registry.py index b9fea78b..b6560447 100644 --- a/fugue_duckdb/registry.py +++ b/fugue_duckdb/registry.py @@ -17,8 +17,7 @@ SimpleAnnotationConverter, register_annotation_converter, ) -from fugue.plugins import as_fugue_dataset, infer_execution_engine -from fugue.workflow import register_raw_df_type +from fugue.plugins import infer_execution_engine from fugue_duckdb.dataframe import DuckDataFrame from fugue_duckdb.execution_engine import DuckDBEngine, DuckExecutionEngine @@ -30,15 +29,6 @@ def _infer_duckdb_client(objs: Any) -> Any: return "duckdb" -@as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, DuckDBPyRelation)) -def _duckdb_as_fugue_df(df: DuckDBPyRelation, **kwargs: Any) -> DuckDataFrame: - return DuckDataFrame(df, **kwargs) - - -def _register_raw_dataframes() -> None: - register_raw_df_type(DuckDBPyRelation) - - def _register_engines() -> None: register_execution_engine( "duck", @@ -136,6 +126,5 @@ def _register() -> None: >>> import fugue_duckdb """ - _register_raw_dataframes() _register_engines() _register_annotation_converters() diff --git a/fugue_ibis/execution_engine.py b/fugue_ibis/execution_engine.py index 2fd951e9..244cc4f6 100644 --- a/fugue_ibis/execution_engine.py +++ b/fugue_ibis/execution_engine.py @@ -1,5 +1,5 @@ import itertools -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Tuple import ibis from ibis import BaseBackend @@ -36,9 +36,9 @@ def __init__(self, execution_engine: ExecutionEngine) -> None: super().__init__(execution_engine) self._ibis_engine: IbisExecutionEngine = execution_engine # type: ignore - def select(self, dfs: DataFrames, statement: str) -> DataFrame: + def select(self, dfs: DataFrames, statement: List[Tuple[bool, str]]) -> DataFrame: return self._ibis_engine._to_ibis_dataframe( - self._ibis_engine._raw_select(statement, dfs) + self._ibis_engine._raw_select(" ".join(x[1] for x in statement), dfs) ) diff --git a/fugue_ray/registry.py b/fugue_ray/registry.py index 771c6358..aaed5e67 100644 --- a/fugue_ray/registry.py +++ b/fugue_ray/registry.py @@ -12,7 +12,7 @@ register_annotation_converter, ) from fugue.plugins import as_fugue_dataset, infer_execution_engine -from fugue.workflow import register_raw_df_type + from .dataframe import RayDataFrame from .execution_engine import RayExecutionEngine @@ -30,15 +30,9 @@ def _ray_as_fugue_df(df: rd.Dataset, **kwargs: Any) -> RayDataFrame: return RayDataFrame(df, **kwargs) -def _register_raw_dataframes() -> None: - register_raw_df_type(rd.Dataset) - - def _register_engines() -> None: register_execution_engine( - "ray", - lambda conf, **kwargs: RayExecutionEngine(conf=conf), - on_dup="ignore", + "ray", lambda conf, **kwargs: RayExecutionEngine(conf=conf), on_dup="ignore" ) @@ -86,6 +80,5 @@ def count(self, df: DataFrame) -> int: # pragma: no cover @run_at_def def _register() -> None: """Register Ray Execution Engine""" - _register_raw_dataframes() _register_engines() _register_annotation_converters() diff --git a/fugue_spark/execution_engine.py b/fugue_spark/execution_engine.py index 4aaed7e9..2f4e8188 100644 --- a/fugue_spark/execution_engine.py +++ b/fugue_spark/execution_engine.py @@ -77,7 +77,7 @@ def __init__(self, execution_engine: ExecutionEngine): ) super().__init__(execution_engine) - def select(self, dfs: DataFrames, statement: str) -> DataFrame: + def select(self, dfs: DataFrames, statement: List[Tuple[bool, str]]) -> DataFrame: for k, v in dfs.items(): self.execution_engine.register(v, k) # type: ignore return SparkDataFrame( diff --git a/fugue_spark/registry.py b/fugue_spark/registry.py index 61c95ce7..85d062df 100644 --- a/fugue_spark/registry.py +++ b/fugue_spark/registry.py @@ -15,7 +15,7 @@ register_annotation_converter, ) from fugue.plugins import as_fugue_dataset, infer_execution_engine, parse_creator -from fugue.workflow import register_raw_df_type + from fugue_spark.dataframe import SparkDataFrame from fugue_spark.execution_engine import SparkExecutionEngine @@ -48,10 +48,6 @@ def _run_sql(spark: SparkSession) -> ps.DataFrame: return _run_sql -def _register_raw_dataframes() -> None: - register_raw_df_type(ps.DataFrame) - - def _register_engines() -> None: register_execution_engine( "spark", @@ -186,6 +182,5 @@ def _register() -> None: >>> import fugue_spark """ - _register_raw_dataframes() _register_engines() _register_annotation_converters() diff --git a/fugue_test/builtin_suite.py b/fugue_test/builtin_suite.py index 1c4fcd43..deb38680 100644 --- a/fugue_test/builtin_suite.py +++ b/fugue_test/builtin_suite.py @@ -53,6 +53,7 @@ FugueWorkflowRuntimeValidationError, ) from pytest import raises +import fugue.api as fa from triad import SerializableRLock @@ -843,6 +844,7 @@ def select(self, dfs, statement): a = dag.df([[1, 10], [2, 20], [3, 30]], "x:long,y:long") b = dag.df([[2, 20, 40], [3, 30, 90]], "x:long,y:long,z:long") dag.select("* FROM", a).assert_eq(a) + dag.select(a, ".* FROM", a).assert_eq(a) dag.select("SELECT *,x*y AS z FROM", a, "WHERE x>=2").assert_eq(b) c = dag.df([[2, 20, 40], [3, 30, 90]], "x:long,y:long,zb:long") @@ -1571,6 +1573,51 @@ def t5(df: pd.DataFrame, c: Callable) -> List[List[Any]]: assert 4 == cb3.n + def test_sql_api(self): + def tr(df: pd.DataFrame, n=1) -> pd.DataFrame: + return df + n + + with fa.engine_context(self.engine): + df1 = fa.as_fugue_df([[0, 1], [2, 3], [4, 5]], schema="a:long,b:int") + df2 = pd.DataFrame([[0, 10], [1, 100]], columns=["a", "c"]) + sdf1 = fa.raw_sql( # noqa + "SELECT ", df1, ".a, b FROM ", df1, " WHERE a<4" + ) + sdf2 = fa.raw_sql("SELECT * FROM ", df2, " WHERE a<1") # noqa + + sdf3 = fa.fugue_sql( + """ + SELECT sdf1.a,sdf1.b,c FROM sdf1 INNER JOIN sdf2 ON sdf1.a=sdf2.a + TRANSFORM USING tr SCHEMA * + """ + ) + res = fa.fugue_sql_flow( + """ + TRANSFORM x USING tr(n=2) SCHEMA * + YIELD LOCAL DATAFRAME AS res + PRINT sdf1 + """, + x=sdf3, + ) + df_eq( + res["res"], + [[3, 4, 13]], + schema="a:long,b:int,c:long", + check_schema=False, + throw=True, + ) + + sdf4 = fa.fugue_sql( + """ + SELECT sdf1.a,b,c FROM sdf1 INNER JOIN sdf2 ON sdf1.a=sdf2.a + TRANSFORM USING tr SCHEMA * + """, + as_fugue=False, + as_local=True, + ) + assert not isinstance(sdf4, DataFrame) + assert fa.is_local(sdf4) + def mock_creator(p: int) -> DataFrame: return ArrayDataFrame([[p]], "a:int") diff --git a/fugue_test/dataframe_suite.py b/fugue_test/dataframe_suite.py index 4ad2c36b..c371778e 100644 --- a/fugue_test/dataframe_suite.py +++ b/fugue_test/dataframe_suite.py @@ -456,7 +456,7 @@ def to_native_df(self, pdf: pd.DataFrame) -> Any: # pragma: no cover def test_get_altered_schema(self): pass - def _test_get_column_names(self): + def test_get_column_names(self): df = self.to_native_df(pd.DataFrame([[0, 1, 2]], columns=["0", "1", "2"])) assert fi.get_column_names(df) == ["0", "1", "2"] diff --git a/fugue_test/execution_suite.py b/fugue_test/execution_suite.py index baea8134..6150ba98 100644 --- a/fugue_test/execution_suite.py +++ b/fugue_test/execution_suite.py @@ -21,6 +21,7 @@ PandasDataFrame, PartitionSpec, register_default_sql_engine, + DataFrame, ) from fugue.column import SelectColumns, col, lit from fugue.dataframe.utils import _df_eq as df_eq @@ -749,8 +750,8 @@ def test_sample(self): def test_take(self): e = self.engine - ps = PartitionSpec(by=["a"], presort="b DESC,c DESC") - ps2 = PartitionSpec(by=["c"], presort="b ASC") + ps = dict(by=["a"], presort="b DESC,c DESC") + ps2 = dict(by=["c"], presort="b ASC") a = e.to_df( [ ["a", 2, 3], @@ -764,8 +765,8 @@ def test_take(self): ) b = fa.take(a, n=1, presort="b desc") c = fa.take(a, n=2, presort="a desc", na_position="first") - d = fa.take(a, n=1, presort="a asc, b desc", partition_spec=ps) - f = fa.take(a, n=1, presort=None, partition_spec=ps2) + d = fa.take(a, n=1, presort="a asc, b desc", partition=ps) + f = fa.take(a, n=1, presort=None, partition=ps2) g = fa.take(a, n=2, presort="a desc", na_position="last") h = fa.take(a, n=2, presort="a", na_position="first") df_eq( @@ -1343,6 +1344,19 @@ def test_load_json_folder(self): c = fa.load(path, format_hint="json", columns=["a", "c"], as_fugue=True) df_eq(c, [[1, 6], [7, 2], [8, 4], [4, 3]], "a:long,c:long", throw=True) + def test_engine_api(self): + # complimentary tests not covered by the other tests + with fa.engine_context(self.engine): + df1 = fa.as_fugue_df([[0, 1], [2, 3]], schema="a:long,b:long") + df1 = fa.repartition(df1, {"num": 2}) + df1 = fa.get_native_as_df(fa.broadcast(df1)) + df2 = pd.DataFrame([[0, 1], [2, 3]], columns=["a", "b"]) + df3 = fa.union(df1, df2, as_fugue=False) + assert fa.is_df(df3) and not isinstance(df3, DataFrame) + df4 = fa.union(df1, df2, as_fugue=True) + assert isinstance(df4, DataFrame) + df_eq(df4, fa.as_pandas(df3), throw=True) + def select_top(cursor, data): return ArrayDataFrame([cursor.row], cursor.row_schema) diff --git a/tests/fugue/column/test_sql.py b/tests/fugue/column/test_sql.py index 8e9b37c1..b8c9c10b 100644 --- a/tests/fugue/column/test_sql.py +++ b/tests/fugue/column/test_sql.py @@ -140,11 +140,13 @@ def dummy(expr): def test_where(): gen = SQLExpressionGenerator() - assert "SELECT * FROM x WHERE (a<5) AND b IS NULL" == gen.where( - (col("a") < 5) & col("b").is_null(), "x" + assert "SELECT * FROM !x! WHERE (a<5) AND b IS NULL" == _to_sql( + gen.where((col("a") < 5) & col("b").is_null(), "x") ) - assert "SELECT * FROM x WHERE a<5" == gen.where((col("a") < 5).alias("x"), "x") - raises(ValueError, lambda: gen.where(f.max(col("a")), "x")) + assert "SELECT * FROM !x! WHERE a<5" == _to_sql( + gen.where((col("a") < 5).alias("x"), "x") + ) + raises(ValueError, lambda: list(gen.where(f.max(col("a")), "x"))) def test_select(): @@ -152,33 +154,33 @@ def test_select(): # no aggregation cols = SelectColumns(col("*")) - assert "SELECT * FROM x" == gen.select(cols, "x") + assert "SELECT * FROM !x!" == _to_sql(gen.select(cols, "x")) cols = SelectColumns(col("a"), lit(1).alias("b"), (col("b") + col("c")).alias("x")) where = (col("a") > 5).alias("aa") - assert "SELECT a, 1 AS b, b+c AS x FROM t WHERE a>5" == gen.select( - cols, "t", where=where + assert "SELECT a, 1 AS b, b+c AS x FROM !t! WHERE a>5" == _to_sql( + gen.select(cols, "t", where=where) ) # aggregation without literals cols = SelectColumns(f.max(col("c")).alias("c"), col("a", "aa"), col("b")) - assert "SELECT MAX(c) AS c, a AS aa, b FROM t GROUP BY a, b" == gen.select( - cols, "t" + assert "SELECT MAX(c) AS c, a AS aa, b FROM !t! GROUP BY a, b" == _to_sql( + gen.select(cols, "t") ) where = col("a") < 10 having = (f.max(col("a")) > 5).alias("aaa") assert ( - "SELECT MAX(c) AS c, a AS aa, b FROM t WHERE a<10 GROUP BY a, b HAVING MAX(a)>5" - == gen.select(cols, "t", where=where, having=having) + "SELECT MAX(c) AS c, a AS aa, b FROM !t! WHERE a<10 GROUP BY a, b HAVING MAX(a)>5" + == _to_sql(gen.select(cols, "t", where=where, having=having)) ) cols = SelectColumns( f.min(col("c") + 1).alias("c"), f.avg(col("d") + col("e")).cast(int).alias("d"), ) - assert "SELECT MIN(c+1) AS c, CAST(AVG(d+e) AS long) AS d FROM t" == gen.select( - cols, "t" + assert "SELECT MIN(c+1) AS c, CAST(AVG(d+e) AS long) AS d FROM !t!" == _to_sql( + gen.select(cols, "t") ) # aggregation with literals @@ -186,19 +188,19 @@ def test_select(): lit(1, "k"), f.max(col("c")).alias("c"), lit(2, "j"), col("a", "aa"), col("b") ) assert ( - "SELECT 1 AS k, c, 2 AS j, aa, b FROM (SELECT MAX(c) AS c, a AS aa, b FROM t GROUP BY a, b)" - == gen.select(cols, "t") + "SELECT 1 AS k, c, 2 AS j, aa, b FROM ( SELECT MAX(c) AS c, a AS aa, b FROM !t! GROUP BY a, b )" + == _to_sql(gen.select(cols, "t")) ) cols = SelectColumns(lit(1, "k"), f.max(col("c")).alias("c"), lit(2, "j")) - assert "SELECT 1 AS k, c, 2 AS j FROM (SELECT MAX(c) AS c FROM t)" == gen.select( - cols, "t" + assert "SELECT 1 AS k, c, 2 AS j FROM ( SELECT MAX(c) AS c FROM !t! )" == _to_sql( + gen.select(cols, "t") ) cols = SelectColumns(lit(1, "k"), col("a"), f.max(col("c")).alias("c"), lit(2, "j")) assert ( - "SELECT 1 AS k, a, c, 2 AS j FROM (SELECT a, MAX(c) AS c FROM t GROUP BY a)" - == gen.select(cols, "t") + "SELECT 1 AS k, a, c, 2 AS j FROM ( SELECT a, MAX(c) AS c FROM !t! GROUP BY a )" + == _to_sql(gen.select(cols, "t")) ) # cast @@ -207,8 +209,8 @@ def test_select(): f.avg(col("d") + col("e")).cast(int).alias("d"), ) assert ( - "SELECT CAST(c AS double) AS c, CAST(AVG(d+e) AS long) AS d FROM t GROUP BY c" - == gen.select(cols, "t") + "SELECT CAST(c AS double) AS c, CAST(AVG(d+e) AS long) AS d FROM !t! GROUP BY c" + == _to_sql(gen.select(cols, "t")) ) # infer alias @@ -219,7 +221,8 @@ def test_select(): ) assert ( "SELECT CAST(-c AS double) AS c, CAST(MAX(e) AS long) AS e, " - "CAST(AVG(d+e) AS long) AS d FROM t GROUP BY -c" == gen.select(cols, "t") + "CAST(AVG(d+e) AS long) AS d FROM !t! GROUP BY -c" + == _to_sql(gen.select(cols, "t")) ) @@ -252,6 +255,16 @@ def test_no_cast(): cols = SelectColumns( f.max(col("c")).cast("long").alias("c"), col("a", "aa"), col("b") ) - assert "SELECT MAX(c) AS c, a AS aa, b FROM t GROUP BY a, b" == gen.select( - cols, "t" + assert "SELECT MAX(c) AS c, a AS aa, b FROM !t! GROUP BY a, b" == _to_sql( + gen.select(cols, "t") ) + + +def _to_sql(parts): + return ( + " ".join( + "!" + x[1].strip() + "!" if x[0] else x[1].strip() + for x in parts + if x[1].strip() != "" + ) + ).strip() diff --git a/tests/fugue/utils/test_sql.py b/tests/fugue/utils/test_sql.py new file mode 100644 index 00000000..4d11e076 --- /dev/null +++ b/tests/fugue/utils/test_sql.py @@ -0,0 +1,23 @@ +from fugue._utils.sql import get_temp_tb_name, parse_sql + + +def test_parse_sql(): + def parse(sql): + parts = parse_sql(sql) + return "".join([p[1] if not p[0] else "!" + p[1] + "!" for p in parts]) + + t1 = get_temp_tb_name() + t2 = get_temp_tb_name() + assert parse("") == "" + assert parse(f"{t1}") == f"!{t1.key}!" + assert parse(f" {t1} ") == f" !{t1.key}! " + assert parse(f"SELECT * FROM {t1}") == f"SELECT * FROM !{t1.key}!" + assert ( + parse(f"SELECT * FROM {t1} NATURAL JOIN {t2}") + == f"SELECT * FROM !{t1.key}! NATURAL JOIN !{t2.key}!" + ) + assert ( + parse(f"SELECT {t1}.* FROM {t1} NATURAL JOIN {t2} WHERE {t2}.x<1") + == f"SELECT !{t1.key}!.* FROM !{t1.key}! " + f"NATURAL JOIN !{t2.key}! WHERE !{t2.key}!.x<1" + ) diff --git a/tests/fugue_duckdb/test_utils.py b/tests/fugue_duckdb/test_utils.py index fa709903..c515ece5 100644 --- a/tests/fugue_duckdb/test_utils.py +++ b/tests/fugue_duckdb/test_utils.py @@ -1,10 +1,11 @@ -from fugue_duckdb._utils import to_pa_type, to_duck_type, encode_value_to_expr -import pyarrow as pa import duckdb -from triad.utils.pyarrow import TRIAD_DEFAULT_TIMESTAMP -from pytest import raises -import pandas as pd import numpy as np +import pandas as pd +import pyarrow as pa +from pytest import raises +from triad.utils.pyarrow import TRIAD_DEFAULT_TIMESTAMP + +from fugue_duckdb._utils import encode_value_to_expr, to_duck_type, to_pa_type def test_encode_value_to_expr(): From 49d37249fd74dddddc94795a5474a31f3fbfdd45 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Wed, 28 Dec 2022 08:43:36 +0000 Subject: [PATCH 21/30] fix ray tests and coverage --- fugue/column/sql.py | 6 +++--- fugue/workflow/input.py | 2 +- fugue_ray/dataframe.py | 2 +- fugue_ray/execution_engine.py | 8 +++++++- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/fugue/column/sql.py b/fugue/column/sql.py index 4264a7dd..cb843f1c 100644 --- a/fugue/column/sql.py +++ b/fugue/column/sql.py @@ -261,7 +261,7 @@ def where(self, condition: ColumnExpr, table: str) -> Iterable[Tuple[bool, str]] lambda: ValueError(f"{condition} has aggregation functions"), ) cond = self.generate(condition.alias("")) - yield (False, "SELECT * FROM ") + yield (False, "SELECT * FROM") yield (True, table) yield (False, f"WHERE {cond}") @@ -312,14 +312,14 @@ def _having(as_where: bool = False) -> str: if len(columns.literals) == 0: expr = ", ".join(self.generate(x) for x in columns.all_cols) if len(columns.group_keys) == 0: - yield (False, f"SELECT {distinct}{expr} FROM ") + yield (False, f"SELECT {distinct}{expr} FROM") yield (True, table) yield (False, _where()) yield (False, _having()) return else: keys = ", ".join(self.generate(x) for x in columns.group_keys) - yield (False, f"SELECT {distinct}{expr} FROM ") + yield (False, f"SELECT {distinct}{expr} FROM") yield (True, table) yield (False, _where()) yield (False, f"GROUP BY {keys}") diff --git a/fugue/workflow/input.py b/fugue/workflow/input.py index 8f87cd35..e68124a0 100644 --- a/fugue/workflow/input.py +++ b/fugue/workflow/input.py @@ -1,7 +1,7 @@ from typing import Type -def register_raw_df_type(df_type: Type) -> None: +def register_raw_df_type(df_type: Type) -> None: # pragma: no cover """TODO: This function is to be removed before 0.9.0 .. deprecated:: 3.1 diff --git a/fugue_ray/dataframe.py b/fugue_ray/dataframe.py index 553259fc..974fd383 100644 --- a/fugue_ray/dataframe.py +++ b/fugue_ray/dataframe.py @@ -90,7 +90,7 @@ def __init__( # noqa: C901 schema = df.schema metadata = None if not df.has_metadata else df.metadata else: - raise ValueError(f"{df} is incompatible with DaskDataFrame") + raise ValueError(f"{df} is incompatible with RayDataFrame") rdf, schema = self._apply_schema(rdf, schema, internal_schema) super().__init__(schema) self._native = rdf diff --git a/fugue_ray/execution_engine.py b/fugue_ray/execution_engine.py index de8a51e0..304db4c3 100644 --- a/fugue_ray/execution_engine.py +++ b/fugue_ray/execution_engine.py @@ -1,7 +1,7 @@ from typing import Any, Callable, Dict, List, Optional, Union import pyarrow as pa -from duckdb import DuckDBPyConnection +from duckdb import DuckDBPyConnection, DuckDBPyRelation from triad import Schema, assert_or_throw, to_uuid from triad.utils.threading import RunOnce @@ -269,6 +269,12 @@ def _to_auto_df(self, df: Any, schema: Any = None) -> DataFrame: ValueError("schema must be None when df is a DataFrame"), ) return df + if isinstance(df, DuckDBPyRelation): + assert_or_throw( + schema is None, + ValueError("schema must be None when df is a DuckDBPyRelation"), + ) + return DuckDataFrame(df) return RayDataFrame(df, schema) def _get_remote_args(self) -> Dict[str, Any]: From d79d914a5ac795ad4068c5f2ac9c2ea416c47170 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Thu, 29 Dec 2022 06:27:11 +0000 Subject: [PATCH 22/30] fix tests, add fugue.default.partitions --- docs/top_api.rst | 4 +- fugue/api.py | 2 + fugue/constants.py | 2 + fugue/dataframe/arrow_dataframe.py | 6 + fugue/dataframe/pandas_dataframe.py | 6 + fugue/dataset/api.py | 9 + fugue/execution/api.py | 14 ++ fugue/execution/execution_engine.py | 5 + fugue/execution/native_execution_engine.py | 3 + fugue/plugins.py | 1 + fugue/workflow/input.py | 2 +- fugue_dask/_constants.py | 8 +- fugue_dask/_utils.py | 14 ++ fugue_dask/dataframe.py | 21 +- fugue_dask/execution_engine.py | 30 +-- fugue_duckdb/dask.py | 3 + fugue_duckdb/dataframe.py | 13 +- fugue_duckdb/execution_engine.py | 3 + fugue_ibis/execution_engine.py | 3 + fugue_ray/_constants.py | 6 +- fugue_ray/_utils/cluster.py | 16 ++ fugue_ray/dataframe.py | 15 +- fugue_ray/execution_engine.py | 47 +++-- fugue_spark/dataframe.py | 12 +- fugue_spark/execution_engine.py | 230 ++++++++++++--------- fugue_test/execution_suite.py | 3 + tests/fugue_dask/test_execution_engine.py | 6 +- tests/fugue_duckdb/test_dask.py | 6 +- tests/fugue_ray/test_execution_engine.py | 3 + tests/fugue_spark/test_execution_engine.py | 4 + 30 files changed, 340 insertions(+), 157 deletions(-) create mode 100644 fugue_ray/_utils/cluster.py diff --git a/docs/top_api.rst b/docs/top_api.rst index 1891c650..3d458310 100644 --- a/docs/top_api.rst +++ b/docs/top_api.rst @@ -47,13 +47,13 @@ Information .. autofunction:: fugue.api.show .. autofunction:: fugue.api.get_column_names +.. autofunction:: fugue.api.get_num_partitions .. autofunction:: fugue.api.get_schema .. autofunction:: fugue.api.is_df .. autofunction:: fugue.api.peek_array .. autofunction:: fugue.api.peek_dict - Transformation ~~~~~~~~~~~~~~ @@ -99,10 +99,12 @@ Conversion ExecutionEngine ~~~~~~~~~~~~~~~ + .. autofunction:: fugue.api.engine_context .. autofunction:: fugue.api.set_global_engine .. autofunction:: fugue.api.clear_global_engine .. autofunction:: fugue.api.get_current_engine +.. autofunction:: get_current_parallelism Big Data Operations diff --git a/fugue/api.py b/fugue/api.py index 1d75e9f3..eae905bf 100644 --- a/fugue/api.py +++ b/fugue/api.py @@ -25,6 +25,7 @@ as_local, as_local_bounded, count, + get_num_partitions, is_bounded, is_empty, is_local, @@ -38,6 +39,7 @@ engine_context, fillna, get_current_engine, + get_current_parallelism, intersect, join, load, diff --git a/fugue/constants.py b/fugue/constants.py index eae62ac8..cc9f6a51 100644 --- a/fugue/constants.py +++ b/fugue/constants.py @@ -13,6 +13,7 @@ FUGUE_CONF_WORKFLOW_EXCEPTION_INJECT = "fugue.workflow.exception.inject" FUGUE_CONF_WORKFLOW_EXCEPTION_OPTIMIZE = "fugue.workflow.exception.optimize" FUGUE_CONF_SQL_IGNORE_CASE = "fugue.sql.compile.ignore_case" +FUGUE_CONF_DEFAULT_PARTITIONS = "fugue.default.partitions" FUGUE_COMPILE_TIME_CONFIGS = set( [ @@ -35,6 +36,7 @@ FUGUE_CONF_WORKFLOW_EXCEPTION_INJECT: 3, FUGUE_CONF_WORKFLOW_EXCEPTION_OPTIMIZE: True, FUGUE_CONF_SQL_IGNORE_CASE: False, + FUGUE_CONF_DEFAULT_PARTITIONS: -1, } ) diff --git a/fugue/dataframe/arrow_dataframe.py b/fugue/dataframe/arrow_dataframe.py index a58c8ac6..56def554 100644 --- a/fugue/dataframe/arrow_dataframe.py +++ b/fugue/dataframe/arrow_dataframe.py @@ -11,6 +11,7 @@ as_local, as_local_bounded, count, + get_num_partitions, is_bounded, is_empty, is_local, @@ -280,6 +281,11 @@ def _pa_table_is_local(df: pa.Table) -> bool: return True +@get_num_partitions.candidate(lambda df: isinstance(df, pa.Table)) +def _pa_table_get_num_partitions(df: pa.Table) -> int: + return 1 + + @get_column_names.candidate(lambda df: isinstance(df, pa.Table)) def _get_pyarrow_table_columns(df: pa.Table) -> List[Any]: return [f.name for f in df.schema] diff --git a/fugue/dataframe/pandas_dataframe.py b/fugue/dataframe/pandas_dataframe.py index f8ece0e0..63e17bd4 100644 --- a/fugue/dataframe/pandas_dataframe.py +++ b/fugue/dataframe/pandas_dataframe.py @@ -11,6 +11,7 @@ as_local, as_local_bounded, count, + get_num_partitions, is_bounded, is_empty, is_local, @@ -235,6 +236,11 @@ def _pd_is_local(df: pd.DataFrame) -> bool: return True +@get_num_partitions.candidate(lambda df: isinstance(df, pd.DataFrame)) +def _get_pandas_num_partitions(df: pd.DataFrame) -> int: + return 1 + + @get_column_names.candidate(lambda df: isinstance(df, pd.DataFrame)) def _get_pandas_dataframe_columns(df: pd.DataFrame) -> List[Any]: return list(df.columns) diff --git a/fugue/dataset/api.py b/fugue/dataset/api.py index 94e57ed0..e3c9e5e4 100644 --- a/fugue/dataset/api.py +++ b/fugue/dataset/api.py @@ -93,3 +93,12 @@ def count(data: AnyDataset) -> int: :param data: the dataset that can be recognized by Fugue """ return as_fugue_dataset(data).count() + + +@fugue_plugin +def get_num_partitions(data: AnyDataset) -> bool: + """Get the number of partitions of the dataset + + :param data: the dataset that can be recognized by Fugue + """ + return as_fugue_dataset(data).num_partitions diff --git a/fugue/execution/api.py b/fugue/execution/api.py index a920100d..1654e7cb 100644 --- a/fugue/execution/api.py +++ b/fugue/execution/api.py @@ -92,6 +92,20 @@ def get_current_engine() -> ExecutionEngine: return make_execution_engine() +def get_current_parallelism( + engine: AnyExecutionEngine = None, engine_conf: Any = None +) -> int: + """Get the current parallelism of the engine + + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + + :return: the size of the parallelism + """ + with engine_context(engine, engine_conf) as e: + return e.get_current_parallelism() + + def run_engine_function( func: Callable[[ExecutionEngine], Any], engine: AnyExecutionEngine = None, diff --git a/fugue/execution/execution_engine.py b/fugue/execution/execution_engine.py index 8207570f..e89b7e3f 100644 --- a/fugue/execution/execution_engine.py +++ b/fugue/execution/execution_engine.py @@ -340,6 +340,11 @@ def create_default_sql_engine(self) -> SQLEngine: # pragma: no cover """Default SQLEngine if user doesn't specify""" raise NotImplementedError + @abstractmethod + def get_current_parallelism(self) -> int: # pragma: no cover + """Get the current number of parallelism of this engine""" + raise NotImplementedError + @abstractmethod def to_df(self, data: Any, schema: Any = None) -> DataFrame: # pragma: no cover """Convert a data structure to this engine compatible DataFrame diff --git a/fugue/execution/native_execution_engine.py b/fugue/execution/native_execution_engine.py index dfc4eeb6..b319a1cd 100644 --- a/fugue/execution/native_execution_engine.py +++ b/fugue/execution/native_execution_engine.py @@ -153,6 +153,9 @@ def create_default_sql_engine(self) -> SQLEngine: def create_default_map_engine(self) -> MapEngine: return PandasMapEngine(self) + def get_current_parallelism(self) -> int: + return 1 + @property def pl_utils(self) -> PandasUtils: """Pandas-like dataframe utils""" diff --git a/fugue/plugins.py b/fugue/plugins.py index 6a7b8aa1..dee330d3 100644 --- a/fugue/plugins.py +++ b/fugue/plugins.py @@ -23,6 +23,7 @@ as_local_bounded, count, get_dataset_display, + get_num_partitions, is_bounded, is_empty, is_local, diff --git a/fugue/workflow/input.py b/fugue/workflow/input.py index e68124a0..a80f271a 100644 --- a/fugue/workflow/input.py +++ b/fugue/workflow/input.py @@ -4,7 +4,7 @@ def register_raw_df_type(df_type: Type) -> None: # pragma: no cover """TODO: This function is to be removed before 0.9.0 - .. deprecated:: 3.1 + .. deprecated:: 0.8.0 Register using :func:`fugue.api.is_df` instead. """ raise DeprecationWarning("use fugue.api.is_df to register the dataframe") diff --git a/fugue_dask/_constants.py b/fugue_dask/_constants.py index e97d8a2e..ee2c43d7 100644 --- a/fugue_dask/_constants.py +++ b/fugue_dask/_constants.py @@ -1,8 +1,4 @@ from typing import Any, Dict -from dask.system import CPU_COUNT - -FUGUE_DASK_CONF_DATAFRAME_DEFAULT_PARTITIONS = "fugue.dask.dataframe.default.partitions" -FUGUE_DASK_DEFAULT_CONF: Dict[str, Any] = { - FUGUE_DASK_CONF_DATAFRAME_DEFAULT_PARTITIONS: CPU_COUNT * 2 -} +FUGUE_DASK_CONF_DEFAULT_PARTITIONS = "fugue.dask.default.partitions" +FUGUE_DASK_DEFAULT_CONF: Dict[str, Any] = {FUGUE_DASK_CONF_DEFAULT_PARTITIONS: -1} diff --git a/fugue_dask/_utils.py b/fugue_dask/_utils.py index 2cc22876..9dc1f140 100644 --- a/fugue_dask/_utils.py +++ b/fugue_dask/_utils.py @@ -8,6 +8,20 @@ from qpd_dask.engine import DaskUtils as DaskUtilsBase from triad.utils.pyarrow import to_pandas_dtype, to_single_pandas_dtype +import fugue.api as fa +from fugue.constants import FUGUE_CONF_DEFAULT_PARTITIONS + +from ._constants import FUGUE_DASK_CONF_DEFAULT_PARTITIONS + + +def get_default_partitions() -> int: + engine = fa.get_current_engine() + n = engine.conf.get( + FUGUE_DASK_CONF_DEFAULT_PARTITIONS, + engine.conf.get(FUGUE_CONF_DEFAULT_PARTITIONS, -1), + ) + return n if n > 0 else engine.get_current_parallelism() * 2 + class DaskUtils(DaskUtilsBase): def get_or_create_client(self, client: Optional[Client] = None): diff --git a/fugue_dask/dataframe.py b/fugue_dask/dataframe.py index 383a98e0..02dcc500 100644 --- a/fugue_dask/dataframe.py +++ b/fugue_dask/dataframe.py @@ -21,6 +21,7 @@ count, drop_columns, get_column_names, + get_num_partitions, head, is_bounded, is_df, @@ -29,11 +30,8 @@ rename, select_columns, ) -from fugue_dask._constants import ( - FUGUE_DASK_CONF_DATAFRAME_DEFAULT_PARTITIONS, - FUGUE_DASK_DEFAULT_CONF, -) -from fugue_dask._utils import DASK_UTILS + +from ._utils import DASK_UTILS, get_default_partitions class DaskDataFrame(DataFrame): @@ -45,7 +43,7 @@ class DaskDataFrame(DataFrame): :param schema: |SchemaLikeObject| or :class:`spark:pyspark.sql.types.StructType`, defaults to None. :param num_partitions: initial number of partitions for the dask dataframe - defaults to 0 to get the value from `fugue.dask.dataframe.default.partitions` + defaults to 0 to get the value from `fugue.dask.default.partitions` :param type_safe: whether to cast input data to ensure type safe, defaults to True .. note:: @@ -61,9 +59,7 @@ def __init__( # noqa: C901 type_safe=True, ): if num_partitions <= 0: - num_partitions = FUGUE_DASK_DEFAULT_CONF[ - FUGUE_DASK_CONF_DATAFRAME_DEFAULT_PARTITIONS - ] + num_partitions = get_default_partitions() if df is None: schema = _input_schema(schema).assert_not_empty() df = [] @@ -120,7 +116,7 @@ def empty(self) -> bool: @property def num_partitions(self) -> int: - return self.native.npartitions + return _dd_get_num_partitions(self.native) def _drop_cols(self, cols: List[str]) -> DataFrame: cols = (self.schema - cols).names @@ -249,6 +245,11 @@ def _dd_is_df(df: dd.DataFrame) -> bool: return True +@get_num_partitions.candidate(lambda df: isinstance(df, dd.DataFrame)) +def _dd_get_num_partitions(df: dd.DataFrame) -> int: + return df.npartitions + + @count.candidate(lambda df: isinstance(df, dd.DataFrame)) def _dd_count(df: dd.DataFrame) -> int: return df.shape[0].compute() diff --git a/fugue_dask/execution_engine.py b/fugue_dask/execution_engine.py index 81307090..7fd35724 100644 --- a/fugue_dask/execution_engine.py +++ b/fugue_dask/execution_engine.py @@ -1,6 +1,6 @@ import logging import os -from typing import Any, Callable, Dict, List, Optional, Union, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import dask.dataframe as dd from distributed import Client @@ -22,11 +22,7 @@ from fugue.dataframe.utils import get_join_schemas from fugue.execution.execution_engine import ExecutionEngine, MapEngine, SQLEngine from fugue.execution.native_execution_engine import NativeExecutionEngine -from fugue_dask._constants import ( - CPU_COUNT, - FUGUE_DASK_CONF_DATAFRAME_DEFAULT_PARTITIONS, - FUGUE_DASK_DEFAULT_CONF, -) +from fugue_dask._constants import FUGUE_DASK_DEFAULT_CONF from fugue_dask._io import load_df, save_df from fugue_dask._utils import DASK_UTILS, DaskUtils from fugue_dask.dataframe import DaskDataFrame @@ -158,6 +154,10 @@ def create_default_sql_engine(self) -> SQLEngine: def create_default_map_engine(self) -> MapEngine: return DaskMapEngine(self) + def get_current_parallelism(self) -> int: + res = dict(self.dask_client.nthreads()) + return sum(res.values()) + @property def pl_utils(self) -> DaskUtils: """Pandas-like dataframe utils""" @@ -182,9 +182,7 @@ def to_df(self, df: Any, schema: Any = None) -> DaskDataFrame: * all other methods in the engine can take arbitrary dataframes and call this method to convert before doing anything """ - default_partitions = self.conf.get_or_throw( - FUGUE_DASK_CONF_DATAFRAME_DEFAULT_PARTITIONS, int - ) + if isinstance(df, DataFrame): assert_or_throw( schema is None, @@ -193,18 +191,12 @@ def to_df(self, df: Any, schema: Any = None) -> DaskDataFrame: if isinstance(df, DaskDataFrame): return df if isinstance(df, PandasDataFrame): - res = DaskDataFrame( - df.native, df.schema, num_partitions=default_partitions - ) + res = DaskDataFrame(df.native, df.schema) else: - res = DaskDataFrame( - df.as_array(type_safe=True), - df.schema, - num_partitions=default_partitions, - ) + res = DaskDataFrame(df.as_array(type_safe=True), df.schema) res.reset_metadata(df.metadata) return res - return DaskDataFrame(df, schema, num_partitions=default_partitions) + return DaskDataFrame(df, schema) def repartition( self, df: DataFrame, partition_spec: PartitionSpec @@ -217,7 +209,7 @@ def repartition( p = partition_spec.get_num_partitions( **{ KEYWORD_ROWCOUNT: lambda: df.persist().count(), # type: ignore - KEYWORD_CORECOUNT: lambda: CPU_COUNT, + KEYWORD_CORECOUNT: lambda: self.get_current_parallelism(), } ) if p > 0: diff --git a/fugue_duckdb/dask.py b/fugue_duckdb/dask.py index 472eccdf..1c0f8859 100644 --- a/fugue_duckdb/dask.py +++ b/fugue_duckdb/dask.py @@ -36,6 +36,9 @@ def __init__( def create_default_map_engine(self) -> MapEngine: return DaskMapEngine(self._dask_engine) + def get_current_parallelism(self) -> int: + return self._dask_engine.get_current_parallelism() + @property def dask_client(self) -> Client: return self._dask_engine.dask_client diff --git a/fugue_duckdb/dataframe.py b/fugue_duckdb/dataframe.py index 7f4dbca2..7453fe67 100644 --- a/fugue_duckdb/dataframe.py +++ b/fugue_duckdb/dataframe.py @@ -13,7 +13,13 @@ LocalDataFrame, ) from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError -from fugue.plugins import as_fugue_dataset, as_local_bounded, get_column_names, is_df +from fugue.plugins import ( + as_fugue_dataset, + as_local_bounded, + get_column_names, + get_num_partitions, + is_df, +) from ._utils import encode_column_name, to_duck_type, to_pa_type @@ -155,6 +161,11 @@ def _duck_is_df(df: DuckDBPyRelation) -> bool: return True +@get_num_partitions.candidate(lambda df: isinstance(df, DuckDBPyRelation)) +def _duckdb_num_partitions(df: DuckDBPyRelation) -> int: + return 1 + + @as_local_bounded.candidate(lambda df: isinstance(df, DuckDBPyRelation)) def _duck_as_local(df: DuckDBPyRelation) -> DuckDBPyRelation: return df diff --git a/fugue_duckdb/execution_engine.py b/fugue_duckdb/execution_engine.py index 05456f43..5704e2f5 100644 --- a/fugue_duckdb/execution_engine.py +++ b/fugue_duckdb/execution_engine.py @@ -135,6 +135,9 @@ def create_default_sql_engine(self) -> SQLEngine: def create_default_map_engine(self) -> MapEngine: return PandasMapEngine(self._native_engine) + def get_current_parallelism(self) -> int: + return 1 + def to_df(self, df: Any, schema: Any = None) -> DataFrame: return self._to_duck_df(df, schema=schema) diff --git a/fugue_ibis/execution_engine.py b/fugue_ibis/execution_engine.py index 244cc4f6..a4a74cbb 100644 --- a/fugue_ibis/execution_engine.py +++ b/fugue_ibis/execution_engine.py @@ -52,6 +52,9 @@ class IbisExecutionEngine(ExecutionEngine): def create_default_sql_engine(self) -> SQLEngine: return IbisSQLEngine(self) + def get_current_parallelism(self) -> int: + return 1 + @property def backend(self) -> BaseBackend: # pragma: no cover raise NotImplementedError diff --git a/fugue_ray/_constants.py b/fugue_ray/_constants.py index d94eede5..2aa1738c 100644 --- a/fugue_ray/_constants.py +++ b/fugue_ray/_constants.py @@ -1,5 +1,9 @@ from typing import Dict, Any FUGUE_RAY_CONF_SHUFFLE_PARTITIONS = "fugue.ray.shuffle.partitions" +FUGUE_RAY_DEFAULT_PARTITIONS = "fugue.ray.default.partitions" -FUGUE_RAY_DEFAULT_CONF: Dict[str, Any] = {FUGUE_RAY_CONF_SHUFFLE_PARTITIONS: -1} +FUGUE_RAY_DEFAULT_CONF: Dict[str, Any] = { + FUGUE_RAY_CONF_SHUFFLE_PARTITIONS: -1, + FUGUE_RAY_DEFAULT_PARTITIONS: 0, +} diff --git a/fugue_ray/_utils/cluster.py b/fugue_ray/_utils/cluster.py new file mode 100644 index 00000000..06d35d37 --- /dev/null +++ b/fugue_ray/_utils/cluster.py @@ -0,0 +1,16 @@ +from fugue import ExecutionEngine + +from .._constants import FUGUE_RAY_CONF_SHUFFLE_PARTITIONS, FUGUE_RAY_DEFAULT_PARTITIONS +from fugue.constants import FUGUE_CONF_DEFAULT_PARTITIONS + + +def get_default_partitions(engine: ExecutionEngine) -> int: + n = engine.conf.get( + FUGUE_RAY_DEFAULT_PARTITIONS, engine.conf.get(FUGUE_CONF_DEFAULT_PARTITIONS, -1) + ) + return n if n >= 0 else engine.get_current_parallelism() * 2 + + +def get_default_shuffle_partitions(engine: ExecutionEngine) -> int: + n = engine.conf.get(FUGUE_RAY_CONF_SHUFFLE_PARTITIONS, -1) + return n if n >= 0 else get_default_partitions(engine) diff --git a/fugue_ray/dataframe.py b/fugue_ray/dataframe.py index 974fd383..dd3d19c0 100644 --- a/fugue_ray/dataframe.py +++ b/fugue_ray/dataframe.py @@ -14,7 +14,13 @@ ) from fugue.dataframe.dataframe import _input_schema from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError -from fugue.plugins import get_column_names, rename, is_df, as_local_bounded +from fugue.plugins import ( + as_local_bounded, + get_column_names, + get_num_partitions, + is_df, + rename, +) from ._utils.dataframe import build_empty, get_dataset_format @@ -129,7 +135,7 @@ def empty(self) -> bool: @property def num_partitions(self) -> int: - return self.native.num_blocks() + return _rd_num_partitions(self.native) def _drop_cols(self, cols: List[str]) -> DataFrame: cols = (self.schema - cols).names @@ -243,6 +249,11 @@ def _rd_is_df(df: rd.Dataset) -> bool: return True +@get_num_partitions.candidate(lambda df: isinstance(df, rd.Dataset)) +def _rd_num_partitions(df: rd.Dataset) -> int: + return df.num_blocks() + + @as_local_bounded.candidate(lambda df: isinstance(df, rd.Dataset)) def _rd_as_local(df: rd.Dataset) -> bool: return pa.concat_tables(_get_arrow_tables(df)) diff --git a/fugue_ray/execution_engine.py b/fugue_ray/execution_engine.py index 304db4c3..e16a7733 100644 --- a/fugue_ray/execution_engine.py +++ b/fugue_ray/execution_engine.py @@ -1,6 +1,7 @@ from typing import Any, Callable, Dict, List, Optional, Union import pyarrow as pa +import ray from duckdb import DuckDBPyConnection, DuckDBPyRelation from triad import Schema, assert_or_throw, to_uuid from triad.utils.threading import RunOnce @@ -18,7 +19,7 @@ from fugue_duckdb.dataframe import DuckDataFrame from fugue_duckdb.execution_engine import DuckExecutionEngine -from ._constants import FUGUE_RAY_CONF_SHUFFLE_PARTITIONS +from ._utils.cluster import get_default_partitions, get_default_shuffle_partitions from ._utils.dataframe import add_partition_key from ._utils.io import RayIO from .dataframe import RayDataFrame @@ -94,12 +95,15 @@ def _udf(adf: pa.Table) -> pa.Table: # pragma: no cover output_df = map_func(cursor, input_df) return output_df.as_arrow() - _df = self.execution_engine._to_ray_df(df) # type: ignore + _df: RayDataFrame = self.execution_engine._to_ray_df(df) # type: ignore if partition_spec.num_partitions != "0": _df = self.execution_engine.repartition(_df, partition_spec) # type: ignore else: - n = self.execution_engine.conf.get(FUGUE_RAY_CONF_SHUFFLE_PARTITIONS, -1) - if n > 1: + n = get_default_shuffle_partitions(self.execution_engine) + if n > 0 and n != _df.num_partitions: + # if n==0 or same as the current dataframe partitions + # then no repartition will be done by fugue + # otherwise, repartition the dataset _df = self.execution_engine.repartition( # type: ignore _df, PartitionSpec(num=n) ) @@ -152,6 +156,15 @@ def _udf(adf: pa.Table) -> pa.Table: # pragma: no cover rdf = self.execution_engine.repartition( # type: ignore rdf, partition_spec=partition_spec ) + elif rdf.num_partitions <= 1: + n = get_default_partitions(self.execution_engine) + if n > 0 and n != rdf.num_partitions: + # if n==0 or same as the current dataframe partitions + # then no repartition will be done by fugue + # otherwise, repartition the dataset + rdf = self.execution_engine.repartition( # type: ignore + rdf, PartitionSpec(num=n) + ) sdf = rdf.native.map_batches( _udf, batch_format="pyarrow", @@ -171,12 +184,24 @@ class RayExecutionEngine(DuckExecutionEngine): def __init__( self, conf: Any = None, connection: Optional[DuckDBPyConnection] = None ): + if not ray.is_initialized(): # pragma: no cover + ray.init() super().__init__(conf, connection) self._io = RayIO(self) + def __repr__(self) -> str: + return "RayExecutionEngine" + def create_default_map_engine(self) -> MapEngine: return RayMapEngine(self) + def get_current_parallelism(self) -> int: + res = ray.cluster_resources() + n = res.get("CPU", 0) + if n == 0: # pragma: no cover + res.get("cpu", 0) + return int(n) + def to_df(self, df: Any, schema: Any = None) -> DataFrame: return self._to_ray_df(df, schema=schema) @@ -189,17 +214,15 @@ def _persist_and_count(df: RayDataFrame) -> int: num_funcs = {KEYWORD_ROWCOUNT: lambda: _persist_and_count(rdf)} num = partition_spec.get_num_partitions(**num_funcs) + pdf = rdf.native - if partition_spec.algo in ["hash", "even"]: - pdf = rdf.native - if num > 0: + if num > 0: + if partition_spec.algo in ["hash", "even"]: pdf = pdf.repartition(num) - elif partition_spec.algo == "rand": - pdf = rdf.native - if num > 0: + elif partition_spec.algo == "rand": pdf = pdf.repartition(num, shuffle=True) - else: # pragma: no cover - raise NotImplementedError(partition_spec.algo + " is not supported") + else: # pragma: no cover + raise NotImplementedError(partition_spec.algo + " is not supported") return RayDataFrame(pdf, schema=rdf.schema, internal_schema=True) def broadcast(self, df: DataFrame) -> DataFrame: diff --git a/fugue_spark/dataframe.py b/fugue_spark/dataframe.py index fea74d06..574c9cc0 100644 --- a/fugue_spark/dataframe.py +++ b/fugue_spark/dataframe.py @@ -22,6 +22,7 @@ count, drop_columns, get_column_names, + get_num_partitions, head, is_bounded, is_df, @@ -66,6 +67,10 @@ def __init__(self, df: Any = None, schema: Any = None): # noqa: C901 schema = to_schema(schema).assert_not_empty() raise ValueError(f"{df} is incompatible with SparkDataFrame") + @property + def alias(self) -> str: + return "_" + str(id(self.native)) + @property def native(self) -> ps.DataFrame: """The wrapped Spark DataFrame @@ -97,7 +102,7 @@ def as_local(self) -> LocalDataFrame: @property def num_partitions(self) -> int: - return self.native.rdd.getNumPartitions() + return _spark_num_partitions(self.native) @property def empty(self) -> bool: @@ -183,6 +188,11 @@ def _spark_is_df(df: ps.DataFrame) -> bool: return True +@get_num_partitions.candidate(lambda df: isinstance(df, ps.DataFrame)) +def _spark_num_partitions(df: ps.DataFrame) -> int: + return df.rdd.getNumPartitions() + + @count.candidate(lambda df: isinstance(df, ps.DataFrame)) def _spark_df_count(df: ps.DataFrame) -> int: return df.count() diff --git a/fugue_spark/execution_engine.py b/fugue_spark/execution_engine.py index 2f4e8188..2c0d45d9 100644 --- a/fugue_spark/execution_engine.py +++ b/fugue_spark/execution_engine.py @@ -17,6 +17,7 @@ from triad.utils.iter import EmptyAwareIterable from triad.utils.pandas_like import PD_UTILS from triad.utils.threading import RunOnce +from triad import SerializableRLock from fugue.collections.partition import ( PartitionCursor, @@ -78,10 +79,13 @@ def __init__(self, execution_engine: ExecutionEngine): super().__init__(execution_engine) def select(self, dfs: DataFrames, statement: List[Tuple[bool, str]]) -> DataFrame: + _map: Dict[str, str] = {} for k, v in dfs.items(): - self.execution_engine.register(v, k) # type: ignore + df = self.execution_engine._to_spark_df(v, create_view=True) # type: ignore + _map[k] = df.alias + _sql = " ".join(_map.get(p[1], p[1]) if p[0] else p[1] for p in statement) return SparkDataFrame( - self.execution_engine.spark_session.sql(statement) # type: ignore + self.execution_engine.spark_session.sql(_sql) # type: ignore ) @@ -258,16 +262,15 @@ def __init__(self, spark_session: Optional[SparkSession] = None, conf: Any = Non cf.update({x[0]: x[1] for x in spark_session.sparkContext.getConf().getAll()}) cf.update(ParamDict(conf)) super().__init__(cf) + self._lock = SerializableRLock() self._fs = FileSystem() self._log = logging.getLogger() self._broadcast_func = RunOnce( self._broadcast, lambda *args, **kwargs: id(args[0]) ) self._persist_func = RunOnce(self._persist, lambda *args, **kwargs: id(args[0])) - self._register_func = RunOnce( - self._register, lambda *args, **kwargs: (id(args[0]), id(args[1])) - ) self._io = SparkIO(self.spark_session, self.fs) + self._registered_dfs: Dict[str, SparkDataFrame] = {} def __repr__(self) -> str: return "SparkExecutionEngine" @@ -297,6 +300,15 @@ def create_default_sql_engine(self) -> SQLEngine: def create_default_map_engine(self) -> MapEngine: return SparkMapEngine(self) + def get_current_parallelism(self) -> int: + spark = self.spark_session + e_cores = int(spark.conf.get("spark.executor.cores", "1")) + tc = int(spark.conf.get("spark.task.cpus", "1")) + sc = spark._jsc.sc() + nodes = len(list(sc.statusTracker().getExecutorInfos())) + workers = 1 if nodes <= 1 else nodes - 1 + return max(workers * (e_cores // tc), 1) + def to_df(self, df: Any, schema: Any = None) -> SparkDataFrame: # noqa: C901 """Convert a data structure to :class:`~fugue_spark.dataframe.SparkDataFrame` @@ -318,76 +330,14 @@ def to_df(self, df: Any, schema: Any = None) -> SparkDataFrame: # noqa: C901 * all other methods in the engine can take arbitrary dataframes and call this method to convert before doing anything """ - if isinstance(df, DataFrame): - assert_or_throw( - schema is None, - ValueError("schema must be None when df is a DataFrame"), - ) - if isinstance(df, SparkDataFrame): - return df - if isinstance(df, ArrowDataFrame): - sdf = self.spark_session.createDataFrame( - df.as_array(), to_spark_schema(df.schema) - ) - return SparkDataFrame(sdf, df.schema) - if isinstance(df, (ArrayDataFrame, IterableDataFrame)): - adf = ArrowDataFrame(df.as_array(type_safe=False), df.schema) - sdf = self.spark_session.createDataFrame( - adf.as_array(), to_spark_schema(df.schema) - ) - return SparkDataFrame(sdf, df.schema) - if any(pa.types.is_struct(t) for t in df.schema.types): - sdf = self.spark_session.createDataFrame( - df.as_array(type_safe=True), to_spark_schema(df.schema) - ) - else: - sdf = self.spark_session.createDataFrame( - df.as_pandas(), to_spark_schema(df.schema) - ) - return SparkDataFrame(sdf, df.schema) - if isinstance(df, ps.DataFrame): - return SparkDataFrame(df, None if schema is None else to_schema(schema)) - if isinstance(df, RDD): - assert_arg_not_none(schema, "schema") - sdf = self.spark_session.createDataFrame(df, to_spark_schema(schema)) - return SparkDataFrame(sdf, to_schema(schema)) - if isinstance(df, pd.DataFrame): - if PD_UTILS.empty(df): - temp_schema = to_spark_schema(PD_UTILS.to_schema(df)) - sdf = self.spark_session.createDataFrame([], temp_schema) - else: - sdf = self.spark_session.createDataFrame(df) - return SparkDataFrame(sdf, schema) - - # use arrow dataframe here to handle nulls in int cols - assert_or_throw( - schema is not None, FugueDataFrameInitError("schema can't be None") - ) - adf = ArrowDataFrame(df, to_schema(schema)) - map_pos = [i for i, t in enumerate(adf.schema.types) if pa.types.is_map(t)] - if len(map_pos) == 0: - sdf = self.spark_session.createDataFrame( - adf.as_array(), to_spark_schema(adf.schema) - ) - else: - - def to_dict(rows: Iterable[List[Any]]) -> Iterable[List[Any]]: - for row in rows: - for p in map_pos: - row[p] = dict(row[p]) - yield row - - sdf = self.spark_session.createDataFrame( - to_dict(adf.as_array_iterable()), to_spark_schema(adf.schema) - ) - return SparkDataFrame(sdf, adf.schema) + return self._to_spark_df(df, schema=schema) def repartition(self, df: DataFrame, partition_spec: PartitionSpec) -> DataFrame: def _persist_and_count(df: DataFrame) -> int: df = self.persist(df) return df.count() - df = self.to_df(df) + df = self._to_spark_df(df) num_funcs = {KEYWORD_ROWCOUNT: lambda: _persist_and_count(df)} num = partition_spec.get_num_partitions(**num_funcs) @@ -411,10 +361,10 @@ def _persist_and_count(df: DataFrame) -> int: sdf = sdf.sortWithinPartitions( *sorts.keys(), ascending=list(sorts.values()) ) - return self.to_df(sdf, df.schema) + return self._to_spark_df(sdf, df.schema) def broadcast(self, df: DataFrame) -> SparkDataFrame: - res = self._broadcast_func(self.to_df(df)) + res = self._broadcast_func(self._to_spark_df(df)) res.reset_metadata(df.metadata) return res @@ -425,13 +375,15 @@ def persist( **kwargs: Any, ) -> SparkDataFrame: res = self._persist_func( - self.to_df(df), lazy=lazy, level=kwargs.get("level", None) + self._to_spark_df(df), lazy=lazy, level=kwargs.get("level", None) ) res.reset_metadata(df.metadata) return res def register(self, df: DataFrame, name: str) -> SparkDataFrame: - return self._register_func(self.to_df(df), name) + sdf = self._to_spark_df(df) + sdf.native.createOrReplaceTempView(name) + return sdf def join( self, @@ -447,14 +399,14 @@ def join( ValueError(f"{how} is not supported as a join type"), ) how = _TO_SPARK_JOIN_MAP[how] - d1 = self.to_df(df1).native - d2 = self.to_df(df2).native + d1 = self._to_spark_df(df1).native + d2 = self._to_spark_df(df2).native cols = [col(n) for n in output_schema.names] if how == "cross": res = d1.crossJoin(d2).select(*cols) else: res = d1.join(d2, on=key_schema.names, how=how).select(*cols) - return self.to_df(res, output_schema) + return self._to_spark_df(res, output_schema) def union( self, @@ -466,12 +418,12 @@ def union( df1.schema == df2.schema, lambda: ValueError(f"{df1.schema} != {df2.schema}"), ) - d1 = self.to_df(df1).native - d2 = self.to_df(df2).native + d1 = self._to_spark_df(df1).native + d2 = self._to_spark_df(df2).native d = d1.union(d2) if distinct: d = d.distinct() - return self.to_df(d, df1.schema) + return self._to_spark_df(d, df1.schema) def subtract( self, df1: DataFrame, df2: DataFrame, distinct: bool = True @@ -480,13 +432,13 @@ def subtract( df1.schema == df2.schema, lambda: ValueError(f"{df1.schema} != {df2.schema}"), ) - d1 = self.to_df(df1).native - d2 = self.to_df(df2).native + d1 = self._to_spark_df(df1).native + d2 = self._to_spark_df(df2).native if distinct: d: Any = d1.subtract(d2) else: # pragma: no cover d = d1.exceptAll(d2) - return self.to_df(d, df1.schema) + return self._to_spark_df(d, df1.schema) def intersect( self, df1: DataFrame, df2: DataFrame, distinct: bool = True @@ -495,17 +447,17 @@ def intersect( df1.schema == df2.schema, lambda: ValueError(f"{df1.schema} != {df2.schema}"), ) - d1 = self.to_df(df1).native - d2 = self.to_df(df2).native + d1 = self._to_spark_df(df1).native + d2 = self._to_spark_df(df2).native if distinct: d: Any = d1.intersect(d2) else: # pragma: no cover d = d1.intersectAll(d2) - return self.to_df(d, df1.schema) + return self._to_spark_df(d, df1.schema) def distinct(self, df: DataFrame) -> DataFrame: - d = self.to_df(df).native.distinct() - return self.to_df(d, df.schema) + d = self._to_spark_df(df).native.distinct() + return self._to_spark_df(d, df.schema) def dropna( self, @@ -514,8 +466,8 @@ def dropna( thresh: int = None, subset: List[str] = None, ) -> DataFrame: - d = self.to_df(df).native.dropna(how=how, thresh=thresh, subset=subset) - return self.to_df(d, df.schema) + d = self._to_spark_df(df).native.dropna(how=how, thresh=thresh, subset=subset) + return self._to_spark_df(d, df.schema) def fillna(self, df: DataFrame, value: Any, subset: List[str] = None) -> DataFrame: assert_or_throw( @@ -534,8 +486,8 @@ def fillna(self, df: DataFrame, value: Any, subset: List[str] = None) -> DataFra # If subset is none, apply to all columns subset = subset or df.schema.names mapping = {col: value for col in subset} - d = self.to_df(df).native.fillna(mapping) - return self.to_df(d, df.schema) + d = self._to_spark_df(df).native.fillna(mapping) + return self._to_spark_df(d, df.schema) def sample( self, @@ -550,10 +502,10 @@ def sample( ValueError("one and only one of n and frac should be set"), ) if frac is not None: - d = self.to_df(df).native.sample( + d = self._to_spark_df(df).native.sample( fraction=frac, withReplacement=replace, seed=seed ) - return self.to_df(d, df.schema) + return self._to_spark_df(d, df.schema) else: assert_or_throw( seed is None, @@ -566,11 +518,11 @@ def sample( ), ) temp_name = "__temp_" + str(uuid4()).split("-")[-1] - self.to_df(df).native.createOrReplaceTempView(temp_name) + self._to_spark_df(df).native.createOrReplaceTempView(temp_name) d = self.spark_session.sql( f"SELECT * FROM {temp_name} TABLESAMPLE({n} ROWS)" ) - return self.to_df(d, df.schema) + return self._to_spark_df(d, df.schema) def take( self, @@ -585,7 +537,7 @@ def take( isinstance(n, int), ValueError("n needs to be an integer"), ) - d = self.to_df(df).native + d = self._to_spark_df(df).native nulls_last = bool(na_position == "last") if presort: @@ -631,7 +583,7 @@ def _presort_to_col(_col: str, _asc: bool) -> Any: .drop("__row_number__") ) - return self.to_df(d, df.schema) + return self._to_spark_df(d, df.schema) def load_df( self, @@ -655,7 +607,7 @@ def save_df( **kwargs: Any, ) -> None: partition_spec = partition_spec or PartitionSpec() - df = self.to_df(df) + df = self._to_spark_df(df) self._io.save_df( df, uri=path, @@ -683,9 +635,85 @@ def _persist(self, df: SparkDataFrame, lazy: bool, level: Any) -> SparkDataFrame return df raise ValueError(f"{level} is not supported persist type") # pragma: no cover - def _register(self, df: SparkDataFrame, name: str) -> SparkDataFrame: - df.native.createOrReplaceTempView(name) - return df + def _to_spark_df( # noqa: C901 + self, df: Any, schema: Any = None, create_view: bool = False + ) -> SparkDataFrame: + def _to_df() -> SparkDataFrame: + if isinstance(df, DataFrame): + assert_or_throw( + schema is None, + ValueError("schema must be None when df is a DataFrame"), + ) + if isinstance(df, SparkDataFrame): + return df + if isinstance(df, ArrowDataFrame): + sdf = self.spark_session.createDataFrame( + df.as_array(), to_spark_schema(df.schema) + ) + return SparkDataFrame(sdf, df.schema) + if isinstance(df, (ArrayDataFrame, IterableDataFrame)): + adf = ArrowDataFrame(df.as_array(type_safe=False), df.schema) + sdf = self.spark_session.createDataFrame( + adf.as_array(), to_spark_schema(df.schema) + ) + return SparkDataFrame(sdf, df.schema) + if any(pa.types.is_struct(t) for t in df.schema.types): + sdf = self.spark_session.createDataFrame( + df.as_array(type_safe=True), to_spark_schema(df.schema) + ) + else: + sdf = self.spark_session.createDataFrame( + df.as_pandas(), to_spark_schema(df.schema) + ) + return SparkDataFrame(sdf, df.schema) + if isinstance(df, ps.DataFrame): + return SparkDataFrame(df, None if schema is None else to_schema(schema)) + if isinstance(df, RDD): + assert_arg_not_none(schema, "schema") + sdf = self.spark_session.createDataFrame(df, to_spark_schema(schema)) + return SparkDataFrame(sdf, to_schema(schema)) + if isinstance(df, pd.DataFrame): + if PD_UTILS.empty(df): + temp_schema = to_spark_schema(PD_UTILS.to_schema(df)) + sdf = self.spark_session.createDataFrame([], temp_schema) + else: + sdf = self.spark_session.createDataFrame(df) + return SparkDataFrame(sdf, schema) + + # use arrow dataframe here to handle nulls in int cols + assert_or_throw( + schema is not None, FugueDataFrameInitError("schema can't be None") + ) + adf = ArrowDataFrame(df, to_schema(schema)) + map_pos = [i for i, t in enumerate(adf.schema.types) if pa.types.is_map(t)] + if len(map_pos) == 0: + sdf = self.spark_session.createDataFrame( + adf.as_array(), to_spark_schema(adf.schema) + ) + else: + + def to_dict(rows: Iterable[List[Any]]) -> Iterable[List[Any]]: + for row in rows: + for p in map_pos: + row[p] = dict(row[p]) + yield row + + sdf = self.spark_session.createDataFrame( + to_dict(adf.as_array_iterable()), to_spark_schema(adf.schema) + ) + return SparkDataFrame(sdf, adf.schema) + + res = _to_df() + if res is not df and isinstance(df, DataFrame) and df.has_metadata: + res.reset_metadata(df.metadata) + + if create_view: + with self._lock: + if res.alias not in self._registered_dfs: + res.native.createOrReplaceTempView(res.alias) + self._registered_dfs[res.alias] = res + + return res class _Mapper(object): # pragma: no cover diff --git a/fugue_test/execution_suite.py b/fugue_test/execution_suite.py index 6150ba98..2787e33d 100644 --- a/fugue_test/execution_suite.py +++ b/fugue_test/execution_suite.py @@ -61,6 +61,9 @@ def test_init(self): assert copy.copy(self.engine) is self.engine assert copy.deepcopy(self.engine) is self.engine + def test_get_parallelism(self): + assert fa.get_current_parallelism(self.engine) == 1 + def test_to_df_general(self): e = self.engine o = ArrayDataFrame( diff --git a/tests/fugue_dask/test_execution_engine.py b/tests/fugue_dask/test_execution_engine.py index 2273eb00..abcfdfb9 100644 --- a/tests/fugue_dask/test_execution_engine.py +++ b/tests/fugue_dask/test_execution_engine.py @@ -38,9 +38,13 @@ def tearDownClass(cls): cls._engine.dask_client.close() def make_engine(self): - e = DaskExecutionEngine(conf=dict(test=True, **_CONF)) + client = Client(processes=True, n_workers=3, threads_per_worker=1) + e = DaskExecutionEngine(client, conf=dict(test=True, **_CONF)) return e + def test_get_parallelism(self): + assert fa.get_current_parallelism(self.engine) == 3 + def test__join_outer_pandas_incompatible(self): return diff --git a/tests/fugue_duckdb/test_dask.py b/tests/fugue_duckdb/test_dask.py index 747eb009..d498dce8 100644 --- a/tests/fugue_duckdb/test_dask.py +++ b/tests/fugue_duckdb/test_dask.py @@ -38,13 +38,17 @@ def tearDownClass(cls): cls._engine.dask_client.close() def make_engine(self): + client = Client(processes=True, n_workers=2, threads_per_worker=1) e = DuckDaskExecutionEngine( conf={"test": True, "fugue.duckdb.pragma.threads": 2}, connection=self._con, - dask_client=Client(), + dask_client=client, ) return e + def test_get_parallelism(self): + assert fa.get_current_parallelism(self.engine) == 2 + def test_to_df_dask(self): pdf = pd.DataFrame([[1.1]], columns=["a"]) df = dd.from_pandas(pdf, npartitions=2) diff --git a/tests/fugue_ray/test_execution_engine.py b/tests/fugue_ray/test_execution_engine.py index be91014e..9008980a 100644 --- a/tests/fugue_ray/test_execution_engine.py +++ b/tests/fugue_ray/test_execution_engine.py @@ -44,6 +44,9 @@ def make_engine(self): ) return e + def test_get_parallelism(self): + assert fa.get_current_parallelism(self.engine) == 2 + def test_repartitioning(self): # schema: * def t(df: pd.DataFrame) -> pd.DataFrame: diff --git a/tests/fugue_spark/test_execution_engine.py b/tests/fugue_spark/test_execution_engine.py index e4a4b80a..b4fff0ec 100644 --- a/tests/fugue_spark/test_execution_engine.py +++ b/tests/fugue_spark/test_execution_engine.py @@ -12,6 +12,7 @@ from pytest import raises from triad import Schema +import fugue.api as fa from fugue import transform from fugue.collections.partition import PartitionSpec from fugue.dataframe import ( @@ -42,6 +43,9 @@ def make_engine(self): ) return e + def test_get_parallelism(self): + assert fa.get_current_parallelism(self.engine) == 4 + def test_not_using_pandas_udf(self): assert not self.engine.create_default_map_engine()._should_use_pandas_udf( Schema("a:int") From f0f04682253fa1976becd48379a7c8ee8c49cb6f Mon Sep 17 00:00:00 2001 From: Han Wang Date: Thu, 29 Dec 2022 07:41:45 +0000 Subject: [PATCH 23/30] update docs --- fugue/__init__.py | 2 +- fugue/dataframe/api.py | 10 +- fugue/sql/api.py | 125 ++++++++++++++--- fugue/workflow/api.py | 152 ++++++++++++--------- tests/fugue_spark/test_execution_engine.py | 3 + 5 files changed, 204 insertions(+), 88 deletions(-) diff --git a/fugue/__init__.py b/fugue/__init__.py index dd5b389d..9d10aab1 100644 --- a/fugue/__init__.py +++ b/fugue/__init__.py @@ -76,7 +76,7 @@ make_rpc_server, to_rpc_handler, ) -from fugue.sql.api import fsql +from fugue.sql.api import fugue_sql_flow as fsql from fugue.sql.workflow import FugueSQLWorkflow from fugue.workflow._workflow_context import FugueWorkflowContext from fugue.workflow.module import module diff --git a/fugue/dataframe/api.py b/fugue/dataframe/api.py index 481f98d8..265af619 100644 --- a/fugue/dataframe/api.py +++ b/fugue/dataframe/api.py @@ -171,7 +171,7 @@ def alter_columns( :return: a new dataframe with altered columns, the order of the original schema will not change """ - return _adjust_df(df, as_fugue_df(df).alter_columns(columns), as_fugue=as_fugue) + return _convert_df(df, as_fugue_df(df).alter_columns(columns), as_fugue=as_fugue) @fugue_plugin @@ -187,7 +187,7 @@ def drop_columns( then it will return the underlying DataFrame object. :return: a new dataframe removing the columns """ - return _adjust_df(df, as_fugue_df(df).drop(columns), as_fugue=as_fugue) + return _convert_df(df, as_fugue_df(df).drop(columns), as_fugue=as_fugue) @fugue_plugin @@ -203,7 +203,7 @@ def select_columns( then it will return the underlying DataFrame object. :return: a new dataframe with the selected the columns """ - return _adjust_df(df, as_fugue_df(df)[columns], as_fugue=as_fugue) + return _convert_df(df, as_fugue_df(df)[columns], as_fugue=as_fugue) @fugue_plugin @@ -259,7 +259,7 @@ def _rename_pandas_dataframe( """ if len(columns) == 0: return df - return _adjust_df(df, as_fugue_df(df).rename(columns), as_fugue=as_fugue) + return _convert_df(df, as_fugue_df(df).rename(columns), as_fugue=as_fugue) def normalize_column_names(df: AnyDataFrame) -> Tuple[AnyDataFrame, Dict[str, Any]]: @@ -296,7 +296,7 @@ def normalize_column_names(df: AnyDataFrame) -> Tuple[AnyDataFrame, Dict[str, An return (rename(df, names), undo) -def _adjust_df( +def _convert_df( input_df: AnyDataFrame, output_df: DataFrame, as_fugue: bool ) -> AnyDataFrame: if as_fugue or isinstance(input_df, DataFrame): diff --git a/fugue/sql/api.py b/fugue/sql/api.py index 8aadb948..28d900e9 100644 --- a/fugue/sql/api.py +++ b/fugue/sql/api.py @@ -2,27 +2,14 @@ from triad.utils.convert import get_caller_global_local_vars -from fugue.dataframe import DataFrame +from fugue.dataframe import AnyDataFrame from fugue.exceptions import FugueSQLError from fugue.execution import AnyExecutionEngine -from fugue.workflow.workflow import FugueWorkflowResult from ..constants import FUGUE_CONF_SQL_IGNORE_CASE from .workflow import FugueSQLWorkflow -def fugue_sql_flow( - query: str, - *args: Any, - fsql_ignore_case: bool = False, - engine: AnyExecutionEngine = None, - engine_conf: Any = None, - **kwargs: Any, -) -> FugueWorkflowResult: - dag = _build_dag(query, fsql_ignore_case=fsql_ignore_case, args=args, kwargs=kwargs) - return dag.run(engine, engine_conf) - - def fugue_sql( query: str, *args: Any, @@ -32,7 +19,71 @@ def fugue_sql( as_fugue: bool = False, as_local: bool = False, **kwargs: Any, -) -> DataFrame: +) -> AnyDataFrame: + """Simplified Fugue SQL interface. This function can still take multiple dataframe + inputs but will always return the last generated dataframe in the SQL workflow. And + ``YIELD`` should NOT be used with this function. If you want to use Fugue SQL to + represent the full workflow, or want to see more Fugue SQL examples, + please read :func:`~.fugue_sql_flow`. + + :param query: the Fugue SQL string (can be a jinja template) + :param args: variables related to the SQL string + :param fsql_ignore_case: whether to ignore case when parsing the SQL string + defaults to False. + :param kwargs: variables related to the SQL string + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether return a local dataframe, defaults to False + + :return: the result dataframe + + .. note:: + + This function is different from :func:`~fugue.api.raw_sql` which directly + sends the query to the execution engine to run. This function parses the query + based on Fugue SQL syntax, creates a + :class:`~fugue.sql.workflow.FugueSQLWorkflow` which + could contain multiple raw SQLs plus other operations, and runs and returns + the last dataframe generated in the workflow. + + This function allows you to parameterize the SQL in a more elegant way. The + data tables referred in the query can either be automatically extracted from the + local variables or be specified in the arguments. + + .. caution:: + + Currently, we have not unified the dialects of different SQL backends. So there + can be some slight syntax differences when you switch between backends. + In addition, we have not unified the UDFs cross different backends, so you + should be careful to use uncommon UDFs belonging to a certain backend. + + That being said, if you keep your SQL part general and leverage Fugue extensions + (transformer, creator, processor, outputter, etc.) appropriately, it should be + easy to write backend agnostic Fugue SQL. + + We are working on unifying the dialects of different SQLs, it should be + available in the future releases. Regarding unifying UDFs, the effort is still + unclear. + + .. code-block:: python + + import pandas as pd + import fugue.api as fa + + def tr(df:pd.DataFrame) -> pd.DataFrame: + return df.assign(c=2) + + input = pd.DataFrame([[0,1],[3.4]], columns=["a","b"]) + + with fa.engine_context("duckdb"): + res = fa.fugue_sql(''' + SELECT * FROM input WHERE a<{{x}} + TRANSFORM USING tr SCHEMA *,c:int + ''', x=2) + assert fa.as_array(res) == [[0,1,2]] + """ + dag = _build_dag(query, fsql_ignore_case=fsql_ignore_case, args=args, kwargs=kwargs) if dag.last_df is not None: dag.last_df.yield_dataframe_as("result", as_local=as_local) @@ -43,10 +94,11 @@ def fugue_sql( return res["result"] if as_fugue else res["result"].native_as_df() -def fsql( +def fugue_sql_flow( query: str, *args: Any, fsql_ignore_case: bool = False, **kwargs: Any ) -> FugueSQLWorkflow: - """Fugue SQL functional interface + """Fugue SQL full functional interface. This function allows full workflow + definition using Fugue SQL, and it allows multiple outputs using ``YIELD``. :param query: the Fugue SQL string (can be a jinja template) :param args: variables related to the SQL string @@ -55,8 +107,39 @@ def fsql( :param kwargs: variables related to the SQL string :return: the translated Fugue workflow + .. note:: + + This function is different from :func:`~fugue.api.raw_sql` which directly + sends the query to the execution engine to run. This function parses the query + based on Fugue SQL syntax, creates a + :class:`~fugue.sql.workflow.FugueSQLWorkflow` which + could contain multiple raw SQLs plus other operations, and runs and returns + the last dataframe generated in the workflow. + + This function allows you to parameterize the SQL in a more elegant way. The + data tables referred in the query can either be automatically extracted from the + local variables or be specified in the arguments. + + .. caution:: + + Currently, we have not unified the dialects of different SQL backends. So there + can be some slight syntax differences when you switch between backends. + In addition, we have not unified the UDFs cross different backends, so you + should be careful to use uncommon UDFs belonging to a certain backend. + + That being said, if you keep your SQL part general and leverage Fugue extensions + (transformer, creator, processor, outputter, etc.) appropriately, it should be + easy to write backend agnostic Fugue SQL. + + We are working on unifying the dialects of different SQLs, it should be + available in the future releases. Regarding unifying UDFs, the effort is still + unclear. + .. code-block:: python + import fugue.api.fugue_sql_flow as fsql + import fugue.api as fa + # Basic case fsql(''' CREATE [[0]] SCHEMA a:int @@ -108,9 +191,11 @@ def dummy(df:pd.DataFrame) -> pd.DataFrame: PRINT ''' - fsql(sql).run(user_defined_spark_session()) - fsql(sql).run(SparkExecutionEngine, {"spark.executor.instances":10}) - fsql(sql).run(DaskExecutionEngine) + fsql(sql).run(spark_session) + fsql(sql).run("dask") + + with fa.engine_context("duckdb"): + fsql(sql).run() # Passing dataframes between fsql calls result = fsql(''' diff --git a/fugue/workflow/api.py b/fugue/workflow/api.py index 16eebe4f..3a376692 100644 --- a/fugue/workflow/api.py +++ b/fugue/workflow/api.py @@ -4,7 +4,7 @@ from ..collections.yielded import Yielded from ..constants import FUGUE_CONF_WORKFLOW_EXCEPTION_INJECT -from ..dataframe import DataFrame +from ..dataframe import DataFrame, AnyDataFrame from ..dataframe.api import get_native_as_df from ..exceptions import FugueInterfacelessError, FugueWorkflowCompileError from ..execution import make_execution_engine @@ -58,76 +58,76 @@ def transform( # noqa: C901 Please read |TransformerTutorial| :param df: |DataFrameLikeObject| or :class:`~fugue.workflow.yielded.Yielded` - or a path string to a parquet file + or a path string to a parquet file :param using: transformer-like object, can't be a string expression :param schema: |SchemaLikeObject|, defaults to None. The transformer - will be able to access this value from - :meth:`~fugue.extensions.context.ExtensionContext.output_schema` + will be able to access this value from + :meth:`~fugue.extensions.context.ExtensionContext.output_schema` :param params: |ParamsLikeObject| to run the processor, defaults to None. - The transformer will be able to access this value from - :meth:`~fugue.extensions.context.ExtensionContext.params` + The transformer will be able to access this value from + :meth:`~fugue.extensions.context.ExtensionContext.params` :param partition: |PartitionLikeObject|, defaults to None :param callback: |RPCHandlerLikeObject|, defaults to None :param ignore_errors: list of exception types the transformer can ignore, - defaults to None (empty list) + defaults to None (empty list) :param engine: it can be empty string or null (use the default execution - engine), a string (use the registered execution engine), an - :class:`~fugue.execution.execution_engine.ExecutionEngine` type, or - the :class:`~fugue.execution.execution_engine.ExecutionEngine` instance - , or a tuple of two values where the first value represents execution - engine and the second value represents the sql engine (you can use ``None`` - for either of them to use the default one), defaults to None + engine), a string (use the registered execution engine), an + :class:`~fugue.execution.execution_engine.ExecutionEngine` type, or + the :class:`~fugue.execution.execution_engine.ExecutionEngine` instance + , or a tuple of two values where the first value represents execution + engine and the second value represents the sql engine (you can use ``None`` + for either of them to use the default one), defaults to None :param engine_conf: |ParamsLikeObject|, defaults to None :param as_fugue: If true, the function will always return - a ``FugueDataFrame``, otherwise, if ``df`` is in native dataframe types such - as pandas dataframe, then the output will also in its native format. Defaults - to False + a ``FugueDataFrame``, otherwise, if ``df`` is in native dataframe types such + as pandas dataframe, then the output will also in its native format. Defaults + to False :param persist: Whether to persist(materialize) the dataframe before returning :param as_local: If true, the result will be converted to a ``LocalDataFrame`` :param save_path: Whether to save the output to a file (see the note) :param checkpoint: Whether to add a checkpoint for the output (see the note) :return: the transformed dataframe, if ``df`` is a native dataframe (e.g. - pd.DataFrame, spark dataframe, etc), the output will be a native dataframe, - the type is determined by the execution engine you use. But if ``df`` is - of type :class:`~fugue.dataframe.dataframe.DataFrame`, then the output will - also be a :class:`~fugue.dataframe.dataframe.DataFrame` + pd.DataFrame, spark dataframe, etc), the output will be a native dataframe, + the type is determined by the execution engine you use. But if ``df`` is + of type :class:`~fugue.dataframe.dataframe.DataFrame`, then the output will + also be a :class:`~fugue.dataframe.dataframe.DataFrame` .. note:: - This function may be lazy and return the transformed dataframe. + This function may be lazy and return the transformed dataframe. .. note:: - When you use callback in this function, you must be careful that the output - dataframe must be materialized. Otherwise, if the real compute happens out of - the function call, the callback receiver is already shut down. To do that you - can either use ``persist`` or ``as_local``, both will materialize the dataframe - before the callback receiver shuts down. + When you use callback in this function, you must be careful that the output + dataframe must be materialized. Otherwise, if the real compute happens out of + the function call, the callback receiver is already shut down. To do that you + can either use ``persist`` or ``as_local``, both will materialize the dataframe + before the callback receiver shuts down. .. note:: - * When `save_path` is None and `checkpoint` is False, then the output will - not be saved into a file. The return will be a dataframe. - * When `save_path` is None and `checkpoint` is True, then the output will be - saved into the path set by `fugue.workflow.checkpoint.path`, the name will - be randomly chosen, and it is NOT a deterministic checkpoint, so if you run - multiple times, the output will be saved into different files. The return - will be a dataframe. - * When `save_path` is not None and `checkpoint` is False, then the output will - be saved into `save_path`. The return will be the value of `save_path` - * When `save_path` is not None and `checkpoint` is True, then the output will - be saved into `save_path`. The return will be the dataframe from `save_path` - - This function can only take parquet file paths in `df` and `save_path`. - Csv and other file formats are disallowed. - - The checkpoint here is NOT deterministic, so re-run will generate new - checkpoints. - - If you want to read and write other file formats or if you want to use - deterministic checkpoints, please use - :class:`~fugue.workflow.workflow.FugueWorkflow`. + * When `save_path` is None and `checkpoint` is False, then the output will + not be saved into a file. The return will be a dataframe. + * When `save_path` is None and `checkpoint` is True, then the output will be + saved into the path set by `fugue.workflow.checkpoint.path`, the name will + be randomly chosen, and it is NOT a deterministic checkpoint, so if you run + multiple times, the output will be saved into different files. The return + will be a dataframe. + * When `save_path` is not None and `checkpoint` is False, then the output will + be saved into `save_path`. The return will be the value of `save_path` + * When `save_path` is not None and `checkpoint` is True, then the output will + be saved into `save_path`. The return will be the dataframe from `save_path` + + This function can only take parquet file paths in `df` and `save_path`. + Csv and other file formats are disallowed. + + The checkpoint here is NOT deterministic, so re-run will generate new + checkpoints. + + If you want to read and write other file formats or if you want to use + deterministic checkpoints, please use + :class:`~fugue.workflow.workflow.FugueWorkflow`. """ _check_valid_input(df, save_path) @@ -205,31 +205,31 @@ def out_transform( Please read |TransformerTutorial| :param df: |DataFrameLikeObject| or :class:`~fugue.workflow.yielded.Yielded` - or a path string to a parquet file + or a path string to a parquet file :param using: transformer-like object, can't be a string expression :param params: |ParamsLikeObject| to run the processor, defaults to None. - The transformer will be able to access this value from - :meth:`~fugue.extensions.context.ExtensionContext.params` + The transformer will be able to access this value from + :meth:`~fugue.extensions.context.ExtensionContext.params` :param partition: |PartitionLikeObject|, defaults to None. :param callback: |RPCHandlerLikeObject|, defaults to None :param ignore_errors: list of exception types the transformer can ignore, - defaults to None (empty list) + defaults to None (empty list) :param engine: it can be empty string or null (use the default execution - engine), a string (use the registered execution engine), an - :class:`~fugue.execution.execution_engine.ExecutionEngine` type, or - the :class:`~fugue.execution.execution_engine.ExecutionEngine` instance - , or a tuple of two values where the first value represents execution - engine and the second value represents the sql engine (you can use ``None`` - for either of them to use the default one), defaults to None + engine), a string (use the registered execution engine), an + :class:`~fugue.execution.execution_engine.ExecutionEngine` type, or + the :class:`~fugue.execution.execution_engine.ExecutionEngine` instance + , or a tuple of two values where the first value represents execution + engine and the second value represents the sql engine (you can use ``None`` + for either of them to use the default one), defaults to None :param engine_conf: |ParamsLikeObject|, defaults to None .. note:: - This function can only take parquet file paths in `df`. Csv and other file - formats are disallowed. + This function can only take parquet file paths in `df`. Csv and other file + formats are disallowed. - This transformation is guaranteed to execute immediately (eager) - and return nothing + This transformation is guaranteed to execute immediately (eager) + and return nothing """ dag = FugueWorkflow(compile_conf={FUGUE_CONF_WORKFLOW_EXCEPTION_INJECT: 0}) try: @@ -256,7 +256,35 @@ def raw_sql( engine_conf: Any = None, as_fugue: bool = False, as_local: bool = False, -): +) -> AnyDataFrame: + """Run raw SQL on the execution engine + + :param statements: a sequence of sub-statements in string + or dataframe-like objects + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether return a local dataframe, defaults to False + + :return: the result dataframe + + .. caution:: + + Currently, only ``SELECT`` statements are supported + + .. admonition:: Examples + + .. code-block:: python + + import pandas as pd + import fugue.api as fa + + with fa.engine_context("duckdb"): + a = fa.as_fugue_df([[0,1]], schema="a:long,b:long") + b = pd.DataFrame([[0,10]], columns=["a","b"]) + c = fa.raw_sql("SELECT * FROM",a,"UNION SELECT * FROM",b) + fa.as_pandas(c) + """ dag = FugueWorkflow(compile_conf={FUGUE_CONF_WORKFLOW_EXCEPTION_INJECT: 0}) sp: List[Any] = [] infer_by: List[Any] = [] diff --git a/tests/fugue_spark/test_execution_engine.py b/tests/fugue_spark/test_execution_engine.py index b4fff0ec..b488547d 100644 --- a/tests/fugue_spark/test_execution_engine.py +++ b/tests/fugue_spark/test_execution_engine.py @@ -126,6 +126,9 @@ def make_engine(self): e = SparkExecutionEngine(session, {"test": True}) return e + def test_get_parallelism(self): + assert fa.get_current_parallelism(self.engine) == 4 + def test__join_outer_pandas_incompatible(self): return From 6db7048c7d8c80e17338826e28e46f5870ff71b5 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Thu, 29 Dec 2022 07:47:22 +0000 Subject: [PATCH 24/30] fix tests --- fugue_test/builtin_suite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fugue_test/builtin_suite.py b/fugue_test/builtin_suite.py index deb38680..849adebb 100644 --- a/fugue_test/builtin_suite.py +++ b/fugue_test/builtin_suite.py @@ -1598,7 +1598,7 @@ def tr(df: pd.DataFrame, n=1) -> pd.DataFrame: PRINT sdf1 """, x=sdf3, - ) + ).run() df_eq( res["res"], [[3, 4, 13]], From 51bd5eb8013caef0b66b595bf7f86f009f248fb1 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Thu, 29 Dec 2022 08:15:54 +0000 Subject: [PATCH 25/30] fix test coverage --- tests/fugue/dataframe/test_arrow_dataframe.py | 12 +++++------- tests/fugue/dataframe/test_pandas_dataframe.py | 15 +++++++++++---- tests/fugue_duckdb/test_dataframe.py | 11 ++++++----- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/tests/fugue/dataframe/test_arrow_dataframe.py b/tests/fugue/dataframe/test_arrow_dataframe.py index 5b80c212..8cdfac34 100644 --- a/tests/fugue/dataframe/test_arrow_dataframe.py +++ b/tests/fugue/dataframe/test_arrow_dataframe.py @@ -1,16 +1,11 @@ -import json -from datetime import datetime from typing import Any -import numpy as np import pandas as pd import pyarrow as pa -from fugue.dataframe import ArrowDataFrame, PandasDataFrame -from fugue.dataframe.utils import _df_eq as df_eq +from fugue.dataframe import ArrowDataFrame from fugue_test.dataframe_suite import DataFrameTests from pytest import raises -from triad.collections.schema import Schema, SchemaError -from triad.exceptions import InvalidOperationError +import fugue.api as fa class ArrowDataFrameTests(DataFrameTests.Tests): @@ -25,6 +20,9 @@ def df(self, data: Any = None, schema: Any = None) -> pd.DataFrame: def to_native_df(self, pdf: pd.DataFrame) -> Any: # pragma: no cover return pa.Table.from_pandas(pdf) + def test_num_partitions(self): + assert fa.get_num_partitions(self.df([[0, 1]], "a:int,b:int")) == 1 + def test_init(): df = ArrowDataFrame(schema="a:str,b:int") diff --git a/tests/fugue/dataframe/test_pandas_dataframe.py b/tests/fugue/dataframe/test_pandas_dataframe.py index 80d38789..68c9cbc5 100644 --- a/tests/fugue/dataframe/test_pandas_dataframe.py +++ b/tests/fugue/dataframe/test_pandas_dataframe.py @@ -5,19 +5,23 @@ import numpy as np import pandas as pd -from fugue.dataframe import PandasDataFrame, ArrowDataFrame +from pytest import raises +from triad.collections.schema import Schema + +import fugue.api as fa +from fugue.dataframe import ArrowDataFrame, PandasDataFrame from fugue.dataframe.array_dataframe import ArrayDataFrame from fugue.dataframe.utils import _df_eq as df_eq from fugue_test.dataframe_suite import DataFrameTests -from pytest import raises -from triad.collections.schema import Schema, SchemaError -from triad.exceptions import InvalidOperationError class PandasDataFrameTests(DataFrameTests.Tests): def df(self, data: Any = None, schema: Any = None) -> PandasDataFrame: return PandasDataFrame(data, schema) + def test_num_partitions(self): + assert fa.get_num_partitions(self.df([[0, 1]], "a:int,b:int")) == 1 + class NativePandasDataFrameTests(DataFrameTests.NativeTests): def df(self, data: Any = None, schema: Any = None) -> pd.DataFrame: @@ -26,6 +30,9 @@ def df(self, data: Any = None, schema: Any = None) -> pd.DataFrame: def to_native_df(self, pdf: pd.DataFrame) -> Any: # pragma: no cover return pdf + def test_num_partitions(self): + assert fa.get_num_partitions(self.df([[0, 1]], "a:int,b:int")) == 1 + def test_map_type(self): pass diff --git a/tests/fugue_duckdb/test_dataframe.py b/tests/fugue_duckdb/test_dataframe.py index 64d32d26..db25da60 100644 --- a/tests/fugue_duckdb/test_dataframe.py +++ b/tests/fugue_duckdb/test_dataframe.py @@ -2,14 +2,12 @@ from typing import Any import duckdb -import numpy as np import pandas as pd -from fugue import ArrowDataFrame -from fugue.dataframe.utils import _df_eq as df_eq -from fugue_test.dataframe_suite import DataFrameTests -from pytest import raises +import fugue.api as fa +from fugue import ArrowDataFrame from fugue_duckdb.dataframe import DuckDataFrame +from fugue_test.dataframe_suite import DataFrameTests class DuckDataFrameTests(DataFrameTests.Tests): @@ -80,3 +78,6 @@ def df(self, data: Any = None, schema: Any = None) -> DuckDataFrame: def to_native_df(self, pdf: pd.DataFrame) -> Any: return duckdb.from_df(pdf) + + def test_num_partitions(self): + assert fa.get_num_partitions(self.df([[0, 1]], "a:int,b:int")) == 1 From 3621015970f3cf327e9981766d151059394d9ba0 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Thu, 29 Dec 2022 08:51:44 +0000 Subject: [PATCH 26/30] update docs --- RELEASE.md | 18 +++++++++++++++--- docs/index.rst | 2 -- fugue/execution/execution_engine.py | 1 - 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index b47dd89c..2838bda1 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,8 +1,20 @@ # Release Notes -## 0.7.4 - -- [340](https://github.com/fugue-project/fugue/issues/340) Migrate to plugin mode (DataFrames & Extensions) +## 0.8.0 + +- [384](https://github.com/fugue-project/fugue/issues/384) Expanding Fugue API +- [396](https://github.com/fugue-project/fugue/issues/396) Ray/Dask engines guess optimal default partitions +- [403](https://github.com/fugue-project/fugue/issues/403) Deprecate register_raw_df_type +- [392](https://github.com/fugue-project/fugue/issues/392) Aggregations on Spark dataframes fail intermittently +- [398](https://github.com/fugue-project/fugue/issues/398) Rework API Docs and Favicon +- [393](https://github.com/fugue-project/fugue/issues/393) ExecutionEngine as_context +- [385](https://github.com/fugue-project/fugue/issues/385) Remove DataFrame metadata +- [381](https://github.com/fugue-project/fugue/issues/381) Change SparkExecutionEngine to use pandas udf by default +- [380](https://github.com/fugue-project/fugue/issues/380) Refactor ExecutionEngine (Separate out MapEngine) +- [378](https://github.com/fugue-project/fugue/issues/378) Refactor DataFrame show +- [377](https://github.com/fugue-project/fugue/issues/377) Create bag +- [372](https://github.com/fugue-project/fugue/issues/372) Infer execution engine from input +- [340](https://github.com/fugue-project/fugue/issues/340) Migrate to plugin mode - [369](https://github.com/fugue-project/fugue/issues/369) Remove execution from FugueWorkflow context manager, remove engine from FugueWorkflow - [373](https://github.com/fugue-project/fugue/issues/373) Fixed Spark engine rename slowness when there are a lot of columns diff --git a/docs/index.rst b/docs/index.rst index 41c13886..ad4ed210 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -33,9 +33,7 @@ For contributing, start with the `contributing guide Tuple[DataFrames, str]: d = DataFrames({self.encode_name(k): v for k, v in dfs.items()}) s = " ".join(self.encode_name(tp[1]) if tp[0] else tp[1] for tp in statement) - print(s) return d, s @abstractmethod From 6623c1fb9d73827a6a43340d7e9ca53d380e8f7d Mon Sep 17 00:00:00 2001 From: Han Wang Date: Fri, 30 Dec 2022 06:34:07 +0000 Subject: [PATCH 27/30] add all sql api functions --- docs/top_api.rst | 21 +- fugue/api.py | 4 + fugue/dataframe/utils.py | 3 + fugue/execution/api.py | 346 ++++++++++++++++-- fugue_test/execution_suite.py | 92 ++--- .../fugue/dataframe/test_pandas_dataframe.py | 3 + tests/fugue/dataframe/test_utils.py | 18 +- 7 files changed, 389 insertions(+), 98 deletions(-) diff --git a/docs/top_api.rst b/docs/top_api.rst index 3d458310..d3335a85 100644 --- a/docs/top_api.rst +++ b/docs/top_api.rst @@ -57,6 +57,9 @@ Information Transformation ~~~~~~~~~~~~~~ +.. autofunction:: fugue.api.transform +.. autofunction:: fugue.api.out_transform + .. autofunction:: fugue.api.alter_columns .. autofunction:: fugue.api.drop_columns .. autofunction:: fugue.api.head @@ -70,14 +73,6 @@ Transformation .. autofunction:: fugue.api.sample .. autofunction:: fugue.api.take -.. autofunction:: fugue.api.join -.. autofunction:: fugue.api.union -.. autofunction:: fugue.api.intersect -.. autofunction:: fugue.api.subtract - -.. autofunction:: fugue.api.transform -.. autofunction:: fugue.api.out_transform - SQL ~~~ @@ -85,6 +80,16 @@ SQL .. autofunction:: fugue.api.fugue_sql_flow .. autofunction:: fugue.api.raw_sql +.. autofunction:: fugue.api.join +.. autofunction:: fugue.api.union +.. autofunction:: fugue.api.intersect +.. autofunction:: fugue.api.subtract + +.. autofunction:: fugue.api.assign +.. autofunction:: fugue.api.select +.. autofunction:: fugue.api.filter +.. autofunction:: fugue.api.aggregate + Conversion ~~~~~~~~~~ diff --git a/fugue/api.py b/fugue/api.py index eae905bf..1552c020 100644 --- a/fugue/api.py +++ b/fugue/api.py @@ -32,12 +32,15 @@ show, ) from .execution.api import ( + aggregate, + assign, broadcast, clear_global_engine, distinct, dropna, engine_context, fillna, + filter, get_current_engine, get_current_parallelism, intersect, @@ -48,6 +51,7 @@ run_engine_function, sample, save, + select, set_global_engine, subtract, take, diff --git a/fugue/dataframe/utils.py b/fugue/dataframe/utils.py index f90df592..227968e7 100644 --- a/fugue/dataframe/utils.py +++ b/fugue/dataframe/utils.py @@ -15,6 +15,7 @@ from .api import get_column_names, normalize_column_names, rename from .array_dataframe import ArrayDataFrame +from .arrow_dataframe import ArrowDataFrame from .dataframe import DataFrame, LocalBoundedDataFrame, LocalDataFrame from .iterable_dataframe import IterableDataFrame from .pandas_dataframe import PandasDataFrame @@ -135,6 +136,8 @@ def to_local_df(df: Any, schema: Any = None) -> LocalDataFrame: return df.as_local() if isinstance(df, pd.DataFrame): return PandasDataFrame(df, schema) + if isinstance(df, pa.Table): + return ArrowDataFrame(df, schema) if isinstance(df, List): return ArrayDataFrame(df, schema) if isinstance(df, Iterable): diff --git a/fugue/execution/api.py b/fugue/execution/api.py index 1654e7cb..1599b71b 100644 --- a/fugue/execution/api.py +++ b/fugue/execution/api.py @@ -11,6 +11,7 @@ ExecutionEngine, ) from .factory import make_execution_engine +from fugue.column import ColumnExpr, SelectColumns, col, lit @contextmanager @@ -111,13 +112,15 @@ def run_engine_function( engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, + as_local: bool = False, infer_by: Optional[List[Any]] = None, ) -> Any: """Run a lambda function based on the engine provided :param engine: an engine like object, defaults to None :param engine_conf: the configs for the engine, defaults to None - :param as_fugue: whether to force return a Fugue DataFrame + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False :param infer_by: a list of objects to infer the engine, defaults to None :return: None or a Fugue :class:`~.fugue.dataframe.dataframe.DataFrame` if @@ -130,14 +133,15 @@ def run_engine_function( This function is for deveopment use. Users should not need it. """ - with engine_context(engine, engine_conf, infer_by=infer_by) as e: - res = func(e) + e = make_execution_engine(engine, engine_conf, infer_by=infer_by) + res = func(e) - if isinstance(res, DataFrame): - if as_fugue or any(isinstance(x, DataFrame) for x in (infer_by or [])): - return res - return res.native_as_df() - return res + if isinstance(res, DataFrame): + res = e.convert_yield_dataframe(res, as_local=as_local) + if as_fugue or any(isinstance(x, DataFrame) for x in (infer_by or [])): + return res + return res.native_as_df() + return res def repartition( @@ -146,6 +150,7 @@ def repartition( engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, + as_local: bool = False, ) -> AnyDataFrame: """Partition the input dataframe using ``partition``. @@ -153,7 +158,8 @@ def repartition( :param partition: how you want to partition the dataframe :param engine: an engine like object, defaults to None :param engine_conf: the configs for the engine, defaults to None - :param as_fugue: whether to force return a Fugue DataFrame + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False :return: the repartitioned dataframe @@ -167,6 +173,7 @@ def repartition( engine_conf=engine_conf, infer_by=[df], as_fugue=as_fugue, + as_local=as_local, ) @@ -175,13 +182,15 @@ def broadcast( engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, + as_local: bool = False, ) -> AnyDataFrame: """Broadcast the dataframe to all workers for a distributed computing framework :param df: an input dataframe that can be recognized by Fugue :param engine: an engine like object, defaults to None :param engine_conf: the configs for the engine, defaults to None - :param as_fugue: whether to force return a Fugue DataFrame + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False :return: the broadcasted dataframe """ @@ -191,6 +200,7 @@ def broadcast( engine_conf=engine_conf, infer_by=[df], as_fugue=as_fugue, + as_local=as_local, ) @@ -200,6 +210,7 @@ def persist( engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, + as_local: bool = False, **kwargs: Any, ) -> AnyDataFrame: """Force materializing and caching the dataframe @@ -211,7 +222,8 @@ def persist( :param kwargs: parameter to pass to the underlying persist implementation :param engine: an engine like object, defaults to None :param engine_conf: the configs for the engine, defaults to None - :param as_fugue: whether to force return a Fugue DataFrame + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False :return: the persisted dataframe """ @@ -221,6 +233,7 @@ def persist( engine_conf=engine_conf, infer_by=[df], as_fugue=as_fugue, + as_local=as_local, ) @@ -229,13 +242,15 @@ def distinct( engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, + as_local: bool = False, ) -> AnyDataFrame: """Equivalent to ``SELECT DISTINCT * FROM df`` :param df: an input dataframe that can be recognized by Fugue :param engine: an engine like object, defaults to None :param engine_conf: the configs for the engine, defaults to None - :param as_fugue: whether to force return a Fugue DataFrame + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False :return: the result with distinct rows """ @@ -245,6 +260,7 @@ def distinct( engine_conf=engine_conf, infer_by=[df], as_fugue=as_fugue, + as_local=as_local, ) @@ -256,6 +272,7 @@ def dropna( engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, + as_local: bool = False, ) -> AnyDataFrame: """Drop NA recods from dataframe @@ -266,7 +283,8 @@ def dropna( :param subset: list of columns to operate on :param engine: an engine like object, defaults to None :param engine_conf: the configs for the engine, defaults to None - :param as_fugue: whether to force return a Fugue DataFrame + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False :return: DataFrame with NA records dropped """ @@ -276,6 +294,7 @@ def dropna( engine_conf=engine_conf, infer_by=[df], as_fugue=as_fugue, + as_local=as_local, ) @@ -286,6 +305,7 @@ def fillna( engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, + as_local: bool = False, ) -> AnyDataFrame: """ Fill ``NULL``, ``NAN``, ``NAT`` values in a dataframe @@ -298,7 +318,8 @@ def fillna( a dictionary :param engine: an engine like object, defaults to None :param engine_conf: the configs for the engine, defaults to None - :param as_fugue: whether to force return a Fugue DataFrame + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False :return: DataFrame with NA records filled """ @@ -308,6 +329,7 @@ def fillna( engine_conf=engine_conf, infer_by=[df], as_fugue=as_fugue, + as_local=as_local, ) @@ -320,6 +342,7 @@ def sample( engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, + as_local: bool = False, ) -> AnyDataFrame: """ Sample dataframe by number of rows or by fraction @@ -334,7 +357,8 @@ def sample( :param seed: seed for randomness, defaults to None :param engine: an engine like object, defaults to None :param engine_conf: the configs for the engine, defaults to None - :param as_fugue: whether to force return a Fugue DataFrame + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False :return: the sampled dataframe """ @@ -344,6 +368,7 @@ def sample( engine_conf=engine_conf, infer_by=[df], as_fugue=as_fugue, + as_local=as_local, ) @@ -356,6 +381,7 @@ def take( engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, + as_local: bool = False, ) -> AnyDataFrame: """ Get the first n rows of a DataFrame per partition. If a presort is defined, @@ -373,7 +399,8 @@ def take( defaults to None :param engine: an engine like object, defaults to None :param engine_conf: the configs for the engine, defaults to None - :param as_fugue: whether to force return a Fugue DataFrame + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False :return: n rows of DataFrame per partition """ @@ -390,6 +417,7 @@ def take( engine_conf=engine_conf, infer_by=[df], as_fugue=as_fugue, + as_local=as_local, ) @@ -400,6 +428,7 @@ def load( engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, + as_local: bool = False, **kwargs: Any, ) -> AnyDataFrame: """Load dataframe from persistent storage @@ -411,8 +440,8 @@ def load( :param kwargs: parameters to pass to the underlying framework :param engine: an engine like object, defaults to None :param engine_conf: the configs for the engine, defaults to None - :param as_fugue: whether to force return a Fugue DataFrame - + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False :return: an engine compatible dataframe For more details and examples, read |ZipComap|. @@ -424,6 +453,7 @@ def load( engine=engine, engine_conf=engine_conf, as_fugue=as_fugue, + as_local=as_local, ) @@ -480,6 +510,7 @@ def join( engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, + as_local: bool = False, ) -> AnyDataFrame: """Join two dataframes @@ -492,7 +523,8 @@ def join( validated against the inferred keys. :param engine: an engine like object, defaults to None :param engine_conf: the configs for the engine, defaults to None - :param as_fugue: whether to force return a Fugue DataFrame + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False :return: the joined dataframe @@ -514,6 +546,7 @@ def _join(e: ExecutionEngine): engine=engine, engine_conf=engine_conf, as_fugue=as_fugue, + as_local=as_local, infer_by=[df1, df2, *dfs], ) @@ -526,6 +559,7 @@ def union( engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, + as_local: bool = False, ) -> AnyDataFrame: """Join two dataframes @@ -536,7 +570,8 @@ def union( ``false`` for ``UNION ALL`` :param engine: an engine like object, defaults to None :param engine_conf: the configs for the engine, defaults to None - :param as_fugue: whether to force return a Fugue DataFrame + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False :return: the unioned dataframe @@ -559,6 +594,7 @@ def _union(e: ExecutionEngine): engine=engine, engine_conf=engine_conf, as_fugue=as_fugue, + as_local=as_local, infer_by=[df1, df2, *dfs], ) @@ -571,6 +607,7 @@ def subtract( engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, + as_local: bool = False, ) -> AnyDataFrame: """``df1 - df2`` @@ -581,7 +618,8 @@ def subtract( ``false`` for ``EXCEPT ALL`` :param engine: an engine like object, defaults to None :param engine_conf: the configs for the engine, defaults to None - :param as_fugue: whether to force return a Fugue DataFrame + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False :return: the unioned dataframe @@ -604,6 +642,7 @@ def _subtract(e: ExecutionEngine): engine=engine, engine_conf=engine_conf, as_fugue=as_fugue, + as_local=as_local, infer_by=[df1, df2, *dfs], ) @@ -616,6 +655,7 @@ def intersect( engine: AnyExecutionEngine = None, engine_conf: Any = None, as_fugue: bool = False, + as_local: bool = False, ) -> AnyDataFrame: """Intersect ``df1`` and ``df2`` @@ -626,7 +666,8 @@ def intersect( ``false`` for ``INTERSECT ALL`` :param engine: an engine like object, defaults to None :param engine_conf: the configs for the engine, defaults to None - :param as_fugue: whether to force return a Fugue DataFrame + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False :return: the unioned dataframe @@ -649,5 +690,266 @@ def _intersect(e: ExecutionEngine): engine=engine, engine_conf=engine_conf, as_fugue=as_fugue, + as_local=as_local, infer_by=[df1, df2, *dfs], ) + + +def select( + df: AnyDataFrame, + *columns: Union[str, ColumnExpr], + where: Optional[ColumnExpr] = None, + having: Optional[ColumnExpr] = None, + distinct: bool = False, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """The functional interface for SQL select statement + + :param df: the dataframe to be operated on + :param columns: column expressions, for strings they will represent + the column names + :param where: ``WHERE`` condition expression, defaults to None + :param having: ``having`` condition expression, defaults to None. It + is used when ``cols`` contains aggregation columns, defaults to None + :param distinct: whether to return distinct result, defaults to False + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the select result as a dataframe + + .. attention:: + + This interface is experimental, it's subjected to change in new versions. + + .. seealso:: + + Please find more expression examples in :mod:`fugue.column.sql` and + :mod:`fugue.column.functions` + + .. admonition:: Examples + + .. code-block:: python + + from fugue.column import col, lit, functions as f + import fugue.api as fa + + with fa.engine_context("duckdb"): + # select existed and new columns + fa.select(df, col("a"),col("b"),lit(1,"another")) + fa.select(df, col("a"),(col("b")+lit(1)).alias("x")) + + # aggregation + # SELECT COUNT(DISTINCT *) AS x FROM df + fa.select( + df, + f.count_distinct(col("*")).alias("x")) + + # SELECT a, MAX(b+1) AS x FROM df GROUP BY a + fa.select( + df, + col("a"),f.max(col("b")+lit(1)).alias("x")) + + # SELECT a, MAX(b+1) AS x FROM df + # WHERE b<2 AND a>1 + # GROUP BY a + # HAVING MAX(b+1)>0 + fa.select( + df, + col("a"),f.max(col("b")+lit(1)).alias("x"), + where=(col("b")<2) & (col("a")>1), + having=f.max(col("b")+lit(1))>0 + ) + """ + cols = SelectColumns( + *[col(x) if isinstance(x, str) else x for x in columns], + arg_distinct=distinct, + ) + + return run_engine_function( + lambda e: e.select(e.to_df(df), cols=cols, where=where, having=having), + engine=engine, + engine_conf=engine_conf, + infer_by=[df], + as_fugue=as_fugue, + as_local=as_local, + ) + + +def filter( + df: AnyDataFrame, + condition: ColumnExpr, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """Filter rows by the given condition + + :param df: the dataframe to be filtered + :param condition: (boolean) column expression + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the filtered dataframe + + .. seealso:: + + Please find more expression examples in :mod:`fugue.column.sql` and + :mod:`fugue.column.functions` + + .. admonition:: Examples + + .. code-block:: python + + from fugue.column import col, functions as f + import fugue.api as fa + + with fa.engine_context("duckdb"): + fa.filter(df, (col("a")>1) & (col("b")=="x")) + fa.filter(df, f.coalesce(col("a"),col("b"))>1) + """ + return run_engine_function( + lambda e: e.filter(e.to_df(df), condition=condition), + engine=engine, + engine_conf=engine_conf, + infer_by=[df], + as_fugue=as_fugue, + as_local=as_local, + ) + + +def assign( + df: AnyDataFrame, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, + **columns: Any, +) -> AnyDataFrame: + """Update existing columns with new values and add new columns + + :param df: the dataframe to set columns + :param columns: column expressions + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the updated dataframe + + .. tip:: + + This can be used to cast data types, alter column values or add new + columns. But you can't use aggregation in columns. + + .. admonition:: New Since + :class: hint + + **0.6.0** + + .. seealso:: + + Please find more expression examples in :mod:`fugue.column.sql` and + :mod:`fugue.column.functions` + + .. admonition:: Examples + + .. code-block:: python + + from fugue.column import col, functions as f + import fugue.api as fa + + # assume df has schema: a:int,b:str + + with fa.engine_context("duckdb"): + # add constant column x + fa.assign(df, x=1) + + # change column b to be a constant integer + fa.assign(df, b=1) + + # add new x to be a+b + fa.assign(df, x=col("a")+col("b")) + + # cast column a data type to double + fa.assign(df, a=col("a").cast(float)) + """ + cols = [ + v.alias(k) if isinstance(v, ColumnExpr) else lit(v).alias(k) + for k, v in columns.items() + ] + return run_engine_function( + lambda e: e.assign(e.to_df(df), columns=cols), + engine=engine, + engine_conf=engine_conf, + infer_by=[df], + as_fugue=as_fugue, + as_local=as_local, + ) + + +def aggregate( + df: AnyDataFrame, + partition_by: Union[None, str, List[str]] = None, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, + **agg_kwcols: ColumnExpr, +) -> AnyDataFrame: + """Aggregate on dataframe + + :param df: the dataframe to aggregate on + :param partition_by: partition key(s), defaults to None + :param agg_kwcols: aggregation expressions + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the aggregated result as a dataframe + + .. seealso:: + + Please find more expression examples in :mod:`fugue.column.sql` and + :mod:`fugue.column.functions` + + .. admonition:: Examples + + .. code-block:: python + + from fugue.column import col, functions as f + import fugue.api as fa + + with fa.engine_context("duckdb"): + # SELECT MAX(b) AS b FROM df + fa.aggregate(df, f.max(col("b"))) + + # SELECT a, MAX(b) AS x FROM df GROUP BY a + fa.aggregate(df, "a", x=f.max(col("b"))) + """ + cols = [ + v.alias(k) if isinstance(v, ColumnExpr) else lit(v).alias(k) + for k, v in agg_kwcols.items() + ] + return run_engine_function( + lambda e: e.aggregate( + e.to_df(df), + partition_spec=None + if partition_by is None + else PartitionSpec(by=partition_by), + agg_cols=cols, + ), + engine=engine, + engine_conf=engine_conf, + infer_by=[df], + as_fugue=as_fugue, + as_local=as_local, + ) diff --git a/fugue_test/execution_suite.py b/fugue_test/execution_suite.py index 2787e33d..ab94cdd8 100644 --- a/fugue_test/execution_suite.py +++ b/fugue_test/execution_suite.py @@ -23,7 +23,7 @@ register_default_sql_engine, DataFrame, ) -from fugue.column import SelectColumns, col, lit +from fugue.column import col, lit from fugue.dataframe.utils import _df_eq as df_eq from fugue.execution.native_execution_engine import NativeExecutionEngine from fugue_test._utils import skip_spark2 @@ -98,30 +98,24 @@ def test_to_df_general(self): df_eq(o, e.to_df(pdf), throw=True) def test_filter(self): - e = self.engine - o = ArrayDataFrame( + a = ArrayDataFrame( [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int", ) - a = e.to_df(o) - b = e.filter(a, col("a").not_null()) + b = fa.filter(a, col("a").not_null()) df_eq(b, [[1, 2], [3, 4]], "a:double,b:int", throw=True) - c = e.filter(a, col("a").not_null() & (col("b") < 3)) + c = fa.filter(a, col("a").not_null() & (col("b") < 3)) df_eq(c, [[1, 2]], "a:double,b:int", throw=True) - c = e.filter(a, col("a") + col("b") == 3) + c = fa.filter(a, col("a") + col("b") == 3) df_eq(c, [[1, 2]], "a:double,b:int", throw=True) def test_select(self): - e = self.engine - o = ArrayDataFrame( + a = ArrayDataFrame( [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int" ) - a = e.to_df(o) # simple - b = e.select( - a, SelectColumns(col("b"), (col("b") + 1).alias("c").cast(str)) - ) + b = fa.select(a, col("b"), (col("b") + 1).alias("c").cast(str)) df_eq( b, [[2, "3"], [2, "3"], [1, "2"], [4, "5"], [4, "5"]], @@ -130,11 +124,8 @@ def test_select(self): ) # with distinct - b = e.select( - a, - SelectColumns( - col("b"), (col("b") + 1).alias("c").cast(str), arg_distinct=True - ), + b = fa.select( + a, col("b"), (col("b") + 1).alias("c").cast(str), distinct=True ) df_eq( b, @@ -144,21 +135,20 @@ def test_select(self): ) # wildcard - b = e.select(a, SelectColumns(col("*")), where=col("a") + col("b") == 3) + b = fa.select(a, col("*"), where=col("a") + col("b") == 3) df_eq(b, [[1, 2]], "a:double,b:int", throw=True) # aggregation - b = e.select( - a, SelectColumns(col("a"), ff.sum(col("b")).cast(float).alias("b")) - ) + b = fa.select(a, col("a"), ff.sum(col("b")).cast(float).alias("b")) df_eq(b, [[1, 2], [3, 4], [None, 7]], "a:double,b:double", throw=True) # having # https://github.com/fugue-project/fugue/issues/222 col_b = ff.sum(col("b")) - b = e.select( + b = fa.select( a, - SelectColumns(col("a"), col_b.cast(float).alias("c")), + col("a"), + col_b.cast(float).alias("c"), having=(col_b >= 7) | (col("a") == 1), ) df_eq(b, [[1, 2], [None, 7]], "a:double,c:double", throw=True) @@ -166,11 +156,11 @@ def test_select(self): # literal + alias inference # https://github.com/fugue-project/fugue/issues/222 col_b = ff.sum(col("b")) - b = e.select( + b = fa.select( a, - SelectColumns( - col("a"), lit(1, "o").cast(str), col_b.cast(float).alias("c") - ), + col("a"), + lit(1, "o").cast(str), + col_b.cast(float).alias("c"), having=(col_b >= 7) | (col("a") == 1), ) df_eq( @@ -178,16 +168,11 @@ def test_select(self): ) def test_assign(self): - e = self.engine - o = ArrayDataFrame( + a = ArrayDataFrame( [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int" ) - a = e.to_df(o) - b = e.assign( - a, - [lit(1, "x"), col("b").cast(str), (col("b") + 1).alias("c").cast(int)], - ) + b = fa.assign(a, x=1, b=col("b").cast(str), c=(col("b") + 1).cast(int)) df_eq( b, [ @@ -202,29 +187,22 @@ def test_assign(self): ) def test_aggregate(self): - e = self.engine - o = ArrayDataFrame( + a = ArrayDataFrame( [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int" ) - a = e.to_df(o) - b = e.aggregate( + b = fa.aggregate( df=a, - partition_spec=None, - agg_cols=[ - ff.max(col("b")), - (ff.max(col("b")) * 2).cast("int32").alias("c"), - ], + b=ff.max(col("b")), + c=(ff.max(col("b")) * 2).cast("int32").alias("c"), ) df_eq(b, [[4, 8]], "b:int,c:int", throw=True) - b = e.aggregate( - df=a, - partition_spec=PartitionSpec(by=["a"]), - agg_cols=[ - ff.max(col("b")), - (ff.max(col("b")) * 2).cast("int32").alias("c"), - ], + b = fa.aggregate( + a, + "a", + b=ff.max(col("b")), + c=(ff.max(col("b")) * 2).cast("int32").alias("c"), ) df_eq( b, @@ -234,18 +212,10 @@ def test_aggregate(self): ) with raises(ValueError): - e.aggregate( - df=a, - partition_spec=PartitionSpec(by=["a"]), - agg_cols=[ff.max(col("b")), lit(1)], - ) + fa.aggregate(a, "a", b=ff.max(col("b")), x=1) with raises(ValueError): - e.aggregate( - df=a, - partition_spec=PartitionSpec(by=["a"]), - agg_cols=[], - ) + fa.aggregate(a, "a") def test_map(self): def noop(cursor, data): diff --git a/tests/fugue/dataframe/test_pandas_dataframe.py b/tests/fugue/dataframe/test_pandas_dataframe.py index 68c9cbc5..38a45399 100644 --- a/tests/fugue/dataframe/test_pandas_dataframe.py +++ b/tests/fugue/dataframe/test_pandas_dataframe.py @@ -22,6 +22,9 @@ def df(self, data: Any = None, schema: Any = None) -> PandasDataFrame: def test_num_partitions(self): assert fa.get_num_partitions(self.df([[0, 1]], "a:int,b:int")) == 1 + def test_api_as_local(self): + assert fa.is_local(self.df([[0, 1]], "a:int,b:int")) + class NativePandasDataFrameTests(DataFrameTests.NativeTests): def df(self, data: Any = None, schema: Any = None) -> pd.DataFrame: diff --git a/tests/fugue/dataframe/test_utils.py b/tests/fugue/dataframe/test_utils.py index 4ee75973..733b620c 100644 --- a/tests/fugue/dataframe/test_utils.py +++ b/tests/fugue/dataframe/test_utils.py @@ -3,10 +3,13 @@ import numpy as np import pandas as pd import pyarrow as pa +from pytest import raises +from triad import FileSystem, Schema +from triad.collections.schema import SchemaError +from triad.exceptions import InvalidOperationError, NoneArgumentError + +from fugue import ArrayDataFrame, ArrowDataFrame, IterableDataFrame, PandasDataFrame from fugue.dataframe import to_local_bounded_df, to_local_df -from fugue.dataframe.array_dataframe import ArrayDataFrame -from fugue.dataframe.iterable_dataframe import IterableDataFrame -from fugue.dataframe.pandas_dataframe import PandasDataFrame from fugue.dataframe.utils import _df_eq as df_eq from fugue.dataframe.utils import ( _schema_eq, @@ -19,10 +22,6 @@ serialize_df, unpickle_df, ) -from pytest import raises -from triad import FileSystem, Schema -from triad.collections.schema import SchemaError -from triad.exceptions import InvalidOperationError, NoneArgumentError def test_to_local_df(): @@ -44,11 +43,16 @@ def test_to_local_df(): def test_to_local_bounded_df(): df = ArrayDataFrame([[0, 1]], "a:int,b:int") idf = IterableDataFrame([[0, 1]], "a:int,b:int") + adf = ArrowDataFrame(df.as_array(), "a:int,b:int") assert to_local_bounded_df(df) is df r = to_local_bounded_df(idf) assert r is not idf assert r.as_array() == [[0, 1]] assert r.schema == "a:int,b:int" + r = to_local_bounded_df(adf.native) + assert isinstance(r, ArrowDataFrame) + assert r.as_array() == [[0, 1]] + assert r.schema == "a:int,b:int" def test_schema_eq(): From ba64362862b873eca1f8f41b3750a83eeb80ad3d Mon Sep 17 00:00:00 2001 From: Han Wang Date: Fri, 30 Dec 2022 06:48:28 +0000 Subject: [PATCH 28/30] lint --- .pylintrc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pylintrc b/.pylintrc index bbe26ef6..9ee3ae91 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,3 +1,3 @@ [MESSAGES CONTROL] -disable = C0103,C0114,C0115,C0116,C0122,C0200,C0201,C0302,C0411,C0415,E0401,E0712,E1130,E5110,R0201,R0205,R0801,R0902,R0903,R0904,R0911,R0912,R0913,R0914,R0915,R1705,R1710,R1718,R1720,R1724,W0102,W0107,W0108,W0201,W0212,W0221,W0223,W0237,W0511,W0613,W0631,W0640,W0703,W0707,W1116 +disable = C0103,C0114,C0115,C0116,C0122,C0200,C0201,C0302,C0411,C0415,E0401,E0712,E1130,E5110,R0201,R0205,R0801,R0902,R0903,R0904,R0911,R0912,R0913,R0914,R0915,R1705,R1710,R1718,R1720,R1724,W0102,W0107,W0108,W0201,W0212,W0221,W0223,W0237,W0511,W0613,W0622,W0631,W0640,W0703,W0707,W1116 # TODO: R0205: inherits from object, can be safely removed From 0cdf0f092a278910e0ef3dccf27740419a5564aa Mon Sep 17 00:00:00 2001 From: Han Wang Date: Fri, 30 Dec 2022 07:22:08 +0000 Subject: [PATCH 29/30] add join functions --- docs/top_api.rst | 8 ++ fugue/api.py | 7 + fugue/execution/api.py | 240 ++++++++++++++++++++++++++++++++++ fugue_test/execution_suite.py | 16 +-- 4 files changed, 263 insertions(+), 8 deletions(-) diff --git a/docs/top_api.rst b/docs/top_api.rst index d3335a85..79d74bca 100644 --- a/docs/top_api.rst +++ b/docs/top_api.rst @@ -81,6 +81,14 @@ SQL .. autofunction:: fugue.api.raw_sql .. autofunction:: fugue.api.join +.. autofunction:: fugue.api.semi_join +.. autofunction:: fugue.api.anti_join +.. autofunction:: fugue.api.inner_join +.. autofunction:: fugue.api.left_outer_join +.. autofunction:: fugue.api.right_outer_join +.. autofunction:: fugue.api.full_outer_join +.. autofunction:: fugue.api.cross_join + .. autofunction:: fugue.api.union .. autofunction:: fugue.api.intersect .. autofunction:: fugue.api.subtract diff --git a/fugue/api.py b/fugue/api.py index 1552c020..4c96cf42 100644 --- a/fugue/api.py +++ b/fugue/api.py @@ -33,25 +33,32 @@ ) from .execution.api import ( aggregate, + anti_join, assign, broadcast, clear_global_engine, + cross_join, distinct, dropna, engine_context, fillna, filter, + full_outer_join, get_current_engine, get_current_parallelism, + inner_join, intersect, join, + left_outer_join, load, persist, repartition, + right_outer_join, run_engine_function, sample, save, select, + semi_join, set_global_engine, subtract, take, diff --git a/fugue/execution/api.py b/fugue/execution/api.py index 1599b71b..9470a160 100644 --- a/fugue/execution/api.py +++ b/fugue/execution/api.py @@ -551,6 +551,246 @@ def _join(e: ExecutionEngine): ) +def inner_join( + df1: AnyDataFrame, + df2: AnyDataFrame, + *dfs: AnyDataFrame, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """Inner join two dataframes. + This is a wrapper of :func:`~.join` with ``how="inner"`` + + :param df1: the first dataframe + :param df2: the second dataframe + :param dfs: more dataframes to join + :param how: can accept ``semi``, ``left_semi``, ``anti``, ``left_anti``, + ``inner``, ``left_outer``, ``right_outer``, ``full_outer``, ``cross`` + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the joined dataframe + """ + return join( + df1, + df2, + *dfs, + how="inner", + engine=engine, + engine_conf=engine_conf, + as_fugue=as_fugue, + as_local=as_local, + ) + + +def semi_join( + df1: AnyDataFrame, + df2: AnyDataFrame, + *dfs: AnyDataFrame, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """Left semi-join two dataframes. + This is a wrapper of :func:`~.join` with ``how="semi"`` + + :param df1: the first dataframe + :param df2: the second dataframe + :param dfs: more dataframes to join + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the joined dataframe + """ + return join( + df1, + df2, + *dfs, + how="semi", + engine=engine, + engine_conf=engine_conf, + as_fugue=as_fugue, + as_local=as_local, + ) + + +def anti_join( + df1: AnyDataFrame, + df2: AnyDataFrame, + *dfs: AnyDataFrame, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """Left anti-join two dataframes. + This is a wrapper of :func:`~.join` with ``how="anti"`` + + :param df1: the first dataframe + :param df2: the second dataframe + :param dfs: more dataframes to join + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the joined dataframe + """ + return join( + df1, + df2, + *dfs, + how="anti", + engine=engine, + engine_conf=engine_conf, + as_fugue=as_fugue, + as_local=as_local, + ) + + +def left_outer_join( + df1: AnyDataFrame, + df2: AnyDataFrame, + *dfs: AnyDataFrame, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """Left outer join two dataframes. + This is a wrapper of :func:`~.join` with ``how="left_outer"`` + + :param df1: the first dataframe + :param df2: the second dataframe + :param dfs: more dataframes to join + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the joined dataframe + """ + return join( + df1, + df2, + *dfs, + how="left_outer", + engine=engine, + engine_conf=engine_conf, + as_fugue=as_fugue, + as_local=as_local, + ) + + +def right_outer_join( + df1: AnyDataFrame, + df2: AnyDataFrame, + *dfs: AnyDataFrame, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """Right outer join two dataframes. + This is a wrapper of :func:`~.join` with ``how="right_outer"`` + + :param df1: the first dataframe + :param df2: the second dataframe + :param dfs: more dataframes to join + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the joined dataframe + """ + return join( + df1, + df2, + *dfs, + how="right_outer", + engine=engine, + engine_conf=engine_conf, + as_fugue=as_fugue, + as_local=as_local, + ) + + +def full_outer_join( + df1: AnyDataFrame, + df2: AnyDataFrame, + *dfs: AnyDataFrame, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """Full outer join two dataframes. + This is a wrapper of :func:`~.join` with ``how="full_outer"`` + + :param df1: the first dataframe + :param df2: the second dataframe + :param dfs: more dataframes to join + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the joined dataframe + """ + return join( + df1, + df2, + *dfs, + how="full_outer", + engine=engine, + engine_conf=engine_conf, + as_fugue=as_fugue, + as_local=as_local, + ) + + +def cross_join( + df1: AnyDataFrame, + df2: AnyDataFrame, + *dfs: AnyDataFrame, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """Cross join two dataframes. + This is a wrapper of :func:`~.join` with ``how="cross"`` + + :param df1: the first dataframe + :param df2: the second dataframe + :param dfs: more dataframes to join + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the joined dataframe + """ + return join( + df1, + df2, + *dfs, + how="cross", + engine=engine, + engine_conf=engine_conf, + as_fugue=as_fugue, + as_local=as_local, + ) + + def union( df1: AnyDataFrame, df2: AnyDataFrame, diff --git a/fugue_test/execution_suite.py b/fugue_test/execution_suite.py index ab94cdd8..8d087296 100644 --- a/fugue_test/execution_suite.py +++ b/fugue_test/execution_suite.py @@ -356,7 +356,7 @@ def test_join_multiple(self): a = e.to_df([[1, 2], [3, 4]], "a:int,b:int") b = e.to_df([[1, 20], [3, 40]], "a:int,c:int") c = e.to_df([[1, 200], [3, 400]], "a:int,d:int") - d = fa.join(a, b, c, how="inner") + d = fa.inner_join(a, b, c) df_eq( d, [[1, 2, 20, 200], [3, 4, 40, 400]], @@ -377,7 +377,7 @@ def test__join_cross(self): ) b = e.to_df([], "c:int") - c = fa.join(a, b, how="Cross") + c = fa.cross_join(a, b) df_eq(c, [], "a:int,b:int,c:int", throw=True) a = e.to_df([], "a:int,b:int") @@ -391,7 +391,7 @@ def test__join_inner(self): b = e.to_df([[6, 1], [2, 7]], "c:int,a:int") c = fa.join(a, b, how="INNER", on=["a"]) df_eq(c, [[1, 2, 6]], "a:int,b:int,c:int", throw=True) - c = fa.join(b, a, how="INNER", on=["a"]) + c = fa.inner_join(b, a) df_eq(c, [[6, 1, 2]], "c:int,a:int,b:int", throw=True) a = e.to_df([], "a:int,b:int") @@ -404,17 +404,17 @@ def test__join_outer(self): a = e.to_df([], "a:int,b:int") b = e.to_df([], "c:str,a:int") - c = fa.join(a, b, how="left_outer", on=["a"]) + c = fa.left_outer_join(a, b) df_eq(c, [], "a:int,b:int,c:str", throw=True) a = e.to_df([], "a:int,b:str") b = e.to_df([], "c:int,a:int") - c = fa.join(a, b, how="right_outer", on=["a"]) + c = fa.right_outer_join(a, b) df_eq(c, [], "a:int,b:str,c:int", throw=True) a = e.to_df([], "a:int,b:str") b = e.to_df([], "c:str,a:int") - c = fa.join(a, b, how="full_outer", on=["a"]) + c = fa.full_outer_join(a, b) df_eq(c, [], "a:int,b:str,c:str", throw=True) a = e.to_df([[1, "2"], [3, "4"]], "a:int,b:str") @@ -480,7 +480,7 @@ def test__join_semi(self): b = e.to_df([[6, 1], [2, 7]], "c:int,a:int") c = fa.join(a, b, how="semi", on=["a"]) df_eq(c, [[1, 2]], "a:int,b:int", throw=True) - c = fa.join(b, a, how="semi", on=["a"]) + c = fa.semi_join(b, a) df_eq(c, [[6, 1]], "c:int,a:int", throw=True) b = e.to_df([], "c:int,a:int") @@ -498,7 +498,7 @@ def test__join_anti(self): b = e.to_df([[6, 1], [2, 7]], "c:int,a:int") c = fa.join(a, b, how="anti", on=["a"]) df_eq(c, [[3, 4]], "a:int,b:int", throw=True) - c = fa.join(b, a, how="anti", on=["a"]) + c = fa.anti_join(b, a) df_eq(c, [[2, 7]], "c:int,a:int", throw=True) b = e.to_df([], "c:int,a:int") From dfdc2642a282b1838ab7047e391e0a3e73fd3507 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Fri, 30 Dec 2022 07:51:29 +0000 Subject: [PATCH 30/30] Make PartitionSpec more flexible --- fugue/collections/partition.py | 24 +++++++---- fugue/execution/api.py | 2 +- tests/fugue/collections/test_partition.py | 50 ++++++++++++++++------- 3 files changed, 51 insertions(+), 25 deletions(-) diff --git a/fugue/collections/partition.py b/fugue/collections/partition.py index bec2db0d..036ad99e 100644 --- a/fugue/collections/partition.py +++ b/fugue/collections/partition.py @@ -84,13 +84,16 @@ class PartitionSpec(object): .. admonition:: Examples >>> PartitionSepc(num=4) + >>> PartitionSepc(4) # == PartitionSepc(num=4) >>> PartitionSepc(num="ROWCOUNT/4 + 3") # It can be an expression >>> PartitionSepc(by=["a","b"]) + >>> PartitionSepc(["a","b"]) # == PartitionSepc(by=["a","b"]) >>> PartitionSpec(by=["a"], presort="b DESC, c ASC") >>> PartitionSpec(algo="even", num=4) >>> p = PartitionSpec(num=4, by=["a"]) >>> p_override = PartitionSpec(p, by=["a","b"], algo="even") >>> PartitionSpec(by="a") # == PartitionSpec(by=["a"]) + >>> PartitionSpec("a") # == PartitionSpec(by=["a"]) >>> PartitionSpec("per_row") # == PartitionSpec(num="ROWCOUNT", algo="even") It's important to understand this concept, please read |PartitionTutorial| @@ -109,15 +112,18 @@ class PartitionSpec(object): def __init__(self, *args: Any, **kwargs: Any): # noqa: C901 p = ParamDict() - if ( - len(args) == 1 - and len(kwargs) == 0 - and isinstance(args[0], str) - and args[0].lower() == "per_row" - ): - p["algo"] = "even" - p["num_partitions"] = "ROWCOUNT" - else: + if len(args) == 1 and len(kwargs) == 0: + if isinstance(args[0], str): + if args[0].lower() == "per_row": + p["algo"] = "even" + p["num_partitions"] = "ROWCOUNT" + elif not args[0].startswith("{"): + p["partition_by"] = [args[0]] + elif isinstance(args[0], int): + p["num_partitions"] = str(args[0]) + elif isinstance(args[0], (list, tuple)): + p["partition_by"] = args[0] + if len(p) == 0: # the first condition had no match for a in args: if a is None: continue diff --git a/fugue/execution/api.py b/fugue/execution/api.py index 9470a160..57a6ba46 100644 --- a/fugue/execution/api.py +++ b/fugue/execution/api.py @@ -1170,7 +1170,7 @@ def aggregate( with fa.engine_context("duckdb"): # SELECT MAX(b) AS b FROM df - fa.aggregate(df, f.max(col("b"))) + fa.aggregate(df, b=f.max(col("b"))) # SELECT a, MAX(b) AS x FROM df GROUP BY a fa.aggregate(df, "a", x=f.max(col("b"))) diff --git a/tests/fugue/collections/test_partition.py b/tests/fugue/collections/test_partition.py index 0665032f..a88a8817 100644 --- a/tests/fugue/collections/test_partition.py +++ b/tests/fugue/collections/test_partition.py @@ -7,31 +7,45 @@ from triad.utils.hash import to_uuid from triad.collections.dict import IndexedOrderedDict + def test_parse_presort_exp(): assert parse_presort_exp(None) == IndexedOrderedDict() - assert parse_presort_exp(IndexedOrderedDict([('c', True)])) == IndexedOrderedDict([('c', True)]) - assert parse_presort_exp("c") == IndexedOrderedDict([('c', True)]) - assert parse_presort_exp(" c") == IndexedOrderedDict([('c', True)]) - assert parse_presort_exp("c desc") == IndexedOrderedDict([('c', False)]) - assert parse_presort_exp("b desc, c asc") == IndexedOrderedDict([('b', False), ('c', True)]) - assert parse_presort_exp("DESC DESC, ASC ASC") == IndexedOrderedDict([('DESC', False), ('ASC', True)]) - assert parse_presort_exp([("b", False),("c", True)]) == IndexedOrderedDict([('b', False), ('c', True)]) - assert parse_presort_exp("B DESC, C ASC") == IndexedOrderedDict([('B', False), ('C', True)]) - assert parse_presort_exp("b desc, c asc") == IndexedOrderedDict([('b', False), ('c', True)]) - + assert parse_presort_exp(IndexedOrderedDict([("c", True)])) == IndexedOrderedDict( + [("c", True)] + ) + assert parse_presort_exp("c") == IndexedOrderedDict([("c", True)]) + assert parse_presort_exp(" c") == IndexedOrderedDict([("c", True)]) + assert parse_presort_exp("c desc") == IndexedOrderedDict([("c", False)]) + assert parse_presort_exp("b desc, c asc") == IndexedOrderedDict( + [("b", False), ("c", True)] + ) + assert parse_presort_exp("DESC DESC, ASC ASC") == IndexedOrderedDict( + [("DESC", False), ("ASC", True)] + ) + assert parse_presort_exp([("b", False), ("c", True)]) == IndexedOrderedDict( + [("b", False), ("c", True)] + ) + assert parse_presort_exp("B DESC, C ASC") == IndexedOrderedDict( + [("B", False), ("C", True)] + ) + assert parse_presort_exp("b desc, c asc") == IndexedOrderedDict( + [("b", False), ("c", True)] + ) with raises(SyntaxError): - parse_presort_exp("b dsc, c asc") # mispelling of desc + parse_presort_exp("b dsc, c asc") # mispelling of desc with raises(SyntaxError): - parse_presort_exp("c true") # string format needs desc/asc + parse_presort_exp("c true") # string format needs desc/asc with raises(SyntaxError): - parse_presort_exp("c true, c true") # cannot contain duplicates + parse_presort_exp("c true, c true") # cannot contain duplicates with raises(SyntaxError): - parse_presort_exp([("b", "desc"),("c", "asc")]) # instead of desc and asc, needs to be bool + parse_presort_exp( + [("b", "desc"), ("c", "asc")] + ) # instead of desc and asc, needs to be bool def test_partition_spec(): @@ -86,6 +100,12 @@ def test_partition_spec(): assert PartitionSpec("per_row") == PartitionSpec(num="ROWCOUNT", algo="even") assert PartitionSpec(by="abc") == PartitionSpec(by=["abc"]) + assert PartitionSpec("abc") == PartitionSpec(by=["abc"]) + assert PartitionSpec(["abc"]) == PartitionSpec(by=["abc"]) + assert PartitionSpec(["abc", "def"]) == PartitionSpec(by=["abc", "def"]) + assert PartitionSpec(("abc", "def")) == PartitionSpec(by=["abc", "def"]) + + assert PartitionSpec(4) == PartitionSpec(num=4) # partition by overlaps with presort raises( @@ -105,7 +125,7 @@ def test_partition_spec(): raises(SyntaxError, lambda: PartitionSpec(partition_by=123)) # bad input - raises(TypeError, lambda: PartitionSpec(1)) + raises(TypeError, lambda: PartitionSpec(1.1)) # bad presort raises(SyntaxError, lambda: PartitionSpec(presort="a xsc,e desc"))