diff --git a/.gitignore b/.gitignore index 2d295fc5..9568b79c 100644 --- a/.gitignore +++ b/.gitignore @@ -120,6 +120,7 @@ pythonenv* # mkdocs documentation /site +.virtual_documents # mypy .mypy_cache diff --git a/.pylintrc b/.pylintrc index bbe26ef6..9ee3ae91 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,3 +1,3 @@ [MESSAGES CONTROL] -disable = C0103,C0114,C0115,C0116,C0122,C0200,C0201,C0302,C0411,C0415,E0401,E0712,E1130,E5110,R0201,R0205,R0801,R0902,R0903,R0904,R0911,R0912,R0913,R0914,R0915,R1705,R1710,R1718,R1720,R1724,W0102,W0107,W0108,W0201,W0212,W0221,W0223,W0237,W0511,W0613,W0631,W0640,W0703,W0707,W1116 +disable = C0103,C0114,C0115,C0116,C0122,C0200,C0201,C0302,C0411,C0415,E0401,E0712,E1130,E5110,R0201,R0205,R0801,R0902,R0903,R0904,R0911,R0912,R0913,R0914,R0915,R1705,R1710,R1718,R1720,R1724,W0102,W0107,W0108,W0201,W0212,W0221,W0223,W0237,W0511,W0613,W0622,W0631,W0640,W0703,W0707,W1116 # TODO: R0205: inherits from object, can be safely removed diff --git a/RELEASE.md b/RELEASE.md index b47dd89c..2838bda1 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,8 +1,20 @@ # Release Notes -## 0.7.4 - -- [340](https://github.com/fugue-project/fugue/issues/340) Migrate to plugin mode (DataFrames & Extensions) +## 0.8.0 + +- [384](https://github.com/fugue-project/fugue/issues/384) Expanding Fugue API +- [396](https://github.com/fugue-project/fugue/issues/396) Ray/Dask engines guess optimal default partitions +- [403](https://github.com/fugue-project/fugue/issues/403) Deprecate register_raw_df_type +- [392](https://github.com/fugue-project/fugue/issues/392) Aggregations on Spark dataframes fail intermittently +- [398](https://github.com/fugue-project/fugue/issues/398) Rework API Docs and Favicon +- [393](https://github.com/fugue-project/fugue/issues/393) ExecutionEngine as_context +- [385](https://github.com/fugue-project/fugue/issues/385) Remove DataFrame metadata +- [381](https://github.com/fugue-project/fugue/issues/381) Change SparkExecutionEngine to use pandas udf by default +- [380](https://github.com/fugue-project/fugue/issues/380) Refactor ExecutionEngine (Separate out MapEngine) +- [378](https://github.com/fugue-project/fugue/issues/378) Refactor DataFrame show +- [377](https://github.com/fugue-project/fugue/issues/377) Create bag +- [372](https://github.com/fugue-project/fugue/issues/372) Infer execution engine from input +- [340](https://github.com/fugue-project/fugue/issues/340) Migrate to plugin mode - [369](https://github.com/fugue-project/fugue/issues/369) Remove execution from FugueWorkflow context manager, remove engine from FugueWorkflow - [373](https://github.com/fugue-project/fugue/issues/373) Fixed Spark engine rename slowness when there are a lot of columns diff --git a/docs/api/fugue.dataframe.rst b/docs/api/fugue.dataframe.rst index 67e2a87b..99b67137 100644 --- a/docs/api/fugue.dataframe.rst +++ b/docs/api/fugue.dataframe.rst @@ -27,6 +27,14 @@ fugue.dataframe .. |FugueDataTypes| replace:: :doc:`Fugue Data Types ` +fugue.dataframe.api +------------------- + +.. automodule:: fugue.dataframe.api + :members: + :undoc-members: + :show-inheritance: + fugue.dataframe.array\_dataframe -------------------------------- diff --git a/docs/api/fugue.dataset.rst b/docs/api/fugue.dataset.rst new file mode 100644 index 00000000..8c4ec9a1 --- /dev/null +++ b/docs/api/fugue.dataset.rst @@ -0,0 +1,45 @@ +fugue.dataset +============== + +.. |SchemaLikeObject| replace:: :ref:`Schema like object ` +.. |ParamsLikeObject| replace:: :ref:`Parameters like object ` +.. |DataFrameLikeObject| replace:: :ref:`DataFrame like object ` +.. |DataFramesLikeObject| replace:: :ref:`DataFrames like object ` +.. |PartitionLikeObject| replace:: :ref:`Partition like object ` +.. |RPCHandlerLikeObject| replace:: :ref:`RPChandler like object ` + +.. |ExecutionEngine| replace:: :class:`~fugue.execution.execution_engine.ExecutionEngine` +.. |NativeExecutionEngine| replace:: :class:`~fugue.execution.native_execution_engine.NativeExecutionEngine` +.. |FugueWorkflow| replace:: :class:`~fugue.workflow.workflow.FugueWorkflow` + +.. |ReadJoin| replace:: Read Join tutorials on :ref:`workflow ` and :ref:`engine ` for details +.. |FugueConfig| replace:: :doc:`the Fugue Configuration Tutorial ` +.. |PartitionTutorial| replace:: :doc:`the Partition Tutorial ` +.. |FugueSQLTutorial| replace:: :doc:`the Fugue SQL Tutorial ` +.. |DataFrameTutorial| replace:: :ref:`the DataFrame Tutorial ` +.. |ExecutionEngineTutorial| replace:: :doc:`the ExecutionEngine Tutorial ` +.. |ZipComap| replace:: :ref:`Zip & Comap ` +.. |LoadSave| replace:: :ref:`Load & Save ` +.. |AutoPersist| replace:: :ref:`Auto Persist ` +.. |TransformerTutorial| replace:: :doc:`the Transformer Tutorial ` +.. |CoTransformer| replace:: :ref:`CoTransformer ` +.. |CoTransformerTutorial| replace:: :doc:`the CoTransformer Tutorial ` +.. |FugueDataTypes| replace:: :doc:`Fugue Data Types ` + + +fugue.dataset.api +----------------- + +.. automodule:: fugue.dataset.api + :members: + :undoc-members: + :show-inheritance: + +fugue.dataset.dataset +--------------------- + +.. automodule:: fugue.dataset.dataset + :members: + :undoc-members: + :show-inheritance: + diff --git a/docs/api/fugue.execution.rst b/docs/api/fugue.execution.rst index 6000ace3..d09e526b 100644 --- a/docs/api/fugue.execution.rst +++ b/docs/api/fugue.execution.rst @@ -27,6 +27,14 @@ fugue.execution .. |FugueDataTypes| replace:: :doc:`Fugue Data Types ` +fugue.execution.api +------------------- + +.. automodule:: fugue.execution.api + :members: + :undoc-members: + :show-inheritance: + fugue.execution.execution\_engine --------------------------------- diff --git a/docs/api/fugue.rst b/docs/api/fugue.rst index c22e96c8..c3364baf 100644 --- a/docs/api/fugue.rst +++ b/docs/api/fugue.rst @@ -8,6 +8,7 @@ fugue fugue.collections fugue.column fugue.dataframe + fugue.dataset fugue.execution fugue.extensions fugue.rpc @@ -40,18 +41,18 @@ fugue .. |FugueDataTypes| replace:: :doc:`Fugue Data Types ` -fugue.constants ---------------- +fugue.api +--------- -.. automodule:: fugue.constants +.. automodule:: fugue.api :members: :undoc-members: :show-inheritance: -fugue.dataset -------------- +fugue.constants +--------------- -.. automodule:: fugue.dataset +.. automodule:: fugue.constants :members: :undoc-members: :show-inheritance: @@ -64,10 +65,10 @@ fugue.exceptions :undoc-members: :show-inheritance: -fugue.interfaceless -------------------- +fugue.plugins +------------- -.. automodule:: fugue.interfaceless +.. automodule:: fugue.plugins :members: :undoc-members: :show-inheritance: diff --git a/docs/api/fugue.sql.rst b/docs/api/fugue.sql.rst index 209ed804..320724bf 100644 --- a/docs/api/fugue.sql.rst +++ b/docs/api/fugue.sql.rst @@ -27,6 +27,14 @@ fugue.sql .. |FugueDataTypes| replace:: :doc:`Fugue Data Types ` +fugue.sql.api +------------- + +.. automodule:: fugue.sql.api + :members: + :undoc-members: + :show-inheritance: + fugue.sql.workflow ------------------ diff --git a/docs/api/fugue.workflow.rst b/docs/api/fugue.workflow.rst index 2c85cef5..5b97d8d1 100644 --- a/docs/api/fugue.workflow.rst +++ b/docs/api/fugue.workflow.rst @@ -27,6 +27,14 @@ fugue.workflow .. |FugueDataTypes| replace:: :doc:`Fugue Data Types ` +fugue.workflow.api +------------------ + +.. automodule:: fugue.workflow.api + :members: + :undoc-members: + :show-inheritance: + fugue.workflow.input -------------------- diff --git a/docs/index.rst b/docs/index.rst index b275ddf2..ad4ed210 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -33,5 +33,7 @@ For contributing, start with the `contributing guide ` +.. |ParamsLikeObject| replace:: :ref:`Parameters like object ` +.. |DataFrameLikeObject| replace:: :ref:`DataFrame like object ` +.. |DataFramesLikeObject| replace:: :ref:`DataFrames like object ` +.. |PartitionLikeObject| replace:: :ref:`Partition like object ` +.. |RPCHandlerLikeObject| replace:: :ref:`RPChandler like object ` + +.. |ExecutionEngine| replace:: :class:`~fugue.execution.execution_engine.ExecutionEngine` +.. |NativeExecutionEngine| replace:: :class:`~fugue.execution.native_execution_engine.NativeExecutionEngine` +.. |FugueWorkflow| replace:: :class:`~fugue.workflow.workflow.FugueWorkflow` + +.. |ReadJoin| replace:: Read Join tutorials on :ref:`workflow ` and :ref:`engine ` for details +.. |FugueConfig| replace:: :doc:`the Fugue Configuration Tutorial ` +.. |PartitionTutorial| replace:: :doc:`the Partition Tutorial ` +.. |FugueSQLTutorial| replace:: :doc:`the Fugue SQL Tutorial ` +.. |DataFrameTutorial| replace:: :ref:`the DataFrame Tutorial ` +.. |ExecutionEngineTutorial| replace:: :doc:`the ExecutionEngine Tutorial ` +.. |ZipComap| replace:: :ref:`Zip & Comap ` +.. |LoadSave| replace:: :ref:`Load & Save ` +.. |AutoPersist| replace:: :ref:`Auto Persist ` +.. |TransformerTutorial| replace:: :doc:`the Transformer Tutorial ` +.. |CoTransformer| replace:: :ref:`CoTransformer ` +.. |CoTransformerTutorial| replace:: :doc:`the CoTransformer Tutorial ` +.. |FugueDataTypes| replace:: :doc:`Fugue Data Types ` + +IO +~~ + +.. autofunction:: fugue.api.as_fugue_dataset + +.. autofunction:: fugue.api.as_fugue_df +.. autofunction:: fugue.api.load +.. autofunction:: fugue.api.save + + + +Information +~~~~~~~~~~~ + +.. autofunction:: fugue.api.count +.. autofunction:: fugue.api.is_bounded +.. autofunction:: fugue.api.is_empty +.. autofunction:: fugue.api.is_local +.. autofunction:: fugue.api.show + +.. autofunction:: fugue.api.get_column_names +.. autofunction:: fugue.api.get_num_partitions +.. autofunction:: fugue.api.get_schema +.. autofunction:: fugue.api.is_df +.. autofunction:: fugue.api.peek_array +.. autofunction:: fugue.api.peek_dict + + +Transformation +~~~~~~~~~~~~~~ + +.. autofunction:: fugue.api.transform +.. autofunction:: fugue.api.out_transform + +.. autofunction:: fugue.api.alter_columns +.. autofunction:: fugue.api.drop_columns +.. autofunction:: fugue.api.head +.. autofunction:: fugue.api.normalize_column_names +.. autofunction:: fugue.api.rename +.. autofunction:: fugue.api.select_columns + +.. autofunction:: fugue.api.distinct +.. autofunction:: fugue.api.dropna +.. autofunction:: fugue.api.fillna +.. autofunction:: fugue.api.sample +.. autofunction:: fugue.api.take + +SQL +~~~ + +.. autofunction:: fugue.api.fugue_sql +.. autofunction:: fugue.api.fugue_sql_flow +.. autofunction:: fugue.api.raw_sql + +.. autofunction:: fugue.api.join +.. autofunction:: fugue.api.semi_join +.. autofunction:: fugue.api.anti_join +.. autofunction:: fugue.api.inner_join +.. autofunction:: fugue.api.left_outer_join +.. autofunction:: fugue.api.right_outer_join +.. autofunction:: fugue.api.full_outer_join +.. autofunction:: fugue.api.cross_join + +.. autofunction:: fugue.api.union +.. autofunction:: fugue.api.intersect +.. autofunction:: fugue.api.subtract + +.. autofunction:: fugue.api.assign +.. autofunction:: fugue.api.select +.. autofunction:: fugue.api.filter +.. autofunction:: fugue.api.aggregate + +Conversion +~~~~~~~~~~ + +.. autofunction:: fugue.api.as_local +.. autofunction:: fugue.api.as_local_bounded +.. autofunction:: fugue.api.as_array +.. autofunction:: fugue.api.as_array_iterable +.. autofunction:: fugue.api.as_arrow +.. autofunction:: fugue.api.as_dict_iterable +.. autofunction:: fugue.api.as_pandas +.. autofunction:: fugue.api.get_native_as_df + +ExecutionEngine +~~~~~~~~~~~~~~~ + +.. autofunction:: fugue.api.engine_context +.. autofunction:: fugue.api.set_global_engine +.. autofunction:: fugue.api.clear_global_engine +.. autofunction:: fugue.api.get_current_engine +.. autofunction:: get_current_parallelism + + +Big Data Operations +~~~~~~~~~~~~~~~~~~~ +.. autofunction:: fugue.api.broadcast +.. autofunction:: fugue.api.persist +.. autofunction:: fugue.api.repartition + + +Development +~~~~~~~~~~~ + +.. autofunction:: fugue.api.run_engine_function + + + + + diff --git a/fugue/__init__.py b/fugue/__init__.py index 57abba71..9d10aab1 100644 --- a/fugue/__init__.py +++ b/fugue/__init__.py @@ -2,6 +2,7 @@ from triad.collections import Schema from triad.collections.fs import FileSystem +from fugue.api import out_transform, transform from fugue.bag.array_bag import ArrayBag from fugue.bag.bag import Bag, BagDisplay from fugue.collections.partition import PartitionCursor, PartitionSpec @@ -10,6 +11,7 @@ from fugue.dataframe.array_dataframe import ArrayDataFrame from fugue.dataframe.arrow_dataframe import ArrowDataFrame from fugue.dataframe.dataframe import ( + AnyDataFrame, DataFrame, DataFrameDisplay, LocalBoundedDataFrame, @@ -20,15 +22,24 @@ from fugue.dataframe.iterable_dataframe import IterableDataFrame from fugue.dataframe.pandas_dataframe import PandasDataFrame from fugue.dataframe.utils import to_local_bounded_df, to_local_df -from fugue.dataset import Dataset, DatasetDisplay, get_dataset_display -from fugue.execution.execution_engine import ExecutionEngine, MapEngine, SQLEngine +from fugue.dataset import ( + AnyDataset, + Dataset, + DatasetDisplay, + as_fugue_dataset, + get_dataset_display, +) +from fugue.execution.execution_engine import ( + AnyExecutionEngine, + ExecutionEngine, + ExecutionEngineFacet, + MapEngine, + SQLEngine, +) from fugue.execution.factory import ( - infer_execution_engine, is_pandas_or, make_execution_engine, make_sql_engine, - parse_execution_engine, - parse_sql_engine, register_default_execution_engine, register_default_sql_engine, register_execution_engine, @@ -40,19 +51,9 @@ QPDPandasEngine, SqliteEngine, ) -from fugue.extensions.creator import Creator, creator, parse_creator, register_creator -from fugue.extensions.outputter import ( - Outputter, - outputter, - parse_outputter, - register_outputter, -) -from fugue.extensions.processor import ( - Processor, - parse_processor, - processor, - register_processor, -) +from fugue.extensions.creator import Creator, creator, register_creator +from fugue.extensions.outputter import Outputter, outputter, register_outputter +from fugue.extensions.processor import Processor, processor, register_processor from fugue.extensions.transformer import ( CoTransformer, OutputCoTransformer, @@ -61,13 +62,10 @@ cotransformer, output_cotransformer, output_transformer, - parse_output_transformer, - parse_transformer, register_output_transformer, register_transformer, transformer, ) -from fugue.interfaceless import out_transform, transform from fugue.registry import _register from fugue.rpc import ( EmptyRPCHandler, @@ -78,7 +76,8 @@ make_rpc_server, to_rpc_handler, ) -from fugue.sql.workflow import FugueSQLWorkflow, fsql +from fugue.sql.api import fugue_sql_flow as fsql +from fugue.sql.workflow import FugueSQLWorkflow from fugue.workflow._workflow_context import FugueWorkflowContext from fugue.workflow.module import module from fugue.workflow.workflow import FugueWorkflow, WorkflowDataFrame, WorkflowDataFrames diff --git a/fugue/_utils/sql.py b/fugue/_utils/sql.py new file mode 100644 index 00000000..efeff1a6 --- /dev/null +++ b/fugue/_utils/sql.py @@ -0,0 +1,32 @@ +from typing import Iterable, Tuple +from uuid import uuid4 + + +class TempTableName: + def __init__(self): + self.key = "_" + str(uuid4())[:5] + + def __repr__(self) -> str: + return f"" + + +def get_temp_tb_name() -> TempTableName: + return TempTableName() + + +def parse_sql( + sql: str, prefix: str = " Iterable[Tuple[bool, str]]: + p = 0 + while p < len(sql): + b = sql.find(prefix, p) + if b >= 0: + if b > p: + yield (False, sql[p:b]) + b += len(prefix) + e = sql.find(suffix, b) + yield (True, sql[b:e]) + p = e + len(suffix) + else: + yield (False, sql[p:]) + return diff --git a/fugue/api.py b/fugue/api.py new file mode 100644 index 00000000..4c96cf42 --- /dev/null +++ b/fugue/api.py @@ -0,0 +1,68 @@ +# flake8: noqa +# pylint: disable-all +from .dataframe.api import ( + alter_columns, + as_array, + as_array_iterable, + as_arrow, + as_dict_iterable, + as_fugue_df, + as_pandas, + drop_columns, + get_column_names, + get_native_as_df, + get_schema, + head, + is_df, + normalize_column_names, + peek_array, + peek_dict, + rename, + select_columns, +) +from .dataset.api import ( + as_fugue_dataset, + as_local, + as_local_bounded, + count, + get_num_partitions, + is_bounded, + is_empty, + is_local, + show, +) +from .execution.api import ( + aggregate, + anti_join, + assign, + broadcast, + clear_global_engine, + cross_join, + distinct, + dropna, + engine_context, + fillna, + filter, + full_outer_join, + get_current_engine, + get_current_parallelism, + inner_join, + intersect, + join, + left_outer_join, + load, + persist, + repartition, + right_outer_join, + run_engine_function, + sample, + save, + select, + semi_join, + set_global_engine, + subtract, + take, + union, +) +from .sql.api import fugue_sql, fugue_sql_flow +from .workflow.api import out_transform, raw_sql, transform diff --git a/fugue/collections/partition.py b/fugue/collections/partition.py index acdc02fd..036ad99e 100644 --- a/fugue/collections/partition.py +++ b/fugue/collections/partition.py @@ -84,13 +84,16 @@ class PartitionSpec(object): .. admonition:: Examples >>> PartitionSepc(num=4) + >>> PartitionSepc(4) # == PartitionSepc(num=4) >>> PartitionSepc(num="ROWCOUNT/4 + 3") # It can be an expression >>> PartitionSepc(by=["a","b"]) + >>> PartitionSepc(["a","b"]) # == PartitionSepc(by=["a","b"]) >>> PartitionSpec(by=["a"], presort="b DESC, c ASC") >>> PartitionSpec(algo="even", num=4) >>> p = PartitionSpec(num=4, by=["a"]) >>> p_override = PartitionSpec(p, by=["a","b"], algo="even") >>> PartitionSpec(by="a") # == PartitionSpec(by=["a"]) + >>> PartitionSpec("a") # == PartitionSpec(by=["a"]) >>> PartitionSpec("per_row") # == PartitionSpec(num="ROWCOUNT", algo="even") It's important to understand this concept, please read |PartitionTutorial| @@ -109,15 +112,18 @@ class PartitionSpec(object): def __init__(self, *args: Any, **kwargs: Any): # noqa: C901 p = ParamDict() - if ( - len(args) == 1 - and len(kwargs) == 0 - and isinstance(args[0], str) - and args[0].lower() == "per_row" - ): - p["algo"] = "even" - p["num_partitions"] = "ROWCOUNT" - else: + if len(args) == 1 and len(kwargs) == 0: + if isinstance(args[0], str): + if args[0].lower() == "per_row": + p["algo"] = "even" + p["num_partitions"] = "ROWCOUNT" + elif not args[0].startswith("{"): + p["partition_by"] = [args[0]] + elif isinstance(args[0], int): + p["num_partitions"] = str(args[0]) + elif isinstance(args[0], (list, tuple)): + p["partition_by"] = args[0] + if len(p) == 0: # the first condition had no match for a in args: if a is None: continue @@ -323,9 +329,6 @@ def _update_dict(self, d: Dict[str, Any], u: Dict[str, Any]) -> None: d[k] = v -EMPTY_PARTITION_SPEC = PartitionSpec() - - class DatasetPartitionCursor: """The cursor pointing at the first item of each logical partition inside a physical partition. diff --git a/fugue/column/sql.py b/fugue/column/sql.py index 76960cd9..cb843f1c 100644 --- a/fugue/column/sql.py +++ b/fugue/column/sql.py @@ -1,4 +1,4 @@ -from typing import Any, Callable, Dict, Iterable, List, Optional, Set +from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple import pyarrow as pa from fugue.column.expressions import ( @@ -238,7 +238,7 @@ def __init__(self, enable_cast: bool = True): self._enable_cast = enable_cast self._func_handler: Dict[str, Callable[[_FuncExpr], Iterable[str]]] = {} - def where(self, condition: ColumnExpr, table: str) -> str: + def where(self, condition: ColumnExpr, table: str) -> Iterable[Tuple[bool, str]]: """Generate a ``SELECT *`` statement with the given where clause :param condition: column expression for ``WHERE`` @@ -261,7 +261,9 @@ def where(self, condition: ColumnExpr, table: str) -> str: lambda: ValueError(f"{condition} has aggregation functions"), ) cond = self.generate(condition.alias("")) - return f"SELECT * FROM {table} WHERE {cond}" + yield (False, "SELECT * FROM") + yield (True, table) + yield (False, f"WHERE {cond}") def select( self, @@ -269,7 +271,7 @@ def select( table: str, where: Optional[ColumnExpr] = None, having: Optional[ColumnExpr] = None, - ) -> str: + ) -> Iterable[Tuple[bool, str]]: """Construct the full ``SELECT`` statement on a single table :param columns: columns to select, it may contain aggregations, if @@ -290,30 +292,39 @@ def _where() -> str: not is_agg(where), lambda: ValueError(f"{where} has aggregation functions"), ) - return " WHERE " + self.generate(where.alias("")) + return "WHERE " + self.generate(where.alias("")) def _having(as_where: bool = False) -> str: if having is None: return "" - pre = " WHERE " if as_where else " HAVING " + pre = "WHERE " if as_where else "HAVING " return pre + self.generate(having.alias("")) distinct = "" if not columns.is_distinct else "DISTINCT " if not columns.has_agg: expr = ", ".join(self.generate(x) for x in columns.all_cols) - return f"SELECT {distinct}{expr} FROM {table}{_where()}" + yield (False, f"SELECT {distinct}{expr} FROM") + yield (True, table) + yield (False, _where()) + return columns.assert_no_wildcard() if len(columns.literals) == 0: expr = ", ".join(self.generate(x) for x in columns.all_cols) if len(columns.group_keys) == 0: - return f"SELECT {distinct}{expr} FROM {table}{_where()}{_having()}" + yield (False, f"SELECT {distinct}{expr} FROM") + yield (True, table) + yield (False, _where()) + yield (False, _having()) + return else: keys = ", ".join(self.generate(x) for x in columns.group_keys) - return ( - f"SELECT {distinct}{expr} FROM " - f"{table}{_where()} GROUP BY {keys}{_having()}" - ) + yield (False, f"SELECT {distinct}{expr} FROM") + yield (True, table) + yield (False, _where()) + yield (False, f"GROUP BY {keys}") + yield (False, _having()) + return else: no_lit = [ x for x in columns.all_cols if not isinstance(x, _LiteralColumnExpr) @@ -324,7 +335,9 @@ def _having(as_where: bool = False) -> str: for x in columns.all_cols ] expr = ", ".join(names) - return f"SELECT {expr} FROM ({sub})" + yield (False, f"SELECT {expr} FROM (") + yield from sub + yield (False, ")") def generate(self, expr: ColumnExpr) -> str: """Convert :class:`~fugue.column.expressions.ColumnExpr` to diff --git a/fugue/constants.py b/fugue/constants.py index eae62ac8..cc9f6a51 100644 --- a/fugue/constants.py +++ b/fugue/constants.py @@ -13,6 +13,7 @@ FUGUE_CONF_WORKFLOW_EXCEPTION_INJECT = "fugue.workflow.exception.inject" FUGUE_CONF_WORKFLOW_EXCEPTION_OPTIMIZE = "fugue.workflow.exception.optimize" FUGUE_CONF_SQL_IGNORE_CASE = "fugue.sql.compile.ignore_case" +FUGUE_CONF_DEFAULT_PARTITIONS = "fugue.default.partitions" FUGUE_COMPILE_TIME_CONFIGS = set( [ @@ -35,6 +36,7 @@ FUGUE_CONF_WORKFLOW_EXCEPTION_INJECT: 3, FUGUE_CONF_WORKFLOW_EXCEPTION_OPTIMIZE: True, FUGUE_CONF_SQL_IGNORE_CASE: False, + FUGUE_CONF_DEFAULT_PARTITIONS: -1, } ) diff --git a/fugue/dataframe/__init__.py b/fugue/dataframe/__init__.py index d8eadb34..d47a1580 100644 --- a/fugue/dataframe/__init__.py +++ b/fugue/dataframe/__init__.py @@ -1,7 +1,9 @@ # flake8: noqa +from .api import * from .array_dataframe import ArrayDataFrame from .arrow_dataframe import ArrowDataFrame from .dataframe import ( + AnyDataFrame, DataFrame, LocalBoundedDataFrame, LocalDataFrame, @@ -12,9 +14,9 @@ from .iterable_dataframe import IterableDataFrame from .pandas_dataframe import PandasDataFrame from .utils import ( - get_dataframe_column_names, + get_column_names, normalize_dataframe_column_names, - rename_dataframe_column_names, + rename, to_local_bounded_df, to_local_df, ) diff --git a/fugue/dataframe/api.py b/fugue/dataframe/api.py new file mode 100644 index 00000000..265af619 --- /dev/null +++ b/fugue/dataframe/api.py @@ -0,0 +1,304 @@ +from typing import Any, Dict, Iterable, List, Optional, Tuple + +import pandas as pd +import pyarrow as pa +from triad.collections.schema import Schema +from triad.utils.rename import normalize_names + +from .._utils.registry import fugue_plugin +from .dataframe import AnyDataFrame, DataFrame, as_fugue_df + + +@fugue_plugin +def is_df(df: Any) -> bool: + """Whether ``df`` is a DataFrame like object""" + return isinstance(df, DataFrame) + + +def get_native_as_df(df: AnyDataFrame) -> AnyDataFrame: + """Return the dataframe form of the input ``df``. + If ``df`` is a :class:`~.DataFrame`, then call the + :meth:`~.DataFrame.native_as_df`, otherwise, it depends on whether there is + a correspondent function handling it. + """ + if isinstance(df, DataFrame): + return df.native_as_df() + if is_df(df): + return df + raise NotImplementedError(f"cannot get a dataframe like object from {type(df)}") + + +@fugue_plugin +def get_schema(df: AnyDataFrame) -> Schema: + """Get the schema of the ``df`` + + :param df: the object that can be recognized as a dataframe by Fugue + :return: the Schema object + """ + return as_fugue_df(df).schema + + +@fugue_plugin +def as_pandas(df: AnyDataFrame) -> pd.DataFrame: + """Convert ``df`` to a Pandas DataFrame + + :param df: the object that can be recognized as a dataframe by Fugue + :return: the Pandas DataFrame + """ + return as_fugue_df(df).as_pandas() + + +@fugue_plugin +def as_arrow(df: AnyDataFrame) -> pa.Table: + """Convert ``df`` to a PyArrow Table + + :param df: the object that can be recognized as a dataframe by Fugue + :return: the PyArrow Table + """ + return as_fugue_df(df).as_arrow() + + +@fugue_plugin +def as_array( + df: AnyDataFrame, columns: Optional[List[str]] = None, type_safe: bool = False +) -> List[Any]: # pragma: no cover + """Convert df to 2-dimensional native python array + + :param df: the object that can be recognized as a dataframe by Fugue + :param columns: columns to extract, defaults to None + :param type_safe: whether to ensure output conforms with its schema, + defaults to False + :return: 2-dimensional native python array + + .. note:: + + If ``type_safe`` is False, then the returned values are 'raw' values. + """ + return as_fugue_df(df).as_array(columns=columns, type_safe=type_safe) + + +@fugue_plugin +def as_array_iterable( + df: AnyDataFrame, columns: Optional[List[str]] = None, type_safe: bool = False +) -> Iterable[Any]: # pragma: no cover + """Convert df to iterable of native python arrays + + :param df: the object that can be recognized as a dataframe by Fugue + :param columns: columns to extract, defaults to None + :param type_safe: whether to ensure output conforms with its schema, + defaults to False + :return: iterable of native python arrays + + .. note:: + + If ``type_safe`` is False, then the returned values are 'raw' values. + """ + + return as_fugue_df(df).as_array_iterable(columns=columns, type_safe=type_safe) + + +@fugue_plugin +def as_dict_iterable( + df: AnyDataFrame, columns: Optional[List[str]] = None +) -> Iterable[Dict[str, Any]]: + """Convert df to iterable of native python dicts + + :param df: the object that can be recognized as a dataframe by Fugue + :param columns: columns to extract, defaults to None + :return: iterable of native python dicts + + .. note:: + + The default implementation enforces ``type_safe`` True + """ + return as_fugue_df(df).as_dict_iterable(columns=columns) + + +@fugue_plugin +def peek_array(df: AnyDataFrame) -> List[Any]: + """Peek the first row of the dataframe as an array + + :param df: the object that can be recognized as a dataframe by Fugue + :return: the first row as an array + """ + return as_fugue_df(df).peek_array() + + +@fugue_plugin +def peek_dict(df: AnyDataFrame) -> Dict[str, Any]: + """Peek the first row of the dataframe as a array + + :param df: the object that can be recognized as a dataframe by Fugue + :return: the first row as a dict + """ + return as_fugue_df(df).peek_dict() + + +@fugue_plugin +def head( + df: AnyDataFrame, + n: int, + columns: Optional[List[str]] = None, + as_fugue: bool = False, +) -> AnyDataFrame: + """Get first n rows of the dataframe as a new local bounded dataframe + + :param n: number of rows + :param columns: selected columns, defaults to None (all columns) + :param as_fugue: whether return a Fugue :class:`~.DataFrame`, default to + False. If False, then if the input ``df`` is not a Fugue DataFrame + then it will return the underlying DataFrame object. + :return: a local bounded dataframe + """ + res = as_fugue_df(df).head(n=n, columns=columns) + if as_fugue or isinstance(df, DataFrame): + return res + return res.native_as_df() + + +@fugue_plugin +def alter_columns( + df: AnyDataFrame, columns: Any, as_fugue: bool = False +) -> AnyDataFrame: + """Change column types + + :param df: the object that can be recognized as a dataframe by Fugue + :param columns: |SchemaLikeObject|, + all columns should be contained by the dataframe schema + :param as_fugue: whether return a Fugue :class:`~.DataFrame`, default to + False. If False, then if the input ``df`` is not a Fugue DataFrame + then it will return the underlying DataFrame object. + :return: a new dataframe with altered columns, the order of the + original schema will not change + """ + return _convert_df(df, as_fugue_df(df).alter_columns(columns), as_fugue=as_fugue) + + +@fugue_plugin +def drop_columns( + df: AnyDataFrame, columns: List[str], as_fugue: bool = False +) -> AnyDataFrame: + """Drop certain columns and return a new dataframe + + :param df: the object that can be recognized as a dataframe by Fugue + :param columns: columns to drop + :param as_fugue: whether return a Fugue :class:`~.DataFrame`, default to + False. If False, then if the input ``df`` is not a Fugue DataFrame + then it will return the underlying DataFrame object. + :return: a new dataframe removing the columns + """ + return _convert_df(df, as_fugue_df(df).drop(columns), as_fugue=as_fugue) + + +@fugue_plugin +def select_columns( + df: AnyDataFrame, columns: List[Any], as_fugue: bool = False +) -> AnyDataFrame: + """Select certain columns and return a new dataframe + + :param df: the object that can be recognized as a dataframe by Fugue + :param columns: columns to return + :param as_fugue: whether return a Fugue :class:`~.DataFrame`, default to + False. If False, then if the input ``df`` is not a Fugue DataFrame + then it will return the underlying DataFrame object. + :return: a new dataframe with the selected the columns + """ + return _convert_df(df, as_fugue_df(df)[columns], as_fugue=as_fugue) + + +@fugue_plugin +def get_column_names(df: AnyDataFrame) -> List[Any]: # pragma: no cover + """A generic function to get column names of any dataframe + + :param df: the dataframe object + :return: the column names + + .. note:: + + In order to support a new type of dataframe, an implementation must + be registered, for example + + .. code-block::python + + @get_column_names.candidate(lambda df: isinstance(df, pa.Table)) + def _get_pyarrow_dataframe_columns(df: pa.Table) -> List[Any]: + return [f.name for f in df.schema] + """ + return get_schema(df).names + + +@fugue_plugin +def rename( + df: AnyDataFrame, columns: Dict[str, Any], as_fugue: bool = False +) -> AnyDataFrame: + """A generic function to rename column names of any dataframe + + :param df: the dataframe object + :param columns: the rename operations as a dict: ``old name => new name`` + :param as_fugue: whether return a Fugue :class:`~.DataFrame`, default to + False. If False, then if the input ``df`` is not a Fugue DataFrame + then it will return the underlying DataFrame object. + :return: the renamed dataframe + + .. note:: + + In order to support a new type of dataframe, an implementation must + be registered, for example + + .. code-block::python + + @rename.candidate( + lambda df, *args, **kwargs: isinstance(df, pd.DataFrame) + ) + def _rename_pandas_dataframe( + df: pd.DataFrame, columns: Dict[str, Any] + ) -> pd.DataFrame: + if len(columns) == 0: + return df + return df.rename(columns=columns) + """ + if len(columns) == 0: + return df + return _convert_df(df, as_fugue_df(df).rename(columns), as_fugue=as_fugue) + + +def normalize_column_names(df: AnyDataFrame) -> Tuple[AnyDataFrame, Dict[str, Any]]: + """A generic function to normalize any dataframe's column names to follow + Fugue naming rules + + .. note:: + + This is a temporary solution before + :class:`~triad:triad.collections.schema.Schema` + can take arbitrary names + + .. admonition:: Examples + + * ``[0,1]`` => ``{"_0":0, "_1":1}`` + * ``["1a","2b"]`` => ``{"_1a":"1a", "_2b":"2b"}`` + * ``["*a","-a"]`` => ``{"_a":"*a", "_a_1":"-a"}`` + + :param df: a dataframe object + :return: the renamed dataframe and the rename operations as a dict that + can **undo** the change + + .. seealso:: + + * :func:`~.get_column_names` + * :func:`~.rename` + * :func:`~triad:triad.utils.rename.normalize_names` + """ + cols = get_column_names(df) + names = normalize_names(cols) + if len(names) == 0: + return df, {} + undo = {v: k for k, v in names.items()} + return (rename(df, names), undo) + + +def _convert_df( + input_df: AnyDataFrame, output_df: DataFrame, as_fugue: bool +) -> AnyDataFrame: + if as_fugue or isinstance(input_df, DataFrame): + return output_df + return output_df.native_as_df() diff --git a/fugue/dataframe/array_dataframe.py b/fugue/dataframe/array_dataframe.py index 5e89b63a..5fc2727b 100644 --- a/fugue/dataframe/array_dataframe.py +++ b/fugue/dataframe/array_dataframe.py @@ -4,6 +4,7 @@ DataFrame, LocalBoundedDataFrame, _get_schema_change, + as_fugue_dataset, ) from fugue.exceptions import FugueDataFrameOperationError from triad.utils.assertion import assert_or_throw @@ -51,7 +52,7 @@ def native(self) -> List[Any]: def empty(self) -> bool: return self.count() == 0 - def peek_array(self) -> Any: + def peek_array(self) -> List[Any]: self.assert_not_empty() return list(self.native[0]) @@ -120,3 +121,8 @@ def _iter_cols(self, pos: List[int]) -> Iterable[List[Any]]: else: for row in self.native: yield [row[p] for p in pos] + + +@as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, list), priority=0.9) +def _arr_to_fugue(df: List[Any], **kwargs: Any) -> ArrayDataFrame: + return ArrayDataFrame(df, **kwargs) diff --git a/fugue/dataframe/arrow_dataframe.py b/fugue/dataframe/arrow_dataframe.py index 8ef1219b..56def554 100644 --- a/fugue/dataframe/arrow_dataframe.py +++ b/fugue/dataframe/arrow_dataframe.py @@ -2,12 +2,32 @@ import pandas as pd import pyarrow as pa -from fugue.dataframe.dataframe import DataFrame, LocalBoundedDataFrame, _input_schema -from fugue.exceptions import FugueDataFrameOperationError from triad.collections.schema import Schema from triad.exceptions import InvalidOperationError from triad.utils.assertion import assert_or_throw +from fugue.dataset.api import ( + as_fugue_dataset, + as_local, + as_local_bounded, + count, + get_num_partitions, + is_bounded, + is_empty, + is_local, +) +from fugue.exceptions import FugueDataFrameOperationError + +from .api import ( + drop_columns, + get_column_names, + get_schema, + is_df, + rename, + select_columns, +) +from .dataframe import DataFrame, LocalBoundedDataFrame, _input_schema + class ArrowDataFrame(LocalBoundedDataFrame): """DataFrame that wraps :func:`pyarrow.Table `. Please also read @@ -101,11 +121,14 @@ def native(self) -> pa.Table: """:func:`pyarrow.Table `""" return self._native + def native_as_df(self) -> pa.Table: + return self._native + @property def empty(self) -> bool: return self.count() == 0 - def peek_array(self) -> Any: + def peek_array(self) -> List[Any]: self.assert_not_empty() data = self.native.take([0]).to_pydict() return [v[0] for v in data.values()] @@ -218,8 +241,95 @@ def as_array_iterable( yield list(arr) +@as_local.candidate(lambda df: isinstance(df, pa.Table)) +def _pa_table_as_local(df: pa.Table) -> pa.Table: + return df + + +@as_local_bounded.candidate(lambda df: isinstance(df, pa.Table)) +def _pa_table_as_local_bounded(df: pa.Table) -> pa.Table: + return df + + +@as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, pa.Table)) +def _pa_table_as_fugue_df(df: pa.Table, **kwargs: Any) -> "ArrowDataFrame": + return ArrowDataFrame(df, **kwargs) + + +@is_df.candidate(lambda df: isinstance(df, pa.Table)) +def _pa_table_is_df(df: pa.Table) -> bool: + return True + + +@count.candidate(lambda df: isinstance(df, pa.Table)) +def _pa_table_count(df: pa.Table) -> int: + return df.shape[0] + + +@is_bounded.candidate(lambda df: isinstance(df, pa.Table)) +def _pa_table_is_bounded(df: pa.Table) -> bool: + return True + + +@is_empty.candidate(lambda df: isinstance(df, pa.Table)) +def _pa_table_is_empty(df: pa.Table) -> bool: + return df.shape[0] == 0 + + +@is_local.candidate(lambda df: isinstance(df, pa.Table)) +def _pa_table_is_local(df: pa.Table) -> bool: + return True + + +@get_num_partitions.candidate(lambda df: isinstance(df, pa.Table)) +def _pa_table_get_num_partitions(df: pa.Table) -> int: + return 1 + + +@get_column_names.candidate(lambda df: isinstance(df, pa.Table)) +def _get_pyarrow_table_columns(df: pa.Table) -> List[Any]: + return [f.name for f in df.schema] + + +@get_schema.candidate(lambda df: isinstance(df, pa.Table)) +def _get_pyarrow_table_schema(df: pa.Table) -> Schema: + return Schema(df.schema) + + +@rename.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table)) +def _rename_pyarrow_dataframe(df: pa.Table, columns: Dict[str, Any]) -> pa.Table: + if len(columns) == 0: + return df + _assert_no_missing(df, columns.keys()) + return df.rename_columns([columns.get(f.name, f.name) for f in df.schema]) + + +@drop_columns.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table)) +def _drop_pa_columns(df: pa.Table, columns: List[str]) -> pa.Table: + cols = [x for x in df.schema.names if x not in columns] + if len(cols) == 0: + raise FugueDataFrameOperationError("cannot drop all columns") + if len(cols) + len(columns) != len(df.columns): + _assert_no_missing(df, columns) + return df.select(cols) + + +@select_columns.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table)) +def _select_pa_columns(df: pa.Table, columns: List[Any]) -> pa.Table: + if len(columns) == 0: + raise FugueDataFrameOperationError("must select at least one column") + _assert_no_missing(df, columns=columns) + return df.select(columns) + + def _build_empty_arrow(schema: Schema) -> pa.Table: # pragma: no cover if pa.__version__ < "7": arr = [pa.array([])] * len(schema) return pa.Table.from_arrays(arr, schema=schema.pa_schema) return pa.Table.from_pylist([], schema=schema.pa_schema) + + +def _assert_no_missing(df: pa.Table, columns: Iterable[Any]) -> None: + missing = [x for x in columns if x not in df.schema.names] + if len(missing) > 0: + raise FugueDataFrameOperationError("found nonexistent columns: {missing}") diff --git a/fugue/dataframe/dataframe.py b/fugue/dataframe/dataframe.py index 2b7b0063..a6adf1dd 100644 --- a/fugue/dataframe/dataframe.py +++ b/fugue/dataframe/dataframe.py @@ -1,6 +1,6 @@ import json from abc import abstractmethod -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, TypeVar, Union import pandas as pd import pyarrow as pa @@ -12,9 +12,18 @@ from .._utils.display import PrettyTable from ..collections.yielded import Yielded -from ..dataset import Dataset, DatasetDisplay, get_dataset_display +from ..dataset import ( + Dataset, + DatasetDisplay, + as_local, + as_local_bounded, + get_dataset_display, + as_fugue_dataset, +) from ..exceptions import FugueDataFrameOperationError +AnyDataFrame = TypeVar("AnyDataFrame", "DataFrame", object) + class DataFrame(Dataset): """Base class of Fugue DataFrame. Please read @@ -43,7 +52,7 @@ def __init__(self, schema: Any = None): @property def schema(self) -> Schema: - """Schema of the dataframe""" + """The schema of the dataframe""" if self._schema_discovered: # we must keep it simple because it could be called on every row by a user assert isinstance(self._schema, Schema) @@ -56,13 +65,23 @@ def schema(self) -> Schema: self._schema_discovered = True return self._schema + @abstractmethod + def native_as_df(self) -> AnyDataFrame: # pragma: no cover + """The dataframe form of the native object this Dataset class wraps. + Dataframe form means the object contains schema information. For example + the native an ArrayDataFrame is a python array, it doesn't contain schema + information, and its ``native_as_df`` should be either a pandas dataframe + or an arrow dataframe. + """ + raise NotImplementedError + @abstractmethod def as_local(self) -> "LocalDataFrame": # pragma: no cover """Convert this dataframe to a :class:`.LocalDataFrame`""" raise NotImplementedError @abstractmethod - def peek_array(self) -> Any: # pragma: no cover + def peek_array(self) -> List[Any]: # pragma: no cover """Peek the first row of the dataframe as array :raises FugueDatasetEmptyError: if it is empty @@ -280,6 +299,9 @@ class LocalDataFrame(DataFrame): implementing a new :class:`~fugue.execution.execution_engine.ExecutionEngine` """ + def native_as_df(self) -> AnyDataFrame: + return self.as_pandas() + @property def is_local(self) -> bool: """Always True because it's a LocalDataFrame""" @@ -410,11 +432,40 @@ def show( print("") +def as_fugue_df(df: AnyDataFrame, **kwargs: Any) -> DataFrame: + """Wrap the object as a Fugue DataFrame. + + :param df: the object to wrap + """ + ds = as_fugue_dataset(df, **kwargs) + if isinstance(ds, DataFrame): + return ds + raise TypeError(f"{type(df)} {kwargs} is not recognized as a Fugue DataFrame: {ds}") + + @get_dataset_display.candidate(lambda ds: isinstance(ds, DataFrame), priority=0.1) def _get_dataframe_display(ds: DataFrame): return DataFrameDisplay(ds) +@as_local.candidate(lambda df: isinstance(df, DataFrame) and not df.is_local) +def _df_to_local(df: DataFrame) -> DataFrame: + return df.as_local() + + +@as_local_bounded.candidate( + lambda df: isinstance(df, DataFrame) and not (df.is_local and df.is_bounded), + priority=0.9, +) +def _df_to_local_bounded(df: DataFrame) -> DataFrame: + res: DataFrame = df.as_local() + if not res.is_bounded: + res = as_fugue_df(res.as_array(), schema=df.schema) + if res is not df and df.has_metadata: + res.reset_metadata(df.metadata) + return res + + def _get_schema_change( orig_schema: Optional[Schema], schema: Any ) -> Tuple[Schema, List[int]]: diff --git a/fugue/dataframe/dataframe_iterable_dataframe.py b/fugue/dataframe/dataframe_iterable_dataframe.py index 646d7b2d..a1bde9e8 100644 --- a/fugue/dataframe/dataframe_iterable_dataframe.py +++ b/fugue/dataframe/dataframe_iterable_dataframe.py @@ -108,7 +108,7 @@ def native(self) -> EmptyAwareIterable[LocalDataFrame]: def empty(self) -> bool: return self.native.empty or self.native.peek().empty - def peek_array(self) -> Any: + def peek_array(self) -> List[Any]: self.assert_not_empty() return self.native.peek().peek_array() diff --git a/fugue/dataframe/iterable_dataframe.py b/fugue/dataframe/iterable_dataframe.py index 653689fd..49a46288 100644 --- a/fugue/dataframe/iterable_dataframe.py +++ b/fugue/dataframe/iterable_dataframe.py @@ -61,7 +61,7 @@ def native(self) -> EmptyAwareIterable[Any]: def empty(self) -> bool: return self.native.empty - def peek_array(self) -> Any: + def peek_array(self) -> List[Any]: self.assert_not_empty() return list(self.native.peek()) diff --git a/fugue/dataframe/pandas_dataframe.py b/fugue/dataframe/pandas_dataframe.py index dc52a0b2..63e17bd4 100644 --- a/fugue/dataframe/pandas_dataframe.py +++ b/fugue/dataframe/pandas_dataframe.py @@ -2,12 +2,33 @@ import pandas as pd import pyarrow as pa -from fugue.dataframe.dataframe import DataFrame, LocalBoundedDataFrame, _input_schema -from fugue.exceptions import FugueDataFrameOperationError from triad.collections.schema import Schema from triad.utils.assertion import assert_or_throw from triad.utils.pandas_like import PD_UTILS +from fugue.dataset.api import ( + as_fugue_dataset, + as_local, + as_local_bounded, + count, + get_num_partitions, + is_bounded, + is_empty, + is_local, +) +from fugue.exceptions import FugueDataFrameOperationError + +from .api import ( + drop_columns, + get_column_names, + get_schema, + head, + is_df, + rename, + select_columns, +) +from .dataframe import DataFrame, LocalBoundedDataFrame, _input_schema + class PandasDataFrame(LocalBoundedDataFrame): """DataFrame that wraps pandas DataFrame. Please also read @@ -72,11 +93,14 @@ def native(self) -> pd.DataFrame: """Pandas DataFrame""" return self._native + def native_as_df(self) -> pd.DataFrame: + return self._native + @property def empty(self) -> bool: return self.native.empty - def peek_array(self) -> Any: + def peek_array(self) -> List[Any]: self.assert_not_empty() return self.native.iloc[0].values.tolist() @@ -170,3 +194,112 @@ def _apply_schema( ) pdf.columns = schema.names return PD_UTILS.enforce_type(pdf, schema.pa_schema, null_safe=True), schema + + +@as_local.candidate(lambda df: isinstance(df, pd.DataFrame)) +def _pd_as_local(df: pd.DataFrame) -> pd.DataFrame: + return df + + +@as_local_bounded.candidate(lambda df: isinstance(df, pd.DataFrame)) +def _pd_as_local_bounded(df: pd.DataFrame) -> pd.DataFrame: + return df + + +@as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, pd.DataFrame)) +def _pd_as_fugue_df(df: pd.DataFrame, **kwargs: Any) -> "PandasDataFrame": + return PandasDataFrame(df, **kwargs) + + +@is_df.candidate(lambda df: isinstance(df, pd.DataFrame)) +def _pd_is_df(df: pd.DataFrame) -> bool: + return True + + +@count.candidate(lambda df: isinstance(df, pd.DataFrame)) +def _pd_count(df: pd.DataFrame) -> int: + return df.shape[0] + + +@is_bounded.candidate(lambda df: isinstance(df, pd.DataFrame)) +def _pd_is_bounded(df: pd.DataFrame) -> bool: + return True + + +@is_empty.candidate(lambda df: isinstance(df, pd.DataFrame)) +def _pd_is_empty(df: pd.DataFrame) -> bool: + return df.shape[0] == 0 + + +@is_local.candidate(lambda df: isinstance(df, pd.DataFrame)) +def _pd_is_local(df: pd.DataFrame) -> bool: + return True + + +@get_num_partitions.candidate(lambda df: isinstance(df, pd.DataFrame)) +def _get_pandas_num_partitions(df: pd.DataFrame) -> int: + return 1 + + +@get_column_names.candidate(lambda df: isinstance(df, pd.DataFrame)) +def _get_pandas_dataframe_columns(df: pd.DataFrame) -> List[Any]: + return list(df.columns) + + +@get_schema.candidate(lambda df: isinstance(df, pd.DataFrame)) +def _get_pandas_dataframe_schema(df: pd.DataFrame) -> Schema: + return Schema(df) + + +@rename.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame)) +def _rename_pandas_dataframe( + df: pd.DataFrame, columns: Dict[str, Any], as_fugue: bool = False +) -> Any: + if len(columns) == 0: + return df + _assert_no_missing(df, columns.keys()) + return _adjust_df(df.rename(columns=columns), as_fugue=as_fugue) + + +@drop_columns.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame)) +def _drop_pd_columns( + df: pd.DataFrame, columns: List[str], as_fugue: bool = False +) -> Any: + cols = [x for x in df.columns if x not in columns] + if len(cols) == 0: + raise FugueDataFrameOperationError("cannot drop all columns") + if len(cols) + len(columns) != len(df.columns): + _assert_no_missing(df, columns) + return _adjust_df(df[cols], as_fugue=as_fugue) + + +@select_columns.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame)) +def _select_pd_columns( + df: pd.DataFrame, columns: List[Any], as_fugue: bool = False +) -> Any: + if len(columns) == 0: + raise FugueDataFrameOperationError("must select at least one column") + _assert_no_missing(df, columns) + return _adjust_df(df[columns], as_fugue=as_fugue) + + +@head.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame)) +def _pd_head( + df: pd.DataFrame, + n: int, + columns: Optional[List[str]] = None, + as_fugue: bool = False, +) -> pd.DataFrame: + if columns is not None: + df = df[columns] + return _adjust_df(df.head(n), as_fugue=as_fugue) + + +def _adjust_df(res: pd.DataFrame, as_fugue: bool): + return res if not as_fugue else PandasDataFrame(res) + + +def _assert_no_missing(df: pd.DataFrame, columns: Iterable[Any]) -> None: + missing = [x for x in columns if x not in df.columns] + if len(missing) > 0: + raise FugueDataFrameOperationError("found nonexistent columns: {missing}") diff --git a/fugue/dataframe/utils.py b/fugue/dataframe/utils.py index 320be8c3..227968e7 100644 --- a/fugue/dataframe/utils.py +++ b/fugue/dataframe/utils.py @@ -2,152 +2,28 @@ import json import os import pickle -from typing import Any, Dict, Iterable, List, Optional, Tuple +from typing import Any, Iterable, List, Optional, Tuple import pandas as pd import pyarrow as pa from fs import open_fs -from fugue.dataframe.array_dataframe import ArrayDataFrame -from fugue.dataframe.dataframe import DataFrame, LocalBoundedDataFrame, LocalDataFrame -from fugue.dataframe.iterable_dataframe import IterableDataFrame -from fugue.dataframe.pandas_dataframe import PandasDataFrame -from triad.collections import Schema -from triad.collections.fs import FileSystem +from triad import FileSystem, Schema from triad.collections.schema import SchemaError from triad.exceptions import InvalidOperationError from triad.utils.assertion import assert_arg_not_none from triad.utils.assertion import assert_or_throw as aot -from triad.utils.rename import normalize_names -from .._utils.registry import fugue_plugin +from .api import get_column_names, normalize_column_names, rename +from .array_dataframe import ArrayDataFrame +from .arrow_dataframe import ArrowDataFrame +from .dataframe import DataFrame, LocalBoundedDataFrame, LocalDataFrame +from .iterable_dataframe import IterableDataFrame +from .pandas_dataframe import PandasDataFrame - -@fugue_plugin -def get_dataframe_column_names(df: Any) -> List[Any]: # pragma: no cover - """A generic function to get column names of any dataframe - - :param df: the dataframe object - :return: the column names - - .. note:: - - In order to support a new type of dataframe, an implementation must - be registered, for example - - .. code-block::python - - @get_dataframe_column_names.candidate(lambda df: isinstance(df, pa.Table)) - def _get_pyarrow_dataframe_columns(df: pa.Table) -> List[Any]: - return [f.name for f in df.schema] - """ - raise NotImplementedError(f"{type(df)} is not supported") - - -@fugue_plugin -def rename_dataframe_column_names(df: Any, names: Dict[str, Any]) -> Any: - """A generic function to rename column names of any dataframe - - :param df: the dataframe object - :param names: the rename operations as a dict: ``old name => new name`` - :return: the renamed dataframe - - .. note:: - - In order to support a new type of dataframe, an implementation must - be registered, for example - - .. code-block::python - - @rename_dataframe_column_names.candidate( - lambda df, *args, **kwargs: isinstance(df, pd.DataFrame) - ) - def _rename_pandas_dataframe( - df: pd.DataFrame, names: Dict[str, Any] - ) -> pd.DataFrame: - if len(names) == 0: - return df - return df.rename(columns=names) - """ - if len(names) == 0: - return df - else: # pragma: no cover - raise NotImplementedError(f"{type(df)} is not supported") - - -def normalize_dataframe_column_names(df: Any) -> Tuple[Any, Dict[str, Any]]: - """A generic function to normalize any dataframe's column names to follow - Fugue naming rules - - .. note:: - - This is a temporary solution before - :class:`~triad:triad.collections.schema.Schema` - can take arbitrary names - - .. admonition:: Examples - - * ``[0,1]`` => ``{"_0":0, "_1":1}`` - * ``["1a","2b"]`` => ``{"_1a":"1a", "_2b":"2b"}`` - * ``["*a","-a"]`` => ``{"_a":"*a", "_a_1":"-a"}`` - - :param df: a dataframe object - :return: the renamed dataframe and the rename operations as a dict that - can **undo** the change - - .. seealso:: - - * :func:`~.get_dataframe_column_names` - * :func:`~.rename_dataframe_column_names` - * :func:`~triad:triad.utils.rename.normalize_names` - """ - cols = get_dataframe_column_names(df) - names = normalize_names(cols) - if len(names) == 0: - return df, {} - undo = {v: k for k, v in names.items()} - return (rename_dataframe_column_names(df, names), undo) - - -@get_dataframe_column_names.candidate(lambda df: isinstance(df, pd.DataFrame)) -def _get_pandas_dataframe_columns(df: pd.DataFrame) -> List[Any]: - return list(df.columns) - - -@rename_dataframe_column_names.candidate( - lambda df, *args, **kwargs: isinstance(df, pd.DataFrame) -) -def _rename_pandas_dataframe(df: pd.DataFrame, names: Dict[str, Any]) -> pd.DataFrame: - if len(names) == 0: - return df - return df.rename(columns=names) - - -@get_dataframe_column_names.candidate(lambda df: isinstance(df, pa.Table)) -def _get_pyarrow_dataframe_columns(df: pa.Table) -> List[Any]: - return [f.name for f in df.schema] - - -@rename_dataframe_column_names.candidate( - lambda df, *args, **kwargs: isinstance(df, pa.Table) -) -def _rename_pyarrow_dataframe(df: pa.Table, names: Dict[str, Any]) -> pa.Table: - if len(names) == 0: - return df - return df.rename_columns([names.get(f.name, f.name) for f in df.schema]) - - -@get_dataframe_column_names.candidate(lambda df: isinstance(df, DataFrame)) -def _get_fugue_dataframe_columns(df: "DataFrame") -> List[Any]: - return df.schema.names - - -@rename_dataframe_column_names.candidate( - lambda df, *args, **kwargs: isinstance(df, DataFrame) -) -def _rename_fugue_dataframe(df: "DataFrame", names: Dict[str, Any]) -> "DataFrame": - if len(names) == 0: - return df - return df.rename(columns=names) +# For backward compatibility, TODO: remove! +get_dataframe_column_names = get_column_names +normalize_dataframe_column_names = normalize_column_names +rename_dataframe_column_names = rename def _pa_type_eq(t1: pa.DataType, t2: pa.DataType) -> bool: @@ -260,6 +136,8 @@ def to_local_df(df: Any, schema: Any = None) -> LocalDataFrame: return df.as_local() if isinstance(df, pd.DataFrame): return PandasDataFrame(df, schema) + if isinstance(df, pa.Table): + return ArrowDataFrame(df, schema) if isinstance(df, List): return ArrayDataFrame(df, schema) if isinstance(df, Iterable): @@ -413,7 +291,7 @@ def deserialize_df( def get_join_schemas( - df1: DataFrame, df2: DataFrame, how: str, on: Iterable[str] + df1: DataFrame, df2: DataFrame, how: str, on: Optional[Iterable[str]] ) -> Tuple[Schema, Schema]: """Get :class:`~triad:triad.collections.schema.Schema` object after joining ``df1`` and ``df2``. If ``on`` is not empty, it's mainly for @@ -451,7 +329,7 @@ def get_join_schemas( ], ValueError(f"{how} is not a valid join type"), ) - on = list(on) + on = list(on) if on is not None else [] aot(len(on) == len(set(on)), f"{on} has duplication") if how != "cross" and len(on) == 0: on = list(df1.schema.intersect(df2.schema.names).names) diff --git a/fugue/dataset/__init__.py b/fugue/dataset/__init__.py new file mode 100644 index 00000000..d70704e5 --- /dev/null +++ b/fugue/dataset/__init__.py @@ -0,0 +1,3 @@ +# flake8: noqa +from .api import * +from .dataset import AnyDataset, Dataset, DatasetDisplay, get_dataset_display diff --git a/fugue/dataset/api.py b/fugue/dataset/api.py new file mode 100644 index 00000000..e3c9e5e4 --- /dev/null +++ b/fugue/dataset/api.py @@ -0,0 +1,104 @@ +from typing import Any, Optional + +from .._utils.registry import fugue_plugin +from .dataset import AnyDataset, Dataset + + +@fugue_plugin +def as_fugue_dataset(data: AnyDataset, **kwargs: Any) -> Dataset: + """Wrap the input as a :class:`~.Dataset` + + :param data: the dataset to be wrapped + """ + if isinstance(data, Dataset) and len(kwargs) == 0: + return data + raise NotImplementedError(f"no registered dataset conversion for {type(data)}") + + +def show( + data: AnyDataset, n: int = 10, with_count: bool = False, title: Optional[str] = None +) -> None: + """Display the Dataset + + :param data: the dataset that can be recognized by Fugue + :param n: number of rows to print, defaults to 10 + :param with_count: whether to show dataset count, defaults to False + :param title: title of the dataset, defaults to None + + .. note:: + + When ``with_count`` is True, it can trigger expensive calculation for + a distributed dataframe. So if you call this function directly, you may + need to :func:`fugue.execution.execution_engine.ExecutionEngine.persist` + the dataset. + """ + return as_fugue_dataset(data).show(n=n, with_count=with_count, title=title) + + +@fugue_plugin +def as_local(data: AnyDataset) -> AnyDataset: + """Convert the dataset to a local dataset + + :param data: the dataset that can be recognized by Fugue + """ + if isinstance(data, Dataset) and data.is_local: + return data + return as_local_bounded(data) + + +@fugue_plugin +def as_local_bounded(data: AnyDataset) -> AnyDataset: + """Convert the dataset to a local bounded dataset + + :param data: the dataset that can be recognized by Fugue + """ + if isinstance(data, Dataset) and data.is_local and data.is_bounded: + return data + raise NotImplementedError( + f"no registered function to convert {type(data)} to a local bounded dataset" + ) + + +@fugue_plugin +def is_local(data: AnyDataset) -> bool: + """Whether the dataset is local + + :param data: the dataset that can be recognized by Fugue + """ + return as_fugue_dataset(data).is_local + + +@fugue_plugin +def is_bounded(data: AnyDataset) -> bool: + """Whether the dataset is local + + :param data: the dataset that can be recognized by Fugue + """ + return as_fugue_dataset(data).is_bounded + + +@fugue_plugin +def is_empty(data: AnyDataset) -> bool: + """Whether the dataset is empty + + :param data: the dataset that can be recognized by Fugue + """ + return as_fugue_dataset(data).empty + + +@fugue_plugin +def count(data: AnyDataset) -> int: + """The number of elements in the dataset + + :param data: the dataset that can be recognized by Fugue + """ + return as_fugue_dataset(data).count() + + +@fugue_plugin +def get_num_partitions(data: AnyDataset) -> bool: + """Get the number of partitions of the dataset + + :param data: the dataset that can be recognized by Fugue + """ + return as_fugue_dataset(data).num_partitions diff --git a/fugue/dataset.py b/fugue/dataset/dataset.py similarity index 88% rename from fugue/dataset.py rename to fugue/dataset/dataset.py index 080337c9..24f3dcdf 100644 --- a/fugue/dataset.py +++ b/fugue/dataset/dataset.py @@ -1,11 +1,14 @@ import html from abc import ABC, abstractmethod -from typing import Any, Optional +from typing import Any, Optional, TypeVar from triad import ParamDict, SerializableRLock, assert_or_throw -from ._utils.registry import fugue_plugin -from .exceptions import FugueDatasetEmptyError +from .._utils.registry import fugue_plugin +from ..exceptions import FugueDatasetEmptyError + + +AnyDataset = TypeVar("AnyDataset", "Dataset", object) class Dataset(ABC): @@ -36,10 +39,16 @@ def reset_metadata(self, metadata: Any) -> None: """Reset metadata""" self._metadata = ParamDict(metadata) if metadata is not None else None + @property + @abstractmethod + def native(self) -> Any: # pragma: no cover + """The native object this Dataset class wraps""" + raise NotImplementedError + @property @abstractmethod def is_local(self) -> bool: # pragma: no cover - """Whether this dataframe is a :class:`.LocalDataFrame`""" + """Whether this dataframe is a local Dataset""" raise NotImplementedError @property @@ -79,7 +88,7 @@ def show( ) -> None: """Display the Dataset - :param rows: number of rows to print, defaults to 10 + :param n: number of rows to print, defaults to 10 :param with_count: whether to show dataset count, defaults to False :param title: title of the dataset, defaults to None @@ -146,4 +155,4 @@ def get_dataset_display(ds: "Dataset") -> DatasetDisplay: # pragma: no cover :param ds: the Dataset to be displayed """ - raise NotImplementedError(f"No matching DatasetDisplay registered for {type(ds)}") + raise NotImplementedError(f"no matching DatasetDisplay registered for {type(ds)}") diff --git a/fugue/execution/__init__.py b/fugue/execution/__init__.py index 35d6f7aa..f82c646e 100644 --- a/fugue/execution/__init__.py +++ b/fugue/execution/__init__.py @@ -1,6 +1,7 @@ # flake8: noqa -from fugue.execution.execution_engine import ExecutionEngine, MapEngine, SQLEngine -from fugue.execution.factory import ( +from .api import * +from .execution_engine import AnyExecutionEngine, ExecutionEngine, MapEngine, SQLEngine +from .factory import ( infer_execution_engine, make_execution_engine, make_sql_engine, @@ -9,7 +10,7 @@ register_execution_engine, register_sql_engine, ) -from fugue.execution.native_execution_engine import ( +from .native_execution_engine import ( NativeExecutionEngine, QPDPandasEngine, SqliteEngine, diff --git a/fugue/execution/api.py b/fugue/execution/api.py new file mode 100644 index 00000000..57a6ba46 --- /dev/null +++ b/fugue/execution/api.py @@ -0,0 +1,1195 @@ +from contextlib import contextmanager +from typing import Any, Callable, Iterator, List, Optional, Union + +from triad import assert_or_throw + +from ..collections.partition import PartitionSpec +from ..dataframe.dataframe import AnyDataFrame, DataFrame +from .execution_engine import ( + _FUGUE_GLOBAL_EXECUTION_ENGINE_CONTEXT, + AnyExecutionEngine, + ExecutionEngine, +) +from .factory import make_execution_engine +from fugue.column import ColumnExpr, SelectColumns, col, lit + + +@contextmanager +def engine_context( + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + infer_by: Optional[List[Any]] = None, +) -> Iterator[ExecutionEngine]: + """Make an execution engine and set it as the context engine. This function + is thread safe and async safe. + + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param infer_by: a list of objects to infer the engine, defaults to None + + .. note:: + + For more details, please read + :func:`~.fugue.execution.factory.make_execution_engine` + + .. admonition:: Examples + + .. code-block:: python + + import fugue.api as fa + + with fa.engine_context(spark_session): + transform(df, func) # will use spark in this transformation + + """ + e = make_execution_engine(engine, engine_conf, infer_by=infer_by) + return e._as_context() + + +def set_global_engine( + engine: AnyExecutionEngine, engine_conf: Any = None +) -> ExecutionEngine: + """Make an execution engine and set it as the global execution engine + + :param engine: an engine like object, must not be None + :param engine_conf: the configs for the engine, defaults to None + + .. caution:: + + In general, it is not a good practice to set a global engine. You should + consider :func:`~.engine_context` instead. The exception + is when you iterate in a notebook and cross cells, this could simplify + the code. + + .. note:: + + For more details, please read + :func:`~.fugue.execution.factory.make_execution_engine` and + :meth:`~fugue.execution.execution_engine.ExecutionEngine.set_global` + + .. admonition:: Examples + + .. code-block:: python + + import fugue.api as fa + + fa.set_global_engine(spark_session) + transform(df, func) # will use spark in this transformation + fa.clear_global_engine() # remove the global setting + """ + assert_or_throw(engine is not None, ValueError("engine must be specified")) + return make_execution_engine(engine, engine_conf).set_global() + + +def clear_global_engine() -> None: + """Remove the global exeuction engine (if set)""" + _FUGUE_GLOBAL_EXECUTION_ENGINE_CONTEXT.set(None) + + +def get_current_engine() -> ExecutionEngine: + """Get the current execution engine. Regarding the order of the logic + please read :func:`~.fugue.execution.factory.make_execution_engine` + """ + return make_execution_engine() + + +def get_current_parallelism( + engine: AnyExecutionEngine = None, engine_conf: Any = None +) -> int: + """Get the current parallelism of the engine + + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + + :return: the size of the parallelism + """ + with engine_context(engine, engine_conf) as e: + return e.get_current_parallelism() + + +def run_engine_function( + func: Callable[[ExecutionEngine], Any], + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, + infer_by: Optional[List[Any]] = None, +) -> Any: + """Run a lambda function based on the engine provided + + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + :param infer_by: a list of objects to infer the engine, defaults to None + + :return: None or a Fugue :class:`~.fugue.dataframe.dataframe.DataFrame` if + ``as_fugue`` is True, otherwise if ``infer_by`` contains any + Fugue DataFrame, then return the Fugue DataFrame, otherwise + it returns the underlying dataframe using + :meth:`~.fugue.dataframe.dataframe.DataFrame.native_as_df` + + .. note:: + + This function is for deveopment use. Users should not need it. + """ + e = make_execution_engine(engine, engine_conf, infer_by=infer_by) + res = func(e) + + if isinstance(res, DataFrame): + res = e.convert_yield_dataframe(res, as_local=as_local) + if as_fugue or any(isinstance(x, DataFrame) for x in (infer_by or [])): + return res + return res.native_as_df() + return res + + +def repartition( + df: AnyDataFrame, + partition: PartitionSpec, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """Partition the input dataframe using ``partition``. + + :param df: an input dataframe that can be recognized by Fugue + :param partition: how you want to partition the dataframe + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the repartitioned dataframe + + .. caution:: + + This function is experimental, and may be removed in the future. + """ + return run_engine_function( + lambda e: e.repartition(e.to_df(df), partition_spec=PartitionSpec(partition)), + engine=engine, + engine_conf=engine_conf, + infer_by=[df], + as_fugue=as_fugue, + as_local=as_local, + ) + + +def broadcast( + df: AnyDataFrame, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """Broadcast the dataframe to all workers for a distributed computing framework + + :param df: an input dataframe that can be recognized by Fugue + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the broadcasted dataframe + """ + return run_engine_function( + lambda e: e.broadcast(e.to_df(df)), + engine=engine, + engine_conf=engine_conf, + infer_by=[df], + as_fugue=as_fugue, + as_local=as_local, + ) + + +def persist( + df: AnyDataFrame, + lazy: bool = False, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, + **kwargs: Any, +) -> AnyDataFrame: + """Force materializing and caching the dataframe + + :param df: an input dataframe that can be recognized by Fugue + :param lazy: ``True``: first usage of the output will trigger persisting + to happen; ``False`` (eager): persist is forced to happend immediately. + Default to ``False`` + :param kwargs: parameter to pass to the underlying persist implementation + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the persisted dataframe + """ + return run_engine_function( + lambda e: e.persist(e.to_df(df), lazy=lazy, **kwargs), + engine=engine, + engine_conf=engine_conf, + infer_by=[df], + as_fugue=as_fugue, + as_local=as_local, + ) + + +def distinct( + df: AnyDataFrame, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """Equivalent to ``SELECT DISTINCT * FROM df`` + + :param df: an input dataframe that can be recognized by Fugue + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the result with distinct rows + """ + return run_engine_function( + lambda e: e.distinct(e.to_df(df)), + engine=engine, + engine_conf=engine_conf, + infer_by=[df], + as_fugue=as_fugue, + as_local=as_local, + ) + + +def dropna( + df: AnyDataFrame, + how: str = "any", + thresh: int = None, + subset: List[str] = None, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """Drop NA recods from dataframe + + :param df: an input dataframe that can be recognized by Fugue + :param how: 'any' or 'all'. 'any' drops rows that contain any nulls. + 'all' drops rows that contain all nulls. + :param thresh: int, drops rows that have less than thresh non-null values + :param subset: list of columns to operate on + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: DataFrame with NA records dropped + """ + return run_engine_function( + lambda e: e.dropna(e.to_df(df), how=how, thresh=thresh, subset=subset), + engine=engine, + engine_conf=engine_conf, + infer_by=[df], + as_fugue=as_fugue, + as_local=as_local, + ) + + +def fillna( + df: AnyDataFrame, + value: Any, + subset: List[str] = None, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """ + Fill ``NULL``, ``NAN``, ``NAT`` values in a dataframe + + :param df: an input dataframe that can be recognized by Fugue + :param value: if scalar, fills all columns with same value. + if dictionary, fills NA using the keys as column names and the + values as the replacement values. + :param subset: list of columns to operate on. ignored if value is + a dictionary + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: DataFrame with NA records filled + """ + return run_engine_function( + lambda e: e.fillna(e.to_df(df), value=value, subset=subset), + engine=engine, + engine_conf=engine_conf, + infer_by=[df], + as_fugue=as_fugue, + as_local=as_local, + ) + + +def sample( + df: AnyDataFrame, + n: Optional[int] = None, + frac: Optional[float] = None, + replace: bool = False, + seed: Optional[int] = None, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """ + Sample dataframe by number of rows or by fraction + + :param df: an input dataframe that can be recognized by Fugue + :param n: number of rows to sample, one and only one of ``n`` and ``frac`` + must be set + :param frac: fraction [0,1] to sample, one and only one of ``n`` and ``frac`` + must be set + :param replace: whether replacement is allowed. With replacement, + there may be duplicated rows in the result, defaults to False + :param seed: seed for randomness, defaults to None + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the sampled dataframe + """ + return run_engine_function( + lambda e: e.sample(e.to_df(df), n=n, frac=frac, replace=replace, seed=seed), + engine=engine, + engine_conf=engine_conf, + infer_by=[df], + as_fugue=as_fugue, + as_local=as_local, + ) + + +def take( + df: AnyDataFrame, + n: int, + presort: str, + na_position: str = "last", + partition: Any = None, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """ + Get the first n rows of a DataFrame per partition. If a presort is defined, + use the presort before applying take. presort overrides partition_spec.presort. + The Fugue implementation of the presort follows Pandas convention of specifying + NULLs first or NULLs last. This is different from the Spark and SQL convention + of NULLs as the smallest value. + + :param df: an input dataframe that can be recognized by Fugue + :param n: number of rows to return + :param presort: presort expression similar to partition presort + :param na_position: position of null values during the presort. + can accept ``first`` or ``last`` + :param partition: PartitionSpec to apply the take operation, + defaults to None + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: n rows of DataFrame per partition + """ + + return run_engine_function( + lambda e: e.take( + e.to_df(df), + n=n, + presort=presort, + na_position=na_position, + partition_spec=None if partition is None else PartitionSpec(partition), + ), + engine=engine, + engine_conf=engine_conf, + infer_by=[df], + as_fugue=as_fugue, + as_local=as_local, + ) + + +def load( + path: Union[str, List[str]], + format_hint: Any = None, + columns: Any = None, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, + **kwargs: Any, +) -> AnyDataFrame: + """Load dataframe from persistent storage + + :param path: the path to the dataframe + :param format_hint: can accept ``parquet``, ``csv``, ``json``, + defaults to None, meaning to infer + :param columns: list of columns or a |SchemaLikeObject|, defaults to None + :param kwargs: parameters to pass to the underlying framework + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + :return: an engine compatible dataframe + + For more details and examples, read |ZipComap|. + """ + return run_engine_function( + lambda e: e.load_df( + path=path, format_hint=format_hint, columns=columns, **kwargs + ), + engine=engine, + engine_conf=engine_conf, + as_fugue=as_fugue, + as_local=as_local, + ) + + +def save( + df: AnyDataFrame, + path: str, + format_hint: Any = None, + mode: str = "overwrite", + partition: Any = None, + force_single: bool = False, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + **kwargs: Any, +) -> None: + """Save dataframe to a persistent storage + + :param df: an input dataframe that can be recognized by Fugue + :param path: output path + :param format_hint: can accept ``parquet``, ``csv``, ``json``, + defaults to None, meaning to infer + :param mode: can accept ``overwrite``, ``append``, ``error``, + defaults to "overwrite" + :param partition: how to partition the dataframe before saving, + defaults to None + :param force_single: force the output as a single file, defaults to False + :param kwargs: parameters to pass to the underlying framework + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + + For more details and examples, read |LoadSave|. + """ + run_engine_function( + lambda e: e.save_df( + e.to_df(df), + path=path, + format_hint=format_hint, + mode=mode, + partition_spec=None if partition is None else PartitionSpec(partition), + force_single=force_single, + **kwargs, + ), + engine=engine, + engine_conf=engine_conf, + infer_by=[df], + ) + + +def join( + df1: AnyDataFrame, + df2: AnyDataFrame, + *dfs: AnyDataFrame, + how: str, + on: Optional[List[str]] = None, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """Join two dataframes + + :param df1: the first dataframe + :param df2: the second dataframe + :param dfs: more dataframes to join + :param how: can accept ``semi``, ``left_semi``, ``anti``, ``left_anti``, + ``inner``, ``left_outer``, ``right_outer``, ``full_outer``, ``cross`` + :param on: it can always be inferred, but if you provide, it will be + validated against the inferred keys. + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the joined dataframe + + .. note:: + + Please read :func:`~.fugue.dataframe.utils.get_join_schemas` + """ + + def _join(e: ExecutionEngine): + edf1 = e.to_df(df1) + edf2 = e.to_df(df2) + res = e.join(edf1, edf2, how=how, on=on) + for odf in dfs: + res = e.join(res, e.to_df(odf), how=how, on=on) + return res + + return run_engine_function( + _join, + engine=engine, + engine_conf=engine_conf, + as_fugue=as_fugue, + as_local=as_local, + infer_by=[df1, df2, *dfs], + ) + + +def inner_join( + df1: AnyDataFrame, + df2: AnyDataFrame, + *dfs: AnyDataFrame, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """Inner join two dataframes. + This is a wrapper of :func:`~.join` with ``how="inner"`` + + :param df1: the first dataframe + :param df2: the second dataframe + :param dfs: more dataframes to join + :param how: can accept ``semi``, ``left_semi``, ``anti``, ``left_anti``, + ``inner``, ``left_outer``, ``right_outer``, ``full_outer``, ``cross`` + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the joined dataframe + """ + return join( + df1, + df2, + *dfs, + how="inner", + engine=engine, + engine_conf=engine_conf, + as_fugue=as_fugue, + as_local=as_local, + ) + + +def semi_join( + df1: AnyDataFrame, + df2: AnyDataFrame, + *dfs: AnyDataFrame, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """Left semi-join two dataframes. + This is a wrapper of :func:`~.join` with ``how="semi"`` + + :param df1: the first dataframe + :param df2: the second dataframe + :param dfs: more dataframes to join + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the joined dataframe + """ + return join( + df1, + df2, + *dfs, + how="semi", + engine=engine, + engine_conf=engine_conf, + as_fugue=as_fugue, + as_local=as_local, + ) + + +def anti_join( + df1: AnyDataFrame, + df2: AnyDataFrame, + *dfs: AnyDataFrame, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """Left anti-join two dataframes. + This is a wrapper of :func:`~.join` with ``how="anti"`` + + :param df1: the first dataframe + :param df2: the second dataframe + :param dfs: more dataframes to join + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the joined dataframe + """ + return join( + df1, + df2, + *dfs, + how="anti", + engine=engine, + engine_conf=engine_conf, + as_fugue=as_fugue, + as_local=as_local, + ) + + +def left_outer_join( + df1: AnyDataFrame, + df2: AnyDataFrame, + *dfs: AnyDataFrame, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """Left outer join two dataframes. + This is a wrapper of :func:`~.join` with ``how="left_outer"`` + + :param df1: the first dataframe + :param df2: the second dataframe + :param dfs: more dataframes to join + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the joined dataframe + """ + return join( + df1, + df2, + *dfs, + how="left_outer", + engine=engine, + engine_conf=engine_conf, + as_fugue=as_fugue, + as_local=as_local, + ) + + +def right_outer_join( + df1: AnyDataFrame, + df2: AnyDataFrame, + *dfs: AnyDataFrame, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """Right outer join two dataframes. + This is a wrapper of :func:`~.join` with ``how="right_outer"`` + + :param df1: the first dataframe + :param df2: the second dataframe + :param dfs: more dataframes to join + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the joined dataframe + """ + return join( + df1, + df2, + *dfs, + how="right_outer", + engine=engine, + engine_conf=engine_conf, + as_fugue=as_fugue, + as_local=as_local, + ) + + +def full_outer_join( + df1: AnyDataFrame, + df2: AnyDataFrame, + *dfs: AnyDataFrame, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """Full outer join two dataframes. + This is a wrapper of :func:`~.join` with ``how="full_outer"`` + + :param df1: the first dataframe + :param df2: the second dataframe + :param dfs: more dataframes to join + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the joined dataframe + """ + return join( + df1, + df2, + *dfs, + how="full_outer", + engine=engine, + engine_conf=engine_conf, + as_fugue=as_fugue, + as_local=as_local, + ) + + +def cross_join( + df1: AnyDataFrame, + df2: AnyDataFrame, + *dfs: AnyDataFrame, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """Cross join two dataframes. + This is a wrapper of :func:`~.join` with ``how="cross"`` + + :param df1: the first dataframe + :param df2: the second dataframe + :param dfs: more dataframes to join + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the joined dataframe + """ + return join( + df1, + df2, + *dfs, + how="cross", + engine=engine, + engine_conf=engine_conf, + as_fugue=as_fugue, + as_local=as_local, + ) + + +def union( + df1: AnyDataFrame, + df2: AnyDataFrame, + *dfs: AnyDataFrame, + distinct: bool = True, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """Join two dataframes + + :param df1: the first dataframe + :param df2: the second dataframe + :param dfs: more dataframes to union + :param distinct: ``true`` for ``UNION`` (== ``UNION DISTINCT``), + ``false`` for ``UNION ALL`` + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the unioned dataframe + + .. note:: + + Currently, the schema of all dataframes must be identical, or + an exception will be thrown. + """ + + def _union(e: ExecutionEngine): + edf1 = e.to_df(df1) + edf2 = e.to_df(df2) + res = e.union(edf1, edf2, distinct=distinct) + for odf in dfs: + res = e.union(res, e.to_df(odf), distinct=distinct) + return res + + return run_engine_function( + _union, + engine=engine, + engine_conf=engine_conf, + as_fugue=as_fugue, + as_local=as_local, + infer_by=[df1, df2, *dfs], + ) + + +def subtract( + df1: AnyDataFrame, + df2: AnyDataFrame, + *dfs: AnyDataFrame, + distinct: bool = True, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """``df1 - df2`` + + :param df1: the first dataframe + :param df2: the second dataframe + :param dfs: more dataframes to subtract + :param distinct: ``true`` for ``EXCEPT`` (== ``EXCEPT DISTINCT``), + ``false`` for ``EXCEPT ALL`` + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the unioned dataframe + + .. note:: + + Currently, the schema of all datafrmes must be identical, or + an exception will be thrown. + """ + + def _subtract(e: ExecutionEngine): + edf1 = e.to_df(df1) + edf2 = e.to_df(df2) + res = e.subtract(edf1, edf2, distinct=distinct) + for odf in dfs: + res = e.subtract(res, e.to_df(odf), distinct=distinct) + return res + + return run_engine_function( + _subtract, + engine=engine, + engine_conf=engine_conf, + as_fugue=as_fugue, + as_local=as_local, + infer_by=[df1, df2, *dfs], + ) + + +def intersect( + df1: AnyDataFrame, + df2: AnyDataFrame, + *dfs: AnyDataFrame, + distinct: bool = True, # pylint: disable-all + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """Intersect ``df1`` and ``df2`` + + :param df1: the first dataframe + :param df2: the second dataframe + :param dfs: more dataframes to intersect with + :param distinct: ``true`` for ``INTERSECT`` (== ``INTERSECT DISTINCT``), + ``false`` for ``INTERSECT ALL`` + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the unioned dataframe + + .. note:: + + Currently, the schema of ``df1`` and ``df2`` must be identical, or + an exception will be thrown. + """ + + def _intersect(e: ExecutionEngine): + edf1 = e.to_df(df1) + edf2 = e.to_df(df2) + res = e.intersect(edf1, edf2, distinct=distinct) + for odf in dfs: + res = e.intersect(res, e.to_df(odf), distinct=distinct) + return res + + return run_engine_function( + _intersect, + engine=engine, + engine_conf=engine_conf, + as_fugue=as_fugue, + as_local=as_local, + infer_by=[df1, df2, *dfs], + ) + + +def select( + df: AnyDataFrame, + *columns: Union[str, ColumnExpr], + where: Optional[ColumnExpr] = None, + having: Optional[ColumnExpr] = None, + distinct: bool = False, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """The functional interface for SQL select statement + + :param df: the dataframe to be operated on + :param columns: column expressions, for strings they will represent + the column names + :param where: ``WHERE`` condition expression, defaults to None + :param having: ``having`` condition expression, defaults to None. It + is used when ``cols`` contains aggregation columns, defaults to None + :param distinct: whether to return distinct result, defaults to False + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the select result as a dataframe + + .. attention:: + + This interface is experimental, it's subjected to change in new versions. + + .. seealso:: + + Please find more expression examples in :mod:`fugue.column.sql` and + :mod:`fugue.column.functions` + + .. admonition:: Examples + + .. code-block:: python + + from fugue.column import col, lit, functions as f + import fugue.api as fa + + with fa.engine_context("duckdb"): + # select existed and new columns + fa.select(df, col("a"),col("b"),lit(1,"another")) + fa.select(df, col("a"),(col("b")+lit(1)).alias("x")) + + # aggregation + # SELECT COUNT(DISTINCT *) AS x FROM df + fa.select( + df, + f.count_distinct(col("*")).alias("x")) + + # SELECT a, MAX(b+1) AS x FROM df GROUP BY a + fa.select( + df, + col("a"),f.max(col("b")+lit(1)).alias("x")) + + # SELECT a, MAX(b+1) AS x FROM df + # WHERE b<2 AND a>1 + # GROUP BY a + # HAVING MAX(b+1)>0 + fa.select( + df, + col("a"),f.max(col("b")+lit(1)).alias("x"), + where=(col("b")<2) & (col("a")>1), + having=f.max(col("b")+lit(1))>0 + ) + """ + cols = SelectColumns( + *[col(x) if isinstance(x, str) else x for x in columns], + arg_distinct=distinct, + ) + + return run_engine_function( + lambda e: e.select(e.to_df(df), cols=cols, where=where, having=having), + engine=engine, + engine_conf=engine_conf, + infer_by=[df], + as_fugue=as_fugue, + as_local=as_local, + ) + + +def filter( + df: AnyDataFrame, + condition: ColumnExpr, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """Filter rows by the given condition + + :param df: the dataframe to be filtered + :param condition: (boolean) column expression + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the filtered dataframe + + .. seealso:: + + Please find more expression examples in :mod:`fugue.column.sql` and + :mod:`fugue.column.functions` + + .. admonition:: Examples + + .. code-block:: python + + from fugue.column import col, functions as f + import fugue.api as fa + + with fa.engine_context("duckdb"): + fa.filter(df, (col("a")>1) & (col("b")=="x")) + fa.filter(df, f.coalesce(col("a"),col("b"))>1) + """ + return run_engine_function( + lambda e: e.filter(e.to_df(df), condition=condition), + engine=engine, + engine_conf=engine_conf, + infer_by=[df], + as_fugue=as_fugue, + as_local=as_local, + ) + + +def assign( + df: AnyDataFrame, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, + **columns: Any, +) -> AnyDataFrame: + """Update existing columns with new values and add new columns + + :param df: the dataframe to set columns + :param columns: column expressions + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the updated dataframe + + .. tip:: + + This can be used to cast data types, alter column values or add new + columns. But you can't use aggregation in columns. + + .. admonition:: New Since + :class: hint + + **0.6.0** + + .. seealso:: + + Please find more expression examples in :mod:`fugue.column.sql` and + :mod:`fugue.column.functions` + + .. admonition:: Examples + + .. code-block:: python + + from fugue.column import col, functions as f + import fugue.api as fa + + # assume df has schema: a:int,b:str + + with fa.engine_context("duckdb"): + # add constant column x + fa.assign(df, x=1) + + # change column b to be a constant integer + fa.assign(df, b=1) + + # add new x to be a+b + fa.assign(df, x=col("a")+col("b")) + + # cast column a data type to double + fa.assign(df, a=col("a").cast(float)) + """ + cols = [ + v.alias(k) if isinstance(v, ColumnExpr) else lit(v).alias(k) + for k, v in columns.items() + ] + return run_engine_function( + lambda e: e.assign(e.to_df(df), columns=cols), + engine=engine, + engine_conf=engine_conf, + infer_by=[df], + as_fugue=as_fugue, + as_local=as_local, + ) + + +def aggregate( + df: AnyDataFrame, + partition_by: Union[None, str, List[str]] = None, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, + **agg_kwcols: ColumnExpr, +) -> AnyDataFrame: + """Aggregate on dataframe + + :param df: the dataframe to aggregate on + :param partition_by: partition key(s), defaults to None + :param agg_kwcols: aggregation expressions + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether to force return a local DataFrame, defaults to False + + :return: the aggregated result as a dataframe + + .. seealso:: + + Please find more expression examples in :mod:`fugue.column.sql` and + :mod:`fugue.column.functions` + + .. admonition:: Examples + + .. code-block:: python + + from fugue.column import col, functions as f + import fugue.api as fa + + with fa.engine_context("duckdb"): + # SELECT MAX(b) AS b FROM df + fa.aggregate(df, b=f.max(col("b"))) + + # SELECT a, MAX(b) AS x FROM df GROUP BY a + fa.aggregate(df, "a", x=f.max(col("b"))) + """ + cols = [ + v.alias(k) if isinstance(v, ColumnExpr) else lit(v).alias(k) + for k, v in agg_kwcols.items() + ] + return run_engine_function( + lambda e: e.aggregate( + e.to_df(df), + partition_spec=None + if partition_by is None + else PartitionSpec(by=partition_by), + agg_cols=cols, + ), + engine=engine, + engine_conf=engine_conf, + infer_by=[df], + as_fugue=as_fugue, + as_local=as_local, + ) diff --git a/fugue/execution/execution_engine.py b/fugue/execution/execution_engine.py index 94394c08..83d854e2 100644 --- a/fugue/execution/execution_engine.py +++ b/fugue/execution/execution_engine.py @@ -2,10 +2,21 @@ from abc import ABC, abstractmethod from contextlib import contextmanager from contextvars import ContextVar -from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Union +from typing import ( + Any, + Callable, + Dict, + Iterable, + Iterator, + List, + Optional, + Tuple, + TypeVar, + Union, +) from uuid import uuid4 -from triad import ParamDict, Schema, assert_or_throw +from triad import ParamDict, Schema, SerializableRLock, assert_or_throw from triad.collections.fs import FileSystem from triad.exceptions import InvalidOperationError from triad.utils.convert import to_size @@ -13,11 +24,11 @@ from fugue.bag import Bag, LocalBag from fugue.collections.partition import ( - EMPTY_PARTITION_SPEC, BagPartitionCursor, PartitionCursor, PartitionSpec, ) +from fugue._utils.sql import get_temp_tb_name from fugue.column import ColumnExpr, SelectColumns, SQLExpressionGenerator, col, is_agg from fugue.constants import _FUGUE_GLOBAL_CONF from fugue.dataframe import DataFrame, DataFrames @@ -26,11 +37,34 @@ from fugue.dataframe.utils import deserialize_df, serialize_df from fugue.exceptions import FugueBug +AnyExecutionEngine = TypeVar("AnyExecutionEngine", object, None) + _FUGUE_EXECUTION_ENGINE_CONTEXT = ContextVar( "_FUGUE_EXECUTION_ENGINE_CONTEXT", default=None ) -_DEFAULT_JOIN_KEYS: List[str] = [] +_CONTEXT_LOCK = SerializableRLock() + + +class _GlobalExecutionEngineContext: + def __init__(self): + self._engine: Optional["ExecutionEngine"] = None + + def set(self, engine: Optional["ExecutionEngine"]): + with _CONTEXT_LOCK: + if self._engine is not None: + self._engine._is_global = False + self._engine._ctx_count -= 1 + self._engine = engine + if engine is not None: + engine._is_global = True + engine._ctx_count += 1 + + def get(self) -> Optional["ExecutionEngine"]: + return self._engine + + +_FUGUE_GLOBAL_EXECUTION_ENGINE_CONTEXT = _GlobalExecutionEngineContext() class ExecutionEngineFacet: @@ -56,18 +90,43 @@ class SQLEngine(ExecutionEngineFacet, ABC): :param execution_engine: the execution engine this sql engine will run on """ + def __init__(self, execution_engine: "ExecutionEngine") -> None: + super().__init__(execution_engine) + self._uid = "_" + str(uuid4())[:5] + "_" + + def encode_name(self, name: str) -> str: + return self._uid + name + + def encode( + self, dfs: DataFrames, statement: List[Tuple[bool, str]] + ) -> Tuple[DataFrames, str]: + d = DataFrames({self.encode_name(k): v for k, v in dfs.items()}) + s = " ".join(self.encode_name(tp[1]) if tp[0] else tp[1] for tp in statement) + return d, s + @abstractmethod - def select(self, dfs: DataFrames, statement: str) -> DataFrame: # pragma: no cover + def select( + self, dfs: DataFrames, statement: List[Tuple[bool, str]] + ) -> DataFrame: # pragma: no cover """Execute select statement on the sql engine. :param dfs: a collection of dataframes that must have keys - :param statement: the ``SELECT`` statement using the ``dfs`` keys as tables + :param statement: the ``SELECT`` statement using the ``dfs`` keys as tables. + In each tuple, the first value indicates whether the second value is a + dataframe name reference (True), or just a part of the statement (False) :return: result of the ``SELECT`` statement .. admonition:: Examples - >>> dfs = DataFrames(a=df1, b=df2) - >>> sql_engine.select(dfs, "SELECT * FROM a UNION SELECT * FROM b") + .. code-block:: python + + dfs = DataFrames(a=df1, b=df2) + sql_engine.select( + dfs, + [(False, "SELECT * FROM "), + (True,"a"), + (False," UNION SELECT * FROM "), + (True,"b")]) .. note:: @@ -155,6 +214,8 @@ def __init__(self, conf: Any): self._compile_conf = ParamDict() self._sql_engine: Optional[SQLEngine] = None self._map_engine: Optional[MapEngine] = None + self._ctx_count = 0 + self._is_global = False @contextmanager def as_context(self) -> Iterator["ExecutionEngine"]: @@ -169,11 +230,41 @@ def as_context(self) -> Iterator["ExecutionEngine"]: transform(df, func) # will use engine in this transformation """ - token = _FUGUE_EXECUTION_ENGINE_CONTEXT.set(self) # type: ignore - try: - yield self - finally: - _FUGUE_EXECUTION_ENGINE_CONTEXT.reset(token) + return self._as_context() + + @property + def in_context(self) -> bool: + """Whether this engine is being used as a context engine""" + with _CONTEXT_LOCK: + return self._ctx_count > 0 + + def set_global(self) -> "ExecutionEngine": + """Set this execution engine to be the global execution engine. + + .. note:: + Global engine is also considered as a context engine, so + :meth:`~.ExecutionEngine.in_context` will also become true + for the global engine. + + .. admonition:: Examples + + .. code-block:: python + + engine1.set_global(): + transform(df, func) # will use engine1 in this transformation + + with engine2.as_context(): + transform(df, func) # will use engine2 + + transform(df, func) # will use engine1 + """ + _FUGUE_GLOBAL_EXECUTION_ENGINE_CONTEXT.set(self) + return self + + @property + def is_global(self) -> bool: + """Whether this engine is being used as THE global engine""" + return self._is_global def stop(self) -> None: """Stop this execution engine, do not override @@ -248,6 +339,11 @@ def create_default_sql_engine(self) -> SQLEngine: # pragma: no cover """Default SQLEngine if user doesn't specify""" raise NotImplementedError + @abstractmethod + def get_current_parallelism(self) -> int: # pragma: no cover + """Get the current number of parallelism of this engine""" + raise NotImplementedError + @abstractmethod def to_df(self, data: Any, schema: Any = None) -> DataFrame: # pragma: no cover """Convert a data structure to this engine compatible DataFrame @@ -307,7 +403,6 @@ def persist( :param lazy: ``True``: first usage of the output will trigger persisting to happen; ``False`` (eager): persist is forced to happend immediately. Default to ``False`` - :param args: parameter to pass to the underlying persist implementation :param kwargs: parameter to pass to the underlying persist implementation :return: the persisted dataframe @@ -327,7 +422,7 @@ def join( df1: DataFrame, df2: DataFrame, how: str, - on: List[str] = _DEFAULT_JOIN_KEYS, + on: Optional[List[str]] = None, ) -> DataFrame: # pragma: no cover """Join two dataframes @@ -341,7 +436,7 @@ def join( .. note:: - Please read :func:`this ` + Please read :func:`~.fugue.dataframe.utils.get_join_schemas` """ raise NotImplementedError @@ -497,7 +592,7 @@ def take( n: int, presort: str, na_position: str = "last", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, ) -> DataFrame: # pragma: no cover """ Get the first n rows of a DataFrame per partition. If a presort is defined, @@ -581,9 +676,9 @@ def select( ) """ gen = SQLExpressionGenerator(enable_cast=False) - df_name = _get_temp_df_name() - sql = gen.select(cols, df_name, where=where, having=having) - res = self.sql_engine.select(DataFrames({df_name: self.to_df(df)}), sql) + df_name = get_temp_tb_name() + sql = list(gen.select(cols, df_name.key, where=where, having=having)) + res = self.sql_engine.select(DataFrames({df_name.key: self.to_df(df)}), sql) diff = gen.correct_select_schema(df.schema, cols, res.schema) return res if diff is None else res.alter_columns(diff) @@ -745,7 +840,7 @@ def zip( df1: DataFrame, df2: DataFrame, how: str = "inner", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, temp_path: Optional[str] = None, to_file_threshold: Any = -1, df1_name: Optional[str] = None, @@ -783,6 +878,7 @@ def zip( For more details and examples, read |ZipComap|. """ + partition_spec = partition_spec or PartitionSpec() on = list(partition_spec.partition_by) how = how.lower() assert_or_throw( @@ -809,7 +905,7 @@ def update_df(df: DataFrame, name: Optional[str]) -> DataFrame: if not df.metadata.get("serialized", False): df = self._serialize_by_partition( df, - partition_spec, + partition_spec or PartitionSpec(), name, temp_path, to_file_threshold, @@ -839,7 +935,7 @@ def zip_all( self, dfs: DataFrames, how: str = "inner", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, temp_path: Optional[str] = None, to_file_threshold: Any = -1, ) -> DataFrame: @@ -868,6 +964,7 @@ def zip_all( For more details and examples, read |ZipComap| """ + partition_spec = partition_spec or PartitionSpec() assert_or_throw(len(dfs) > 0, "can't zip 0 dataframes") pairs = list(dfs.items()) has_name = dfs.has_key @@ -980,7 +1077,7 @@ def save_df( path: str, format_hint: Any = None, mode: str = "overwrite", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, force_single: bool = False, **kwargs: Any, ) -> None: # pragma: no cover @@ -1007,6 +1104,28 @@ def __copy__(self) -> "ExecutionEngine": def __deepcopy__(self, memo: Any) -> "ExecutionEngine": return self + def _as_context(self) -> Iterator["ExecutionEngine"]: + """Set this execution engine as the context engine. This function + is thread safe and async safe. + + .. admonition:: Examples + + .. code-block:: python + + with engine.as_context(): + transform(df, func) # will use engine in this transformation + + """ + with _CONTEXT_LOCK: + token = _FUGUE_EXECUTION_ENGINE_CONTEXT.set(self) # type: ignore + self._ctx_count += 1 + try: + yield self + finally: + with _CONTEXT_LOCK: + self._ctx_count -= 1 + _FUGUE_EXECUTION_ENGINE_CONTEXT.reset(token) + def _serialize_by_partition( self, df: DataFrame, @@ -1121,7 +1240,3 @@ def _generate_comap_empty_dfs(schemas: Any, named: bool) -> DataFrames: return DataFrames({k: ArrayDataFrame([], v) for k, v in schemas.items()}) else: return DataFrames([ArrayDataFrame([], v) for v in schemas.values()]) - - -def _get_temp_df_name() -> str: - return "_" + str(uuid4())[:5] diff --git a/fugue/execution/factory.py b/fugue/execution/factory.py index 22f56cbb..c2320b66 100644 --- a/fugue/execution/factory.py +++ b/fugue/execution/factory.py @@ -8,6 +8,7 @@ from ..exceptions import FuguePluginsRegistrationError from .execution_engine import ( _FUGUE_EXECUTION_ENGINE_CONTEXT, + _FUGUE_GLOBAL_EXECUTION_ENGINE_CONTEXT, ExecutionEngine, SQLEngine, ) @@ -250,6 +251,9 @@ def make_execution_engine( * If ``engine`` is None, it first try to see if there is any defined context engine to use (=> engine) + * If ``engine`` is still empty, then it will try to get the global execution + engine. See + :meth:`~fugue.execution.execution_engine.ExecutionEngine.set_global` * If ``engine`` is still empty, then if ``infer_by`` is given, it will try to infer the execution engine (=> engine) * If ``engine`` is still empty, then it will construct the default @@ -291,13 +295,20 @@ def make_execution_engine( # assume object e2_df can infer E2 engine make_execution_engine(infer_by=[e2_df]) # an E2 engine + # global + e_global = E1(conf) + e_global.set_global() + make_execution_engine() # e_global + # context with E2(conf).as_context() as ec: make_execution_engine() # ec - make_execution_engine() # the default execution engine + make_execution_engine() # e_global """ if engine is None: engine = _FUGUE_EXECUTION_ENGINE_CONTEXT.get() + if engine is None: + engine = _FUGUE_GLOBAL_EXECUTION_ENGINE_CONTEXT.get() if engine is None and infer_by is not None: engine = infer_execution_engine(infer_by) @@ -398,7 +409,7 @@ def is_pandas_or(objs: List[Any], obj_type: Any) -> bool: @fugue_plugin def infer_execution_engine(obj: List[Any]) -> Any: """Infer the correspondent ExecutionEngine based on the input objects. This is - used in interfaceless functions. + used in express functions. :param objs: the objects :return: if the inference succeeded, it returns an object that can be used by diff --git a/fugue/execution/native_execution_engine.py b/fugue/execution/native_execution_engine.py index c0e82fd7..b319a1cd 100644 --- a/fugue/execution/native_execution_engine.py +++ b/fugue/execution/native_execution_engine.py @@ -1,13 +1,13 @@ import inspect import logging import os -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import pandas as pd from qpd_pandas import run_sql_on_pandas from qpd_pandas.engine import PandasUtils from sqlalchemy import create_engine -from triad.collections import Schema +from triad import Schema from triad.collections.dict import IndexedOrderedDict from triad.collections.fs import FileSystem from triad.utils.assertion import assert_or_throw @@ -19,7 +19,6 @@ ) from fugue._utils.io import load_df, save_df from fugue.collections.partition import ( - EMPTY_PARTITION_SPEC, PartitionCursor, PartitionSpec, parse_presort_exp, @@ -33,12 +32,8 @@ to_local_bounded_df, ) from fugue.dataframe.utils import get_join_schemas, to_local_df -from fugue.execution.execution_engine import ( - _DEFAULT_JOIN_KEYS, - ExecutionEngine, - MapEngine, - SQLEngine, -) + +from .execution_engine import ExecutionEngine, MapEngine, SQLEngine class SqliteEngine(SQLEngine): @@ -47,11 +42,12 @@ class SqliteEngine(SQLEngine): :param execution_engine: the execution engine this sql engine will run on """ - def select(self, dfs: DataFrames, statement: str) -> DataFrame: + def select(self, dfs: DataFrames, statement: List[Tuple[bool, str]]) -> DataFrame: + _dfs, _sql = self.encode(dfs, statement) sql_engine = create_engine("sqlite:///:memory:") - for k, v in dfs.items(): + for k, v in _dfs.items(): v.as_pandas().to_sql(k, sql_engine, if_exists="replace", index=False) - df = pd.read_sql_query(statement, sql_engine) + df = pd.read_sql_query(_sql, sql_engine) return PandasDataFrame(df) @@ -61,12 +57,14 @@ class QPDPandasEngine(SQLEngine): :param execution_engine: the execution engine this sql engine will run on """ - def select(self, dfs: DataFrames, statement: str) -> DataFrame: - _dfs = { + def select(self, dfs: DataFrames, statement: List[Tuple[bool, str]]) -> DataFrame: + _dfs, _sql = self.encode(dfs, statement) + _dd = { k: self.execution_engine.to_df(v).as_pandas() # type: ignore - for k, v in dfs.items() + for k, v in _dfs.items() } - df = run_sql_on_pandas(statement, _dfs, ignore_case=True) + + df = run_sql_on_pandas(_sql, _dd, ignore_case=True) return self.execution_engine.to_df(df) @@ -155,6 +153,9 @@ def create_default_sql_engine(self) -> SQLEngine: def create_default_map_engine(self) -> MapEngine: return PandasMapEngine(self) + def get_current_parallelism(self) -> int: + return 1 + @property def pl_utils(self) -> PandasUtils: """Pandas-like dataframe utils""" @@ -188,7 +189,7 @@ def join( df1: DataFrame, df2: DataFrame, how: str, - on: List[str] = _DEFAULT_JOIN_KEYS, + on: Optional[List[str]] = None, ) -> DataFrame: key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on) d = self.pl_utils.join( @@ -309,8 +310,9 @@ def take( n: int, presort: str, na_position: str = "last", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, ) -> DataFrame: + partition_spec = partition_spec or PartitionSpec() assert_or_throw( isinstance(n, int), ValueError("n needs to be an integer"), @@ -357,10 +359,11 @@ def save_df( path: str, format_hint: Any = None, mode: str = "overwrite", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, force_single: bool = False, **kwargs: Any, ) -> None: + partition_spec = partition_spec or PartitionSpec() if not force_single and not partition_spec.empty: kwargs["partition_cols"] = partition_spec.partition_by self.fs.makedirs(os.path.dirname(path), recreate=True) diff --git a/fugue/extensions/_builtins/processors.py b/fugue/extensions/_builtins/processors.py index 724b24eb..b24da83d 100644 --- a/fugue/extensions/_builtins/processors.py +++ b/fugue/extensions/_builtins/processors.py @@ -147,7 +147,7 @@ def process(self, dfs: DataFrames) -> DataFrame: class RunSQLSelect(Processor): def process(self, dfs: DataFrames) -> DataFrame: - statement = self.params.get_or_throw("statement", str) + statement = self.params.get_or_throw("statement", object) engine = self.params.get_or_none("sql_engine", object) engine_params = self.params.get("sql_engine_params", ParamDict()) sql_engine = make_sql_engine(engine, self.execution_engine, **engine_params) diff --git a/fugue/extensions/creator/convert.py b/fugue/extensions/creator/convert.py index 86970f84..18b8ae94 100644 --- a/fugue/extensions/creator/convert.py +++ b/fugue/extensions/creator/convert.py @@ -24,7 +24,8 @@ def parse_creator(obj: Any) -> Any: .. code-block:: python - from fugue import Creator, parse_creator, FugueWorkflow + from fugue import Creator, FugueWorkflow + from fugue.plugins import parse_creator from triad import to_uuid class My(Creator): diff --git a/fugue/interfaceless.py b/fugue/interfaceless.py deleted file mode 100644 index c7bff4f0..00000000 --- a/fugue/interfaceless.py +++ /dev/null @@ -1,249 +0,0 @@ -from typing import Any, List, Optional - -from triad.utils.assertion import assert_or_throw - -from fugue.collections.yielded import Yielded -from fugue.constants import FUGUE_CONF_WORKFLOW_EXCEPTION_INJECT -from fugue.dataframe import DataFrame -from fugue.exceptions import FugueInterfacelessError, FugueWorkflowCompileError -from fugue.execution import make_execution_engine -from fugue.workflow import FugueWorkflow - - -def _check_valid_input(df: Any, save_path: Optional[str]) -> None: - # Check valid input - if isinstance(df, str): - assert_or_throw( - (".csv" not in df) and (".json" not in df), - FugueInterfacelessError( - """Fugue transform can only load parquet file paths. - Csv and json are disallowed""" - ), - ) - if save_path: - assert_or_throw( - (".csv" not in save_path) and (".json" not in save_path), - FugueInterfacelessError( - """Fugue transform can only load parquet file paths. - Csv and json are disallowed""" - ), - ) - - -def transform( # noqa: C901 - df: Any, - using: Any, - schema: Any = None, - params: Any = None, - partition: Any = None, - callback: Any = None, - ignore_errors: Optional[List[Any]] = None, - engine: Any = None, - engine_conf: Any = None, - force_output_fugue_dataframe: bool = False, - persist: bool = False, - as_local: bool = False, - save_path: Optional[str] = None, - checkpoint: bool = False, -) -> Any: - """Transform this dataframe using transformer. It's a wrapper of - :meth:`~fugue.workflow.workflow.FugueWorkflow.transform` and - :meth:`~fugue.workflow.workflow.FugueWorkflow.run`. It let you do the - basic dataframe transformation without using - :class:`~fugue.workflow.workflow.FugueWorkflow` and - :class:`~fugue.dataframe.dataframe.DataFrame`. Both input and output - can be native types only. - - Please read |TransformerTutorial| - - :param df: |DataFrameLikeObject| or :class:`~fugue.workflow.yielded.Yielded` - or a path string to a parquet file - :param using: transformer-like object, can't be a string expression - :param schema: |SchemaLikeObject|, defaults to None. The transformer - will be able to access this value from - :meth:`~fugue.extensions.context.ExtensionContext.output_schema` - :param params: |ParamsLikeObject| to run the processor, defaults to None. - The transformer will be able to access this value from - :meth:`~fugue.extensions.context.ExtensionContext.params` - :param partition: |PartitionLikeObject|, defaults to None - :param callback: |RPCHandlerLikeObject|, defaults to None - :param ignore_errors: list of exception types the transformer can ignore, - defaults to None (empty list) - :param engine: it can be empty string or null (use the default execution - engine), a string (use the registered execution engine), an - :class:`~fugue.execution.execution_engine.ExecutionEngine` type, or - the :class:`~fugue.execution.execution_engine.ExecutionEngine` instance - , or a tuple of two values where the first value represents execution - engine and the second value represents the sql engine (you can use ``None`` - for either of them to use the default one), defaults to None - :param engine_conf: |ParamsLikeObject|, defaults to None - :param force_output_fugue_dataframe: If true, the function will always return - a ``FugueDataFrame``, otherwise, if ``df`` is in native dataframe types such - as pandas dataframe, then the output will also in its native format. Defaults - to False - :param persist: Whether to persist(materialize) the dataframe before returning - :param as_local: If true, the result will be converted to a ``LocalDataFrame`` - :param save_path: Whether to save the output to a file (see the note) - :param checkpoint: Whether to add a checkpoint for the output (see the note) - - :return: the transformed dataframe, if ``df`` is a native dataframe (e.g. - pd.DataFrame, spark dataframe, etc), the output will be a native dataframe, - the type is determined by the execution engine you use. But if ``df`` is - of type :class:`~fugue.dataframe.dataframe.DataFrame`, then the output will - also be a :class:`~fugue.dataframe.dataframe.DataFrame` - - .. note:: - - This function may be lazy and return the transformed dataframe. - - .. note:: - - When you use callback in this function, you must be careful that the output - dataframe must be materialized. Otherwise, if the real compute happens out of - the function call, the callback receiver is already shut down. To do that you - can either use ``persist`` or ``as_local``, both will materialize the dataframe - before the callback receiver shuts down. - - .. note:: - - * When `save_path` is None and `checkpoint` is False, then the output will - not be saved into a file. The return will be a dataframe. - * When `save_path` is None and `checkpoint` is True, then the output will be - saved into the path set by `fugue.workflow.checkpoint.path`, the name will - be randomly chosen, and it is NOT a deterministic checkpoint, so if you run - multiple times, the output will be saved into different files. The return - will be a dataframe. - * When `save_path` is not None and `checkpoint` is False, then the output will - be saved into `save_path`. The return will be the value of `save_path` - * When `save_path` is not None and `checkpoint` is True, then the output will - be saved into `save_path`. The return will be the dataframe from `save_path` - - This function can only take parquet file paths in `df` and `save_path`. - Csv and other file formats are disallowed. - - The checkpoint here is NOT deterministic, so re-run will generate new - checkpoints. - - If you want to read and write other file formats or if you want to use - deterministic checkpoints, please use - :class:`~fugue.workflow.workflow.FugueWorkflow`. - """ - _check_valid_input(df, save_path) - - dag = FugueWorkflow(compile_conf={FUGUE_CONF_WORKFLOW_EXCEPTION_INJECT: 0}) - try: - src = dag.create(df) - except FugueWorkflowCompileError: - if isinstance(df, str): - src = dag.load(df, fmt="parquet") - else: - raise - tdf = src.transform( - using=using, - schema=schema, - params=params, - pre_partition=partition, - callback=callback, - ignore_errors=ignore_errors or [], - ) - if persist: - tdf = tdf.persist() - if checkpoint: - if save_path is None: - - def _no_op_processor(df: DataFrame) -> DataFrame: - # this is a trick to force yielding again - # from the file to a dataframe - return df - - tdf.yield_file_as("file_result") - tdf.process(_no_op_processor).yield_dataframe_as( - "result", as_local=as_local - ) - else: - tdf.save_and_use(save_path, fmt="parquet").yield_dataframe_as( - "result", as_local=as_local - ) - else: - if save_path is None: - tdf.yield_dataframe_as("result", as_local=as_local) - else: - tdf.save(save_path, fmt="parquet") - - dag.run(make_execution_engine(engine, conf=engine_conf, infer_by=[df])) - if checkpoint: - result = dag.yields["result"].result # type:ignore - else: - if save_path is None: - result = dag.yields["result"].result # type:ignore - else: - return save_path - if force_output_fugue_dataframe or isinstance(df, (DataFrame, Yielded)): - return result - return result.as_pandas() if result.is_local else result.native # type:ignore - - -def out_transform( - df: Any, - using: Any, - params: Any = None, - partition: Any = None, - callback: Any = None, - ignore_errors: Optional[List[Any]] = None, - engine: Any = None, - engine_conf: Any = None, -) -> None: - """Transform this dataframe using transformer. It's a wrapper of - :meth:`~fugue.workflow.workflow.FugueWorkflow.out_transform` and - :meth:`~fugue.workflow.workflow.FugueWorkflow.run`. It let you do the - basic dataframe transformation without using - :class:`~fugue.workflow.workflow.FugueWorkflow` and - :class:`~fugue.dataframe.dataframe.DataFrame`. The input can be native - type only - - Please read |TransformerTutorial| - - :param df: |DataFrameLikeObject| or :class:`~fugue.workflow.yielded.Yielded` - or a path string to a parquet file - :param using: transformer-like object, can't be a string expression - :param params: |ParamsLikeObject| to run the processor, defaults to None. - The transformer will be able to access this value from - :meth:`~fugue.extensions.context.ExtensionContext.params` - :param partition: |PartitionLikeObject|, defaults to None. - :param callback: |RPCHandlerLikeObject|, defaults to None - :param ignore_errors: list of exception types the transformer can ignore, - defaults to None (empty list) - :param engine: it can be empty string or null (use the default execution - engine), a string (use the registered execution engine), an - :class:`~fugue.execution.execution_engine.ExecutionEngine` type, or - the :class:`~fugue.execution.execution_engine.ExecutionEngine` instance - , or a tuple of two values where the first value represents execution - engine and the second value represents the sql engine (you can use ``None`` - for either of them to use the default one), defaults to None - :param engine_conf: |ParamsLikeObject|, defaults to None - - .. note:: - - This function can only take parquet file paths in `df`. Csv and other file - formats are disallowed. - - This transformation is guaranteed to execute immediately (eager) - and return nothing - """ - dag = FugueWorkflow(compile_conf={FUGUE_CONF_WORKFLOW_EXCEPTION_INJECT: 0}) - try: - src = dag.create(df) - except FugueWorkflowCompileError: - if isinstance(df, str): - src = dag.load(df, fmt="parquet") - else: - raise - src.out_transform( - using=using, - params=params, - pre_partition=partition, - callback=callback, - ignore_errors=ignore_errors or [], - ) - - dag.run(make_execution_engine(engine, conf=engine_conf, infer_by=[df])) diff --git a/fugue/plugins.py b/fugue/plugins.py new file mode 100644 index 00000000..dee330d3 --- /dev/null +++ b/fugue/plugins.py @@ -0,0 +1,39 @@ +# flake8: noqa +# pylint: disable-all +from fugue.dataframe import ( + alter_columns, + as_array, + as_array_iterable, + as_arrow, + as_dict_iterable, + as_pandas, + drop_columns, + get_column_names, + get_schema, + head, + is_df, + peek_array, + peek_dict, + rename, + select_columns, +) +from fugue.dataset import ( + as_fugue_dataset, + as_local, + as_local_bounded, + count, + get_dataset_display, + get_num_partitions, + is_bounded, + is_empty, + is_local, +) +from fugue.execution.factory import ( + infer_execution_engine, + parse_execution_engine, + parse_sql_engine, +) +from fugue.extensions.creator import parse_creator +from fugue.extensions.outputter import parse_outputter +from fugue.extensions.processor import parse_processor +from fugue.extensions.transformer import parse_output_transformer, parse_transformer diff --git a/fugue/registry.py b/fugue/registry.py index eb6310a3..003c3625 100644 --- a/fugue/registry.py +++ b/fugue/registry.py @@ -1,7 +1,6 @@ import inspect from typing import Any, Optional -import pandas as pd import pyarrow as pa from fugue._utils.interfaceless import ( @@ -9,7 +8,6 @@ SimpleAnnotationConverter, register_annotation_converter, ) -from fugue.collections.yielded import Yielded from fugue.dataframe import ArrowDataFrame, DataFrame from fugue.execution.factory import register_execution_engine, register_sql_engine from fugue.execution.native_execution_engine import ( @@ -17,7 +15,6 @@ QPDPandasEngine, SqliteEngine, ) -from fugue.workflow import register_raw_df_type def _register() -> None: @@ -29,18 +26,10 @@ def _register() -> None: >>> import fugue """ - _register_raw_dataframes() _register_engines() _register_annotation_converters() -def _register_raw_dataframes() -> None: - register_raw_df_type(Yielded) - register_raw_df_type(pd.DataFrame) - register_raw_df_type(DataFrame) - register_raw_df_type(pa.Table) - - def _register_engines() -> None: register_execution_engine( "native", lambda conf: NativeExecutionEngine(conf), on_dup="ignore" diff --git a/fugue/sql/api.py b/fugue/sql/api.py new file mode 100644 index 00000000..28d900e9 --- /dev/null +++ b/fugue/sql/api.py @@ -0,0 +1,249 @@ +from typing import Any, Dict, Tuple + +from triad.utils.convert import get_caller_global_local_vars + +from fugue.dataframe import AnyDataFrame +from fugue.exceptions import FugueSQLError +from fugue.execution import AnyExecutionEngine + +from ..constants import FUGUE_CONF_SQL_IGNORE_CASE +from .workflow import FugueSQLWorkflow + + +def fugue_sql( + query: str, + *args: Any, + fsql_ignore_case: bool = False, + engine: AnyExecutionEngine = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, + **kwargs: Any, +) -> AnyDataFrame: + """Simplified Fugue SQL interface. This function can still take multiple dataframe + inputs but will always return the last generated dataframe in the SQL workflow. And + ``YIELD`` should NOT be used with this function. If you want to use Fugue SQL to + represent the full workflow, or want to see more Fugue SQL examples, + please read :func:`~.fugue_sql_flow`. + + :param query: the Fugue SQL string (can be a jinja template) + :param args: variables related to the SQL string + :param fsql_ignore_case: whether to ignore case when parsing the SQL string + defaults to False. + :param kwargs: variables related to the SQL string + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether return a local dataframe, defaults to False + + :return: the result dataframe + + .. note:: + + This function is different from :func:`~fugue.api.raw_sql` which directly + sends the query to the execution engine to run. This function parses the query + based on Fugue SQL syntax, creates a + :class:`~fugue.sql.workflow.FugueSQLWorkflow` which + could contain multiple raw SQLs plus other operations, and runs and returns + the last dataframe generated in the workflow. + + This function allows you to parameterize the SQL in a more elegant way. The + data tables referred in the query can either be automatically extracted from the + local variables or be specified in the arguments. + + .. caution:: + + Currently, we have not unified the dialects of different SQL backends. So there + can be some slight syntax differences when you switch between backends. + In addition, we have not unified the UDFs cross different backends, so you + should be careful to use uncommon UDFs belonging to a certain backend. + + That being said, if you keep your SQL part general and leverage Fugue extensions + (transformer, creator, processor, outputter, etc.) appropriately, it should be + easy to write backend agnostic Fugue SQL. + + We are working on unifying the dialects of different SQLs, it should be + available in the future releases. Regarding unifying UDFs, the effort is still + unclear. + + .. code-block:: python + + import pandas as pd + import fugue.api as fa + + def tr(df:pd.DataFrame) -> pd.DataFrame: + return df.assign(c=2) + + input = pd.DataFrame([[0,1],[3.4]], columns=["a","b"]) + + with fa.engine_context("duckdb"): + res = fa.fugue_sql(''' + SELECT * FROM input WHERE a<{{x}} + TRANSFORM USING tr SCHEMA *,c:int + ''', x=2) + assert fa.as_array(res) == [[0,1,2]] + """ + + dag = _build_dag(query, fsql_ignore_case=fsql_ignore_case, args=args, kwargs=kwargs) + if dag.last_df is not None: + dag.last_df.yield_dataframe_as("result", as_local=as_local) + else: # pragma: no cover + # impossible case + raise FugueSQLError(f"no dataframe to output from\n{query}") + res = dag.run(engine, engine_conf) + return res["result"] if as_fugue else res["result"].native_as_df() + + +def fugue_sql_flow( + query: str, *args: Any, fsql_ignore_case: bool = False, **kwargs: Any +) -> FugueSQLWorkflow: + """Fugue SQL full functional interface. This function allows full workflow + definition using Fugue SQL, and it allows multiple outputs using ``YIELD``. + + :param query: the Fugue SQL string (can be a jinja template) + :param args: variables related to the SQL string + :param fsql_ignore_case: whether to ignore case when parsing the SQL string + defaults to False. + :param kwargs: variables related to the SQL string + :return: the translated Fugue workflow + + .. note:: + + This function is different from :func:`~fugue.api.raw_sql` which directly + sends the query to the execution engine to run. This function parses the query + based on Fugue SQL syntax, creates a + :class:`~fugue.sql.workflow.FugueSQLWorkflow` which + could contain multiple raw SQLs plus other operations, and runs and returns + the last dataframe generated in the workflow. + + This function allows you to parameterize the SQL in a more elegant way. The + data tables referred in the query can either be automatically extracted from the + local variables or be specified in the arguments. + + .. caution:: + + Currently, we have not unified the dialects of different SQL backends. So there + can be some slight syntax differences when you switch between backends. + In addition, we have not unified the UDFs cross different backends, so you + should be careful to use uncommon UDFs belonging to a certain backend. + + That being said, if you keep your SQL part general and leverage Fugue extensions + (transformer, creator, processor, outputter, etc.) appropriately, it should be + easy to write backend agnostic Fugue SQL. + + We are working on unifying the dialects of different SQLs, it should be + available in the future releases. Regarding unifying UDFs, the effort is still + unclear. + + .. code-block:: python + + import fugue.api.fugue_sql_flow as fsql + import fugue.api as fa + + # Basic case + fsql(''' + CREATE [[0]] SCHEMA a:int + PRINT + ''').run() + + # With external data sources + df = pd.DataFrame([[0],[1]], columns=["a"]) + fsql(''' + SELECT * FROM df WHERE a=0 + PRINT + ''').run() + + # With external variables + df = pd.DataFrame([[0],[1]], columns=["a"]) + t = 1 + fsql(''' + SELECT * FROM df WHERE a={{t}} + PRINT + ''').run() + + # The following is the explicit way to specify variables and datafrems + # (recommended) + df = pd.DataFrame([[0],[1]], columns=["a"]) + t = 1 + fsql(''' + SELECT * FROM df WHERE a={{t}} + PRINT + ''', df=df, t=t).run() + + # Using extensions + def dummy(df:pd.DataFrame) -> pd.DataFrame: + return df + + fsql(''' + CREATE [[0]] SCHEMA a:int + TRANSFORM USING dummy SCHEMA * + PRINT + ''').run() + + # It's recommended to provide full path of the extension inside + # Fugue SQL, so the SQL definition and exeuction can be more + # independent from the extension definition. + + # Run with different execution engines + sql = ''' + CREATE [[0]] SCHEMA a:int + TRANSFORM USING dummy SCHEMA * + PRINT + ''' + + fsql(sql).run(spark_session) + fsql(sql).run("dask") + + with fa.engine_context("duckdb"): + fsql(sql).run() + + # Passing dataframes between fsql calls + result = fsql(''' + CREATE [[0]] SCHEMA a:int + YIELD DATAFRAME AS x + + CREATE [[1]] SCHEMA a:int + YIELD DATAFRAME AS y + ''').run(DaskExecutionEngine) + + fsql(''' + SELECT * FROM x + UNION + SELECT * FROM y + UNION + SELECT * FROM z + + PRINT + ''', result, z=pd.DataFrame([[2]], columns=["z"])).run() + + # Get framework native dataframes + result["x"].native # Dask dataframe + result["y"].native # Dask dataframe + result["x"].as_pandas() # Pandas dataframe + + # Use lower case fugue sql + df = pd.DataFrame([[0],[1]], columns=["a"]) + t = 1 + fsql(''' + select * from df where a={{t}} + print + ''', df=df, t=t, fsql_ignore_case=True).run() + """ + dag = _build_dag(query, fsql_ignore_case=fsql_ignore_case, args=args, kwargs=kwargs) + return dag + + +def _build_dag( + query: str, + fsql_ignore_case: bool, + args: Tuple[Any, ...], + kwargs: Dict[str, Any], + level: int = -2, +) -> FugueSQLWorkflow: + global_vars, local_vars = get_caller_global_local_vars(start=level, end=level) + dag = FugueSQLWorkflow(compile_conf={FUGUE_CONF_SQL_IGNORE_CASE: fsql_ignore_case}) + try: + dag._sql(query, global_vars, local_vars, *args, **kwargs) + except SyntaxError as ex: + raise SyntaxError(str(ex)).with_traceback(None) from None + return dag diff --git a/fugue/sql/workflow.py b/fugue/sql/workflow.py index 33fcfe73..a972dcc8 100644 --- a/fugue/sql/workflow.py +++ b/fugue/sql/workflow.py @@ -7,13 +7,9 @@ from ..collections.yielded import Yielded from ..constants import FUGUE_CONF_SQL_IGNORE_CASE +from ..dataframe.api import is_df from ..dataframe.dataframe import DataFrame -from ..workflow.workflow import ( - FugueWorkflow, - WorkflowDataFrame, - WorkflowDataFrames, - is_acceptable_raw_df, -) +from ..workflow.workflow import FugueWorkflow, WorkflowDataFrame, WorkflowDataFrames from ._utils import LazyWorkflowDataFrame, fill_sql_template from ._visitors import FugueSQLHooks, _Extensions @@ -71,118 +67,8 @@ def _split_params( for k, v in params.items(): if isinstance(v, (int, str, float, bool)): p[k] = v - elif isinstance(v, (DataFrame, Yielded)) or is_acceptable_raw_df(v): + elif isinstance(v, (DataFrame, Yielded)) or is_df(v): dfs[k] = LazyWorkflowDataFrame(k, v, self) else: p[k] = v return p, dfs - - -def fsql( - sql: str, *args: Any, fsql_ignore_case: bool = False, **kwargs: Any -) -> FugueSQLWorkflow: - """Fugue SQL functional interface - - :param sql: the Fugue SQL string (can be a jinja template) - :param args: variables related to the SQL string - :param fsql_ignore_case: whether to ignore case when parsing the SQL string - defaults to False. - :param kwargs: variables related to the SQL string - :return: the translated Fugue workflow - - .. code-block:: python - - # Basic case - fsql(''' - CREATE [[0]] SCHEMA a:int - PRINT - ''').run() - - # With external data sources - df = pd.DataFrame([[0],[1]], columns=["a"]) - fsql(''' - SELECT * FROM df WHERE a=0 - PRINT - ''').run() - - # With external variables - df = pd.DataFrame([[0],[1]], columns=["a"]) - t = 1 - fsql(''' - SELECT * FROM df WHERE a={{t}} - PRINT - ''').run() - - # The following is the explicit way to specify variables and datafrems - # (recommended) - df = pd.DataFrame([[0],[1]], columns=["a"]) - t = 1 - fsql(''' - SELECT * FROM df WHERE a={{t}} - PRINT - ''', df=df, t=t).run() - - # Using extensions - def dummy(df:pd.DataFrame) -> pd.DataFrame: - return df - - fsql(''' - CREATE [[0]] SCHEMA a:int - TRANSFORM USING dummy SCHEMA * - PRINT - ''').run() - - # It's recommended to provide full path of the extension inside - # Fugue SQL, so the SQL definition and exeuction can be more - # independent from the extension definition. - - # Run with different execution engines - sql = ''' - CREATE [[0]] SCHEMA a:int - TRANSFORM USING dummy SCHEMA * - PRINT - ''' - - fsql(sql).run(user_defined_spark_session()) - fsql(sql).run(SparkExecutionEngine, {"spark.executor.instances":10}) - fsql(sql).run(DaskExecutionEngine) - - # Passing dataframes between fsql calls - result = fsql(''' - CREATE [[0]] SCHEMA a:int - YIELD DATAFRAME AS x - - CREATE [[1]] SCHEMA a:int - YIELD DATAFRAME AS y - ''').run(DaskExecutionEngine) - - fsql(''' - SELECT * FROM x - UNION - SELECT * FROM y - UNION - SELECT * FROM z - - PRINT - ''', result, z=pd.DataFrame([[2]], columns=["z"])).run() - - # Get framework native dataframes - result["x"].native # Dask dataframe - result["y"].native # Dask dataframe - result["x"].as_pandas() # Pandas dataframe - - # Use lower case fugue sql - df = pd.DataFrame([[0],[1]], columns=["a"]) - t = 1 - fsql(''' - select * from df where a={{t}} - print - ''', df=df, t=t, fsql_ignore_case=True).run() - """ - global_vars, local_vars = get_caller_global_local_vars() - dag = FugueSQLWorkflow(compile_conf={FUGUE_CONF_SQL_IGNORE_CASE: fsql_ignore_case}) - try: - dag._sql(sql, global_vars, local_vars, *args, **kwargs) - except SyntaxError as ex: - raise SyntaxError(str(ex)).with_traceback(None) from None - return dag diff --git a/fugue/workflow/__init__.py b/fugue/workflow/__init__.py index cb13f310..2a3af140 100644 --- a/fugue/workflow/__init__.py +++ b/fugue/workflow/__init__.py @@ -1,6 +1,7 @@ # flake8: noqa -from fugue.workflow._workflow_context import FugueWorkflowContext -from fugue.workflow.input import is_acceptable_raw_df, register_raw_df_type -from fugue.workflow.module import module -from fugue.workflow.workflow import FugueWorkflow, WorkflowDataFrame, WorkflowDataFrames +from ._workflow_context import FugueWorkflowContext +from .api import * +from .input import register_raw_df_type +from .module import module +from .workflow import FugueWorkflow, WorkflowDataFrame, WorkflowDataFrames diff --git a/fugue/workflow/api.py b/fugue/workflow/api.py new file mode 100644 index 00000000..3a376692 --- /dev/null +++ b/fugue/workflow/api.py @@ -0,0 +1,307 @@ +from typing import Any, Dict, List, Optional + +from triad.utils.assertion import assert_or_throw + +from ..collections.yielded import Yielded +from ..constants import FUGUE_CONF_WORKFLOW_EXCEPTION_INJECT +from ..dataframe import DataFrame, AnyDataFrame +from ..dataframe.api import get_native_as_df +from ..exceptions import FugueInterfacelessError, FugueWorkflowCompileError +from ..execution import make_execution_engine +from .workflow import FugueWorkflow + + +def _check_valid_input(df: Any, save_path: Optional[str]) -> None: + # Check valid input + if isinstance(df, str): + assert_or_throw( + (".csv" not in df) and (".json" not in df), + FugueInterfacelessError( + """Fugue transform can only load parquet file paths. + Csv and json are disallowed""" + ), + ) + if save_path: + assert_or_throw( + (".csv" not in save_path) and (".json" not in save_path), + FugueInterfacelessError( + """Fugue transform can only load parquet file paths. + Csv and json are disallowed""" + ), + ) + + +def transform( # noqa: C901 + df: Any, + using: Any, + schema: Any = None, + params: Any = None, + partition: Any = None, + callback: Any = None, + ignore_errors: Optional[List[Any]] = None, + persist: bool = False, + as_local: bool = False, + save_path: Optional[str] = None, + checkpoint: bool = False, + engine: Any = None, + engine_conf: Any = None, + as_fugue: bool = False, +) -> Any: + """Transform this dataframe using transformer. It's a wrapper of + :meth:`~fugue.workflow.workflow.FugueWorkflow.transform` and + :meth:`~fugue.workflow.workflow.FugueWorkflow.run`. It let you do the + basic dataframe transformation without using + :class:`~fugue.workflow.workflow.FugueWorkflow` and + :class:`~fugue.dataframe.dataframe.DataFrame`. Both input and output + can be native types only. + + Please read |TransformerTutorial| + + :param df: |DataFrameLikeObject| or :class:`~fugue.workflow.yielded.Yielded` + or a path string to a parquet file + :param using: transformer-like object, can't be a string expression + :param schema: |SchemaLikeObject|, defaults to None. The transformer + will be able to access this value from + :meth:`~fugue.extensions.context.ExtensionContext.output_schema` + :param params: |ParamsLikeObject| to run the processor, defaults to None. + The transformer will be able to access this value from + :meth:`~fugue.extensions.context.ExtensionContext.params` + :param partition: |PartitionLikeObject|, defaults to None + :param callback: |RPCHandlerLikeObject|, defaults to None + :param ignore_errors: list of exception types the transformer can ignore, + defaults to None (empty list) + :param engine: it can be empty string or null (use the default execution + engine), a string (use the registered execution engine), an + :class:`~fugue.execution.execution_engine.ExecutionEngine` type, or + the :class:`~fugue.execution.execution_engine.ExecutionEngine` instance + , or a tuple of two values where the first value represents execution + engine and the second value represents the sql engine (you can use ``None`` + for either of them to use the default one), defaults to None + :param engine_conf: |ParamsLikeObject|, defaults to None + :param as_fugue: If true, the function will always return + a ``FugueDataFrame``, otherwise, if ``df`` is in native dataframe types such + as pandas dataframe, then the output will also in its native format. Defaults + to False + :param persist: Whether to persist(materialize) the dataframe before returning + :param as_local: If true, the result will be converted to a ``LocalDataFrame`` + :param save_path: Whether to save the output to a file (see the note) + :param checkpoint: Whether to add a checkpoint for the output (see the note) + + :return: the transformed dataframe, if ``df`` is a native dataframe (e.g. + pd.DataFrame, spark dataframe, etc), the output will be a native dataframe, + the type is determined by the execution engine you use. But if ``df`` is + of type :class:`~fugue.dataframe.dataframe.DataFrame`, then the output will + also be a :class:`~fugue.dataframe.dataframe.DataFrame` + + .. note:: + + This function may be lazy and return the transformed dataframe. + + .. note:: + + When you use callback in this function, you must be careful that the output + dataframe must be materialized. Otherwise, if the real compute happens out of + the function call, the callback receiver is already shut down. To do that you + can either use ``persist`` or ``as_local``, both will materialize the dataframe + before the callback receiver shuts down. + + .. note:: + + * When `save_path` is None and `checkpoint` is False, then the output will + not be saved into a file. The return will be a dataframe. + * When `save_path` is None and `checkpoint` is True, then the output will be + saved into the path set by `fugue.workflow.checkpoint.path`, the name will + be randomly chosen, and it is NOT a deterministic checkpoint, so if you run + multiple times, the output will be saved into different files. The return + will be a dataframe. + * When `save_path` is not None and `checkpoint` is False, then the output will + be saved into `save_path`. The return will be the value of `save_path` + * When `save_path` is not None and `checkpoint` is True, then the output will + be saved into `save_path`. The return will be the dataframe from `save_path` + + This function can only take parquet file paths in `df` and `save_path`. + Csv and other file formats are disallowed. + + The checkpoint here is NOT deterministic, so re-run will generate new + checkpoints. + + If you want to read and write other file formats or if you want to use + deterministic checkpoints, please use + :class:`~fugue.workflow.workflow.FugueWorkflow`. + """ + _check_valid_input(df, save_path) + + dag = FugueWorkflow(compile_conf={FUGUE_CONF_WORKFLOW_EXCEPTION_INJECT: 0}) + try: + src = dag.create(df) + except FugueWorkflowCompileError: + if isinstance(df, str): + src = dag.load(df, fmt="parquet") + else: + raise + tdf = src.transform( + using=using, + schema=schema, + params=params, + pre_partition=partition, + callback=callback, + ignore_errors=ignore_errors or [], + ) + if persist: + tdf = tdf.persist() + if checkpoint: + if save_path is None: + + def _no_op_processor(df: DataFrame) -> DataFrame: + # this is a trick to force yielding again + # from the file to a dataframe + return df + + tdf.yield_file_as("file_result") + tdf.process(_no_op_processor).yield_dataframe_as( + "result", as_local=as_local + ) + else: + tdf.save_and_use(save_path, fmt="parquet").yield_dataframe_as( + "result", as_local=as_local + ) + else: + if save_path is None: + tdf.yield_dataframe_as("result", as_local=as_local) + else: + tdf.save(save_path, fmt="parquet") + + dag.run(make_execution_engine(engine, conf=engine_conf, infer_by=[df])) + if checkpoint: + result = dag.yields["result"].result # type:ignore + else: + if save_path is None: + result = dag.yields["result"].result # type:ignore + else: + return save_path + if as_fugue or isinstance(df, (DataFrame, Yielded)): + return result + return result.as_pandas() if result.is_local else result.native # type:ignore + + +def out_transform( + df: Any, + using: Any, + params: Any = None, + partition: Any = None, + callback: Any = None, + ignore_errors: Optional[List[Any]] = None, + engine: Any = None, + engine_conf: Any = None, +) -> None: + """Transform this dataframe using transformer. It's a wrapper of + :meth:`~fugue.workflow.workflow.FugueWorkflow.out_transform` and + :meth:`~fugue.workflow.workflow.FugueWorkflow.run`. It let you do the + basic dataframe transformation without using + :class:`~fugue.workflow.workflow.FugueWorkflow` and + :class:`~fugue.dataframe.dataframe.DataFrame`. The input can be native + type only + + Please read |TransformerTutorial| + + :param df: |DataFrameLikeObject| or :class:`~fugue.workflow.yielded.Yielded` + or a path string to a parquet file + :param using: transformer-like object, can't be a string expression + :param params: |ParamsLikeObject| to run the processor, defaults to None. + The transformer will be able to access this value from + :meth:`~fugue.extensions.context.ExtensionContext.params` + :param partition: |PartitionLikeObject|, defaults to None. + :param callback: |RPCHandlerLikeObject|, defaults to None + :param ignore_errors: list of exception types the transformer can ignore, + defaults to None (empty list) + :param engine: it can be empty string or null (use the default execution + engine), a string (use the registered execution engine), an + :class:`~fugue.execution.execution_engine.ExecutionEngine` type, or + the :class:`~fugue.execution.execution_engine.ExecutionEngine` instance + , or a tuple of two values where the first value represents execution + engine and the second value represents the sql engine (you can use ``None`` + for either of them to use the default one), defaults to None + :param engine_conf: |ParamsLikeObject|, defaults to None + + .. note:: + + This function can only take parquet file paths in `df`. Csv and other file + formats are disallowed. + + This transformation is guaranteed to execute immediately (eager) + and return nothing + """ + dag = FugueWorkflow(compile_conf={FUGUE_CONF_WORKFLOW_EXCEPTION_INJECT: 0}) + try: + src = dag.create(df) + except FugueWorkflowCompileError: + if isinstance(df, str): + src = dag.load(df, fmt="parquet") + else: + raise + src.out_transform( + using=using, + params=params, + pre_partition=partition, + callback=callback, + ignore_errors=ignore_errors or [], + ) + + dag.run(make_execution_engine(engine, conf=engine_conf, infer_by=[df])) + + +def raw_sql( + *statements: Any, + engine: Any = None, + engine_conf: Any = None, + as_fugue: bool = False, + as_local: bool = False, +) -> AnyDataFrame: + """Run raw SQL on the execution engine + + :param statements: a sequence of sub-statements in string + or dataframe-like objects + :param engine: an engine like object, defaults to None + :param engine_conf: the configs for the engine, defaults to None + :param as_fugue: whether to force return a Fugue DataFrame, defaults to False + :param as_local: whether return a local dataframe, defaults to False + + :return: the result dataframe + + .. caution:: + + Currently, only ``SELECT`` statements are supported + + .. admonition:: Examples + + .. code-block:: python + + import pandas as pd + import fugue.api as fa + + with fa.engine_context("duckdb"): + a = fa.as_fugue_df([[0,1]], schema="a:long,b:long") + b = pd.DataFrame([[0,10]], columns=["a","b"]) + c = fa.raw_sql("SELECT * FROM",a,"UNION SELECT * FROM",b) + fa.as_pandas(c) + """ + dag = FugueWorkflow(compile_conf={FUGUE_CONF_WORKFLOW_EXCEPTION_INJECT: 0}) + sp: List[Any] = [] + infer_by: List[Any] = [] + inputs: Dict[int, Any] = {} + for x in statements: + if isinstance(x, str): + sp.append(x) + else: + if id(x) in inputs: + sp.append(inputs[id(x)]) + else: + inputs[id(x)] = dag.create(x) + sp.append(inputs[id(x)]) + infer_by.append(x) + + engine = make_execution_engine(engine, engine_conf, infer_by=infer_by) + dag.select(*sp).yield_dataframe_as("result", as_local=as_local) + res = dag.run(engine) + + return res["result"] if as_fugue else get_native_as_df(res["result"]) diff --git a/fugue/workflow/input.py b/fugue/workflow/input.py index ce520a49..a80f271a 100644 --- a/fugue/workflow/input.py +++ b/fugue/workflow/input.py @@ -1,32 +1,10 @@ -from typing import Any, Set, Type +from typing import Type -from fugue.extensions._builtins import CreateData -from fugue.extensions.creator import parse_creator -_VALID_RAW_DF_TYPES: Set[Type] = set() +def register_raw_df_type(df_type: Type) -> None: # pragma: no cover + """TODO: This function is to be removed before 0.9.0 - -def register_raw_df_type(df_type: Type) -> None: - """Register a base type of dataframe that can be recognized by - :class:`~fugue.workflow.workflow.FugueWorkflow` and converted to - :class:`~fugue.workflow.workflow.WorkflowDataFrame` - - :param df_type: dataframe type, for example ``dask.dataframe.DataFrame`` - """ - - _VALID_RAW_DF_TYPES.add(df_type) - - @parse_creator.candidate(lambda x: isinstance(x, df_type), priority=0.5) - def _parse(x: Any) -> Any: - return CreateData(x) - - -def is_acceptable_raw_df(df: Any) -> bool: - """Whether the input ``df`` can be converted to - :class:`~fugue.workflow.workflow.WorkflowDataFrame` - :param df: input raw dataframe - :return: whether this dataframe is convertible + .. deprecated:: 0.8.0 + Register using :func:`fugue.api.is_df` instead. """ - import fugue._utils.register # pylint: disable=W0611 # noqa: F401 - - return any(isinstance(df, t) for t in _VALID_RAW_DF_TYPES) + raise DeprecationWarning("use fugue.api.is_df to register the dataframe") diff --git a/fugue/workflow/workflow.py b/fugue/workflow/workflow.py index fca6864b..704aee13 100644 --- a/fugue/workflow/workflow.py +++ b/fugue/workflow/workflow.py @@ -1,9 +1,28 @@ import sys from collections import defaultdict -from typing import Any, Callable, Dict, Iterable, List, Optional, Set, TypeVar, Union +from typing import ( + Any, + Callable, + Dict, + Iterable, + List, + Optional, + Set, + Tuple, + TypeVar, + Union, +) from uuid import uuid4 from adagio.specs import WorkflowSpec +from triad import ( + ParamDict, + Schema, + SerializableRLock, + assert_or_throw, + extensible_class, +) + from fugue._utils.exception import modify_traceback from fugue.collections.partition import PartitionSpec from fugue.collections.yielded import Yielded @@ -19,6 +38,7 @@ FUGUE_CONF_WORKFLOW_EXCEPTION_OPTIMIZE, ) from fugue.dataframe import DataFrame, LocalBoundedDataFrame, YieldedDataFrame +from fugue.dataframe.api import is_df from fugue.dataframe.dataframes import DataFrames from fugue.exceptions import FugueWorkflowCompileError, FugueWorkflowError from fugue.execution.factory import make_execution_engine @@ -56,14 +76,6 @@ from fugue.workflow._checkpoint import FileCheckpoint, WeakCheckpoint from fugue.workflow._tasks import Create, FugueTask, Output, Process from fugue.workflow._workflow_context import FugueWorkflowContext -from fugue.workflow.input import is_acceptable_raw_df -from triad import ( - ParamDict, - Schema, - SerializableRLock, - assert_or_throw, - extensible_class, -) _DEFAULT_IGNORE_ERRORS: List[Any] = [] @@ -96,6 +108,13 @@ def spec_uuid(self) -> str: """UUID of its task spec""" return self._task.__uuid__() + @property + def native(self) -> Any: # pragma: no cover + raise NotImplementedError + + def native_as_df(self) -> Any: # pragma: no cover + raise NotImplementedError + @property def name(self) -> str: """Name of its task spec""" @@ -1312,7 +1331,7 @@ def num_partitions(self) -> int: # pragma: no cover """ raise NotImplementedError("WorkflowDataFrame does not support this method") - def peek_array(self) -> Any: # pragma: no cover + def peek_array(self) -> List[Any]: # pragma: no cover """ :raises NotImplementedError: don't call this method """ @@ -1462,6 +1481,7 @@ def __init__(self, compile_conf: Any = None): self._compile_conf = ParamDict( {**_FUGUE_GLOBAL_CONF, **ParamDict(compile_conf)} ) + self._last_df: Optional[WorkflowDataFrame] = None @property def conf(self) -> ParamDict: @@ -1541,6 +1561,10 @@ def run( def yields(self) -> Dict[str, Yielded]: return self._yields + @property + def last_df(self) -> Optional[WorkflowDataFrame]: + return self._last_df + def __enter__(self): return self @@ -1585,8 +1609,16 @@ def create( :meth:`~fugue.extensions.context.ExtensionContext.partition_spec` :return: result dataframe """ - task = Create(creator=using, schema=schema, params=params) - return self.add(task) + task = Create( + creator=CreateData(using) + if is_df(using) or isinstance(using, Yielded) + else using, + schema=schema, + params=params, + ) + res = self.add(task) + self._last_df = res + return res def process( self, @@ -1626,9 +1658,11 @@ def process( input_names=None if not _dfs.has_key else list(_dfs.keys()), ) if _dfs.has_key: - return self.add(task, **_dfs) + res = self.add(task, **_dfs) else: - return self.add(task, *_dfs.values()) + res = self.add(task, *_dfs.values()) + self._last_df = res + return res def output( self, *dfs: Any, using: Any, params: Any = None, pre_partition: Any = None @@ -1694,11 +1728,12 @@ def create_data( "schema must be None when data is WorkflowDataFrame" ), ) + self._last_df = data return data if ( (isinstance(data, (List, Iterable)) and not isinstance(data, str)) or isinstance(data, Yielded) - or is_acceptable_raw_df(data) + or is_df(data) ): return self.create( using=CreateData( @@ -2054,20 +2089,23 @@ def select( Please read :ref:`this ` for more examples """ - s_str: List[str] = [] + sql: List[Tuple[bool, str]] = [] dfs: Dict[str, DataFrame] = {} for s in statements: if isinstance(s, str): - s_str.append(s) - if isinstance(s, DataFrame): + sql.append((False, s)) + else: ws = self.df(s) dfs[ws.name] = ws - s_str.append(ws.name) - sql = " ".join(s_str).strip() - if not sql[:10].upper().startswith("SELECT") and not sql[ - :10 - ].upper().startswith("WITH"): - sql = "SELECT " + sql + sql.append((True, ws.name)) + if sql[0][0]: # starts with reference + sql.insert(0, (False, "SELECT")) + else: # start with string but without select + start = sql[0][1].strip() + if not start[:10].upper().startswith("SELECT") and not start[ + :10 + ].upper().startswith("WITH"): + sql[0] = (False, "SELECT " + start) return self.process( dfs, using=RunSQLSelect, diff --git a/fugue_dask/__init__.py b/fugue_dask/__init__.py index 49cbec1d..296f6e5c 100644 --- a/fugue_dask/__init__.py +++ b/fugue_dask/__init__.py @@ -3,8 +3,3 @@ from fugue_dask.dataframe import DaskDataFrame from fugue_dask.execution_engine import DaskExecutionEngine - -try: - from fugue_dask.ibis_engine import DaskIbisEngine -except Exception: # pragma: no cover - pass diff --git a/fugue_dask/_constants.py b/fugue_dask/_constants.py index e97d8a2e..ee2c43d7 100644 --- a/fugue_dask/_constants.py +++ b/fugue_dask/_constants.py @@ -1,8 +1,4 @@ from typing import Any, Dict -from dask.system import CPU_COUNT - -FUGUE_DASK_CONF_DATAFRAME_DEFAULT_PARTITIONS = "fugue.dask.dataframe.default.partitions" -FUGUE_DASK_DEFAULT_CONF: Dict[str, Any] = { - FUGUE_DASK_CONF_DATAFRAME_DEFAULT_PARTITIONS: CPU_COUNT * 2 -} +FUGUE_DASK_CONF_DEFAULT_PARTITIONS = "fugue.dask.default.partitions" +FUGUE_DASK_DEFAULT_CONF: Dict[str, Any] = {FUGUE_DASK_CONF_DEFAULT_PARTITIONS: -1} diff --git a/fugue_dask/_utils.py b/fugue_dask/_utils.py index 2cc22876..9dc1f140 100644 --- a/fugue_dask/_utils.py +++ b/fugue_dask/_utils.py @@ -8,6 +8,20 @@ from qpd_dask.engine import DaskUtils as DaskUtilsBase from triad.utils.pyarrow import to_pandas_dtype, to_single_pandas_dtype +import fugue.api as fa +from fugue.constants import FUGUE_CONF_DEFAULT_PARTITIONS + +from ._constants import FUGUE_DASK_CONF_DEFAULT_PARTITIONS + + +def get_default_partitions() -> int: + engine = fa.get_current_engine() + n = engine.conf.get( + FUGUE_DASK_CONF_DEFAULT_PARTITIONS, + engine.conf.get(FUGUE_CONF_DEFAULT_PARTITIONS, -1), + ) + return n if n > 0 else engine.get_current_parallelism() * 2 + class DaskUtils(DaskUtilsBase): def get_or_create_client(self, client: Optional[Client] = None): diff --git a/fugue_dask/dataframe.py b/fugue_dask/dataframe.py index 85068273..02dcc500 100644 --- a/fugue_dask/dataframe.py +++ b/fugue_dask/dataframe.py @@ -1,8 +1,12 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple -import dask.dataframe as pd +import dask.dataframe as dd import pandas import pyarrow as pa +from triad.collections.schema import Schema +from triad.utils.assertion import assert_arg_not_none, assert_or_throw +from triad.utils.pyarrow import to_pandas_dtype + from fugue.dataframe import ( ArrowDataFrame, DataFrame, @@ -11,34 +15,23 @@ PandasDataFrame, ) from fugue.dataframe.dataframe import _input_schema -from fugue.dataframe.utils import ( - get_dataframe_column_names, - rename_dataframe_column_names, -) from fugue.exceptions import FugueDataFrameOperationError -from triad.collections.schema import Schema -from triad.utils.assertion import assert_arg_not_none, assert_or_throw -from triad.utils.pyarrow import to_pandas_dtype - -from fugue_dask._constants import ( - FUGUE_DASK_CONF_DATAFRAME_DEFAULT_PARTITIONS, - FUGUE_DASK_DEFAULT_CONF, +from fugue.plugins import ( + as_local_bounded, + count, + drop_columns, + get_column_names, + get_num_partitions, + head, + is_bounded, + is_df, + is_empty, + is_local, + rename, + select_columns, ) -from fugue_dask._utils import DASK_UTILS - -@get_dataframe_column_names.candidate(lambda df: isinstance(df, pd.DataFrame)) -def _get_dask_dataframe_columns(df: pd.DataFrame) -> List[Any]: - return list(df.columns) - - -@rename_dataframe_column_names.candidate( - lambda df, *args, **kwargs: isinstance(df, pd.DataFrame) -) -def _rename_dask_dataframe(df: pd.DataFrame, names: Dict[str, Any]) -> pd.DataFrame: - if len(names) == 0: - return df - return df.rename(columns=names) +from ._utils import DASK_UTILS, get_default_partitions class DaskDataFrame(DataFrame): @@ -50,7 +43,7 @@ class DaskDataFrame(DataFrame): :param schema: |SchemaLikeObject| or :class:`spark:pyspark.sql.types.StructType`, defaults to None. :param num_partitions: initial number of partitions for the dask dataframe - defaults to 0 to get the value from `fugue.dask.dataframe.default.partitions` + defaults to 0 to get the value from `fugue.dask.default.partitions` :param type_safe: whether to cast input data to ensure type safe, defaults to True .. note:: @@ -66,30 +59,28 @@ def __init__( # noqa: C901 type_safe=True, ): if num_partitions <= 0: - num_partitions = FUGUE_DASK_DEFAULT_CONF[ - FUGUE_DASK_CONF_DATAFRAME_DEFAULT_PARTITIONS - ] + num_partitions = get_default_partitions() if df is None: schema = _input_schema(schema).assert_not_empty() df = [] if isinstance(df, DaskDataFrame): super().__init__(df.schema) - self._native: pd.DataFrame = df._native + self._native: dd.DataFrame = df._native return - elif isinstance(df, (pd.DataFrame, pd.Series)): - if isinstance(df, pd.Series): + elif isinstance(df, (dd.DataFrame, dd.Series)): + if isinstance(df, dd.Series): df = df.to_frame() pdf = df schema = None if schema is None else _input_schema(schema) elif isinstance(df, (pandas.DataFrame, pandas.Series)): if isinstance(df, pandas.Series): df = df.to_frame() - pdf = pd.from_pandas(df, npartitions=num_partitions, sort=False) + pdf = dd.from_pandas(df, npartitions=num_partitions, sort=False) schema = None if schema is None else _input_schema(schema) elif isinstance(df, Iterable): schema = _input_schema(schema).assert_not_empty() t = PandasDataFrame(df, schema) - pdf = pd.from_pandas(t.native, npartitions=num_partitions, sort=False) + pdf = dd.from_pandas(t.native, npartitions=num_partitions, sort=False) type_safe = False else: raise ValueError(f"{df} is incompatible with DaskDataFrame") @@ -98,11 +89,11 @@ def __init__( # noqa: C901 self._native = pdf @property - def native(self) -> pd.DataFrame: - """The wrapped Dask DataFrame + def native(self) -> dd.DataFrame: + """The wrapped Dask DataFrame""" + return self._native - :rtype: :class:`dask:dask.dataframe.DataFrame` - """ + def native_as_df(self) -> dd.DataFrame: return self._native @property @@ -110,7 +101,10 @@ def is_local(self) -> bool: return False def as_local(self) -> LocalDataFrame: - return PandasDataFrame(self.as_pandas(), self.schema) + res = PandasDataFrame(self.as_pandas(), self.schema) + if self.has_metadata: + res.reset_metadata(self.metadata) + return res @property def is_bounded(self) -> bool: @@ -122,7 +116,7 @@ def empty(self) -> bool: @property def num_partitions(self) -> int: - return self.native.npartitions + return _dd_get_num_partitions(self.native) def _drop_cols(self, cols: List[str]) -> DataFrame: cols = (self.schema - cols).names @@ -132,7 +126,7 @@ def _select_cols(self, cols: List[Any]) -> DataFrame: schema = self.schema.extract(cols) return DaskDataFrame(self.native[schema.names], schema, type_safe=False) - def peek_array(self) -> Any: + def peek_array(self) -> List[Any]: self.assert_not_empty() return self.as_pandas().iloc[0].values.tolist() @@ -142,7 +136,7 @@ def persist(self, **kwargs: Any) -> "DaskDataFrame": return self def count(self) -> int: - return self.as_pandas().shape[0] + return self.native.shape[0].compute() def as_pandas(self) -> pandas.DataFrame: return self.native.compute().reset_index(drop=True) @@ -222,8 +216,8 @@ def head( return PandasDataFrame(ddf.head(n, compute=True, npartitions=-1), schema=schema) def _apply_schema( - self, pdf: pd.DataFrame, schema: Optional[Schema], type_safe: bool = True - ) -> Tuple[pd.DataFrame, Schema]: + self, pdf: dd.DataFrame, schema: Optional[Schema], type_safe: bool = True + ) -> Tuple[dd.DataFrame, Schema]: if not type_safe: assert_arg_not_none(pdf, "pdf") assert_arg_not_none(schema, "schema") @@ -244,3 +238,96 @@ def _apply_schema( ) pdf.columns = schema.names return DASK_UTILS.enforce_type(pdf, schema.pa_schema, null_safe=True), schema + + +@is_df.candidate(lambda df: isinstance(df, dd.DataFrame)) +def _dd_is_df(df: dd.DataFrame) -> bool: + return True + + +@get_num_partitions.candidate(lambda df: isinstance(df, dd.DataFrame)) +def _dd_get_num_partitions(df: dd.DataFrame) -> int: + return df.npartitions + + +@count.candidate(lambda df: isinstance(df, dd.DataFrame)) +def _dd_count(df: dd.DataFrame) -> int: + return df.shape[0].compute() + + +@is_bounded.candidate(lambda df: isinstance(df, dd.DataFrame)) +def _dd_is_bounded(df: dd.DataFrame) -> bool: + return True + + +@is_empty.candidate(lambda df: isinstance(df, dd.DataFrame)) +def _dd_is_empty(df: dd.DataFrame) -> bool: + return DASK_UTILS.empty(df) + + +@is_local.candidate(lambda df: isinstance(df, dd.DataFrame)) +def _dd_is_local(df: dd.DataFrame) -> bool: + return False + + +@as_local_bounded.candidate(lambda df: isinstance(df, dd.DataFrame)) +def _dd_as_local(df: dd.DataFrame) -> bool: + return df.compute() + + +@get_column_names.candidate(lambda df: isinstance(df, dd.DataFrame)) +def _get_dask_dataframe_columns(df: dd.DataFrame) -> List[Any]: + return list(df.columns) + + +@rename.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame)) +def _rename_dask_dataframe(df: dd.DataFrame, columns: Dict[str, Any]) -> dd.DataFrame: + if len(columns) == 0: + return df + _assert_no_missing(df, columns.keys()) + return df.rename(columns=columns) + + +@drop_columns.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame)) +def _drop_dd_columns( + df: dd.DataFrame, columns: List[str], as_fugue: bool = False +) -> Any: + cols = [x for x in df.columns if x not in columns] + if len(cols) == 0: + raise FugueDataFrameOperationError("cannot drop all columns") + if len(cols) + len(columns) != len(df.columns): + _assert_no_missing(df, columns) + return _adjust_df(df[cols], as_fugue=as_fugue) + + +@select_columns.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame)) +def _select_dd_columns( + df: dd.DataFrame, columns: List[Any], as_fugue: bool = False +) -> Any: + if len(columns) == 0: + raise FugueDataFrameOperationError("must select at least one column") + _assert_no_missing(df, columns) + return _adjust_df(df[columns], as_fugue=as_fugue) + + +@head.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame)) +def _dd_head( + df: dd.DataFrame, + n: int, + columns: Optional[List[str]] = None, + as_fugue: bool = False, +) -> pandas.DataFrame: + if columns is not None: + df = df[columns] + res = df.head(n, compute=True, npartitions=-1) + return PandasDataFrame(res) if as_fugue else res + + +def _assert_no_missing(df: dd.DataFrame, columns: Iterable[Any]) -> None: + missing = set(columns) - set(df.columns) + if len(missing) > 0: + raise FugueDataFrameOperationError("found nonexistent columns: {missing}") + + +def _adjust_df(res: dd.DataFrame, as_fugue: bool): + return res if not as_fugue else DaskDataFrame(res) diff --git a/fugue_dask/execution_engine.py b/fugue_dask/execution_engine.py index 966479e3..7fd35724 100644 --- a/fugue_dask/execution_engine.py +++ b/fugue_dask/execution_engine.py @@ -1,25 +1,9 @@ import logging import os -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import dask.dataframe as dd from distributed import Client -from fugue.collections.partition import ( - EMPTY_PARTITION_SPEC, - PartitionCursor, - PartitionSpec, - parse_presort_exp, -) -from fugue.constants import KEYWORD_CORECOUNT, KEYWORD_ROWCOUNT -from fugue.dataframe import DataFrame, DataFrames, LocalDataFrame, PandasDataFrame -from fugue.dataframe.utils import get_join_schemas -from fugue.execution.execution_engine import ( - _DEFAULT_JOIN_KEYS, - ExecutionEngine, - SQLEngine, - MapEngine, -) -from fugue.execution.native_execution_engine import NativeExecutionEngine from qpd_dask import run_sql_on_dask from triad.collections import Schema from triad.collections.dict import IndexedOrderedDict, ParamDict @@ -28,11 +12,17 @@ from triad.utils.hash import to_uuid from triad.utils.threading import RunOnce -from fugue_dask._constants import ( - CPU_COUNT, - FUGUE_DASK_CONF_DATAFRAME_DEFAULT_PARTITIONS, - FUGUE_DASK_DEFAULT_CONF, +from fugue.collections.partition import ( + PartitionCursor, + PartitionSpec, + parse_presort_exp, ) +from fugue.constants import KEYWORD_CORECOUNT, KEYWORD_ROWCOUNT +from fugue.dataframe import DataFrame, DataFrames, LocalDataFrame, PandasDataFrame +from fugue.dataframe.utils import get_join_schemas +from fugue.execution.execution_engine import ExecutionEngine, MapEngine, SQLEngine +from fugue.execution.native_execution_engine import NativeExecutionEngine +from fugue_dask._constants import FUGUE_DASK_DEFAULT_CONF from fugue_dask._io import load_df, save_df from fugue_dask._utils import DASK_UTILS, DaskUtils from fugue_dask.dataframe import DaskDataFrame @@ -51,12 +41,13 @@ def __init__(self, execution_engine: ExecutionEngine): ) super().__init__(execution_engine) - def select(self, dfs: DataFrames, statement: str) -> DataFrame: + def select(self, dfs: DataFrames, statement: List[Tuple[bool, str]]) -> DataFrame: + _dfs, _sql = self.encode(dfs, statement) dask_dfs = { k: self.execution_engine.to_df(v).native # type: ignore - for k, v in dfs.items() + for k, v in _dfs.items() } - df = run_sql_on_dask(statement, dask_dfs, ignore_case=True) + df = run_sql_on_dask(_sql, dask_dfs, ignore_case=True) return DaskDataFrame(df) @@ -163,6 +154,10 @@ def create_default_sql_engine(self) -> SQLEngine: def create_default_map_engine(self) -> MapEngine: return DaskMapEngine(self) + def get_current_parallelism(self) -> int: + res = dict(self.dask_client.nthreads()) + return sum(res.values()) + @property def pl_utils(self) -> DaskUtils: """Pandas-like dataframe utils""" @@ -187,9 +182,7 @@ def to_df(self, df: Any, schema: Any = None) -> DaskDataFrame: * all other methods in the engine can take arbitrary dataframes and call this method to convert before doing anything """ - default_partitions = self.conf.get_or_throw( - FUGUE_DASK_CONF_DATAFRAME_DEFAULT_PARTITIONS, int - ) + if isinstance(df, DataFrame): assert_or_throw( schema is None, @@ -198,15 +191,12 @@ def to_df(self, df: Any, schema: Any = None) -> DaskDataFrame: if isinstance(df, DaskDataFrame): return df if isinstance(df, PandasDataFrame): - return DaskDataFrame( - df.native, df.schema, num_partitions=default_partitions - ) - return DaskDataFrame( - df.as_array(type_safe=True), - df.schema, - num_partitions=default_partitions, - ) - return DaskDataFrame(df, schema, num_partitions=default_partitions) + res = DaskDataFrame(df.native, df.schema) + else: + res = DaskDataFrame(df.as_array(type_safe=True), df.schema) + res.reset_metadata(df.metadata) + return res + return DaskDataFrame(df, schema) def repartition( self, df: DataFrame, partition_spec: PartitionSpec @@ -219,7 +209,7 @@ def repartition( p = partition_spec.get_num_partitions( **{ KEYWORD_ROWCOUNT: lambda: df.persist().count(), # type: ignore - KEYWORD_CORECOUNT: lambda: CPU_COUNT, + KEYWORD_CORECOUNT: lambda: self.get_current_parallelism(), } ) if p > 0: @@ -249,7 +239,7 @@ def join( df1: DataFrame, df2: DataFrame, how: str, - on: List[str] = _DEFAULT_JOIN_KEYS, + on: Optional[List[str]] = None, ) -> DataFrame: key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on) d = self.pl_utils.join( @@ -376,8 +366,9 @@ def take( n: int, presort: str, na_position: str = "last", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, ) -> DataFrame: + partition_spec = partition_spec or PartitionSpec() assert_or_throw( isinstance(n, int), ValueError("n needs to be an integer"), @@ -444,10 +435,11 @@ def save_df( path: str, format_hint: Any = None, mode: str = "overwrite", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, force_single: bool = False, **kwargs: Any, ) -> None: + partition_spec = partition_spec or PartitionSpec() if force_single: self._native.save_df( df, diff --git a/fugue_dask/ibis_engine.py b/fugue_dask/ibis_engine.py index 52970484..cab934af 100644 --- a/fugue_dask/ibis_engine.py +++ b/fugue_dask/ibis_engine.py @@ -1,16 +1,16 @@ -from typing import Any, Callable, Optional +from typing import Any, Callable import dask.dataframe as dd import ibis -from fugue import DataFrame, DataFrames, ExecutionEngine -from fugue_ibis import IbisTable -from fugue_ibis._utils import to_ibis_schema, to_schema -from fugue_ibis.execution.ibis_engine import IbisEngine, register_ibis_engine from ibis.backends.dask import Backend from triad.utils.assertion import assert_or_throw +from fugue import DataFrame, DataFrames, ExecutionEngine from fugue_dask.dataframe import DaskDataFrame from fugue_dask.execution_engine import DaskExecutionEngine +from fugue_ibis import IbisTable +from fugue_ibis._utils import to_ibis_schema, to_schema +from fugue_ibis.execution.ibis_engine import IbisEngine, parse_ibis_engine class DaskIbisEngine(IbisEngine): @@ -42,13 +42,11 @@ def select( return DaskDataFrame(result, schema=schema) -def _to_dask_ibis_engine( - engine: ExecutionEngine, ibis_engine: Any -) -> Optional[IbisEngine]: - if isinstance(engine, DaskExecutionEngine): - if ibis_engine is None: - return DaskIbisEngine(engine) - return None # pragma: no cover +@parse_ibis_engine.candidate( + lambda obj, *args, **kwargs: isinstance(obj, DaskExecutionEngine) +) +def _to_dask_ibis_engine(obj: Any, engine: ExecutionEngine) -> IbisEngine: + return DaskIbisEngine(engine) class _BackendWrapper(Backend): @@ -62,6 +60,3 @@ def table(self, name: str, schema: Any = None): if schema is None and name in self._schemas else schema, ) - - -register_ibis_engine(0, _to_dask_ibis_engine) diff --git a/fugue_dask/registry.py b/fugue_dask/registry.py index d4dc386b..17c1c084 100644 --- a/fugue_dask/registry.py +++ b/fugue_dask/registry.py @@ -5,19 +5,14 @@ from dask.distributed import Client from triad import run_at_def -from fugue import ( - DataFrame, - infer_execution_engine, - is_pandas_or, - register_execution_engine, -) +from fugue import DataFrame, is_pandas_or, register_execution_engine from fugue._utils.interfaceless import ( DataFrameParam, ExecutionEngineParam, SimpleAnnotationConverter, register_annotation_converter, ) -from fugue.workflow import register_raw_df_type +from fugue.plugins import as_fugue_dataset, infer_execution_engine from fugue_dask._utils import DASK_UTILS from fugue_dask.dataframe import DaskDataFrame from fugue_dask.execution_engine import DaskExecutionEngine @@ -30,8 +25,9 @@ def _infer_dask_client(objs: Any) -> Any: return DASK_UTILS.get_or_create_client() -def _register_raw_dataframes() -> None: - register_raw_df_type(dd.DataFrame) +@as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, dd.DataFrame)) +def _dask_as_fugue_df(df: dd.DataFrame, **kwargs: Any) -> DaskDataFrame: + return DaskDataFrame(df, **kwargs) def _register_engines() -> None: @@ -102,6 +98,5 @@ def _register() -> None: >>> import fugue_dask """ - _register_raw_dataframes() _register_engines() _register_annotation_converters() diff --git a/fugue_duckdb/_io.py b/fugue_duckdb/_io.py index c299a50f..74bf8867 100644 --- a/fugue_duckdb/_io.py +++ b/fugue_duckdb/_io.py @@ -2,13 +2,14 @@ from typing import Any, Iterable, List, Optional, Union from duckdb import DuckDBPyConnection -from fugue._utils.io import FileParser, load_df, save_df -from fugue.dataframe import ArrowDataFrame, LocalBoundedDataFrame from triad import ParamDict, Schema from triad.collections.fs import FileSystem from triad.utils.assertion import assert_or_throw -from fugue_duckdb._utils import encode_value_to_expr, get_temp_df_name, to_duck_type +from fugue._utils.io import FileParser, load_df, save_df +from fugue._utils.sql import get_temp_tb_name +from fugue.dataframe import ArrowDataFrame, LocalBoundedDataFrame +from fugue_duckdb._utils import encode_value_to_expr, to_duck_type from fugue_duckdb.dataframe import DuckDataFrame @@ -91,15 +92,15 @@ def save_df( self._format_save[p.file_format](df, p, **kwargs) def _save_csv(self, df: DuckDataFrame, p: FileParser, **kwargs: Any): - dn = get_temp_df_name() - df.native.create_view(dn) + dn = get_temp_tb_name() + df.native.create_view(dn.key) kw = ParamDict({k.lower(): v for k, v in kwargs.items()}) kw["header"] = 1 if kw.pop("header", False) else 0 params: List[str] = [] for k, v in kw.items(): params.append(f"{k.upper()} " + encode_value_to_expr(v)) pm = ", ".join(params) - query = f"COPY {dn} TO {encode_value_to_expr(p.uri)} WITH ({pm})" + query = f"COPY {dn.key} TO {encode_value_to_expr(p.uri)} WITH ({pm})" self._con.execute(query) def _load_csv( # noqa: C901 @@ -176,15 +177,15 @@ def _load_csv( # noqa: C901 return DuckDataFrame(self._con.from_query(query)) def _save_parquet(self, df: DuckDataFrame, p: FileParser, **kwargs: Any): - dn = get_temp_df_name() - df.native.create_view(dn) + dn = get_temp_tb_name() + df.native.create_view(dn.key) kw = ParamDict({k.lower(): v for k, v in kwargs.items()}) kw["format"] = "parquet" params: List[str] = [] for k, v in kw.items(): params.append(f"{k.upper()} " + encode_value_to_expr(v)) pm = ", ".join(params) - query = f"COPY {dn} TO {encode_value_to_expr(p.uri)}" + query = f"COPY {dn.key} TO {encode_value_to_expr(p.uri)}" if len(params) > 0: query += f" WITH ({pm})" self._con.execute(query) diff --git a/fugue_duckdb/_utils.py b/fugue_duckdb/_utils.py index 9502ed61..6245414d 100644 --- a/fugue_duckdb/_utils.py +++ b/fugue_duckdb/_utils.py @@ -1,11 +1,11 @@ from datetime import date, datetime from typing import Any, Dict, Iterable, Optional, Tuple -from uuid import uuid4 import numpy as np import pandas as pd import pyarrow as pa from duckdb import __version__ as _DUCKDB_VERSION # type: ignore +from triad import Schema from triad.utils.pyarrow import TRIAD_DEFAULT_TIMESTAMP _LEGACY_DUCKDB = _DUCKDB_VERSION < "0.3.3" @@ -32,6 +32,19 @@ _PA_TYPES_TO_DUCK: Dict[pa.DataType, str] = {v: k for k, v in _DUCK_TYPES_TO_PA.items()} +def encode_column_name(name: str) -> str: + return '"' + name.replace('"', '""') + '"' + + +def encode_column_names(names: Iterable[str]) -> Iterable[str]: + for name in names: + yield encode_column_name(name) + + +def encode_schema_names(schema: Schema) -> Iterable[str]: + return encode_column_names(schema.names) + + def encode_value_to_expr(value: Any) -> str: # noqa: C901 if isinstance(value, list): return "[" + ", ".join(encode_value_to_expr(x) for x in value) + "]" @@ -64,10 +77,6 @@ def encode_value_to_expr(value: Any) -> str: # noqa: C901 raise NotImplementedError(value) -def get_temp_df_name() -> str: - return "_" + str(uuid4())[:5] - - def to_duck_type(tp: pa.DataType) -> str: if _LEGACY_DUCKDB: # pragma: no cover return _to_duck_type_legacy(tp) diff --git a/fugue_duckdb/dask.py b/fugue_duckdb/dask.py index b968e6b7..1c0f8859 100644 --- a/fugue_duckdb/dask.py +++ b/fugue_duckdb/dask.py @@ -8,7 +8,6 @@ from triad import assert_or_throw from fugue import DataFrame, MapEngine, PartitionSpec -from fugue.collections.partition import EMPTY_PARTITION_SPEC from fugue_dask import DaskDataFrame, DaskExecutionEngine from fugue_dask.execution_engine import DaskMapEngine from fugue_duckdb.dataframe import DuckDataFrame @@ -37,6 +36,9 @@ def __init__( def create_default_map_engine(self) -> MapEngine: return DaskMapEngine(self._dask_engine) + def get_current_parallelism(self) -> int: + return self._dask_engine.get_current_parallelism() + @property def dask_client(self) -> Client: return self._dask_engine.dask_client @@ -45,11 +47,14 @@ def to_df(self, df: Any, schema: Any = None) -> DuckDataFrame: if isinstance(df, (dd.DataFrame, DaskDataFrame)): ddf = self._to_dask_df(df, schema) if all(not pa.types.is_nested(f.type) for f in ddf.schema.fields): - return DuckDataFrame(self.connection.from_df(ddf.as_pandas())) + res = DuckDataFrame(self.connection.from_df(ddf.as_pandas())) else: - return DuckDataFrame( + res = DuckDataFrame( duckdb.arrow(ddf.as_arrow(), connection=self.connection) ) + if ddf.has_metadata: + res.reset_metadata(ddf.metadata) + return res return super().to_df(df, schema) def repartition(self, df: DataFrame, partition_spec: PartitionSpec) -> DataFrame: @@ -79,10 +84,11 @@ def save_df( path: str, format_hint: Any = None, mode: str = "overwrite", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, force_single: bool = False, **kwargs: Any, ) -> None: + partition_spec = partition_spec or PartitionSpec() if isinstance(df, DaskDataFrame) or not partition_spec.empty: return self._dask_engine.save_df( self._to_dask_df(df), @@ -123,5 +129,7 @@ def _to_auto_df( def _to_dask_df(self, df: Any, schema: Any = None) -> DaskDataFrame: if isinstance(df, DuckDataFrame): - return self._dask_engine.to_df(df.as_pandas(), df.schema) + res = self._dask_engine.to_df(df.as_pandas(), df.schema) + res.reset_metadata(df.metadata if df.has_metadata else None) + return res return self._dask_engine.to_df(df, schema) diff --git a/fugue_duckdb/dataframe.py b/fugue_duckdb/dataframe.py index 89744676..7453fe67 100644 --- a/fugue_duckdb/dataframe.py +++ b/fugue_duckdb/dataframe.py @@ -3,17 +3,25 @@ import pandas as pd import pyarrow as pa from duckdb import DuckDBPyRelation +from triad import Schema + from fugue import ( + ArrayDataFrame, ArrowDataFrame, DataFrame, LocalBoundedDataFrame, LocalDataFrame, - ArrayDataFrame, ) -from fugue.exceptions import FugueDatasetEmptyError, FugueDataFrameOperationError -from triad import Schema +from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError +from fugue.plugins import ( + as_fugue_dataset, + as_local_bounded, + get_column_names, + get_num_partitions, + is_df, +) -from fugue_duckdb._utils import to_duck_type, to_pa_type +from ._utils import encode_column_name, to_duck_type, to_pa_type class DuckDataFrame(LocalBoundedDataFrame): @@ -24,21 +32,33 @@ class DuckDataFrame(LocalBoundedDataFrame): def __init__(self, rel: DuckDBPyRelation): self._rel = rel - schema = Schema( - [pa.field(x, to_pa_type(y)) for x, y in zip(rel.columns, rel.types)] + super().__init__(schema=self._get_schema) + + def _get_schema(self) -> Schema: + return Schema( + [ + pa.field(x, to_pa_type(y)) + for x, y in zip(self._rel.columns, self._rel.types) + ] ) - super().__init__(schema=schema) + + @property + def alias(self) -> str: + return "_" + str(id(self._rel)) # DuckDBPyRelation.alias is not always unique @property def native(self) -> DuckDBPyRelation: """DuckDB relation object""" return self._rel + def native_as_df(self) -> DuckDBPyRelation: + return self._rel + @property def empty(self) -> bool: return self._rel.fetchone() is None - def peek_array(self) -> Any: + def peek_array(self) -> List[Any]: res = self._rel.fetchone() if res is None: raise FugueDatasetEmptyError() @@ -48,21 +68,23 @@ def count(self) -> int: return self._rel.aggregate("count(1) AS ct").fetchone()[0] def _drop_cols(self, cols: List[str]) -> DataFrame: - schema = self.schema.exclude(cols) - rel = self._rel.project(",".join(n for n in schema.names)) + cols = [col for col in self._rel.columns if col not in cols] + rel = self._rel.project(",".join(encode_column_name(n) for n in cols)) return DuckDataFrame(rel) def _select_cols(self, keys: List[Any]) -> DataFrame: - schema = self.schema.extract(keys) - rel = self._rel.project(",".join(n for n in schema.names)) + rel = self._rel.project(",".join(encode_column_name(n) for n in keys)) return DuckDataFrame(rel) def rename(self, columns: Dict[str, str]) -> DataFrame: - try: - schema = self.schema.rename(columns) - except Exception as e: - raise FugueDataFrameOperationError from e - expr = ", ".join(f"{a} AS {b}" for a, b in zip(self.schema.names, schema.names)) + _assert_no_missing(self._rel, columns.keys()) + expr = ", ".join( + f"{a} AS {b}" + for a, b in [ + (encode_column_name(name), encode_column_name(columns.get(name, name))) + for name in self._rel.columns + ] + ) return DuckDataFrame(self._rel.project(expr)) def alter_columns(self, columns: Any) -> DataFrame: @@ -75,7 +97,9 @@ def alter_columns(self, columns: Any) -> DataFrame: fields.append(f1.name) else: tp = to_duck_type(f2.type) - fields.append(f"CAST({f1.name} AS {tp}) AS {f1.name}") + fields.append( + f"CAST({encode_column_name(f1.name)} AS {tp}) AS {f1.name}" + ) return DuckDataFrame(self._rel.project(", ".join(fields))) def as_arrow(self, type_safe: bool = False) -> pa.Table: @@ -125,3 +149,34 @@ def to_list(row: Any) -> List[Any]: return res return [to_list(x) for x in rel.fetchall()] + + +@as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, DuckDBPyRelation)) +def _duckdb_as_fugue_df(df: DuckDBPyRelation, **kwargs: Any) -> DuckDataFrame: + return DuckDataFrame(df, **kwargs) + + +@is_df.candidate(lambda df: isinstance(df, DuckDBPyRelation)) +def _duck_is_df(df: DuckDBPyRelation) -> bool: + return True + + +@get_num_partitions.candidate(lambda df: isinstance(df, DuckDBPyRelation)) +def _duckdb_num_partitions(df: DuckDBPyRelation) -> int: + return 1 + + +@as_local_bounded.candidate(lambda df: isinstance(df, DuckDBPyRelation)) +def _duck_as_local(df: DuckDBPyRelation) -> DuckDBPyRelation: + return df + + +@get_column_names.candidate(lambda df: isinstance(df, DuckDBPyRelation)) +def _get_duckdb_columns(df: DuckDBPyRelation) -> List[Any]: + return list(df.columns) + + +def _assert_no_missing(df: DuckDBPyRelation, columns: Iterable[Any]) -> None: + missing = set(columns) - set(df.columns) + if len(missing) > 0: + raise FugueDataFrameOperationError("found nonexistent columns: {missing}") diff --git a/fugue_duckdb/execution_engine.py b/fugue_duckdb/execution_engine.py index fe80a22b..5704e2f5 100644 --- a/fugue_duckdb/execution_engine.py +++ b/fugue_duckdb/execution_engine.py @@ -1,9 +1,9 @@ import logging -from typing import Any, Dict, Iterable, List, Optional, Union +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import duckdb import pyarrow as pa -from duckdb import DuckDBPyConnection +from duckdb import DuckDBPyConnection, DuckDBPyRelation from triad import SerializableRLock from triad.collections.fs import FileSystem from triad.utils.assertion import assert_or_throw @@ -16,11 +16,8 @@ PandasMapEngine, SQLEngine, ) -from fugue.collections.partition import ( - EMPTY_PARTITION_SPEC, - PartitionSpec, - parse_presort_exp, -) +from fugue._utils.sql import get_temp_tb_name, parse_sql +from fugue.collections.partition import PartitionSpec, parse_presort_exp from fugue.dataframe import ( DataFrame, DataFrames, @@ -28,10 +25,15 @@ PandasDataFrame, ) from fugue.dataframe.utils import get_join_schemas -from fugue.execution.execution_engine import _DEFAULT_JOIN_KEYS -from fugue_duckdb._io import DuckDBIO -from fugue_duckdb._utils import encode_value_to_expr, get_temp_df_name -from fugue_duckdb.dataframe import DuckDataFrame + +from ._io import DuckDBIO +from ._utils import ( + encode_column_name, + encode_column_names, + encode_schema_names, + encode_value_to_expr, +) +from .dataframe import DuckDataFrame _FUGUE_DUCKDB_PRAGMA_CONFIG_PREFIX = "fugue.duckdb.pragma." @@ -42,27 +44,24 @@ class DuckDBEngine(SQLEngine): :param execution_engine: the execution engine this sql engine will run on """ - def __init__(self, execution_engine: ExecutionEngine) -> None: - super().__init__(execution_engine) - self._cache: Dict[str, int] = {} - - def select(self, dfs: DataFrames, statement: str) -> DataFrame: + def select(self, dfs: DataFrames, statement: List[Tuple[bool, str]]) -> DataFrame: if isinstance(self.execution_engine, DuckExecutionEngine): return self._duck_select(dfs, statement) - return self._other_select(dfs, statement) + else: + _dfs, _sql = self.encode(dfs, statement) + return self._other_select(_dfs, _sql) - def _duck_select(self, dfs: DataFrames, statement: str) -> DataFrame: + def _duck_select( + self, dfs: DataFrames, statement: List[Tuple[bool, str]] + ) -> DataFrame: + name_map: Dict[str, str] = {} for k, v in dfs.items(): - tdf: Any = self.execution_engine._to_duck_df(v) # type: ignore - if k not in self._cache or self._cache[k] != id(tdf.native): - tdf.native.create_view(k, replace=True) - # TODO: remove the following hack, if it is stable - # kk = k + get_temp_df_name() - # tdf.native.query( - # kk, f"CREATE OR REPLACE TEMP VIEW {k} AS SELECT * FROM {kk}" - # ) - self._cache[k] = id(tdf.native) - result = self.execution_engine.connection.query(statement) # type: ignore + tdf: DuckDataFrame = self.execution_engine._to_duck_df( # type: ignore + v, create_view=True + ) + name_map[k] = tdf.alias + query = " ".join(name_map.get(p[1], p[1]) if p[0] else p[1] for p in statement) + result = self.execution_engine.connection.query(query) # type: ignore return DuckDataFrame(result) def _other_select(self, dfs: DataFrames, statement: str) -> DataFrame: @@ -91,6 +90,7 @@ def __init__( self._con = connection or duckdb.connect() self._external_con = connection is not None self._context_lock = SerializableRLock() + self._registered_dfs: Dict[str, DuckDataFrame] = {} try: for pg in list(self._get_pragmas()): # transactional @@ -135,6 +135,9 @@ def create_default_sql_engine(self) -> SQLEngine: def create_default_map_engine(self) -> MapEngine: return PandasMapEngine(self._native_engine) + def get_current_parallelism(self) -> int: + return 1 + def to_df(self, df: Any, schema: Any = None) -> DataFrame: return self._to_duck_df(df, schema=schema) @@ -167,25 +170,32 @@ def join( df1: DataFrame, df2: DataFrame, how: str, - on: List[str] = _DEFAULT_JOIN_KEYS, + on: Optional[List[str]] = None, ) -> DataFrame: key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on) t1, t2, t3 = ( - get_temp_df_name(), - get_temp_df_name(), - get_temp_df_name(), + get_temp_tb_name(), + get_temp_tb_name(), + get_temp_tb_name(), + ) + on_fields = " AND ".join( + f"{t1}.{encode_column_name(k)}={t2}.{encode_column_name(k)}" + for k in key_schema ) - on_fields = " AND ".join(f"{t1}.{k}={t2}.{k}" for k in key_schema) join_type = self._how_to_join(how) if how.lower() == "cross": select_fields = ",".join( - f"{t1}.{k}" if k in df1.schema else f"{t2}.{k}" + f"{t1}.{encode_column_name(k)}" + if k in df1.schema + else f"{t2}.{encode_column_name(k)}" for k in output_schema.names ) sql = f"SELECT {select_fields} FROM {t1} {join_type} {t2}" elif how.lower() == "right_outer": select_fields = ",".join( - f"{t2}.{k}" if k in df2.schema else f"{t1}.{k}" + f"{t2}.{encode_column_name(k)}" + if k in df2.schema + else f"{t1}.{encode_column_name(k)}" for k in output_schema.names ) sql = ( @@ -193,20 +203,29 @@ def join( ) elif how.lower() == "full_outer": select_fields = ",".join( - f"COALESCE({t1}.{k},{t2}.{k}) AS {k}" if k in key_schema else k + f"COALESCE({t1}.{encode_column_name(k)},{t2}.{encode_column_name(k)}) " + f"AS {encode_column_name(k)}" + if k in key_schema + else encode_column_name(k) for k in output_schema.names ) sql = f"SELECT {select_fields} FROM {t1} {join_type} {t2} ON {on_fields}" elif how.lower() in ["semi", "left_semi"]: - keys = ",".join(key_schema.names) - on_fields = " AND ".join(f"{t1}.{k}={t3}.{k}" for k in key_schema) + keys = ",".join(encode_schema_names(key_schema)) + on_fields = " AND ".join( + f"{t1}.{encode_column_name(k)}={t3}.{encode_column_name(k)}" + for k in key_schema + ) sql = ( f"SELECT {t1}.* FROM {t1} INNER JOIN (SELECT DISTINCT {keys} " f"FROM {t2}) AS {t3} ON {on_fields}" ) elif how.lower() in ["anti", "left_anti"]: - keys = ",".join(key_schema.names) - on_fields = " AND ".join(f"{t1}.{k}={t3}.{k}" for k in key_schema) + keys = ",".join(encode_schema_names(key_schema)) + on_fields = " AND ".join( + f"{t1}.{encode_column_name(k)}={t3}.{encode_column_name(k)}" + for k in key_schema + ) sql = ( f"SELECT {t1}.* FROM {t1} LEFT OUTER JOIN " f"(SELECT DISTINCT {keys}, 1 AS __contain__ FROM {t2}) AS {t3} " @@ -214,11 +233,13 @@ def join( ) else: select_fields = ",".join( - f"{t1}.{k}" if k in df1.schema else f"{t2}.{k}" + f"{t1}.{encode_column_name(k)}" + if k in df1.schema + else f"{t2}.{encode_column_name(k)}" for k in output_schema.names ) sql = f"SELECT {select_fields} FROM {t1} {join_type} {t2} ON {on_fields}" - return self._sql(sql, {t1: df1, t2: df2}) + return self._sql(sql, {t1.key: df1, t2.key: df2}) def _how_to_join(self, how: str): return how.upper().replace("_", " ") + " JOIN" @@ -228,9 +249,9 @@ def union(self, df1: DataFrame, df2: DataFrame, distinct: bool = True) -> DataFr df1.schema == df2.schema, ValueError(f"{df1.schema} != {df2.schema}") ) if distinct: - t1, t2 = get_temp_df_name(), get_temp_df_name() + t1, t2 = get_temp_tb_name(), get_temp_tb_name() sql = f"SELECT * FROM {t1} UNION SELECT * FROM {t2}" - return self._sql(sql, {t1: df1, t2: df2}) + return self._sql(sql, {t1.key: df1, t2.key: df2}) return DuckDataFrame( self._to_duck_df(df1).native.union(self._to_duck_df(df2).native) ) @@ -239,9 +260,9 @@ def subtract( self, df1: DataFrame, df2: DataFrame, distinct: bool = True ) -> DataFrame: # pragma: no cover if distinct: - t1, t2 = get_temp_df_name(), get_temp_df_name() + t1, t2 = get_temp_tb_name(), get_temp_tb_name() sql = f"SELECT * FROM {t1} EXCEPT SELECT * FROM {t2}" - return self._sql(sql, {t1: df1, t2: df2}) + return self._sql(sql, {t1.key: df1, t2.key: df2}) return DuckDataFrame( self._to_duck_df(df1).native.except_(self._to_duck_df(df2).native) ) @@ -250,9 +271,9 @@ def intersect( self, df1: DataFrame, df2: DataFrame, distinct: bool = True ) -> DataFrame: if distinct: - t1, t2 = get_temp_df_name(), get_temp_df_name() + t1, t2 = get_temp_tb_name(), get_temp_tb_name() sql = f"SELECT * FROM {t1} INTERSECT DISTINCT SELECT * FROM {t2}" - return self._sql(sql, {t1: df1, t2: df2}) + return self._sql(sql, {t1.key: df1, t2.key: df2}) raise NotImplementedError( "DuckDB doesn't have consist behavior on INTERSECT ALL," " so Fugue doesn't support it" @@ -278,7 +299,10 @@ def dropna( thr = thresh or len(schema) else: # pragma: no cover raise ValueError(f"{how} is not one of any and all") - cw = [f"CASE WHEN {f} IS NULL THEN 0 ELSE 1 END" for f in schema.names] + cw = [ + f"CASE WHEN {encode_column_name(f)} IS NULL THEN 0 ELSE 1 END" + for f in schema.names + ] expr = " + ".join(cw) + f" >= {thr}" return DuckDataFrame(self._to_duck_df(df).native.filter(expr)) @@ -302,7 +326,9 @@ def _build_value_dict(names: List[str]) -> Dict[str, str]: ValueError("fillna value can not be None or contain None"), ) cols = [ - f"COALESCE({f}, {vd[f]}) AS {f}" if f in names else f + f"COALESCE({encode_column_name(f)}, {vd[f]}) AS {encode_column_name(f)}" + if f in names + else encode_column_name(f) for f in df.schema.names ] return DuckDataFrame(self._to_duck_df(df).native.project(", ".join(cols))) @@ -322,14 +348,14 @@ def sample( f"one and only one of n and frac should be non-negative, {n}, {frac}" ), ) - tb = get_temp_df_name() + tb = get_temp_tb_name() if frac is not None: sql = f"SELECT * FROM {tb} USING SAMPLE bernoulli({frac*100} PERCENT)" else: sql = f"SELECT * FROM {tb} USING SAMPLE reservoir({n} ROWS)" if seed is not None: sql += f" REPEATABLE ({seed})" - return self._sql(sql, {tb: df}) + return self._sql(sql, {tb.key: df}) def take( self, @@ -337,8 +363,9 @@ def take( n: int, presort: str, na_position: str = "last", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, ) -> DataFrame: + partition_spec = partition_spec or PartitionSpec() assert_or_throw( isinstance(n, int), ValueError("n needs to be an integer"), @@ -348,23 +375,23 @@ def take( _presort = parse_presort_exp(presort) else: _presort = partition_spec.presort - tb = get_temp_df_name() + tb = get_temp_tb_name() if len(_presort) == 0: if len(partition_spec.partition_by) == 0: return DuckDataFrame(self._to_duck_df(df).native.limit(n)) - cols = ", ".join(df.schema.names) - pcols = ", ".join(partition_spec.partition_by) + cols = ", ".join(encode_schema_names(df.schema)) + pcols = ", ".join(encode_column_names(partition_spec.partition_by)) sql = ( f"SELECT *, ROW_NUMBER() OVER (PARTITION BY {pcols}) " f"AS __fugue_take_param FROM {tb}" ) sql = f"SELECT {cols} FROM ({sql}) WHERE __fugue_take_param<={n}" - return self._sql(sql, {tb: df}) + return self._sql(sql, {tb.key: df}) sorts: List[str] = [] for k, v in _presort.items(): - s = k + s = encode_column_name(k) if not v: s += " DESC" s += " NULLS FIRST" if na_position == "first" else " NULLS LAST" @@ -373,16 +400,16 @@ def take( if len(partition_spec.partition_by) == 0: sql = f"SELECT * FROM {tb} {sort_expr} LIMIT {n}" - return self._sql(sql, {tb: df}) + return self._sql(sql, {tb.key: df}) - cols = ", ".join(df.schema.names) - pcols = ", ".join(partition_spec.partition_by) + cols = ", ".join(encode_schema_names(df.schema)) + pcols = ", ".join(encode_column_names(partition_spec.partition_by)) sql = ( f"SELECT *, ROW_NUMBER() OVER (PARTITION BY {pcols} {sort_expr}) " f"AS __fugue_take_param FROM {tb}" ) sql = f"SELECT {cols} FROM ({sql}) WHERE __fugue_take_param<={n}" - return self._sql(sql, {tb: df}) + return self._sql(sql, {tb.key: df}) def load_df( self, @@ -400,40 +427,62 @@ def save_df( path: str, format_hint: Any = None, mode: str = "overwrite", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, force_single: bool = False, **kwargs: Any, ) -> None: + partition_spec = partition_spec or PartitionSpec() if not partition_spec.empty and not force_single: kwargs["partition_cols"] = partition_spec.partition_by dio = DuckDBIO(self.fs, self.connection) dio.save_df(self._to_duck_df(df), path, format_hint, mode, **kwargs) def convert_yield_dataframe(self, df: DataFrame, as_local: bool) -> DataFrame: - return df.as_local() if not self._external_con or as_local else df + if as_local: + return df.as_local() + return df.as_local() if not self.in_context and not self._external_con else df def _sql(self, sql: str, dfs: Dict[str, DataFrame]) -> DuckDataFrame: with self._context_lock: - df = self.sql_engine.select(DataFrames(dfs), sql) + df = self.sql_engine.select(DataFrames(dfs), list(parse_sql(sql))) return DuckDataFrame(df.native) # type: ignore - def _to_duck_df(self, df: Any, schema: Any = None) -> DuckDataFrame: - if isinstance(df, DataFrame): - assert_or_throw( - schema is None, - ValueError("schema must be None when df is a DataFrame"), - ) - if isinstance(df, DuckDataFrame): - return df - - if isinstance(df, PandasDataFrame) and all( - not pa.types.is_nested(f.type) for f in df.schema.fields - ): - rdf = DuckDataFrame(self.connection.from_df(df.as_pandas())) - else: - rdf = DuckDataFrame( - duckdb.arrow(df.as_arrow(), connection=self.connection) + def _to_duck_df( + self, df: Any, schema: Any = None, create_view: bool = False + ) -> DuckDataFrame: + def _gen_duck() -> DuckDataFrame: + if isinstance(df, DuckDBPyRelation): + assert_or_throw( + schema is None, + ValueError("schema must be None when df is a DuckDBPyRelation"), + ) + return DuckDataFrame(df) + if isinstance(df, DataFrame): + assert_or_throw( + schema is None, + ValueError("schema must be None when df is a DataFrame"), ) - return rdf - tdf = ArrowDataFrame(df, schema) - return DuckDataFrame(duckdb.arrow(tdf.native, connection=self.connection)) + if isinstance(df, DuckDataFrame): + return df + + if isinstance(df, PandasDataFrame) and all( + not pa.types.is_nested(f.type) for f in df.schema.fields + ): + rdf = DuckDataFrame(self.connection.from_df(df.as_pandas())) + else: + rdf = DuckDataFrame( + duckdb.arrow(df.as_arrow(), connection=self.connection) + ) + rdf.reset_metadata(df.metadata if df.has_metadata else None) + return rdf + tdf = ArrowDataFrame(df, schema) + return DuckDataFrame(duckdb.arrow(tdf.native, connection=self.connection)) + + res = _gen_duck() + if create_view: + with self._context_lock: + if res.alias not in self._registered_dfs: + res.native.create_view(res.alias, replace=True) + # must hold the reference of the df so the id will not be reused + self._registered_dfs[res.alias] = res + return res diff --git a/fugue_duckdb/ibis_engine.py b/fugue_duckdb/ibis_engine.py index 2de268b0..888e022e 100644 --- a/fugue_duckdb/ibis_engine.py +++ b/fugue_duckdb/ibis_engine.py @@ -1,13 +1,15 @@ -from typing import Any, Callable, Optional +from typing import Any, Callable, Dict, Optional, Tuple import ibis +from ibis.backends.pandas import Backend + from fugue import DataFrame, DataFrames, ExecutionEngine +from fugue._utils.sql import TempTableName, get_temp_tb_name, parse_sql from fugue_ibis import IbisTable from fugue_ibis._utils import to_ibis_schema -from fugue_ibis.execution.ibis_engine import IbisEngine, register_ibis_engine -from ibis.backends.pandas import Backend +from fugue_ibis.execution.ibis_engine import IbisEngine, parse_ibis_engine -from fugue_duckdb.execution_engine import DuckDBEngine, DuckExecutionEngine +from .execution_engine import DuckDBEngine, DuckExecutionEngine class DuckDBIbisEngine(IbisEngine): @@ -17,30 +19,38 @@ def select( be = _BackendWrapper().connect({}) be.set_schemas(dfs) expr = ibis_func(be) - sql = str( - ibis.postgres.compile(expr).compile(compile_kwargs={"literal_binds": True}) + sql = list( + parse_sql( + str( + ibis.postgres.compile(expr).compile( + compile_kwargs={"literal_binds": True} + ) + ), + prefix='" Optional[IbisEngine]: - if isinstance(ibis_engine, str) and ibis_engine in ["duck", "duckdb"]: - return DuckDBIbisEngine(engine) - if isinstance(engine, DuckExecutionEngine): - if ibis_engine is None: - return DuckDBIbisEngine(engine) - return None # pragma: no cover +@parse_ibis_engine.candidate( + lambda obj, *args, **kwargs: isinstance(obj, DuckExecutionEngine) + or (isinstance(obj, str) and obj in ["duck", "duckdb"]) +) +def _to_duck_ibis_engine(obj: Any, engine: ExecutionEngine) -> Optional[IbisEngine]: + return DuckDBIbisEngine(engine) class _BackendWrapper(Backend): def set_schemas(self, dfs: DataFrames) -> None: self._schemas = {k: to_ibis_schema(v.schema) for k, v in dfs.items()} - - def table(self, name: str, schema: Any = None): - return ibis.table(self._schemas[name], name=name) - - -register_ibis_engine(0, _to_duckdb_ibis_engine) + self._name_map: Dict[str, Tuple[TempTableName, IbisTable]] = {} + + def table(self, name: str, schema: Any = None) -> IbisTable: + if name not in self._name_map: + tn = get_temp_tb_name() + tb = ibis.table(self._schemas[name], name=(str(tn))) + self._name_map[name] = (tn, tb) + return self._name_map[name][1] diff --git a/fugue_duckdb/registry.py b/fugue_duckdb/registry.py index ccdc3b23..b6560447 100644 --- a/fugue_duckdb/registry.py +++ b/fugue_duckdb/registry.py @@ -7,7 +7,6 @@ from fugue import ( DataFrame, ExecutionEngine, - infer_execution_engine, is_pandas_or, register_execution_engine, register_sql_engine, @@ -18,7 +17,7 @@ SimpleAnnotationConverter, register_annotation_converter, ) -from fugue.workflow import register_raw_df_type +from fugue.plugins import infer_execution_engine from fugue_duckdb.dataframe import DuckDataFrame from fugue_duckdb.execution_engine import DuckDBEngine, DuckExecutionEngine @@ -30,10 +29,6 @@ def _infer_duckdb_client(objs: Any) -> Any: return "duckdb" -def _register_raw_dataframes() -> None: - register_raw_df_type(DuckDBPyRelation) - - def _register_engines() -> None: register_execution_engine( "duck", @@ -131,6 +126,5 @@ def _register() -> None: >>> import fugue_duckdb """ - _register_raw_dataframes() _register_engines() _register_annotation_converters() diff --git a/fugue_ibis/__init__.py b/fugue_ibis/__init__.py index 22d8ea89..1366c0fb 100644 --- a/fugue_ibis/__init__.py +++ b/fugue_ibis/__init__.py @@ -3,12 +3,7 @@ from ._compat import IbisTable from .dataframe import IbisDataFrame -from .execution.ibis_engine import IbisEngine, register_ibis_engine -from .execution.pandas_backend import _to_pandas_ibis_engine +from .execution.ibis_engine import IbisEngine, parse_ibis_engine +from .execution.pandas_backend import PandasIbisEngine from .execution_engine import IbisExecutionEngine from .extensions import as_fugue, as_ibis, run_ibis - - -@run_at_def -def register(): - register_ibis_engine(1, _to_pandas_ibis_engine) diff --git a/fugue_ibis/dataframe.py b/fugue_ibis/dataframe.py index 80b3aad3..717f21d8 100644 --- a/fugue_ibis/dataframe.py +++ b/fugue_ibis/dataframe.py @@ -2,6 +2,8 @@ import pandas as pd import pyarrow as pa +from triad import Schema + from fugue import ( DataFrame, IterableDataFrame, @@ -11,7 +13,7 @@ ) from fugue.dataframe.dataframe import _input_schema from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError -from triad import Schema +from fugue.plugins import is_df, get_column_names, rename from ._compat import IbisTable from ._utils import _pa_to_ibis_type, to_schema @@ -39,6 +41,9 @@ def native(self) -> IbisTable: """Ibis Table object""" return self._table + def native_as_df(self) -> IbisTable: + return self._table + def _to_local_df(self, table: IbisTable, schema: Any = None) -> LocalDataFrame: raise NotImplementedError # pragma: no cover @@ -69,7 +74,7 @@ def empty(self) -> bool: def num_partitions(self) -> int: return 1 # pragma: no cover - def peek_array(self) -> Any: + def peek_array(self) -> List[Any]: res = self._to_local_df(self._table.head(1)).as_array() if len(res) == 0: raise FugueDatasetEmptyError() @@ -91,13 +96,8 @@ def rename(self, columns: Dict[str, str]) -> DataFrame: schema = self.schema.rename(columns) except Exception as e: raise FugueDataFrameOperationError from e - cols: List[Any] = [] - for a, b in zip(self.schema.names, schema.names): - if a == b: - cols.append(self._table[a]) - else: - cols.append(self._table[a].name(b)) - return self._to_new_df(self._table.projection(cols), schema=schema) + df = _rename(self._table, self.schema.names, schema.names) + return self if df is self._table else self._to_new_df(df, schema=schema) def alter_columns(self, columns: Any) -> DataFrame: new_schema = self._get_altered_schema(columns) @@ -115,7 +115,10 @@ def as_pandas(self) -> pd.DataFrame: return self.as_local().as_pandas() def as_local(self) -> LocalDataFrame: - return self._to_local_df(self._table, schema=self.schema) + res = self._to_local_df(self._table, schema=self.schema) + if res is not self and self.has_metadata: + res.reset_metadata(self.metadata) + return res def as_array( self, columns: Optional[List[str]] = None, type_safe: bool = False @@ -152,3 +155,39 @@ def _alter_table_columns( def _type_equal(self, tp1: pa.DataType, tp2: pa.DataType) -> bool: return tp1 == tp2 + + +@is_df.candidate(lambda df: isinstance(df, IbisTable)) +def _ibis_is_df(df: IbisTable) -> bool: + return True + + +@get_column_names.candidate(lambda df: isinstance(df, IbisTable)) +def _get_ibis_columns(df: IbisTable) -> List[Any]: + return df.columns + + +@rename.candidate(lambda df, *args, **kwargs: isinstance(df, IbisTable)) +def _rename_dask_dataframe(df: IbisTable, columns: Dict[str, Any]) -> IbisTable: + _assert_no_missing(df, columns.keys()) + old_names = df.columns + new_names = [columns.get(name, name) for name in old_names] + return _rename(df, old_names, new_names) + + +def _rename(df: IbisTable, old_names: List[str], new_names: List[str]) -> IbisTable: + cols: List[Any] = [] + has_change = False + for a, b in zip(old_names, new_names): + if a == b: + cols.append(df[a]) + else: + cols.append(df[a].name(b)) + has_change = True + return df.projection(cols) if has_change else df + + +def _assert_no_missing(df: IbisTable, columns: Iterable[Any]) -> None: + missing = set(columns) - set(df.columns) + if len(missing) > 0: + raise FugueDataFrameOperationError("found nonexistent columns: {missing}") diff --git a/fugue_ibis/execution/ibis_engine.py b/fugue_ibis/execution/ibis_engine.py index 3108001a..e0d2f90a 100644 --- a/fugue_ibis/execution/ibis_engine.py +++ b/fugue_ibis/execution/ibis_engine.py @@ -1,51 +1,30 @@ from abc import abstractmethod -from typing import Any, Callable, List, Optional, Tuple +from typing import Any, Callable import ibis -from fugue import DataFrame, DataFrames, ExecutionEngine - -from .._compat import IbisTable - -_ENGINE_FUNC: List[ - Tuple[int, int, Callable[[ExecutionEngine, Any], Optional["IbisEngine"]]] -] = [] +from fugue import DataFrame, DataFrames, ExecutionEngine, ExecutionEngineFacet +from fugue._utils.registry import fugue_plugin -def register_ibis_engine( - priority: int, func: Callable[[ExecutionEngine, Any], Optional["IbisEngine"]] -) -> None: - _ENGINE_FUNC.append((priority, len(_ENGINE_FUNC), func)) - _ENGINE_FUNC.sort() +from .._compat import IbisTable -def to_ibis_engine( - execution_engine: ExecutionEngine, ibis_engine: Any = None -) -> "IbisEngine": - if isinstance(ibis_engine, IbisEngine): - return ibis_engine - for _, _, f in _ENGINE_FUNC: - e = f(execution_engine, ibis_engine) - if e is not None: - return e +@fugue_plugin +def parse_ibis_engine(obj: Any, engine: ExecutionEngine) -> "IbisEngine": + if isinstance(obj, IbisEngine): + return obj raise NotImplementedError( - f"can't get ibis engine from {execution_engine}, {ibis_engine}" + f"Ibis execution engine can't be parsed from {obj}." + " You may need to register a parser for it." ) -class IbisEngine: +class IbisEngine(ExecutionEngineFacet): """The abstract base class for different ibis execution implementations. :param execution_engine: the execution engine this ibis engine will run on """ - def __init__(self, execution_engine: ExecutionEngine) -> None: - self._execution_engine = execution_engine - - @property - def execution_engine(self) -> ExecutionEngine: - """the execution engine this ibis engine will run on""" - return self._execution_engine - @abstractmethod def select( self, dfs: DataFrames, ibis_func: Callable[[ibis.BaseBackend], IbisTable] diff --git a/fugue_ibis/execution/pandas_backend.py b/fugue_ibis/execution/pandas_backend.py index e93b64a5..03787528 100644 --- a/fugue_ibis/execution/pandas_backend.py +++ b/fugue_ibis/execution/pandas_backend.py @@ -1,4 +1,4 @@ -from typing import Any, Callable, Optional +from typing import Any, Callable import ibis import pandas as pd @@ -10,7 +10,7 @@ PandasDataFrame, ) from fugue_ibis._utils import to_ibis_schema, to_schema -from fugue_ibis.execution.ibis_engine import IbisEngine +from .ibis_engine import IbisEngine, parse_ibis_engine from ibis.backends.pandas import Backend from triad.utils.assertion import assert_or_throw @@ -33,13 +33,11 @@ def select( return PandasDataFrame(result, schema=schema) -def _to_pandas_ibis_engine( - engine: ExecutionEngine, ibis_engine: Any -) -> Optional[IbisEngine]: - if isinstance(engine, NativeExecutionEngine): - if ibis_engine is None: - return PandasIbisEngine(engine) - return None # pragma: no cover +@parse_ibis_engine.candidate( + lambda obj, *args, **kwargs: isinstance(obj, NativeExecutionEngine) +) +def _pd_to_ibis_engine(obj: Any, engine: ExecutionEngine) -> IbisEngine: + return PandasIbisEngine(engine) class _BackendWrapper(Backend): diff --git a/fugue_ibis/execution_engine.py b/fugue_ibis/execution_engine.py index 54636680..a4a74cbb 100644 --- a/fugue_ibis/execution_engine.py +++ b/fugue_ibis/execution_engine.py @@ -1,24 +1,20 @@ -from typing import Any, List, Optional, Dict +import itertools +from typing import Any, Dict, List, Optional, Tuple import ibis +from ibis import BaseBackend +from triad.utils.assertion import assert_or_throw + from fugue.collections.partition import ( - EMPTY_PARTITION_SPEC, PartitionSpec, parse_presort_exp, ) from fugue.dataframe import DataFrame, DataFrames from fugue.dataframe.utils import get_join_schemas -from fugue.execution.execution_engine import ( - _DEFAULT_JOIN_KEYS, - ExecutionEngine, - SQLEngine, -) -from ibis import BaseBackend -from triad.utils.assertion import assert_or_throw +from fugue.execution.execution_engine import ExecutionEngine, SQLEngine -from .dataframe import IbisDataFrame from ._compat import IbisTable -import itertools +from .dataframe import IbisDataFrame _JOIN_RIGHT_SUFFIX = "_ibis_y__" _GEN_TABLE_NAMES = (f"_fugue_temp_table_{i:d}" for i in itertools.count()) @@ -40,9 +36,9 @@ def __init__(self, execution_engine: ExecutionEngine) -> None: super().__init__(execution_engine) self._ibis_engine: IbisExecutionEngine = execution_engine # type: ignore - def select(self, dfs: DataFrames, statement: str) -> DataFrame: + def select(self, dfs: DataFrames, statement: List[Tuple[bool, str]]) -> DataFrame: return self._ibis_engine._to_ibis_dataframe( - self._ibis_engine._raw_select(statement, dfs) + self._ibis_engine._raw_select(" ".join(x[1] for x in statement), dfs) ) @@ -56,6 +52,9 @@ class IbisExecutionEngine(ExecutionEngine): def create_default_sql_engine(self) -> SQLEngine: return IbisSQLEngine(self) + def get_current_parallelism(self) -> int: + return 1 + @property def backend(self) -> BaseBackend: # pragma: no cover raise NotImplementedError @@ -82,7 +81,7 @@ def join( df1: DataFrame, df2: DataFrame, how: str, - on: List[str] = _DEFAULT_JOIN_KEYS, + on: Optional[List[str]] = None, ) -> DataFrame: _df1 = self._to_ibis_dataframe(df1) _df2 = self._to_ibis_dataframe(df2) @@ -213,8 +212,9 @@ def take( n: int, presort: str, na_position: str = "last", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, ) -> DataFrame: + partition_spec = partition_spec or PartitionSpec() assert_or_throw( isinstance(n, int), ValueError("n needs to be an integer"), diff --git a/fugue_ibis/extensions.py b/fugue_ibis/extensions.py index 6268696c..ffed50a6 100644 --- a/fugue_ibis/extensions.py +++ b/fugue_ibis/extensions.py @@ -6,8 +6,8 @@ from fugue.workflow.workflow import WorkflowDataFrames from triad import assert_or_throw, extension_method -from fugue_ibis._utils import LazyIbisObject, _materialize -from fugue_ibis.execution.ibis_engine import to_ibis_engine +from ._utils import LazyIbisObject, _materialize +from .execution.ibis_engine import parse_ibis_engine from ._compat import IbisTable @@ -196,5 +196,8 @@ class _IbisProcessor(Processor): def process(self, dfs: DataFrames) -> DataFrame: ibis_func = self.params.get_or_throw("ibis_func", Callable) ibis_engine = self.params.get_or_none("ibis_engine", object) - ie = to_ibis_engine(self.execution_engine, ibis_engine) + ie = parse_ibis_engine( + self.execution_engine if ibis_engine is None else ibis_engine, + self.execution_engine, + ) return ie.select(dfs, ibis_func) diff --git a/fugue_ray/_constants.py b/fugue_ray/_constants.py index d94eede5..2aa1738c 100644 --- a/fugue_ray/_constants.py +++ b/fugue_ray/_constants.py @@ -1,5 +1,9 @@ from typing import Dict, Any FUGUE_RAY_CONF_SHUFFLE_PARTITIONS = "fugue.ray.shuffle.partitions" +FUGUE_RAY_DEFAULT_PARTITIONS = "fugue.ray.default.partitions" -FUGUE_RAY_DEFAULT_CONF: Dict[str, Any] = {FUGUE_RAY_CONF_SHUFFLE_PARTITIONS: -1} +FUGUE_RAY_DEFAULT_CONF: Dict[str, Any] = { + FUGUE_RAY_CONF_SHUFFLE_PARTITIONS: -1, + FUGUE_RAY_DEFAULT_PARTITIONS: 0, +} diff --git a/fugue_ray/_utils/cluster.py b/fugue_ray/_utils/cluster.py new file mode 100644 index 00000000..06d35d37 --- /dev/null +++ b/fugue_ray/_utils/cluster.py @@ -0,0 +1,16 @@ +from fugue import ExecutionEngine + +from .._constants import FUGUE_RAY_CONF_SHUFFLE_PARTITIONS, FUGUE_RAY_DEFAULT_PARTITIONS +from fugue.constants import FUGUE_CONF_DEFAULT_PARTITIONS + + +def get_default_partitions(engine: ExecutionEngine) -> int: + n = engine.conf.get( + FUGUE_RAY_DEFAULT_PARTITIONS, engine.conf.get(FUGUE_CONF_DEFAULT_PARTITIONS, -1) + ) + return n if n >= 0 else engine.get_current_parallelism() * 2 + + +def get_default_shuffle_partitions(engine: ExecutionEngine) -> int: + n = engine.conf.get(FUGUE_RAY_CONF_SHUFFLE_PARTITIONS, -1) + return n if n >= 0 else get_default_partitions(engine) diff --git a/fugue_ray/_utils/io.py b/fugue_ray/_utils/io.py index cece9c8f..72dddd17 100644 --- a/fugue_ray/_utils/io.py +++ b/fugue_ray/_utils/io.py @@ -6,7 +6,7 @@ import ray.data as rd from fugue import ExecutionEngine from fugue._utils.io import FileParser, load_df, save_df -from fugue.collections.partition import EMPTY_PARTITION_SPEC, PartitionSpec +from fugue.collections.partition import PartitionSpec from fugue.dataframe import DataFrame from fugue_ray.dataframe import RayDataFrame from pyarrow import csv as pacsv @@ -59,11 +59,12 @@ def save_df( df: RayDataFrame, uri: str, format_hint: Optional[str] = None, - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, mode: str = "overwrite", force_single: bool = False, **kwargs: Any, ) -> None: + partition_spec = partition_spec or PartitionSpec() if self._fs.exists(uri): assert_or_throw(mode == "overwrite", FileExistsError(uri)) try: diff --git a/fugue_ray/dataframe.py b/fugue_ray/dataframe.py index cc007a8b..dd3d19c0 100644 --- a/fugue_ray/dataframe.py +++ b/fugue_ray/dataframe.py @@ -4,6 +4,8 @@ import pyarrow as pa import ray import ray.data as rd +from triad.collections.schema import Schema + from fugue.dataframe import ( ArrowDataFrame, DataFrame, @@ -11,34 +13,16 @@ LocalDataFrame, ) from fugue.dataframe.dataframe import _input_schema -from fugue.dataframe.utils import ( - get_dataframe_column_names, - rename_dataframe_column_names, -) from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError -from triad.collections.schema import Schema - -from ._utils.dataframe import _build_empty_arrow, build_empty, get_dataset_format - - -@get_dataframe_column_names.candidate(lambda df: isinstance(df, rd.Dataset)) -def _get_ray_dataframe_columns(df: rd.Dataset) -> List[Any]: - fmt = get_dataset_format(df) - if fmt == "pandas": - return list(df.schema(True).names) - elif fmt == "arrow": - return [f.name for f in df.schema(True)] - raise NotImplementedError(f"{fmt} is not supported") # pragma: no cover - - -@rename_dataframe_column_names.candidate( - lambda df, *args, **kwargs: isinstance(df, rd.Dataset) +from fugue.plugins import ( + as_local_bounded, + get_column_names, + get_num_partitions, + is_df, + rename, ) -def _rename_ray_dataframe(df: rd.Dataset, names: Dict[str, Any]) -> rd.Dataset: - if len(names) == 0: - return df - new_cols = [names.get(name, name) for name in _get_ray_dataframe_columns(df)] - return df.map_batches(lambda b: b.rename_columns(new_cols), batch_format="pyarrow") + +from ._utils.dataframe import build_empty, get_dataset_format class RayDataFrame(DataFrame): @@ -63,6 +47,7 @@ def __init__( # noqa: C901 schema: Any = None, internal_schema: bool = False, ): + metadata: Any = None if internal_schema: schema = _input_schema(schema).assert_not_empty() if df is None: @@ -93,6 +78,7 @@ def __init__( # noqa: C901 rdf = df._native if schema is None: schema = df.schema + metadata = None if not df.has_metadata else df.metadata elif isinstance(df, (pd.DataFrame, pd.Series)): if isinstance(df, pd.Series): df = df.to_frame() @@ -108,17 +94,23 @@ def __init__( # noqa: C901 rdf = rd.from_arrow(df.as_arrow(type_safe=True)) if schema is None: schema = df.schema + metadata = None if not df.has_metadata else df.metadata else: - raise ValueError(f"{df} is incompatible with DaskDataFrame") + raise ValueError(f"{df} is incompatible with RayDataFrame") rdf, schema = self._apply_schema(rdf, schema, internal_schema) super().__init__(schema) self._native = rdf + if metadata is not None: + self.reset_metadata(metadata) @property def native(self) -> rd.Dataset: """The wrapped ray Dataset""" return self._native + def native_as_df(self) -> rd.Dataset: + return self._native + @property def is_local(self) -> bool: return False @@ -126,8 +118,12 @@ def is_local(self) -> bool: def as_local(self) -> LocalDataFrame: adf = self.as_arrow() if adf.shape[0] == 0: - return ArrowDataFrame([], self.schema) - return ArrowDataFrame(adf) + res = ArrowDataFrame([], self.schema) + else: + res = ArrowDataFrame(adf) + if self.has_metadata: + res.reset_metadata(self.metadata) + return res @property def is_bounded(self) -> bool: @@ -139,7 +135,7 @@ def empty(self) -> bool: @property def num_partitions(self) -> int: - return self.native.num_blocks() + return _rd_num_partitions(self.native) def _drop_cols(self, cols: List[str]) -> DataFrame: cols = (self.schema - cols).names @@ -153,7 +149,7 @@ def _select_cols(self, cols: List[Any]) -> DataFrame: ) return RayDataFrame(rdf, self.schema.extract(cols), internal_schema=True) - def peek_array(self) -> Any: + def peek_array(self) -> List[Any]: data = self.native.limit(1).to_pandas().values.tolist() if len(data) == 0: raise FugueDatasetEmptyError @@ -169,17 +165,7 @@ def count(self) -> int: return self.native.count() def as_arrow(self, type_safe: bool = False) -> pa.Table: - def get_tables() -> Iterable[pa.Table]: - empty = True - for block in self.native.get_internal_block_refs(): - tb = ray.get(block) - if tb.shape[0] > 0: - yield tb - empty = False - if empty: - yield _build_empty_arrow(self.schema) - - return pa.concat_tables(get_tables()) + return pa.concat_tables(_get_arrow_tables(self.native)) def as_pandas(self) -> pd.DataFrame: return self.as_arrow().to_pandas() @@ -256,3 +242,54 @@ def _alter(table: pa.Table) -> pa.Table: # pragma: no cover def _remote_args(self) -> Dict[str, Any]: return {"num_cpus": 1} + + +@is_df.candidate(lambda df: isinstance(df, rd.Dataset)) +def _rd_is_df(df: rd.Dataset) -> bool: + return True + + +@get_num_partitions.candidate(lambda df: isinstance(df, rd.Dataset)) +def _rd_num_partitions(df: rd.Dataset) -> int: + return df.num_blocks() + + +@as_local_bounded.candidate(lambda df: isinstance(df, rd.Dataset)) +def _rd_as_local(df: rd.Dataset) -> bool: + return pa.concat_tables(_get_arrow_tables(df)) + + +@get_column_names.candidate(lambda df: isinstance(df, rd.Dataset)) +def _get_ray_dataframe_columns(df: rd.Dataset) -> List[Any]: + fmt = get_dataset_format(df) + if fmt == "pandas": + return list(df.schema(True).names) + elif fmt == "arrow": + return [f.name for f in df.schema(True)] + raise NotImplementedError(f"{fmt} is not supported") # pragma: no cover + + +@rename.candidate(lambda df, *args, **kwargs: isinstance(df, rd.Dataset)) +def _rename_ray_dataframe(df: rd.Dataset, columns: Dict[str, Any]) -> rd.Dataset: + if len(columns) == 0: + return df + cols = _get_ray_dataframe_columns(df) + missing = set(columns.keys()) - set(cols) + if len(missing) > 0: + raise FugueDataFrameOperationError("found nonexistent columns: {missing}") + new_cols = [columns.get(name, name) for name in cols] + return df.map_batches(lambda b: b.rename_columns(new_cols), batch_format="pyarrow") + + +def _get_arrow_tables(df: rd.Dataset) -> Iterable[pa.Table]: + last_empty: Any = None + empty = True + for block in df.get_internal_block_refs(): + tb = ray.get(block) + if tb.shape[0] > 0: + yield tb + empty = False + else: + last_empty = tb + if empty: + yield last_empty diff --git a/fugue_ray/execution_engine.py b/fugue_ray/execution_engine.py index 6c170973..e16a7733 100644 --- a/fugue_ray/execution_engine.py +++ b/fugue_ray/execution_engine.py @@ -1,7 +1,8 @@ from typing import Any, Callable, Dict, List, Optional, Union import pyarrow as pa -from duckdb import DuckDBPyConnection +import ray +from duckdb import DuckDBPyConnection, DuckDBPyRelation from triad import Schema, assert_or_throw, to_uuid from triad.utils.threading import RunOnce @@ -13,13 +14,12 @@ PartitionCursor, PartitionSpec, ) -from fugue.collections.partition import EMPTY_PARTITION_SPEC from fugue.constants import KEYWORD_ROWCOUNT from fugue.dataframe.arrow_dataframe import _build_empty_arrow from fugue_duckdb.dataframe import DuckDataFrame from fugue_duckdb.execution_engine import DuckExecutionEngine -from ._constants import FUGUE_RAY_CONF_SHUFFLE_PARTITIONS +from ._utils.cluster import get_default_partitions, get_default_shuffle_partitions from ._utils.dataframe import add_partition_key from ._utils.io import RayIO from .dataframe import RayDataFrame @@ -95,12 +95,15 @@ def _udf(adf: pa.Table) -> pa.Table: # pragma: no cover output_df = map_func(cursor, input_df) return output_df.as_arrow() - _df = self.execution_engine._to_ray_df(df) # type: ignore + _df: RayDataFrame = self.execution_engine._to_ray_df(df) # type: ignore if partition_spec.num_partitions != "0": _df = self.execution_engine.repartition(_df, partition_spec) # type: ignore else: - n = self.execution_engine.conf.get(FUGUE_RAY_CONF_SHUFFLE_PARTITIONS, -1) - if n > 1: + n = get_default_shuffle_partitions(self.execution_engine) + if n > 0 and n != _df.num_partitions: + # if n==0 or same as the current dataframe partitions + # then no repartition will be done by fugue + # otherwise, repartition the dataset _df = self.execution_engine.repartition( # type: ignore _df, PartitionSpec(num=n) ) @@ -153,6 +156,15 @@ def _udf(adf: pa.Table) -> pa.Table: # pragma: no cover rdf = self.execution_engine.repartition( # type: ignore rdf, partition_spec=partition_spec ) + elif rdf.num_partitions <= 1: + n = get_default_partitions(self.execution_engine) + if n > 0 and n != rdf.num_partitions: + # if n==0 or same as the current dataframe partitions + # then no repartition will be done by fugue + # otherwise, repartition the dataset + rdf = self.execution_engine.repartition( # type: ignore + rdf, PartitionSpec(num=n) + ) sdf = rdf.native.map_batches( _udf, batch_format="pyarrow", @@ -172,12 +184,24 @@ class RayExecutionEngine(DuckExecutionEngine): def __init__( self, conf: Any = None, connection: Optional[DuckDBPyConnection] = None ): + if not ray.is_initialized(): # pragma: no cover + ray.init() super().__init__(conf, connection) self._io = RayIO(self) + def __repr__(self) -> str: + return "RayExecutionEngine" + def create_default_map_engine(self) -> MapEngine: return RayMapEngine(self) + def get_current_parallelism(self) -> int: + res = ray.cluster_resources() + n = res.get("CPU", 0) + if n == 0: # pragma: no cover + res.get("cpu", 0) + return int(n) + def to_df(self, df: Any, schema: Any = None) -> DataFrame: return self._to_ray_df(df, schema=schema) @@ -190,17 +214,15 @@ def _persist_and_count(df: RayDataFrame) -> int: num_funcs = {KEYWORD_ROWCOUNT: lambda: _persist_and_count(rdf)} num = partition_spec.get_num_partitions(**num_funcs) + pdf = rdf.native - if partition_spec.algo in ["hash", "even"]: - pdf = rdf.native - if num > 0: + if num > 0: + if partition_spec.algo in ["hash", "even"]: pdf = pdf.repartition(num) - elif partition_spec.algo == "rand": - pdf = rdf.native - if num > 0: + elif partition_spec.algo == "rand": pdf = pdf.repartition(num, shuffle=True) - else: # pragma: no cover - raise NotImplementedError(partition_spec.algo + " is not supported") + else: # pragma: no cover + raise NotImplementedError(partition_spec.algo + " is not supported") return RayDataFrame(pdf, schema=rdf.schema, internal_schema=True) def broadcast(self, df: DataFrame) -> DataFrame: @@ -215,7 +237,7 @@ def persist( df = self._to_auto_df(df) if isinstance(df, RayDataFrame): return df.persist(**kwargs) - return df + return df # pragma: no cover def convert_yield_dataframe(self, df: DataFrame, as_local: bool) -> DataFrame: if isinstance(df, RayDataFrame): @@ -239,10 +261,11 @@ def save_df( path: str, format_hint: Any = None, mode: str = "overwrite", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, force_single: bool = False, **kwargs: Any, ) -> None: + partition_spec = partition_spec or PartitionSpec() df = self._to_ray_df(df) self._io.save_df( df, @@ -269,6 +292,12 @@ def _to_auto_df(self, df: Any, schema: Any = None) -> DataFrame: ValueError("schema must be None when df is a DataFrame"), ) return df + if isinstance(df, DuckDBPyRelation): + assert_or_throw( + schema is None, + ValueError("schema must be None when df is a DuckDBPyRelation"), + ) + return DuckDataFrame(df) return RayDataFrame(df, schema) def _get_remote_args(self) -> Dict[str, Any]: diff --git a/fugue_ray/registry.py b/fugue_ray/registry.py index 4445ee77..aaed5e67 100644 --- a/fugue_ray/registry.py +++ b/fugue_ray/registry.py @@ -4,19 +4,15 @@ import ray.data as rd from triad import run_at_def -from fugue import ( - DataFrame, - infer_execution_engine, - is_pandas_or, - register_execution_engine, -) +from fugue import DataFrame, is_pandas_or, register_execution_engine from fugue._utils.interfaceless import ( DataFrameParam, ExecutionEngineParam, SimpleAnnotationConverter, register_annotation_converter, ) -from fugue.workflow import register_raw_df_type +from fugue.plugins import as_fugue_dataset, infer_execution_engine + from .dataframe import RayDataFrame from .execution_engine import RayExecutionEngine @@ -29,15 +25,14 @@ def _infer_ray_client(objs: Any) -> Any: return "ray" -def _register_raw_dataframes() -> None: - register_raw_df_type(rd.Dataset) +@as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, rd.Dataset)) +def _ray_as_fugue_df(df: rd.Dataset, **kwargs: Any) -> RayDataFrame: + return RayDataFrame(df, **kwargs) def _register_engines() -> None: register_execution_engine( - "ray", - lambda conf, **kwargs: RayExecutionEngine(conf=conf), - on_dup="ignore", + "ray", lambda conf, **kwargs: RayExecutionEngine(conf=conf), on_dup="ignore" ) @@ -85,6 +80,5 @@ def count(self, df: DataFrame) -> int: # pragma: no cover @run_at_def def _register() -> None: """Register Ray Execution Engine""" - _register_raw_dataframes() _register_engines() _register_annotation_converters() diff --git a/fugue_spark/__init__.py b/fugue_spark/__init__.py index d3cac0c3..7d74f40d 100644 --- a/fugue_spark/__init__.py +++ b/fugue_spark/__init__.py @@ -3,8 +3,3 @@ from fugue_spark.dataframe import SparkDataFrame from fugue_spark.execution_engine import SparkExecutionEngine - -try: - from fugue_spark.ibis_engine import SparkIbisEngine -except Exception: # pragma: no cover - pass diff --git a/fugue_spark/_utils/io.py b/fugue_spark/_utils/io.py index 80c5e6b0..b925ef77 100644 --- a/fugue_spark/_utils/io.py +++ b/fugue_spark/_utils/io.py @@ -1,7 +1,7 @@ from typing import Any, Callable, Dict, List, Optional, Union import pyspark.sql as ps -from fugue.collections.partition import EMPTY_PARTITION_SPEC, PartitionSpec +from fugue.collections.partition import PartitionSpec from fugue.dataframe import DataFrame from fugue._utils.io import FileParser, save_df from fugue_spark.dataframe import SparkDataFrame @@ -48,11 +48,12 @@ def save_df( df: SparkDataFrame, uri: str, format_hint: Optional[str] = None, - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, mode: str = "overwrite", force_single: bool = False, **kwargs: Any, ) -> None: + partition_spec = partition_spec or PartitionSpec() if not force_single: p = FileParser(uri, format_hint) writer = self._get_writer(df.native, partition_spec) diff --git a/fugue_spark/dataframe.py b/fugue_spark/dataframe.py index c5a37d22..574c9cc0 100644 --- a/fugue_spark/dataframe.py +++ b/fugue_spark/dataframe.py @@ -3,6 +3,11 @@ import pandas as pd import pyarrow as pa import pyspark.sql as ps +from pyspark.sql.functions import col +from triad import SerializableRLock +from triad.collections.schema import SchemaError +from triad.utils.assertion import assert_or_throw + from fugue.dataframe import ( ArrayDataFrame, DataFrame, @@ -11,37 +16,22 @@ LocalDataFrame, PandasDataFrame, ) -from fugue.dataframe.utils import ( - get_dataframe_column_names, - rename_dataframe_column_names, -) from fugue.exceptions import FugueDataFrameOperationError -from pyspark.sql.functions import col -from triad import SerializableRLock -from triad.collections.schema import SchemaError -from triad.utils.assertion import assert_or_throw - -from fugue_spark._utils.convert import to_cast_expression, to_schema, to_type_safe_input - - -@get_dataframe_column_names.candidate(lambda df: isinstance(df, ps.DataFrame)) -def _get_spark_dataframe_columns(df: ps.DataFrame) -> List[Any]: - return [f.name for f in df.schema] - - -@rename_dataframe_column_names.candidate( - lambda df, *args, **kwargs: isinstance(df, ps.DataFrame) +from fugue.plugins import ( + as_local_bounded, + count, + drop_columns, + get_column_names, + get_num_partitions, + head, + is_bounded, + is_df, + is_empty, + is_local, + rename, + select_columns, ) -def _rename_spark_dataframe(df: ps.DataFrame, names: Dict[str, Any]) -> ps.DataFrame: - if len(names) == 0: - return df - cols: List[ps.Column] = [] - for f in df.schema: - c = col(f.name) - if f.name in names: - c = c.alias(names[f.name]) - cols.append(c) - return df.select(cols) +from fugue_spark._utils.convert import to_cast_expression, to_schema, to_type_safe_input class SparkDataFrame(DataFrame): @@ -77,6 +67,10 @@ def __init__(self, df: Any = None, schema: Any = None): # noqa: C901 schema = to_schema(schema).assert_not_empty() raise ValueError(f"{df} is incompatible with SparkDataFrame") + @property + def alias(self) -> str: + return "_" + str(id(self.native)) + @property def native(self) -> ps.DataFrame: """The wrapped Spark DataFrame @@ -85,6 +79,9 @@ def native(self) -> ps.DataFrame: """ return self._native + def native_as_df(self) -> ps.DataFrame: + return self._native + @property def is_local(self) -> bool: return False @@ -96,12 +93,16 @@ def is_bounded(self) -> bool: def as_local(self) -> LocalDataFrame: if any(pa.types.is_nested(t) for t in self.schema.types): data = list(to_type_safe_input(self.native.collect(), self.schema)) - return ArrayDataFrame(data, self.schema) - return PandasDataFrame(self.native.toPandas(), self.schema) + res: LocalDataFrame = ArrayDataFrame(data, self.schema) + else: + res = PandasDataFrame(self.native.toPandas(), self.schema) + if self.has_metadata: + res.reset_metadata(self.metadata) + return res @property def num_partitions(self) -> int: - return self.native.rdd.getNumPartitions() + return _spark_num_partitions(self.native) @property def empty(self) -> bool: @@ -180,3 +181,108 @@ def _select_columns(self, columns: Optional[List[str]]) -> "SparkDataFrame": if columns is None: return self return SparkDataFrame(self.native.select(*columns)) + + +@is_df.candidate(lambda df: isinstance(df, ps.DataFrame)) +def _spark_is_df(df: ps.DataFrame) -> bool: + return True + + +@get_num_partitions.candidate(lambda df: isinstance(df, ps.DataFrame)) +def _spark_num_partitions(df: ps.DataFrame) -> int: + return df.rdd.getNumPartitions() + + +@count.candidate(lambda df: isinstance(df, ps.DataFrame)) +def _spark_df_count(df: ps.DataFrame) -> int: + return df.count() + + +@is_bounded.candidate(lambda df: isinstance(df, ps.DataFrame)) +def _spark_df_is_bounded(df: ps.DataFrame) -> bool: + return True + + +@is_empty.candidate(lambda df: isinstance(df, ps.DataFrame)) +def _spark_df_is_empty(df: ps.DataFrame) -> bool: + return df.first() is None + + +@is_local.candidate(lambda df: isinstance(df, ps.DataFrame)) +def _spark_df_is_local(df: ps.DataFrame) -> bool: + return False + + +@as_local_bounded.candidate(lambda df: isinstance(df, ps.DataFrame)) +def _spark_df_as_local(df: ps.DataFrame) -> pd.DataFrame: + return df.toPandas() + + +@get_column_names.candidate(lambda df: isinstance(df, ps.DataFrame)) +def _get_spark_df_columns(df: ps.DataFrame) -> List[Any]: + return df.columns + + +@rename.candidate(lambda df, *args, **kwargs: isinstance(df, ps.DataFrame)) +def _rename_spark_df( + df: ps.DataFrame, columns: Dict[str, Any], as_fugue: bool = False +) -> ps.DataFrame: + if len(columns) == 0: + return df + _assert_no_missing(df, columns.keys()) + return _adjust_df(_rename_spark_dataframe(df, columns), as_fugue=as_fugue) + + +@drop_columns.candidate(lambda df, *args, **kwargs: isinstance(df, ps.DataFrame)) +def _drop_spark_df_columns( + df: ps.DataFrame, columns: List[str], as_fugue: bool = False +) -> Any: + cols = [x for x in df.columns if x not in columns] + if len(cols) == 0: + raise FugueDataFrameOperationError("cannot drop all columns") + if len(cols) + len(columns) != len(df.columns): + _assert_no_missing(df, columns) + return _adjust_df(df[cols], as_fugue=as_fugue) + + +@select_columns.candidate(lambda df, *args, **kwargs: isinstance(df, ps.DataFrame)) +def _select_spark_df_columns( + df: ps.DataFrame, columns: List[Any], as_fugue: bool = False +) -> Any: + if len(columns) == 0: + raise FugueDataFrameOperationError("must select at least one column") + _assert_no_missing(df, columns) + return _adjust_df(df[columns], as_fugue=as_fugue) + + +@head.candidate(lambda df, *args, **kwargs: isinstance(df, ps.DataFrame)) +def _spark_df_head( + df: ps.DataFrame, + n: int, + columns: Optional[List[str]] = None, + as_fugue: bool = False, +) -> pd.DataFrame: + if columns is not None: + df = df[columns] + res = df.limit(n) + return SparkDataFrame(res).as_local() if as_fugue else res.toPandas() + + +def _rename_spark_dataframe(df: ps.DataFrame, names: Dict[str, Any]) -> ps.DataFrame: + cols: List[ps.Column] = [] + for f in df.schema: + c = col(f.name) + if f.name in names: + c = c.alias(names[f.name]) + cols.append(c) + return df.select(cols) + + +def _assert_no_missing(df: ps.DataFrame, columns: Iterable[Any]) -> None: + missing = set(columns) - set(df.columns) + if len(missing) > 0: + raise FugueDataFrameOperationError("found nonexistent columns: {missing}") + + +def _adjust_df(res: ps.DataFrame, as_fugue: bool): + return res if not as_fugue else SparkDataFrame(res) diff --git a/fugue_spark/execution_engine.py b/fugue_spark/execution_engine.py index ae63f9e3..2c0d45d9 100644 --- a/fugue_spark/execution_engine.py +++ b/fugue_spark/execution_engine.py @@ -4,8 +4,8 @@ import pandas as pd import pyarrow as pa -import pyspark.sql as ps import pyspark +import pyspark.sql as ps from pyspark import StorageLevel from pyspark.rdd import RDD from pyspark.sql import SparkSession @@ -17,9 +17,9 @@ from triad.utils.iter import EmptyAwareIterable from triad.utils.pandas_like import PD_UTILS from triad.utils.threading import RunOnce +from triad import SerializableRLock from fugue.collections.partition import ( - EMPTY_PARTITION_SPEC, PartitionCursor, PartitionSpec, parse_presort_exp, @@ -37,12 +37,7 @@ ) from fugue.dataframe.utils import get_join_schemas from fugue.exceptions import FugueDataFrameInitError -from fugue.execution.execution_engine import ( - _DEFAULT_JOIN_KEYS, - ExecutionEngine, - MapEngine, - SQLEngine, -) +from fugue.execution.execution_engine import ExecutionEngine, MapEngine, SQLEngine from fugue_spark._constants import ( FUGUE_SPARK_CONF_USE_PANDAS_UDF, FUGUE_SPARK_DEFAULT_CONF, @@ -83,11 +78,14 @@ def __init__(self, execution_engine: ExecutionEngine): ) super().__init__(execution_engine) - def select(self, dfs: DataFrames, statement: str) -> DataFrame: + def select(self, dfs: DataFrames, statement: List[Tuple[bool, str]]) -> DataFrame: + _map: Dict[str, str] = {} for k, v in dfs.items(): - self.execution_engine.register(v, k) # type: ignore + df = self.execution_engine._to_spark_df(v, create_view=True) # type: ignore + _map[k] = df.alias + _sql = " ".join(_map.get(p[1], p[1]) if p[0] else p[1] for p in statement) return SparkDataFrame( - self.execution_engine.spark_session.sql(statement) # type: ignore + self.execution_engine.spark_session.sql(_sql) # type: ignore ) @@ -264,16 +262,15 @@ def __init__(self, spark_session: Optional[SparkSession] = None, conf: Any = Non cf.update({x[0]: x[1] for x in spark_session.sparkContext.getConf().getAll()}) cf.update(ParamDict(conf)) super().__init__(cf) + self._lock = SerializableRLock() self._fs = FileSystem() self._log = logging.getLogger() self._broadcast_func = RunOnce( self._broadcast, lambda *args, **kwargs: id(args[0]) ) self._persist_func = RunOnce(self._persist, lambda *args, **kwargs: id(args[0])) - self._register_func = RunOnce( - self._register, lambda *args, **kwargs: (id(args[0]), id(args[1])) - ) self._io = SparkIO(self.spark_session, self.fs) + self._registered_dfs: Dict[str, SparkDataFrame] = {} def __repr__(self) -> str: return "SparkExecutionEngine" @@ -303,6 +300,15 @@ def create_default_sql_engine(self) -> SQLEngine: def create_default_map_engine(self) -> MapEngine: return SparkMapEngine(self) + def get_current_parallelism(self) -> int: + spark = self.spark_session + e_cores = int(spark.conf.get("spark.executor.cores", "1")) + tc = int(spark.conf.get("spark.task.cpus", "1")) + sc = spark._jsc.sc() + nodes = len(list(sc.statusTracker().getExecutorInfos())) + workers = 1 if nodes <= 1 else nodes - 1 + return max(workers * (e_cores // tc), 1) + def to_df(self, df: Any, schema: Any = None) -> SparkDataFrame: # noqa: C901 """Convert a data structure to :class:`~fugue_spark.dataframe.SparkDataFrame` @@ -324,76 +330,14 @@ def to_df(self, df: Any, schema: Any = None) -> SparkDataFrame: # noqa: C901 * all other methods in the engine can take arbitrary dataframes and call this method to convert before doing anything """ - if isinstance(df, DataFrame): - assert_or_throw( - schema is None, - ValueError("schema must be None when df is a DataFrame"), - ) - if isinstance(df, SparkDataFrame): - return df - if isinstance(df, ArrowDataFrame): - sdf = self.spark_session.createDataFrame( - df.as_array(), to_spark_schema(df.schema) - ) - return SparkDataFrame(sdf, df.schema) - if isinstance(df, (ArrayDataFrame, IterableDataFrame)): - adf = ArrowDataFrame(df.as_array(type_safe=False), df.schema) - sdf = self.spark_session.createDataFrame( - adf.as_array(), to_spark_schema(df.schema) - ) - return SparkDataFrame(sdf, df.schema) - if any(pa.types.is_struct(t) for t in df.schema.types): - sdf = self.spark_session.createDataFrame( - df.as_array(type_safe=True), to_spark_schema(df.schema) - ) - else: - sdf = self.spark_session.createDataFrame( - df.as_pandas(), to_spark_schema(df.schema) - ) - return SparkDataFrame(sdf, df.schema) - if isinstance(df, ps.DataFrame): - return SparkDataFrame(df, None if schema is None else to_schema(schema)) - if isinstance(df, RDD): - assert_arg_not_none(schema, "schema") - sdf = self.spark_session.createDataFrame(df, to_spark_schema(schema)) - return SparkDataFrame(sdf, to_schema(schema)) - if isinstance(df, pd.DataFrame): - if PD_UTILS.empty(df): - temp_schema = to_spark_schema(PD_UTILS.to_schema(df)) - sdf = self.spark_session.createDataFrame([], temp_schema) - else: - sdf = self.spark_session.createDataFrame(df) - return SparkDataFrame(sdf, schema) - - # use arrow dataframe here to handle nulls in int cols - assert_or_throw( - schema is not None, FugueDataFrameInitError("schema can't be None") - ) - adf = ArrowDataFrame(df, to_schema(schema)) - map_pos = [i for i, t in enumerate(adf.schema.types) if pa.types.is_map(t)] - if len(map_pos) == 0: - sdf = self.spark_session.createDataFrame( - adf.as_array(), to_spark_schema(adf.schema) - ) - else: - - def to_dict(rows: Iterable[List[Any]]) -> Iterable[List[Any]]: - for row in rows: - for p in map_pos: - row[p] = dict(row[p]) - yield row - - sdf = self.spark_session.createDataFrame( - to_dict(adf.as_array_iterable()), to_spark_schema(adf.schema) - ) - return SparkDataFrame(sdf, adf.schema) + return self._to_spark_df(df, schema=schema) def repartition(self, df: DataFrame, partition_spec: PartitionSpec) -> DataFrame: def _persist_and_count(df: DataFrame) -> int: df = self.persist(df) return df.count() - df = self.to_df(df) + df = self._to_spark_df(df) num_funcs = {KEYWORD_ROWCOUNT: lambda: _persist_and_count(df)} num = partition_spec.get_num_partitions(**num_funcs) @@ -417,10 +361,10 @@ def _persist_and_count(df: DataFrame) -> int: sdf = sdf.sortWithinPartitions( *sorts.keys(), ascending=list(sorts.values()) ) - return self.to_df(sdf, df.schema) + return self._to_spark_df(sdf, df.schema) def broadcast(self, df: DataFrame) -> SparkDataFrame: - res = self._broadcast_func(self.to_df(df)) + res = self._broadcast_func(self._to_spark_df(df)) res.reset_metadata(df.metadata) return res @@ -431,20 +375,22 @@ def persist( **kwargs: Any, ) -> SparkDataFrame: res = self._persist_func( - self.to_df(df), lazy=lazy, level=kwargs.get("level", None) + self._to_spark_df(df), lazy=lazy, level=kwargs.get("level", None) ) res.reset_metadata(df.metadata) return res def register(self, df: DataFrame, name: str) -> SparkDataFrame: - return self._register_func(self.to_df(df), name) + sdf = self._to_spark_df(df) + sdf.native.createOrReplaceTempView(name) + return sdf def join( self, df1: DataFrame, df2: DataFrame, how: str, - on: List[str] = _DEFAULT_JOIN_KEYS, + on: Optional[List[str]] = None, ) -> DataFrame: key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on) how = how.lower().replace("_", "").replace(" ", "") @@ -453,14 +399,14 @@ def join( ValueError(f"{how} is not supported as a join type"), ) how = _TO_SPARK_JOIN_MAP[how] - d1 = self.to_df(df1).native - d2 = self.to_df(df2).native + d1 = self._to_spark_df(df1).native + d2 = self._to_spark_df(df2).native cols = [col(n) for n in output_schema.names] if how == "cross": res = d1.crossJoin(d2).select(*cols) else: res = d1.join(d2, on=key_schema.names, how=how).select(*cols) - return self.to_df(res, output_schema) + return self._to_spark_df(res, output_schema) def union( self, @@ -472,12 +418,12 @@ def union( df1.schema == df2.schema, lambda: ValueError(f"{df1.schema} != {df2.schema}"), ) - d1 = self.to_df(df1).native - d2 = self.to_df(df2).native + d1 = self._to_spark_df(df1).native + d2 = self._to_spark_df(df2).native d = d1.union(d2) if distinct: d = d.distinct() - return self.to_df(d, df1.schema) + return self._to_spark_df(d, df1.schema) def subtract( self, df1: DataFrame, df2: DataFrame, distinct: bool = True @@ -486,13 +432,13 @@ def subtract( df1.schema == df2.schema, lambda: ValueError(f"{df1.schema} != {df2.schema}"), ) - d1 = self.to_df(df1).native - d2 = self.to_df(df2).native + d1 = self._to_spark_df(df1).native + d2 = self._to_spark_df(df2).native if distinct: d: Any = d1.subtract(d2) else: # pragma: no cover d = d1.exceptAll(d2) - return self.to_df(d, df1.schema) + return self._to_spark_df(d, df1.schema) def intersect( self, df1: DataFrame, df2: DataFrame, distinct: bool = True @@ -501,17 +447,17 @@ def intersect( df1.schema == df2.schema, lambda: ValueError(f"{df1.schema} != {df2.schema}"), ) - d1 = self.to_df(df1).native - d2 = self.to_df(df2).native + d1 = self._to_spark_df(df1).native + d2 = self._to_spark_df(df2).native if distinct: d: Any = d1.intersect(d2) else: # pragma: no cover d = d1.intersectAll(d2) - return self.to_df(d, df1.schema) + return self._to_spark_df(d, df1.schema) def distinct(self, df: DataFrame) -> DataFrame: - d = self.to_df(df).native.distinct() - return self.to_df(d, df.schema) + d = self._to_spark_df(df).native.distinct() + return self._to_spark_df(d, df.schema) def dropna( self, @@ -520,8 +466,8 @@ def dropna( thresh: int = None, subset: List[str] = None, ) -> DataFrame: - d = self.to_df(df).native.dropna(how=how, thresh=thresh, subset=subset) - return self.to_df(d, df.schema) + d = self._to_spark_df(df).native.dropna(how=how, thresh=thresh, subset=subset) + return self._to_spark_df(d, df.schema) def fillna(self, df: DataFrame, value: Any, subset: List[str] = None) -> DataFrame: assert_or_throw( @@ -540,8 +486,8 @@ def fillna(self, df: DataFrame, value: Any, subset: List[str] = None) -> DataFra # If subset is none, apply to all columns subset = subset or df.schema.names mapping = {col: value for col in subset} - d = self.to_df(df).native.fillna(mapping) - return self.to_df(d, df.schema) + d = self._to_spark_df(df).native.fillna(mapping) + return self._to_spark_df(d, df.schema) def sample( self, @@ -556,10 +502,10 @@ def sample( ValueError("one and only one of n and frac should be set"), ) if frac is not None: - d = self.to_df(df).native.sample( + d = self._to_spark_df(df).native.sample( fraction=frac, withReplacement=replace, seed=seed ) - return self.to_df(d, df.schema) + return self._to_spark_df(d, df.schema) else: assert_or_throw( seed is None, @@ -572,11 +518,11 @@ def sample( ), ) temp_name = "__temp_" + str(uuid4()).split("-")[-1] - self.to_df(df).native.createOrReplaceTempView(temp_name) + self._to_spark_df(df).native.createOrReplaceTempView(temp_name) d = self.spark_session.sql( f"SELECT * FROM {temp_name} TABLESAMPLE({n} ROWS)" ) - return self.to_df(d, df.schema) + return self._to_spark_df(d, df.schema) def take( self, @@ -584,13 +530,14 @@ def take( n: int, presort: str, na_position: str = "last", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, ) -> DataFrame: + partition_spec = partition_spec or PartitionSpec() assert_or_throw( isinstance(n, int), ValueError("n needs to be an integer"), ) - d = self.to_df(df).native + d = self._to_spark_df(df).native nulls_last = bool(na_position == "last") if presort: @@ -636,7 +583,7 @@ def _presort_to_col(_col: str, _asc: bool) -> Any: .drop("__row_number__") ) - return self.to_df(d, df.schema) + return self._to_spark_df(d, df.schema) def load_df( self, @@ -655,11 +602,12 @@ def save_df( path: str, format_hint: Any = None, mode: str = "overwrite", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, force_single: bool = False, **kwargs: Any, ) -> None: - df = self.to_df(df) + partition_spec = partition_spec or PartitionSpec() + df = self._to_spark_df(df) self._io.save_df( df, uri=path, @@ -687,9 +635,85 @@ def _persist(self, df: SparkDataFrame, lazy: bool, level: Any) -> SparkDataFrame return df raise ValueError(f"{level} is not supported persist type") # pragma: no cover - def _register(self, df: SparkDataFrame, name: str) -> SparkDataFrame: - df.native.createOrReplaceTempView(name) - return df + def _to_spark_df( # noqa: C901 + self, df: Any, schema: Any = None, create_view: bool = False + ) -> SparkDataFrame: + def _to_df() -> SparkDataFrame: + if isinstance(df, DataFrame): + assert_or_throw( + schema is None, + ValueError("schema must be None when df is a DataFrame"), + ) + if isinstance(df, SparkDataFrame): + return df + if isinstance(df, ArrowDataFrame): + sdf = self.spark_session.createDataFrame( + df.as_array(), to_spark_schema(df.schema) + ) + return SparkDataFrame(sdf, df.schema) + if isinstance(df, (ArrayDataFrame, IterableDataFrame)): + adf = ArrowDataFrame(df.as_array(type_safe=False), df.schema) + sdf = self.spark_session.createDataFrame( + adf.as_array(), to_spark_schema(df.schema) + ) + return SparkDataFrame(sdf, df.schema) + if any(pa.types.is_struct(t) for t in df.schema.types): + sdf = self.spark_session.createDataFrame( + df.as_array(type_safe=True), to_spark_schema(df.schema) + ) + else: + sdf = self.spark_session.createDataFrame( + df.as_pandas(), to_spark_schema(df.schema) + ) + return SparkDataFrame(sdf, df.schema) + if isinstance(df, ps.DataFrame): + return SparkDataFrame(df, None if schema is None else to_schema(schema)) + if isinstance(df, RDD): + assert_arg_not_none(schema, "schema") + sdf = self.spark_session.createDataFrame(df, to_spark_schema(schema)) + return SparkDataFrame(sdf, to_schema(schema)) + if isinstance(df, pd.DataFrame): + if PD_UTILS.empty(df): + temp_schema = to_spark_schema(PD_UTILS.to_schema(df)) + sdf = self.spark_session.createDataFrame([], temp_schema) + else: + sdf = self.spark_session.createDataFrame(df) + return SparkDataFrame(sdf, schema) + + # use arrow dataframe here to handle nulls in int cols + assert_or_throw( + schema is not None, FugueDataFrameInitError("schema can't be None") + ) + adf = ArrowDataFrame(df, to_schema(schema)) + map_pos = [i for i, t in enumerate(adf.schema.types) if pa.types.is_map(t)] + if len(map_pos) == 0: + sdf = self.spark_session.createDataFrame( + adf.as_array(), to_spark_schema(adf.schema) + ) + else: + + def to_dict(rows: Iterable[List[Any]]) -> Iterable[List[Any]]: + for row in rows: + for p in map_pos: + row[p] = dict(row[p]) + yield row + + sdf = self.spark_session.createDataFrame( + to_dict(adf.as_array_iterable()), to_spark_schema(adf.schema) + ) + return SparkDataFrame(sdf, adf.schema) + + res = _to_df() + if res is not df and isinstance(df, DataFrame) and df.has_metadata: + res.reset_metadata(df.metadata) + + if create_view: + with self._lock: + if res.alias not in self._registered_dfs: + res.native.createOrReplaceTempView(res.alias) + self._registered_dfs[res.alias] = res + + return res class _Mapper(object): # pragma: no cover diff --git a/fugue_spark/ibis_engine.py b/fugue_spark/ibis_engine.py index 166a8dd8..26b3adda 100644 --- a/fugue_spark/ibis_engine.py +++ b/fugue_spark/ibis_engine.py @@ -1,13 +1,13 @@ -from typing import Any, Callable, Optional +from typing import Any, Callable import ibis -from fugue import DataFrame, DataFrames, ExecutionEngine -from fugue_ibis import IbisTable -from fugue_ibis._utils import to_schema -from fugue_ibis.execution.ibis_engine import IbisEngine, register_ibis_engine from pyspark.sql import DataFrame as PySparkDataFrame from triad.utils.assertion import assert_or_throw +from fugue import DataFrame, DataFrames, ExecutionEngine +from fugue_ibis import IbisTable +from fugue_ibis._utils import to_schema +from fugue_ibis.execution.ibis_engine import IbisEngine, parse_ibis_engine from fugue_spark.dataframe import SparkDataFrame from fugue_spark.execution_engine import SparkExecutionEngine @@ -38,13 +38,8 @@ def select( return SparkDataFrame(result, schema=schema) -def _to_spark_ibis_engine( - engine: ExecutionEngine, ibis_engine: Any -) -> Optional[IbisEngine]: - if isinstance(engine, SparkExecutionEngine): - if ibis_engine is None: - return SparkIbisEngine(engine) - return None # pragma: no cover - - -register_ibis_engine(0, _to_spark_ibis_engine) +@parse_ibis_engine.candidate( + lambda obj, *args, **kwargs: isinstance(obj, SparkExecutionEngine) +) +def _spark_to_ibis_engine(obj: Any, engine: ExecutionEngine) -> IbisEngine: + return SparkIbisEngine(engine) diff --git a/fugue_spark/registry.py b/fugue_spark/registry.py index 27c5b55e..85d062df 100644 --- a/fugue_spark/registry.py +++ b/fugue_spark/registry.py @@ -7,21 +7,15 @@ from pyspark.sql import SparkSession from triad import run_at_def -from fugue import ( - DataFrame, - ExecutionEngine, - infer_execution_engine, - is_pandas_or, - parse_creator, - register_execution_engine, -) +from fugue import DataFrame, ExecutionEngine, is_pandas_or, register_execution_engine from fugue._utils.interfaceless import ( DataFrameParam, ExecutionEngineParam, SimpleAnnotationConverter, register_annotation_converter, ) -from fugue.workflow import register_raw_df_type +from fugue.plugins import as_fugue_dataset, infer_execution_engine, parse_creator + from fugue_spark.dataframe import SparkDataFrame from fugue_spark.execution_engine import SparkExecutionEngine @@ -41,6 +35,11 @@ def _infer_spark_client(obj: Any) -> Any: return SparkSession.builder.getOrCreate() +@as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, ps.DataFrame)) +def _spark_as_fugue_df(df: ps.DataFrame, **kwargs: Any) -> SparkDataFrame: + return SparkDataFrame(df, **kwargs) + + @parse_creator.candidate(lambda obj: _is_sparksql(obj)) def _parse_sparksql_creator(sql): def _run_sql(spark: SparkSession) -> ps.DataFrame: @@ -49,10 +48,6 @@ def _run_sql(spark: SparkSession) -> ps.DataFrame: return _run_sql -def _register_raw_dataframes() -> None: - register_raw_df_type(ps.DataFrame) - - def _register_engines() -> None: register_execution_engine( "spark", @@ -187,6 +182,5 @@ def _register() -> None: >>> import fugue_spark """ - _register_raw_dataframes() _register_engines() _register_annotation_converters() diff --git a/fugue_test/builtin_suite.py b/fugue_test/builtin_suite.py index 1c4fcd43..849adebb 100644 --- a/fugue_test/builtin_suite.py +++ b/fugue_test/builtin_suite.py @@ -53,6 +53,7 @@ FugueWorkflowRuntimeValidationError, ) from pytest import raises +import fugue.api as fa from triad import SerializableRLock @@ -843,6 +844,7 @@ def select(self, dfs, statement): a = dag.df([[1, 10], [2, 20], [3, 30]], "x:long,y:long") b = dag.df([[2, 20, 40], [3, 30, 90]], "x:long,y:long,z:long") dag.select("* FROM", a).assert_eq(a) + dag.select(a, ".* FROM", a).assert_eq(a) dag.select("SELECT *,x*y AS z FROM", a, "WHERE x>=2").assert_eq(b) c = dag.df([[2, 20, 40], [3, 30, 90]], "x:long,y:long,zb:long") @@ -1571,6 +1573,51 @@ def t5(df: pd.DataFrame, c: Callable) -> List[List[Any]]: assert 4 == cb3.n + def test_sql_api(self): + def tr(df: pd.DataFrame, n=1) -> pd.DataFrame: + return df + n + + with fa.engine_context(self.engine): + df1 = fa.as_fugue_df([[0, 1], [2, 3], [4, 5]], schema="a:long,b:int") + df2 = pd.DataFrame([[0, 10], [1, 100]], columns=["a", "c"]) + sdf1 = fa.raw_sql( # noqa + "SELECT ", df1, ".a, b FROM ", df1, " WHERE a<4" + ) + sdf2 = fa.raw_sql("SELECT * FROM ", df2, " WHERE a<1") # noqa + + sdf3 = fa.fugue_sql( + """ + SELECT sdf1.a,sdf1.b,c FROM sdf1 INNER JOIN sdf2 ON sdf1.a=sdf2.a + TRANSFORM USING tr SCHEMA * + """ + ) + res = fa.fugue_sql_flow( + """ + TRANSFORM x USING tr(n=2) SCHEMA * + YIELD LOCAL DATAFRAME AS res + PRINT sdf1 + """, + x=sdf3, + ).run() + df_eq( + res["res"], + [[3, 4, 13]], + schema="a:long,b:int,c:long", + check_schema=False, + throw=True, + ) + + sdf4 = fa.fugue_sql( + """ + SELECT sdf1.a,b,c FROM sdf1 INNER JOIN sdf2 ON sdf1.a=sdf2.a + TRANSFORM USING tr SCHEMA * + """, + as_fugue=False, + as_local=True, + ) + assert not isinstance(sdf4, DataFrame) + assert fa.is_local(sdf4) + def mock_creator(p: int) -> DataFrame: return ArrayDataFrame([[p]], "a:int") diff --git a/fugue_test/dataframe_suite.py b/fugue_test/dataframe_suite.py index c956c9dc..c371778e 100644 --- a/fugue_test/dataframe_suite.py +++ b/fugue_test/dataframe_suite.py @@ -6,11 +6,12 @@ import numpy as np import pandas as pd +from pytest import raises + +import fugue.api as fi from fugue.dataframe import ArrowDataFrame, DataFrame from fugue.dataframe.utils import _df_eq as df_eq from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError -from pytest import raises -from triad.collections.schema import Schema class DataFrameTests(object): @@ -27,100 +28,140 @@ def setUpClass(cls): def tearDownClass(cls): pass - def df( - self, data: Any = None, schema: Any = None - ) -> DataFrame: # pragma: no cover + def df(self, data: Any = None, schema: Any = None) -> Any: # pragma: no cover raise NotImplementedError - def test_init_basic(self): - raises(Exception, lambda: self.df()) - raises(Exception, lambda: self.df([])) - raises(Exception, lambda: self.df([[]], Schema())) - raises(Exception, lambda: self.df([[1]], Schema())) - # raises(SchemaError, lambda: self.df([[1]])) # schema can be inferred - - df = self.df([], "a:str,b:int") - assert df.empty - - def test_datetime(self): - df = self.df([["2020-01-01"], [None]], "a:datetime") - assert [[datetime(2020, 1, 1)], [None]] == df.as_array(type_safe=True) + def test_native(self): + df = self.df([1], "a:int") + assert fi.is_df(df) + fdf = fi.as_fugue_df(df) + assert isinstance(fdf, DataFrame) + assert fi.is_df(fdf) + ndf = fi.get_native_as_df(fdf) + assert fi.is_df(ndf) + assert not isinstance(ndf, DataFrame) + ndf2 = fi.get_native_as_df(ndf) + assert ndf2 is ndf def test_peek(self): df = self.df([], "x:str,y:double") - raises(FugueDatasetEmptyError, lambda: df.peek_array()) - raises(FugueDatasetEmptyError, lambda: df.peek_dict()) + raises(FugueDatasetEmptyError, lambda: fi.peek_array(df)) + raises(FugueDatasetEmptyError, lambda: fi.peek_dict(df)) df = self.df([["a", 1.0], ["b", 2.0]], "x:str,y:double") - assert not df.is_bounded or 2 == df.count() - assert not df.empty - assert ["a", 1.0] == df.peek_array() - assert dict(x="a", y=1.0) == df.peek_dict() + assert not fi.is_bounded(df) or 2 == fi.count(df) + assert not fi.is_empty(df) + assert ["a", 1.0] == fi.peek_array(df) + assert dict(x="a", y=1.0) == fi.peek_dict(df) def test_as_pandas(self): df = self.df([["a", 1.0], ["b", 2.0]], "x:str,y:double") - pdf = df.as_pandas() + pdf = fi.as_pandas(df) assert [["a", 1.0], ["b", 2.0]] == pdf.values.tolist() df = self.df([], "x:str,y:double") - pdf = df.as_pandas() + pdf = fi.as_pandas(df) assert [] == pdf.values.tolist() + assert fi.is_local(pdf) + + def test_as_local(self): + with raises(NotImplementedError): + fi.as_local(10) + with raises(NotImplementedError): + fi.as_local_bounded(10) - def test_drop(self): - df = self.df([], "a:str,b:int").drop(["a"]) - assert df.schema == "b:int" + df = self.df([["a", 1.0], ["b", 2.0]], "x:str,y:double") + ldf = fi.as_local(df) + assert fi.is_local(ldf) + lbdf = fi.as_local_bounded(df) + assert fi.is_local(lbdf) and fi.is_bounded(lbdf) + + fdf = fi.as_fugue_df(df) + fdf.reset_metadata({"a": 1}) + ldf = fi.as_local(fdf) + assert ldf.metadata == {"a": 1} + lbdf = fi.as_local_bounded(fdf) + assert fi.is_local(lbdf) and fi.is_bounded(lbdf) + assert ldf.metadata == {"a": 1} + + def test_drop_columns(self): + df = fi.drop_columns(self.df([], "a:str,b:int"), ["a"]) + assert fi.get_schema(df) == "b:int" raises( - FugueDataFrameOperationError, lambda: df.drop(["b"]) + FugueDataFrameOperationError, lambda: fi.drop_columns(df, ["b"]) ) # can't be empty raises( - FugueDataFrameOperationError, lambda: df.drop(["x"]) + FugueDataFrameOperationError, lambda: fi.drop_columns(df, ["x"]) ) # cols must exist - df = self.df([["a", 1]], "a:str,b:int").drop(["a"]) - assert df.schema == "b:int" + df = fi.drop_columns(self.df([["a", 1]], "a:str,b:int"), ["a"]) + assert fi.get_schema(df) == "b:int" raises( - FugueDataFrameOperationError, lambda: df.drop(["b"]) + FugueDataFrameOperationError, lambda: fi.drop_columns(df, ["b"]) ) # can't be empty raises( - FugueDataFrameOperationError, lambda: df.drop(["x"]) + FugueDataFrameOperationError, lambda: fi.drop_columns(df, ["x"]) ) # cols must exist - assert [[1]] == df.as_array(type_safe=True) + assert [[1]] == fi.as_array(df, type_safe=True) def test_select(self): - df = self.df([], "a:str,b:int")[["b"]] - assert df.schema == "b:int" - raises(FugueDataFrameOperationError, lambda: df[["a"]]) # not existed - raises(FugueDataFrameOperationError, lambda: df[[]]) # empty + df = fi.select_columns(self.df([], "a:str,b:int"), ["b"]) + assert fi.get_schema(df) == "b:int" + assert fi.get_column_names(df) == ["b"] + raises( + FugueDataFrameOperationError, lambda: fi.select_columns(df, []) + ) # select empty + raises( + FugueDataFrameOperationError, lambda: fi.select_columns(df, ["a"]) + ) # not existed + raises( + FugueDataFrameOperationError, lambda: fi.select_columns(df, ["a"]) + ) # empty - df = self.df([["a", 1]], "a:str,b:int")[["b"]] - assert df.schema == "b:int" - raises(FugueDataFrameOperationError, lambda: df[["a"]]) # not existed - raises(FugueDataFrameOperationError, lambda: df[[]]) # empty - assert [[1]] == df.as_array(type_safe=True) + df = fi.select_columns(self.df([["a", 1]], "a:str,b:int"), ["b"]) + assert fi.get_schema(df) == "b:int" + raises( + FugueDataFrameOperationError, lambda: fi.select_columns(df, ["a"]) + ) # not existed + raises( + FugueDataFrameOperationError, lambda: fi.select_columns(df, ["a"]) + ) # empty + assert [[1]] == fi.as_array(df, type_safe=True) df = self.df([["a", 1, 2]], "a:str,b:int,c:int") - df_eq(df[["c", "a"]], [[2, "a"]], "a:str,c:int") + df_eq( + fi.as_fugue_df(fi.select_columns(df, ["c", "a"])), + [[2, "a"]], + "a:str,c:int", + ) def test_rename(self): for data in [[["a", 1]], []]: df = self.df(data, "a:str,b:int") - df2 = df.rename(columns=dict(a="aa")) - assert df.schema == "a:str,b:int" - df_eq(df2, data, "aa:str,b:int", throw=True) + df2 = fi.rename(df, columns=dict(a="aa")) + assert fi.get_schema(df) == "a:str,b:int" + df_eq(fi.as_fugue_df(df2), data, "aa:str,b:int", throw=True) + + for data in [[["a", 1]], []]: + df = self.df(data, "a:str,b:int") + df3 = fi.rename(df, columns={}) + assert fi.get_schema(df3) == "a:str,b:int" + df_eq(fi.as_fugue_df(df3), data, "a:str,b:int", throw=True) def test_rename_invalid(self): df = self.df([["a", 1]], "a:str,b:int") raises( - FugueDataFrameOperationError, lambda: df.rename(columns=dict(aa="ab")) + FugueDataFrameOperationError, + lambda: fi.rename(df, columns=dict(aa="ab")), ) def test_as_array(self): for func in [ - lambda df, *args, **kwargs: df.as_array( - *args, **kwargs, type_safe=True + lambda df, *args, **kwargs: fi.as_array( + df, *args, **kwargs, type_safe=True ), lambda df, *args, **kwargs: list( - df.as_array_iterable(*args, **kwargs, type_safe=True) + fi.as_array_iterable(df, *args, **kwargs, type_safe=True) ), ]: df = self.df([], "a:str,b:int") @@ -142,11 +183,11 @@ def test_as_array(self): def test_as_array_special_values(self): for func in [ - lambda df, *args, **kwargs: df.as_array( - *args, **kwargs, type_safe=True + lambda df, *args, **kwargs: fi.as_array( + df, *args, **kwargs, type_safe=True ), lambda df, *args, **kwargs: list( - df.as_array_iterable(*args, **kwargs, type_safe=True) + fi.as_array_iterable(df, *args, **kwargs, type_safe=True) ), ]: df = self.df([[pd.Timestamp("2020-01-01"), 1]], "a:datetime,b:int") @@ -166,92 +207,93 @@ def test_as_array_special_values(self): def test_as_dict_iterable(self): df = self.df([[pd.NaT, 1]], "a:datetime,b:int") - assert [dict(a=None, b=1)] == list(df.as_dict_iterable()) + assert [dict(a=None, b=1)] == list(fi.as_dict_iterable(df)) df = self.df([[pd.Timestamp("2020-01-01"), 1]], "a:datetime,b:int") - assert [dict(a=datetime(2020, 1, 1), b=1)] == list(df.as_dict_iterable()) + assert [dict(a=datetime(2020, 1, 1), b=1)] == list(fi.as_dict_iterable(df)) def test_list_type(self): data = [[[30, 40]]] df = self.df(data, "a:[int]") - a = df.as_array(type_safe=True) + a = fi.as_array(df, type_safe=True) assert data == a def test_struct_type(self): data = [[{"a": 1}], [{"a": 2}]] df = self.df(data, "x:{a:int}") - a = df.as_array(type_safe=True) + a = fi.as_array(df, type_safe=True) assert data == a def test_map_type(self): data = [[[("a", 1), ("b", 3)]], [[("b", 2)]]] df = self.df(data, "x:") - a = df.as_array(type_safe=True) + a = fi.as_array(df, type_safe=True) assert data == a def test_deep_nested_types(self): data = [[dict(a="1", b=[3, 4], d=1.0)], [dict(b=[30, 40])]] df = self.df(data, "a:{a:str,b:[int]}") - a = df.as_array(type_safe=True) + a = fi.as_array(df, type_safe=True) assert [[dict(a="1", b=[3, 4])], [dict(a=None, b=[30, 40])]] == a data = [[[dict(b=[30, 40])]]] df = self.df(data, "a:[{a:str,b:[int]}]") - a = df.as_array(type_safe=True) + a = fi.as_array(df, type_safe=True) assert [[[dict(a=None, b=[30, 40])]]] == a def test_binary_type(self): data = [[b"\x01\x05"]] df = self.df(data, "a:bytes") - a = df.as_array(type_safe=True) + a = fi.as_array(df, type_safe=True) assert data == a def test_as_arrow(self): # empty df = self.df([], "a:int,b:int") - assert [] == list(ArrowDataFrame(df.as_arrow()).as_dict_iterable()) + assert [] == list(ArrowDataFrame(fi.as_arrow(df)).as_dict_iterable()) + assert fi.is_local(fi.as_arrow(df)) # pd.Nat df = self.df([[pd.NaT, 1]], "a:datetime,b:int") assert [dict(a=None, b=1)] == list( - ArrowDataFrame(df.as_arrow()).as_dict_iterable() + ArrowDataFrame(fi.as_arrow(df)).as_dict_iterable() ) # pandas timestamps df = self.df([[pd.Timestamp("2020-01-01"), 1]], "a:datetime,b:int") assert [dict(a=datetime(2020, 1, 1), b=1)] == list( - ArrowDataFrame(df.as_arrow()).as_dict_iterable() + ArrowDataFrame(fi.as_arrow(df)).as_dict_iterable() ) # float nan, list data = [[[float("nan"), 2.0]]] df = self.df(data, "a:[float]") - assert [[[None, 2.0]]] == ArrowDataFrame(df.as_arrow()).as_array() + assert [[[None, 2.0]]] == ArrowDataFrame(fi.as_arrow(df)).as_array() # dict data = [[dict(b="x")]] df = self.df(data, "a:{b:str}") - assert data == ArrowDataFrame(df.as_arrow()).as_array() + assert data == ArrowDataFrame(fi.as_arrow(df)).as_array() # list[dict] data = [[[dict(b=[30, 40])]]] df = self.df(data, "a:[{b:[int]}]") - assert data == ArrowDataFrame(df.as_arrow()).as_array() + assert data == ArrowDataFrame(fi.as_arrow(df)).as_array() def test_head(self): df = self.df([], "a:str,b:int") - assert [] == df.head(1).as_array() - assert [] == df.head(1, ["b"]).as_array() + assert [] == fi.as_array(fi.head(df, 1)) + assert [] == fi.as_array(fi.head(df, 1, ["b"])) df = self.df([["a", 1]], "a:str,b:int") - if df.is_bounded: - assert [["a", 1]] == df.head(1).as_array() - assert [[1, "a"]] == df.head(1, ["b", "a"]).as_array() - assert [] == df.head(0).as_array() + if fi.is_bounded(df): + assert [["a", 1]] == fi.as_array(fi.head(df, 1)) + assert [[1, "a"]] == fi.as_array(fi.head(df, 1, ["b", "a"])) + assert [] == fi.as_array(fi.head(df, 0)) df = self.df([[0, 1], [0, 2], [1, 1], [1, 3]], "a:int,b:int") - assert 2 == df.head(2).count() + assert 2 == fi.count(fi.head(df, 2)) df = self.df([[0, 1], [0, 2], [1, 1], [1, 3]], "a:int,b:int") - assert 4 == df.head(10).count() - h = df.head(10) - assert h.is_local and h.is_bounded + assert 4 == fi.count(fi.head(df, 10)) + h = fi.head(df, 10) + assert fi.is_local(h) and fi.is_bounded(h) def test_show(self): df = self.df([["a", 1]], "a:str,b:int") - df.show() + fi.show(df) def test_get_altered_schema(self): df = self.df([["a", 1]], "a:str,b:int") @@ -270,47 +312,55 @@ def test_get_altered_schema(self): def test_alter_columns(self): # empty df = self.df([], "a:str,b:int") - ndf = df.alter_columns("a:str,b:str") - assert [] == ndf.as_array(type_safe=True) - assert ndf.schema == "a:str,b:str" + ndf = fi.alter_columns(df, "a:str,b:str") + assert [] == fi.as_array(ndf, type_safe=True) + assert fi.get_schema(ndf) == "a:str,b:str" # no change df = self.df([["a", 1], ["c", None]], "a:str,b:int") - ndf = df.alter_columns("b:int,a:str") - assert [["a", 1], ["c", None]] == ndf.as_array(type_safe=True) - assert ndf.schema == df.schema + ndf = fi.alter_columns(df, "b:int,a:str", as_fugue=True) + assert [["a", 1], ["c", None]] == fi.as_array(ndf, type_safe=True) + assert fi.get_schema(ndf) == "a:str,b:int" # bool -> str df = self.df([["a", True], ["b", False], ["c", None]], "a:str,b:bool") - ndf = df.alter_columns("b:str") - actual = ndf.as_array(type_safe=True) + ndf = fi.alter_columns(df, "b:str", as_fugue=True) + actual = fi.as_array(ndf, type_safe=True) # Capitalization doesn't matter # and dataframes don't need to be consistent on capitalization expected1 = [["a", "True"], ["b", "False"], ["c", None]] expected2 = [["a", "true"], ["b", "false"], ["c", None]] assert expected1 == actual or expected2 == actual - assert ndf.schema == "a:str,b:str" + assert fi.get_schema(ndf) == "a:str,b:str" # int -> str df = self.df([["a", 1], ["c", None]], "a:str,b:int") - ndf = df.alter_columns("b:str") - assert [["a", "1"], ["c", None]] == ndf.as_array(type_safe=True) - assert ndf.schema == "a:str,b:str" + ndf = fi.alter_columns(df, "b:str", as_fugue=True) + arr = fi.as_array(ndf, type_safe=True) + assert [["a", "1"], ["c", None]] == arr or [ + ["a", "1.0"], + ["c", None], + ] == arr # in pandas case, it can't treat [1, None] as an int col + assert fi.get_schema(ndf) == "a:str,b:str" # int -> double df = self.df([["a", 1], ["c", None]], "a:str,b:int") - ndf = df.alter_columns("b:double") - assert [["a", 1], ["c", None]] == ndf.as_array(type_safe=True) - assert ndf.schema == "a:str,b:double" + ndf = fi.alter_columns(df, "b:double", as_fugue=True) + assert [["a", 1], ["c", None]] == fi.as_array(ndf, type_safe=True) + assert fi.get_schema(ndf) == "a:str,b:double" # double -> str df = self.df([["a", 1.1], ["b", None]], "a:str,b:double") - data = df.alter_columns("b:str").as_array(type_safe=True) + data = fi.as_array( + fi.alter_columns(df, "b:str", as_fugue=True), type_safe=True + ) assert [["a", "1.1"], ["b", None]] == data # double -> int df = self.df([["a", 1.0], ["b", None]], "a:str,b:double") - data = df.alter_columns("b:int").as_array(type_safe=True) + data = fi.as_array( + fi.alter_columns(df, "b:int", as_fugue=True), type_safe=True + ) assert [["a", 1], ["b", None]] == data # date -> str @@ -318,7 +368,9 @@ def test_alter_columns(self): [["a", date(2020, 1, 1)], ["b", date(2020, 1, 2)], ["c", None]], "a:str,b:date", ) - data = df.alter_columns("b:str").as_array(type_safe=True) + data = fi.as_array( + fi.alter_columns(df, "b:str", as_fugue=True), type_safe=True + ) assert [["a", "2020-01-01"], ["b", "2020-01-02"], ["c", None]] == data # datetime -> str @@ -330,7 +382,9 @@ def test_alter_columns(self): ], "a:str,b:datetime", ) - data = df.alter_columns("b:str").as_array(type_safe=True) + data = fi.as_array( + fi.alter_columns(df, "b:str", as_fugue=True), type_safe=True + ) assert [ ["a", "2020-01-01 03:04:05"], ["b", "2020-01-02 16:07:08"], @@ -339,49 +393,51 @@ def test_alter_columns(self): # str -> bool df = self.df([["a", "trUe"], ["b", "False"], ["c", None]], "a:str,b:str") - ndf = df.alter_columns("b:bool,a:str") - assert [["a", True], ["b", False], ["c", None]] == ndf.as_array( - type_safe=True + ndf = fi.alter_columns(df, "b:bool,a:str", as_fugue=True) + assert [["a", True], ["b", False], ["c", None]] == fi.as_array( + ndf, type_safe=True ) - assert ndf.schema == "a:str,b:bool" + assert fi.get_schema(ndf) == "a:str,b:bool" # str -> int df = self.df([["a", "1"]], "a:str,b:str") - ndf = df.alter_columns("b:int,a:str") - assert [["a", 1]] == ndf.as_array(type_safe=True) - assert ndf.schema == "a:str,b:int" + ndf = fi.alter_columns(df, "b:int,a:str") + assert [["a", 1]] == fi.as_array(ndf, type_safe=True) + assert fi.get_schema(ndf) == "a:str,b:int" # str -> double df = self.df([["a", "1.1"], ["b", "2"], ["c", None]], "a:str,b:str") - ndf = df.alter_columns("b:double") - assert [["a", 1.1], ["b", 2.0], ["c", None]] == ndf.as_array(type_safe=True) - assert ndf.schema == "a:str,b:double" + ndf = fi.alter_columns(df, "b:double", as_fugue=True) + assert [["a", 1.1], ["b", 2.0], ["c", None]] == fi.as_array( + ndf, type_safe=True + ) + assert fi.get_schema(ndf) == "a:str,b:double" # str -> date df = self.df( [["1", "2020-01-01"], ["2", "2020-01-02 01:02:03"], ["3", None]], "a:str,b:str", ) - ndf = df.alter_columns("b:date,a:int") + ndf = fi.alter_columns(df, "b:date,a:int", as_fugue=True) assert [ [1, date(2020, 1, 1)], [2, date(2020, 1, 2)], [3, None], - ] == ndf.as_array(type_safe=True) - assert ndf.schema == "a:int,b:date" + ] == fi.as_array(ndf, type_safe=True) + assert fi.get_schema(ndf) == "a:int,b:date" # str -> datetime df = self.df( [["1", "2020-01-01"], ["2", "2020-01-02 01:02:03"], ["3", None]], "a:str,b:str", ) - ndf = df.alter_columns("b:datetime,a:int") + ndf = fi.alter_columns(df, "b:datetime,a:int", as_fugue=True) assert [ [1, datetime(2020, 1, 1)], [2, datetime(2020, 1, 2, 1, 2, 3)], [3, None], - ] == ndf.as_array(type_safe=True) - assert ndf.schema == "a:int,b:datetime" + ] == fi.as_array(ndf, type_safe=True) + assert fi.get_schema(ndf) == "a:int,b:datetime" def test_alter_columns_invalid(self): # invalid conversion @@ -390,5 +446,25 @@ def test_alter_columns_invalid(self): [["1", "x"], ["2", "y"], ["3", None]], "a:str,b:str", ) - ndf = df.alter_columns("b:int") - ndf.show() # lazy dataframes will force to materialize + ndf = fi.alter_columns(df, "b:int") + fi.show(ndf) # lazy dataframes will force to materialize + + class NativeTests(Tests): + def to_native_df(self, pdf: pd.DataFrame) -> Any: # pragma: no cover + raise NotImplementedError + + def test_get_altered_schema(self): + pass + + def test_get_column_names(self): + df = self.to_native_df(pd.DataFrame([[0, 1, 2]], columns=["0", "1", "2"])) + assert fi.get_column_names(df) == ["0", "1", "2"] + + def test_rename_any_names(self): + pdf = self.to_native_df(pd.DataFrame([[0, 1, 2]], columns=["a", "b", "c"])) + df = fi.rename(pdf, {}) + assert fi.get_column_names(df) == ["a", "b", "c"] + + pdf = self.to_native_df(pd.DataFrame([[0, 1, 2]], columns=["0", "1", "2"])) + df = fi.rename(pdf, {"0": "_0", "1": "_1", "2": "_2"}) + assert fi.get_column_names(df) == ["_0", "_1", "_2"] diff --git a/fugue_test/execution_suite.py b/fugue_test/execution_suite.py index 002ba7ab..8d087296 100644 --- a/fugue_test/execution_suite.py +++ b/fugue_test/execution_suite.py @@ -6,9 +6,14 @@ from datetime import datetime from unittest import TestCase -import fugue.column.functions as ff import pandas as pd import pytest +from pytest import raises +from triad.collections.fs import FileSystem +from triad.exceptions import InvalidOperationError + +import fugue.api as fa +import fugue.column.functions as ff from fugue import ( ArrayDataFrame, DataFrames, @@ -16,14 +21,11 @@ PandasDataFrame, PartitionSpec, register_default_sql_engine, + DataFrame, ) -from fugue.column import SelectColumns, col, lit +from fugue.column import col, lit from fugue.dataframe.utils import _df_eq as df_eq from fugue.execution.native_execution_engine import NativeExecutionEngine -from pytest import raises -from triad.collections.fs import FileSystem -from triad.exceptions import InvalidOperationError - from fugue_test._utils import skip_spark2 @@ -38,6 +40,7 @@ class Tests(TestCase): def setUpClass(cls): register_default_sql_engine(lambda engine: engine.sql_engine) cls._engine = cls.make_engine(cls) + fa.set_global_engine(cls._engine) @property def engine(self) -> ExecutionEngine: @@ -45,6 +48,7 @@ def engine(self) -> ExecutionEngine: @classmethod def tearDownClass(cls): + fa.clear_global_engine() cls._engine.stop() def make_engine(self) -> ExecutionEngine: # pragma: no cover @@ -57,6 +61,9 @@ def test_init(self): assert copy.copy(self.engine) is self.engine assert copy.deepcopy(self.engine) is self.engine + def test_get_parallelism(self): + assert fa.get_current_parallelism(self.engine) == 1 + def test_to_df_general(self): e = self.engine o = ArrayDataFrame( @@ -91,30 +98,24 @@ def test_to_df_general(self): df_eq(o, e.to_df(pdf), throw=True) def test_filter(self): - e = self.engine - o = ArrayDataFrame( + a = ArrayDataFrame( [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int", ) - a = e.to_df(o) - b = e.filter(a, col("a").not_null()) + b = fa.filter(a, col("a").not_null()) df_eq(b, [[1, 2], [3, 4]], "a:double,b:int", throw=True) - c = e.filter(a, col("a").not_null() & (col("b") < 3)) + c = fa.filter(a, col("a").not_null() & (col("b") < 3)) df_eq(c, [[1, 2]], "a:double,b:int", throw=True) - c = e.filter(a, col("a") + col("b") == 3) + c = fa.filter(a, col("a") + col("b") == 3) df_eq(c, [[1, 2]], "a:double,b:int", throw=True) def test_select(self): - e = self.engine - o = ArrayDataFrame( + a = ArrayDataFrame( [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int" ) - a = e.to_df(o) # simple - b = e.select( - a, SelectColumns(col("b"), (col("b") + 1).alias("c").cast(str)) - ) + b = fa.select(a, col("b"), (col("b") + 1).alias("c").cast(str)) df_eq( b, [[2, "3"], [2, "3"], [1, "2"], [4, "5"], [4, "5"]], @@ -123,11 +124,8 @@ def test_select(self): ) # with distinct - b = e.select( - a, - SelectColumns( - col("b"), (col("b") + 1).alias("c").cast(str), arg_distinct=True - ), + b = fa.select( + a, col("b"), (col("b") + 1).alias("c").cast(str), distinct=True ) df_eq( b, @@ -137,21 +135,20 @@ def test_select(self): ) # wildcard - b = e.select(a, SelectColumns(col("*")), where=col("a") + col("b") == 3) + b = fa.select(a, col("*"), where=col("a") + col("b") == 3) df_eq(b, [[1, 2]], "a:double,b:int", throw=True) # aggregation - b = e.select( - a, SelectColumns(col("a"), ff.sum(col("b")).cast(float).alias("b")) - ) + b = fa.select(a, col("a"), ff.sum(col("b")).cast(float).alias("b")) df_eq(b, [[1, 2], [3, 4], [None, 7]], "a:double,b:double", throw=True) # having # https://github.com/fugue-project/fugue/issues/222 col_b = ff.sum(col("b")) - b = e.select( + b = fa.select( a, - SelectColumns(col("a"), col_b.cast(float).alias("c")), + col("a"), + col_b.cast(float).alias("c"), having=(col_b >= 7) | (col("a") == 1), ) df_eq(b, [[1, 2], [None, 7]], "a:double,c:double", throw=True) @@ -159,11 +156,11 @@ def test_select(self): # literal + alias inference # https://github.com/fugue-project/fugue/issues/222 col_b = ff.sum(col("b")) - b = e.select( + b = fa.select( a, - SelectColumns( - col("a"), lit(1, "o").cast(str), col_b.cast(float).alias("c") - ), + col("a"), + lit(1, "o").cast(str), + col_b.cast(float).alias("c"), having=(col_b >= 7) | (col("a") == 1), ) df_eq( @@ -171,16 +168,11 @@ def test_select(self): ) def test_assign(self): - e = self.engine - o = ArrayDataFrame( + a = ArrayDataFrame( [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int" ) - a = e.to_df(o) - b = e.assign( - a, - [lit(1, "x"), col("b").cast(str), (col("b") + 1).alias("c").cast(int)], - ) + b = fa.assign(a, x=1, b=col("b").cast(str), c=(col("b") + 1).cast(int)) df_eq( b, [ @@ -195,29 +187,22 @@ def test_assign(self): ) def test_aggregate(self): - e = self.engine - o = ArrayDataFrame( + a = ArrayDataFrame( [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int" ) - a = e.to_df(o) - b = e.aggregate( + b = fa.aggregate( df=a, - partition_spec=None, - agg_cols=[ - ff.max(col("b")), - (ff.max(col("b")) * 2).cast("int32").alias("c"), - ], + b=ff.max(col("b")), + c=(ff.max(col("b")) * 2).cast("int32").alias("c"), ) df_eq(b, [[4, 8]], "b:int,c:int", throw=True) - b = e.aggregate( - df=a, - partition_spec=PartitionSpec(by=["a"]), - agg_cols=[ - ff.max(col("b")), - (ff.max(col("b")) * 2).cast("int32").alias("c"), - ], + b = fa.aggregate( + a, + "a", + b=ff.max(col("b")), + c=(ff.max(col("b")) * 2).cast("int32").alias("c"), ) df_eq( b, @@ -227,18 +212,10 @@ def test_aggregate(self): ) with raises(ValueError): - e.aggregate( - df=a, - partition_spec=PartitionSpec(by=["a"]), - agg_cols=[ff.max(col("b")), lit(1)], - ) + fa.aggregate(a, "a", b=ff.max(col("b")), x=1) with raises(ValueError): - e.aggregate( - df=a, - partition_spec=PartitionSpec(by=["a"]), - agg_cols=[], - ) + fa.aggregate(a, "a") def test_map(self): def noop(cursor, data): @@ -374,38 +351,52 @@ def test_map_with_binary(self): ) df_eq(expected, c, no_pandas=True, check_order=True, throw=True) + def test_join_multiple(self): + e = self.engine + a = e.to_df([[1, 2], [3, 4]], "a:int,b:int") + b = e.to_df([[1, 20], [3, 40]], "a:int,c:int") + c = e.to_df([[1, 200], [3, 400]], "a:int,d:int") + d = fa.inner_join(a, b, c) + df_eq( + d, + [[1, 2, 20, 200], [3, 4, 40, 400]], + "a:int,b:int,c:int,d:int", + throw=True, + ) + def test__join_cross(self): e = self.engine a = e.to_df([[1, 2], [3, 4]], "a:int,b:int") b = e.to_df([[6], [7]], "c:int") - c = e.join(a, b, how="Cross") + c = fa.join(a, b, how="Cross") df_eq( c, [[1, 2, 6], [1, 2, 7], [3, 4, 6], [3, 4, 7]], "a:int,b:int,c:int", + throw=True, ) b = e.to_df([], "c:int") - c = e.join(a, b, how="Cross") + c = fa.cross_join(a, b) df_eq(c, [], "a:int,b:int,c:int", throw=True) a = e.to_df([], "a:int,b:int") b = e.to_df([], "c:int") - c = e.join(a, b, how="Cross") + c = fa.join(a, b, how="Cross") df_eq(c, [], "a:int,b:int,c:int", throw=True) def test__join_inner(self): e = self.engine a = e.to_df([[1, 2], [3, 4]], "a:int,b:int") b = e.to_df([[6, 1], [2, 7]], "c:int,a:int") - c = e.join(a, b, how="INNER", on=["a"]) + c = fa.join(a, b, how="INNER", on=["a"]) df_eq(c, [[1, 2, 6]], "a:int,b:int,c:int", throw=True) - c = e.join(b, a, how="INNER", on=["a"]) + c = fa.inner_join(b, a) df_eq(c, [[6, 1, 2]], "c:int,a:int,b:int", throw=True) a = e.to_df([], "a:int,b:int") b = e.to_df([], "c:int,a:int") - c = e.join(a, b, how="INNER", on=["a"]) + c = fa.join(a, b, how="INNER", on=["a"]) df_eq(c, [], "a:int,b:int,c:int", throw=True) def test__join_outer(self): @@ -413,33 +404,33 @@ def test__join_outer(self): a = e.to_df([], "a:int,b:int") b = e.to_df([], "c:str,a:int") - c = e.join(a, b, how="left_outer", on=["a"]) + c = fa.left_outer_join(a, b) df_eq(c, [], "a:int,b:int,c:str", throw=True) a = e.to_df([], "a:int,b:str") b = e.to_df([], "c:int,a:int") - c = e.join(a, b, how="right_outer", on=["a"]) + c = fa.right_outer_join(a, b) df_eq(c, [], "a:int,b:str,c:int", throw=True) a = e.to_df([], "a:int,b:str") b = e.to_df([], "c:str,a:int") - c = e.join(a, b, how="full_outer", on=["a"]) + c = fa.full_outer_join(a, b) df_eq(c, [], "a:int,b:str,c:str", throw=True) a = e.to_df([[1, "2"], [3, "4"]], "a:int,b:str") b = e.to_df([["6", 1], ["2", 7]], "c:str,a:int") - c = e.join(a, b, how="left_OUTER", on=["a"]) + c = fa.join(a, b, how="left_OUTER", on=["a"]) df_eq(c, [[1, "2", "6"], [3, "4", None]], "a:int,b:str,c:str", throw=True) - c = e.join(b, a, how="left_outer", on=["a"]) + c = fa.join(b, a, how="left_outer", on=["a"]) df_eq(c, [["6", 1, "2"], ["2", 7, None]], "c:str,a:int,b:str", throw=True) a = e.to_df([[1, "2"], [3, "4"]], "a:int,b:str") b = e.to_df([[6, 1], [2, 7]], "c:double,a:int") - c = e.join(a, b, how="left_OUTER", on=["a"]) + c = fa.join(a, b, how="left_OUTER", on=["a"]) df_eq( c, [[1, "2", 6.0], [3, "4", None]], "a:int,b:str,c:double", throw=True ) - c = e.join(b, a, how="left_outer", on=["a"]) + c = fa.join(b, a, how="left_outer", on=["a"]) # assert c.as_pandas().values.tolist()[1][2] is None df_eq( c, [[6.0, 1, "2"], [2.0, 7, None]], "c:double,a:int,b:str", throw=True @@ -447,11 +438,11 @@ def test__join_outer(self): a = e.to_df([[1, "2"], [3, "4"]], "a:int,b:str") b = e.to_df([["6", 1], ["2", 7]], "c:str,a:int") - c = e.join(a, b, how="right_outer", on=["a"]) + c = fa.join(a, b, how="right_outer", on=["a"]) # assert c.as_pandas().values.tolist()[1][1] is None df_eq(c, [[1, "2", "6"], [7, None, "2"]], "a:int,b:str,c:str", throw=True) - c = e.join(a, b, how="full_outer", on=["a"]) + c = fa.join(a, b, how="full_outer", on=["a"]) df_eq( c, [[1, "2", "6"], [3, "4", None], [7, None, "2"]], @@ -464,21 +455,21 @@ def test__join_outer_pandas_incompatible(self): a = e.to_df([[1, "2"], [3, "4"]], "a:int,b:str") b = e.to_df([[6, 1], [2, 7]], "c:int,a:int") - c = e.join(a, b, how="left_OUTER", on=["a"]) + c = fa.join(a, b, how="left_OUTER", on=["a"]) df_eq( c, [[1, "2", 6], [3, "4", None]], "a:int,b:str,c:int", throw=True, ) - c = e.join(b, a, how="left_outer", on=["a"]) + c = fa.join(b, a, how="left_outer", on=["a"]) df_eq(c, [[6, 1, "2"], [2, 7, None]], "c:int,a:int,b:str", throw=True) a = e.to_df([[1, "2"], [3, "4"]], "a:int,b:str") b = e.to_df([[True, 1], [False, 7]], "c:bool,a:int") - c = e.join(a, b, how="left_OUTER", on=["a"]) + c = fa.join(a, b, how="left_OUTER", on=["a"]) df_eq(c, [[1, "2", True], [3, "4", None]], "a:int,b:str,c:bool", throw=True) - c = e.join(b, a, how="left_outer", on=["a"]) + c = fa.join(b, a, how="left_outer", on=["a"]) df_eq( c, [[True, 1, "2"], [False, 7, None]], "c:bool,a:int,b:str", throw=True ) @@ -487,36 +478,36 @@ def test__join_semi(self): e = self.engine a = e.to_df([[1, 2], [3, 4]], "a:int,b:int") b = e.to_df([[6, 1], [2, 7]], "c:int,a:int") - c = e.join(a, b, how="semi", on=["a"]) + c = fa.join(a, b, how="semi", on=["a"]) df_eq(c, [[1, 2]], "a:int,b:int", throw=True) - c = e.join(b, a, how="semi", on=["a"]) + c = fa.semi_join(b, a) df_eq(c, [[6, 1]], "c:int,a:int", throw=True) b = e.to_df([], "c:int,a:int") - c = e.join(a, b, how="semi", on=["a"]) + c = fa.join(a, b, how="semi", on=["a"]) df_eq(c, [], "a:int,b:int", throw=True) a = e.to_df([], "a:int,b:int") b = e.to_df([], "c:int,a:int") - c = e.join(a, b, how="semi", on=["a"]) + c = fa.join(a, b, how="semi", on=["a"]) df_eq(c, [], "a:int,b:int", throw=True) def test__join_anti(self): e = self.engine a = e.to_df([[1, 2], [3, 4]], "a:int,b:int") b = e.to_df([[6, 1], [2, 7]], "c:int,a:int") - c = e.join(a, b, how="anti", on=["a"]) + c = fa.join(a, b, how="anti", on=["a"]) df_eq(c, [[3, 4]], "a:int,b:int", throw=True) - c = e.join(b, a, how="anti", on=["a"]) + c = fa.anti_join(b, a) df_eq(c, [[2, 7]], "c:int,a:int", throw=True) b = e.to_df([], "c:int,a:int") - c = e.join(a, b, how="anti", on=["a"]) + c = fa.join(a, b, how="anti", on=["a"]) df_eq(c, [[1, 2], [3, 4]], "a:int,b:int", throw=True) a = e.to_df([], "a:int,b:int") b = e.to_df([], "c:int,a:int") - c = e.join(a, b, how="anti", on=["a"]) + c = fa.join(a, b, how="anti", on=["a"]) df_eq(c, [], "a:int,b:int", throw=True) def test__join_with_null_keys(self): @@ -524,41 +515,66 @@ def test__join_with_null_keys(self): e = self.engine a = e.to_df([[1, 2, 3], [4, None, 6]], "a:double,b:double,c:int") b = e.to_df([[1, 2, 33], [4, None, 63]], "a:double,b:double,d:int") - c = e.join(a, b, how="INNER") + c = fa.join(a, b, how="INNER") df_eq(c, [[1, 2, 3, 33]], "a:double,b:double,c:int,d:int", throw=True) def test_union(self): e = self.engine a = e.to_df([[1, 2, 3], [4, None, 6]], "a:double,b:double,c:int") b = e.to_df([[1, 2, 33], [4, None, 6]], "a:double,b:double,c:int") - c = e.union(a, b) + c = fa.union(a, b) df_eq( c, [[1, 2, 3], [4, None, 6], [1, 2, 33]], "a:double,b:double,c:int", throw=True, ) - c = e.union(a, b, distinct=False) + c = fa.union(a, b, distinct=False) df_eq( c, [[1, 2, 3], [4, None, 6], [1, 2, 33], [4, None, 6]], "a:double,b:double,c:int", throw=True, ) + d = fa.union(a, b, c, distinct=False) + df_eq( + d, + [ + [1, 2, 3], + [4, None, 6], + [1, 2, 33], + [4, None, 6], + [1, 2, 3], + [4, None, 6], + [1, 2, 33], + [4, None, 6], + ], + "a:double,b:double,c:int", + throw=True, + ) def test_subtract(self): e = self.engine a = e.to_df([[1, 2, 3], [1, 2, 3], [4, None, 6]], "a:double,b:double,c:int") b = e.to_df([[1, 2, 33], [4, None, 6]], "a:double,b:double,c:int") - c = e.subtract(a, b) + c = fa.subtract(a, b) df_eq( c, [[1, 2, 3]], "a:double,b:double,c:int", throw=True, ) + x = e.to_df([[1, 2, 33]], "a:double,b:double,c:int") + y = e.to_df([[4, None, 6]], "a:double,b:double,c:int") + z = fa.subtract(a, x, y) + df_eq( + z, + [[1, 2, 3]], + "a:double,b:double,c:int", + throw=True, + ) # TODO: EXCEPT ALL is not implemented (QPD issue) - # c = e.subtract(a, b, distinct=False) + # c = fa.subtract(a, b, distinct=False) # df_eq( # c, # [[1, 2, 3], [1, 2, 3]], @@ -575,15 +591,30 @@ def test_intersect(self): [[1, 2, 33], [4, None, 6], [4, None, 6], [4, None, 6]], "a:double,b:double,c:int", ) - c = e.intersect(a, b) + c = fa.intersect(a, b) df_eq( c, [[4, None, 6]], "a:double,b:double,c:int", throw=True, ) + x = e.to_df( + [[1, 2, 33]], + "a:double,b:double,c:int", + ) + y = e.to_df( + [[4, None, 6], [4, None, 6], [4, None, 6]], + "a:double,b:double,c:int", + ) + z = fa.intersect(a, x, y) + df_eq( + z, + [], + "a:double,b:double,c:int", + throw=True, + ) # TODO: INTERSECT ALL is not implemented (QPD issue) - # c = e.intersect(a, b, distinct=False) + # c = fa.intersect(a, b, distinct=False) # df_eq( # c, # [[4, None, 6], [4, None, 6]], @@ -596,7 +627,7 @@ def test_distinct(self): a = e.to_df( [[4, None, 6], [1, 2, 3], [4, None, 6]], "a:double,b:double,c:int" ) - c = e.distinct(a) + c = fa.distinct(a) df_eq( c, [[4, None, 6], [1, 2, 3]], @@ -609,11 +640,11 @@ def test_dropna(self): a = e.to_df( [[4, None, 6], [1, 2, 3], [4, None, None]], "a:double,b:double,c:double" ) - c = e.dropna(a) # default - d = e.dropna(a, how="all") - f = e.dropna(a, how="any", thresh=2) - g = e.dropna(a, how="any", subset=["a", "c"]) - h = e.dropna(a, how="any", thresh=1, subset=["a", "c"]) + c = fa.dropna(a) # default + d = fa.dropna(a, how="all") + f = fa.dropna(a, how="any", thresh=2) + g = fa.dropna(a, how="any", subset=["a", "c"]) + h = fa.dropna(a, how="any", thresh=1, subset=["a", "c"]) df_eq( c, [[1, 2, 3]], @@ -644,10 +675,10 @@ def test_fillna(self): a = e.to_df( [[4, None, 6], [1, 2, 3], [4, None, None]], "a:double,b:double,c:double" ) - c = e.fillna(a, value=1) - d = e.fillna(a, {"b": 99, "c": -99}) - f = e.fillna(a, value=-99, subset=["c"]) - g = e.fillna(a, {"b": 99, "c": -99}, subset=["c"]) # subset ignored + c = fa.fillna(a, value=1) + d = fa.fillna(a, {"b": 99, "c": -99}) + f = fa.fillna(a, value=-99, subset=["c"]) + g = fa.fillna(a, {"b": 99, "c": -99}, subset=["c"]) # subset ignored df_eq( c, [[4, 1, 6], [1, 2, 3], [4, 1, 1]], @@ -667,24 +698,24 @@ def test_fillna(self): throw=True, ) df_eq(g, d, throw=True) - raises(ValueError, lambda: e.fillna(a, {"b": None, c: "99"})) - raises(ValueError, lambda: e.fillna(a, None)) - # raises(ValueError, lambda: e.fillna(a, ["b"])) + raises(ValueError, lambda: fa.fillna(a, {"b": None, c: "99"})) + raises(ValueError, lambda: fa.fillna(a, None)) + # raises(ValueError, lambda: fa.fillna(a, ["b"])) def test_sample(self): engine = self.engine a = engine.to_df([[x] for x in range(100)], "a:int") with raises(ValueError): - engine.sample(a) # must set one + fa.sample(a) # must set one with raises(ValueError): - engine.sample(a, n=90, frac=0.9) # can't set both + fa.sample(a, n=90, frac=0.9) # can't set both - f = engine.sample(a, frac=0.8, replace=False) - g = engine.sample(a, frac=0.8, replace=True) - h = engine.sample(a, frac=0.8, seed=1) - h2 = engine.sample(a, frac=0.8, seed=1) - i = engine.sample(a, frac=0.8, seed=2) + f = fa.sample(a, frac=0.8, replace=False) + g = fa.sample(a, frac=0.8, replace=True) + h = fa.sample(a, frac=0.8, seed=1) + h2 = fa.sample(a, frac=0.8, seed=1) + i = fa.sample(a, frac=0.8, seed=2) assert not df_eq(f, g, throw=False) df_eq(h, h2, throw=True) assert not df_eq(h, i, throw=False) @@ -692,8 +723,8 @@ def test_sample(self): def test_take(self): e = self.engine - ps = PartitionSpec(by=["a"], presort="b DESC,c DESC") - ps2 = PartitionSpec(by=["c"], presort="b ASC") + ps = dict(by=["a"], presort="b DESC,c DESC") + ps2 = dict(by=["c"], presort="b ASC") a = e.to_df( [ ["a", 2, 3], @@ -705,12 +736,12 @@ def test_take(self): ], "a:str,b:int,c:long", ) - b = e.take(a, n=1, presort="b desc") - c = e.take(a, n=2, presort="a desc", na_position="first") - d = e.take(a, n=1, presort="a asc, b desc", partition_spec=ps) - f = e.take(a, n=1, presort=None, partition_spec=ps2) - g = e.take(a, n=2, presort="a desc", na_position="last") - h = e.take(a, n=2, presort="a", na_position="first") + b = fa.take(a, n=1, presort="b desc") + c = fa.take(a, n=2, presort="a desc", na_position="first") + d = fa.take(a, n=1, presort="a asc, b desc", partition=ps) + f = fa.take(a, n=1, presort=None, partition=ps2) + g = fa.take(a, n=2, presort="a desc", na_position="last") + h = fa.take(a, n=2, presort="a", na_position="first") df_eq( b, [[None, 4, 2]], @@ -750,17 +781,17 @@ def test_take(self): "a:str,b:int,c:long", throw=True, ) - raises(ValueError, lambda: e.take(a, n=0.5, presort=None)) + raises(ValueError, lambda: fa.take(a, n=0.5, presort=None)) def test_sample_n(self): engine = self.engine a = engine.to_df([[x] for x in range(100)], "a:int") - b = engine.sample(a, n=90, replace=False) - c = engine.sample(a, n=90, replace=True) - d = engine.sample(a, n=90, seed=1) - d2 = engine.sample(a, n=90, seed=1) - e = engine.sample(a, n=90, seed=2) + b = fa.sample(a, n=90, replace=False) + c = fa.sample(a, n=90, replace=True) + d = fa.sample(a, n=90, seed=1) + d2 = fa.sample(a, n=90, seed=1) + e = fa.sample(a, n=90, seed=2) assert not df_eq(b, c, throw=False) df_eq(d, d2, throw=True) assert not df_eq(d, e, throw=False) @@ -773,9 +804,9 @@ def test__serialize_by_partition(self): a, PartitionSpec(by=["a"], presort="b"), df_name="_0" ) assert s.count() == 2 - s = e.persist(e._serialize_by_partition(a, PartitionSpec(), df_name="_0")) + s = fa.persist(e._serialize_by_partition(a, PartitionSpec(), df_name="_0")) assert s.count() == 1 - s = e.persist( + s = fa.persist( e._serialize_by_partition(a, PartitionSpec(by=["x"]), df_name="_0") ) assert s.count() == 1 @@ -788,10 +819,10 @@ def test_zip(self): sa = e._serialize_by_partition(a, ps, df_name="_0") sb = e._serialize_by_partition(b, ps, df_name="_1") # test zip with serialized dfs - z1 = e.persist(e.zip(sa, sb, how="inner", partition_spec=ps)) + z1 = fa.persist(e.zip(sa, sb, how="inner", partition_spec=ps)) assert 1 == z1.count() assert not z1.metadata.get("serialized_has_name", False) - z2 = e.persist(e.zip(sa, sb, how="left_outer", partition_spec=ps)) + z2 = fa.persist(e.zip(sa, sb, how="left_outer", partition_spec=ps)) assert 2 == z2.count() # can't have duplicated keys @@ -816,24 +847,24 @@ def test_zip(self): ) # test zip with unserialized dfs - z3 = e.persist(e.zip(a, b, partition_spec=ps)) + z3 = fa.persist(e.zip(a, b, partition_spec=ps)) df_eq(z1, z3, throw=True) - z3 = e.persist(e.zip(a, sb, partition_spec=ps)) + z3 = fa.persist(e.zip(a, sb, partition_spec=ps)) df_eq(z1, z3, throw=True) - z3 = e.persist(e.zip(sa, b, partition_spec=ps)) + z3 = fa.persist(e.zip(sa, b, partition_spec=ps)) df_eq(z1, z3, throw=True) - z4 = e.persist(e.zip(a, b, how="left_outer", partition_spec=ps)) + z4 = fa.persist(e.zip(a, b, how="left_outer", partition_spec=ps)) df_eq(z2, z4, throw=True) - z4 = e.persist(e.zip(a, sb, how="left_outer", partition_spec=ps)) + z4 = fa.persist(e.zip(a, sb, how="left_outer", partition_spec=ps)) df_eq(z2, z4, throw=True) - z4 = e.persist(e.zip(sa, b, how="left_outer", partition_spec=ps)) + z4 = fa.persist(e.zip(sa, b, how="left_outer", partition_spec=ps)) df_eq(z2, z4, throw=True) - z5 = e.persist(e.zip(a, b, how="cross")) + z5 = fa.persist(e.zip(a, b, how="cross")) assert z5.count() == 1 assert len(z5.schema) == 2 - z6 = e.persist(e.zip(sa, b, how="cross")) + z6 = fa.persist(e.zip(sa, b, how="cross")) assert z6.count() == 2 assert len(z6.schema) == 3 @@ -844,15 +875,15 @@ def test_zip(self): def test_zip_all(self): e = self.engine a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int") - z = e.persist(e.zip_all(DataFrames(a))) + z = fa.persist(e.zip_all(DataFrames(a))) assert 1 == z.count() assert z.metadata.get("serialized", False) assert not z.metadata.get("serialized_has_name", False) - z = e.persist(e.zip_all(DataFrames(x=a))) + z = fa.persist(e.zip_all(DataFrames(x=a))) assert 1 == z.count() assert z.metadata.get("serialized", False) assert z.metadata.get("serialized_has_name", False) - z = e.persist( + z = fa.persist( e.zip_all(DataFrames(x=a), partition_spec=PartitionSpec(by=["a"])) ) assert 2 == z.count() @@ -861,23 +892,23 @@ def test_zip_all(self): b = e.to_df([[6, 1], [2, 7]], "c:int,a:int") c = e.to_df([[6, 1], [2, 7]], "d:int,a:int") - z = e.persist(e.zip_all(DataFrames(a, b, c))) + z = fa.persist(e.zip_all(DataFrames(a, b, c))) assert 1 == z.count() assert not z.metadata.get("serialized_has_name", False) - z = e.persist(e.zip_all(DataFrames(x=a, y=b, z=c))) + z = fa.persist(e.zip_all(DataFrames(x=a, y=b, z=c))) assert 1 == z.count() assert z.metadata.get("serialized_has_name", False) - z = e.persist(e.zip_all(DataFrames(b, b))) + z = fa.persist(e.zip_all(DataFrames(b, b))) assert 2 == z.count() assert not z.metadata.get("serialized_has_name", False) assert ["a", "c"] in z.schema - z = e.persist(e.zip_all(DataFrames(x=b, y=b))) + z = fa.persist(e.zip_all(DataFrames(x=b, y=b))) assert 2 == z.count() assert z.metadata.get("serialized_has_name", False) assert ["a", "c"] in z.schema - z = e.persist( + z = fa.persist( e.zip_all(DataFrames(b, b), partition_spec=PartitionSpec(by=["a"])) ) assert 2 == z.count() @@ -889,12 +920,12 @@ def test_comap(self): e = self.engine a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int") b = e.to_df([[6, 1], [2, 7]], "c:int,a:int") - z1 = e.persist(e.zip(a, b)) - z2 = e.persist(e.zip(a, b, partition_spec=ps, how="left_outer")) - z3 = e.persist( + z1 = fa.persist(e.zip(a, b)) + z2 = fa.persist(e.zip(a, b, partition_spec=ps, how="left_outer")) + z3 = fa.persist( e._serialize_by_partition(a, partition_spec=ps, df_name="_x") ) - z4 = e.persist(e.zip(a, b, partition_spec=ps, how="cross")) + z4 = fa.persist(e.zip(a, b, partition_spec=ps, how="cross")) def comap(cursor, dfs): assert not dfs.has_key @@ -938,9 +969,9 @@ def test_comap_with_key(self): a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int") b = e.to_df([[6, 1], [2, 7]], "c:int,a:int") c = e.to_df([[6, 1]], "c:int,a:int") - z1 = e.persist(e.zip(a, b, df1_name="x", df2_name="y")) - z2 = e.persist(e.zip_all(DataFrames(x=a, y=b, z=b))) - z3 = e.persist( + z1 = fa.persist(e.zip(a, b, df1_name="x", df2_name="y")) + z2 = fa.persist(e.zip_all(DataFrames(x=a, y=b, z=b))) + z3 = fa.persist( e.zip_all(DataFrames(z=c), partition_spec=PartitionSpec(by=["a"])) ) @@ -994,48 +1025,47 @@ def test_save_single_and_load_parquet(self): path = os.path.join(self.tmpdir, "a", "b") e.fs.makedirs(path, recreate=True) # over write folder with single file - e.save_df(b, path, format_hint="parquet", force_single=True) + fa.save(b, path, format_hint="parquet", force_single=True) assert e.fs.isfile(path) - c = e.load_df(path, format_hint="parquet", columns=["a", "c"]) + c = fa.load(path, format_hint="parquet", columns=["a", "c"], as_fugue=True) df_eq(c, [[1, 6], [7, 2]], "a:long,c:int", throw=True) # overwirte single with folder (if applicable) b = ArrayDataFrame([[60, 1], [20, 7]], "c:int,a:long") - e.save_df(b, path, format_hint="parquet", mode="overwrite") - c = e.load_df(path, format_hint="parquet", columns=["a", "c"]) + fa.save(b, path, format_hint="parquet", mode="overwrite") + c = fa.load(path, format_hint="parquet", columns=["a", "c"], as_fugue=True) df_eq(c, [[1, 60], [7, 20]], "a:long,c:int", throw=True) def test_save_and_load_parquet(self): - e = self.engine b = ArrayDataFrame([[6, 1], [2, 7]], "c:int,a:long") path = os.path.join(self.tmpdir, "a", "b") - e.save_df(b, path, format_hint="parquet") - c = e.load_df(path, format_hint="parquet", columns=["a", "c"]) + fa.save(b, path, format_hint="parquet") + c = fa.load(path, format_hint="parquet", columns=["a", "c"], as_fugue=True) df_eq(c, [[1, 6], [7, 2]], "a:long,c:int", throw=True) def test_load_parquet_folder(self): - e = self.engine native = NativeExecutionEngine() a = ArrayDataFrame([[6, 1]], "c:int,a:long") b = ArrayDataFrame([[2, 7], [4, 8]], "c:int,a:long") path = os.path.join(self.tmpdir, "a", "b") - native.save_df(a, os.path.join(path, "a.parquet")) - native.save_df(b, os.path.join(path, "b.parquet")) + fa.save(a, os.path.join(path, "a.parquet"), engine=native) + fa.save(b, os.path.join(path, "b.parquet"), engine=native) FileSystem().touch(os.path.join(path, "_SUCCESS")) - c = e.load_df(path, format_hint="parquet", columns=["a", "c"]) + c = fa.load(path, format_hint="parquet", columns=["a", "c"], as_fugue=True) df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:int", throw=True) def test_load_parquet_files(self): - e = self.engine native = NativeExecutionEngine() a = ArrayDataFrame([[6, 1]], "c:int,a:long") b = ArrayDataFrame([[2, 7], [4, 8]], "c:int,a:long") path = os.path.join(self.tmpdir, "a", "b") f1 = os.path.join(path, "a.parquet") f2 = os.path.join(path, "b.parquet") - native.save_df(a, f1) - native.save_df(b, f2) - c = e.load_df([f1, f2], format_hint="parquet", columns=["a", "c"]) + fa.save(a, f1, engine=native) + fa.save(b, f2, engine=native) + c = fa.load( + [f1, f2], format_hint="parquet", columns=["a", "c"], as_fugue=True + ) df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:int", throw=True) @skip_spark2 @@ -1046,39 +1076,37 @@ def test_save_single_and_load_avro(self): path = os.path.join(self.tmpdir, "a", "b") e.fs.makedirs(path, recreate=True) # over write folder with single file - e.save_df(b, path, format_hint="avro", force_single=True) + fa.save(b, path, format_hint="avro", force_single=True) assert e.fs.isfile(path) - c = e.load_df(path, format_hint="avro", columns=["a", "c"]) + c = fa.load(path, format_hint="avro", columns=["a", "c"], as_fugue=True) df_eq(c, [[1, 6], [7, 2]], "a:long,c:long", throw=True) # overwirte single with folder (if applicable) b = ArrayDataFrame([[60, 1], [20, 7]], "c:long,a:long") - e.save_df(b, path, format_hint="avro", mode="overwrite") - c = e.load_df(path, format_hint="avro", columns=["a", "c"]) + fa.save(b, path, format_hint="avro", mode="overwrite") + c = fa.load(path, format_hint="avro", columns=["a", "c"], as_fugue=True) df_eq(c, [[1, 60], [7, 20]], "a:long,c:long", throw=True) @skip_spark2 def test_save_and_load_avro(self): # TODO: switch to c:int,a:long when we can preserve schema to avro - e = self.engine b = ArrayDataFrame([[6, 1], [2, 7]], "c:long,a:long") path = os.path.join(self.tmpdir, "a", "b") - e.save_df(b, path, format_hint="avro") - c = e.load_df(path, format_hint="avro", columns=["a", "c"]) + fa.save(b, path, format_hint="avro") + c = fa.load(path, format_hint="avro", columns=["a", "c"], as_fugue=True) df_eq(c, [[1, 6], [7, 2]], "a:long,c:long", throw=True) @skip_spark2 def test_load_avro_folder(self): # TODO: switch to c:int,a:long when we can preserve schema to avro - e = self.engine native = NativeExecutionEngine() a = ArrayDataFrame([[6, 1]], "c:long,a:long") b = ArrayDataFrame([[2, 7], [4, 8]], "c:long,a:long") path = os.path.join(self.tmpdir, "a", "b") - native.save_df(a, os.path.join(path, "a.avro")) - native.save_df(b, os.path.join(path, "b.avro")) + fa.save(a, os.path.join(path, "a.avro"), engine=native) + fa.save(b, os.path.join(path, "b.avro"), engine=native) FileSystem().touch(os.path.join(path, "_SUCCESS")) - c = e.load_df(path, format_hint="avro", columns=["a", "c"]) + c = fa.load(path, format_hint="avro", columns=["a", "c"], as_fugue=True) df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:long", throw=True) def test_save_single_and_load_csv(self): @@ -1087,60 +1115,58 @@ def test_save_single_and_load_csv(self): path = os.path.join(self.tmpdir, "a", "b") e.fs.makedirs(path, recreate=True) # over write folder with single file - e.save_df(b, path, format_hint="csv", header=True, force_single=True) + fa.save(b, path, format_hint="csv", header=True, force_single=True) assert e.fs.isfile(path) - c = e.load_df( - path, - format_hint="csv", - header=True, - infer_schema=False, + c = fa.load( + path, format_hint="csv", header=True, infer_schema=False, as_fugue=True ) df_eq(c, [["6.1", "1.1"], ["2.1", "7.1"]], "c:str,a:str", throw=True) - c = e.load_df( - path, - format_hint="csv", - header=True, - infer_schema=True, + c = fa.load( + path, format_hint="csv", header=True, infer_schema=True, as_fugue=True ) df_eq(c, [[6.1, 1.1], [2.1, 7.1]], "c:double,a:double", throw=True) with raises(ValueError): - c = e.load_df( + c = fa.load( path, format_hint="csv", header=True, infer_schema=True, columns="c:str,a:str", # invalid to set schema when infer schema + as_fugue=True, ) - c = e.load_df( + c = fa.load( path, format_hint="csv", header=True, infer_schema=False, columns=["a", "c"], + as_fugue=True, ) df_eq(c, [["1.1", "6.1"], ["7.1", "2.1"]], "a:str,c:str", throw=True) - c = e.load_df( + c = fa.load( path, format_hint="csv", header=True, infer_schema=False, columns="a:double,c:double", + as_fugue=True, ) df_eq(c, [[1.1, 6.1], [7.1, 2.1]], "a:double,c:double", throw=True) # overwirte single with folder (if applicable) b = ArrayDataFrame([[60.1, 1.1], [20.1, 7.1]], "c:double,a:double") - e.save_df(b, path, format_hint="csv", header=True, mode="overwrite") - c = e.load_df( + fa.save(b, path, format_hint="csv", header=True, mode="overwrite") + c = fa.load( path, format_hint="csv", header=True, infer_schema=False, columns=["a", "c"], + as_fugue=True, ) df_eq(c, [["1.1", "60.1"], ["7.1", "20.1"]], "a:str,c:str", throw=True) @@ -1150,87 +1176,100 @@ def test_save_single_and_load_csv_no_header(self): path = os.path.join(self.tmpdir, "a", "b") e.fs.makedirs(path, recreate=True) # over write folder with single file - e.save_df(b, path, format_hint="csv", header=False, force_single=True) + fa.save(b, path, format_hint="csv", header=False, force_single=True) assert e.fs.isfile(path) with raises(ValueError): - c = e.load_df( + c = fa.load( path, format_hint="csv", header=False, infer_schema=False, + as_fugue=True # when header is False, must set columns ) - c = e.load_df( + c = fa.load( path, format_hint="csv", header=False, infer_schema=False, columns=["c", "a"], + as_fugue=True, ) df_eq(c, [["6.1", "1.1"], ["2.1", "7.1"]], "c:str,a:str", throw=True) - c = e.load_df( + c = fa.load( path, format_hint="csv", header=False, infer_schema=True, columns=["c", "a"], + as_fugue=True, ) df_eq(c, [[6.1, 1.1], [2.1, 7.1]], "c:double,a:double", throw=True) with raises(ValueError): - c = e.load_df( + c = fa.load( path, format_hint="csv", header=False, infer_schema=True, columns="c:double,a:double", + as_fugue=True, ) - c = e.load_df( + c = fa.load( path, format_hint="csv", header=False, infer_schema=False, columns="c:double,a:str", + as_fugue=True, ) df_eq(c, [[6.1, "1.1"], [2.1, "7.1"]], "c:double,a:str", throw=True) def test_save_and_load_csv(self): - e = self.engine b = ArrayDataFrame([[6.1, 1.1], [2.1, 7.1]], "c:double,a:double") path = os.path.join(self.tmpdir, "a", "b") - e.save_df(b, path, format_hint="csv", header=True) - c = e.load_df( + fa.save(b, path, format_hint="csv", header=True) + c = fa.load( path, format_hint="csv", header=True, infer_schema=True, columns=["a", "c"], + as_fugue=True, ) df_eq(c, [[1.1, 6.1], [7.1, 2.1]], "a:double,c:double", throw=True) def test_load_csv_folder(self): - e = self.engine native = NativeExecutionEngine() a = ArrayDataFrame([[6.1, 1.1]], "c:double,a:double") b = ArrayDataFrame([[2.1, 7.1], [4.1, 8.1]], "c:double,a:double") path = os.path.join(self.tmpdir, "a", "b") - native.save_df( - a, os.path.join(path, "a.csv"), format_hint="csv", header=True + fa.save( + a, + os.path.join(path, "a.csv"), + format_hint="csv", + header=True, + engine=native, ) - native.save_df( - b, os.path.join(path, "b.csv"), format_hint="csv", header=True + fa.save( + b, + os.path.join(path, "b.csv"), + format_hint="csv", + header=True, + engine=native, ) FileSystem().touch(os.path.join(path, "_SUCCESS")) - c = e.load_df( + c = fa.load( path, format_hint="csv", header=True, infer_schema=True, columns=["a", "c"], + as_fugue=True, ) df_eq( c, [[1.1, 6.1], [7.1, 2.1], [8.1, 4.1]], "a:double,c:double", throw=True @@ -1242,55 +1281,55 @@ def test_save_single_and_load_json(self): path = os.path.join(self.tmpdir, "a", "b") e.fs.makedirs(path, recreate=True) # over write folder with single file - e.save_df(b, path, format_hint="json", force_single=True) + fa.save(b, path, format_hint="json", force_single=True) assert e.fs.isfile(path) - c = e.load_df( - path, - format_hint="json", - columns=["a", "c"], - ) + c = fa.load(path, format_hint="json", columns=["a", "c"], as_fugue=True) df_eq(c, [[1, 6], [7, 2]], "a:long,c:long", throw=True) # overwirte single with folder (if applicable) b = ArrayDataFrame([[60, 1], [20, 7]], "c:long,a:long") - e.save_df(b, path, format_hint="json", mode="overwrite") - c = e.load_df(path, format_hint="json", columns=["a", "c"]) + fa.save(b, path, format_hint="json", mode="overwrite") + c = fa.load(path, format_hint="json", columns=["a", "c"], as_fugue=True) df_eq(c, [[1, 60], [7, 20]], "a:long,c:long", throw=True) def test_save_and_load_json(self): e = self.engine b = ArrayDataFrame([[6, 1], [3, 4], [2, 7], [4, 8], [6, 7]], "c:int,a:long") path = os.path.join(self.tmpdir, "a", "b") - e.save_df( + fa.save( e.repartition(e.to_df(b), PartitionSpec(num=2)), path, format_hint="json", ) - c = e.load_df( - path, - format_hint="json", - columns=["a", "c"], - ) + c = fa.load(path, format_hint="json", columns=["a", "c"], as_fugue=True) df_eq( c, [[1, 6], [7, 2], [4, 3], [8, 4], [7, 6]], "a:long,c:long", throw=True ) def test_load_json_folder(self): - e = self.engine native = NativeExecutionEngine() a = ArrayDataFrame([[6, 1], [3, 4]], "c:int,a:long") b = ArrayDataFrame([[2, 7], [4, 8]], "c:int,a:long") path = os.path.join(self.tmpdir, "a", "b") - native.save_df(a, os.path.join(path, "a.json"), format_hint="json") - native.save_df(b, os.path.join(path, "b.json"), format_hint="json") + fa.save(a, os.path.join(path, "a.json"), format_hint="json", engine=native) + fa.save(b, os.path.join(path, "b.json"), format_hint="json", engine=native) FileSystem().touch(os.path.join(path, "_SUCCESS")) - c = e.load_df( - path, - format_hint="json", - columns=["a", "c"], - ) + c = fa.load(path, format_hint="json", columns=["a", "c"], as_fugue=True) df_eq(c, [[1, 6], [7, 2], [8, 4], [4, 3]], "a:long,c:long", throw=True) + def test_engine_api(self): + # complimentary tests not covered by the other tests + with fa.engine_context(self.engine): + df1 = fa.as_fugue_df([[0, 1], [2, 3]], schema="a:long,b:long") + df1 = fa.repartition(df1, {"num": 2}) + df1 = fa.get_native_as_df(fa.broadcast(df1)) + df2 = pd.DataFrame([[0, 1], [2, 3]], columns=["a", "b"]) + df3 = fa.union(df1, df2, as_fugue=False) + assert fa.is_df(df3) and not isinstance(df3, DataFrame) + df4 = fa.union(df1, df2, as_fugue=True) + assert isinstance(df4, DataFrame) + df_eq(df4, fa.as_pandas(df3), throw=True) + def select_top(cursor, data): return ArrayDataFrame([cursor.row], cursor.row_schema) diff --git a/setup.py b/setup.py index 80849629..1b9e1820 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ def get_version() -> str: install_requires=[ "triad>=0.7.0", "adagio>=0.2.4", - "qpd>=0.3.1", + "qpd>=0.3.4", "fugue-sql-antlr>=0.1.1", "sqlalchemy", "pyarrow>=0.15.1", @@ -43,7 +43,7 @@ def get_version() -> str: extras_require={ "cpp_sql_parser": ["fugue-sql-antlr[cpp]>=0.1.1"], "spark": ["pyspark"], - "dask": ["dask[distributed,dataframe]", "qpd[dask]>=0.3.1"], + "dask": ["dask[distributed,dataframe]", "qpd[dask]>=0.3.4"], "ray": ["ray[data]>=2.0.0", "duckdb>=0.5.0", "pyarrow>=6.0.1"], "duckdb": [ "duckdb>=0.5.0", @@ -60,7 +60,7 @@ def get_version() -> str: "pyspark", "dask[distributed,dataframe]", "ray[data]>=2.0.0", - "qpd[dask]>=0.3.1", + "qpd[dask]>=0.3.4", "notebook", "jupyterlab", "ipython>=7.10.0", @@ -89,8 +89,11 @@ def get_version() -> str: "fugue.plugins": [ "ibis = fugue_ibis[ibis]", "duckdb = fugue_duckdb.registry[duckdb]", + "duckdb_ibis = fugue_duckdb.ibis_engine[duckdb,ibis]", "spark = fugue_spark.registry[spark]", + "spark_ibis = fugue_spark.ibis_engine[spark,ibis]", "dask = fugue_dask.registry[dask]", + "dask_ibis = fugue_dask.ibis_engine[dask,ibis]", "ray = fugue_ray.registry[ray]", ] }, diff --git a/tests/fugue/collections/test_partition.py b/tests/fugue/collections/test_partition.py index 0665032f..a88a8817 100644 --- a/tests/fugue/collections/test_partition.py +++ b/tests/fugue/collections/test_partition.py @@ -7,31 +7,45 @@ from triad.utils.hash import to_uuid from triad.collections.dict import IndexedOrderedDict + def test_parse_presort_exp(): assert parse_presort_exp(None) == IndexedOrderedDict() - assert parse_presort_exp(IndexedOrderedDict([('c', True)])) == IndexedOrderedDict([('c', True)]) - assert parse_presort_exp("c") == IndexedOrderedDict([('c', True)]) - assert parse_presort_exp(" c") == IndexedOrderedDict([('c', True)]) - assert parse_presort_exp("c desc") == IndexedOrderedDict([('c', False)]) - assert parse_presort_exp("b desc, c asc") == IndexedOrderedDict([('b', False), ('c', True)]) - assert parse_presort_exp("DESC DESC, ASC ASC") == IndexedOrderedDict([('DESC', False), ('ASC', True)]) - assert parse_presort_exp([("b", False),("c", True)]) == IndexedOrderedDict([('b', False), ('c', True)]) - assert parse_presort_exp("B DESC, C ASC") == IndexedOrderedDict([('B', False), ('C', True)]) - assert parse_presort_exp("b desc, c asc") == IndexedOrderedDict([('b', False), ('c', True)]) - + assert parse_presort_exp(IndexedOrderedDict([("c", True)])) == IndexedOrderedDict( + [("c", True)] + ) + assert parse_presort_exp("c") == IndexedOrderedDict([("c", True)]) + assert parse_presort_exp(" c") == IndexedOrderedDict([("c", True)]) + assert parse_presort_exp("c desc") == IndexedOrderedDict([("c", False)]) + assert parse_presort_exp("b desc, c asc") == IndexedOrderedDict( + [("b", False), ("c", True)] + ) + assert parse_presort_exp("DESC DESC, ASC ASC") == IndexedOrderedDict( + [("DESC", False), ("ASC", True)] + ) + assert parse_presort_exp([("b", False), ("c", True)]) == IndexedOrderedDict( + [("b", False), ("c", True)] + ) + assert parse_presort_exp("B DESC, C ASC") == IndexedOrderedDict( + [("B", False), ("C", True)] + ) + assert parse_presort_exp("b desc, c asc") == IndexedOrderedDict( + [("b", False), ("c", True)] + ) with raises(SyntaxError): - parse_presort_exp("b dsc, c asc") # mispelling of desc + parse_presort_exp("b dsc, c asc") # mispelling of desc with raises(SyntaxError): - parse_presort_exp("c true") # string format needs desc/asc + parse_presort_exp("c true") # string format needs desc/asc with raises(SyntaxError): - parse_presort_exp("c true, c true") # cannot contain duplicates + parse_presort_exp("c true, c true") # cannot contain duplicates with raises(SyntaxError): - parse_presort_exp([("b", "desc"),("c", "asc")]) # instead of desc and asc, needs to be bool + parse_presort_exp( + [("b", "desc"), ("c", "asc")] + ) # instead of desc and asc, needs to be bool def test_partition_spec(): @@ -86,6 +100,12 @@ def test_partition_spec(): assert PartitionSpec("per_row") == PartitionSpec(num="ROWCOUNT", algo="even") assert PartitionSpec(by="abc") == PartitionSpec(by=["abc"]) + assert PartitionSpec("abc") == PartitionSpec(by=["abc"]) + assert PartitionSpec(["abc"]) == PartitionSpec(by=["abc"]) + assert PartitionSpec(["abc", "def"]) == PartitionSpec(by=["abc", "def"]) + assert PartitionSpec(("abc", "def")) == PartitionSpec(by=["abc", "def"]) + + assert PartitionSpec(4) == PartitionSpec(num=4) # partition by overlaps with presort raises( @@ -105,7 +125,7 @@ def test_partition_spec(): raises(SyntaxError, lambda: PartitionSpec(partition_by=123)) # bad input - raises(TypeError, lambda: PartitionSpec(1)) + raises(TypeError, lambda: PartitionSpec(1.1)) # bad presort raises(SyntaxError, lambda: PartitionSpec(presort="a xsc,e desc")) diff --git a/tests/fugue/column/test_sql.py b/tests/fugue/column/test_sql.py index 8e9b37c1..b8c9c10b 100644 --- a/tests/fugue/column/test_sql.py +++ b/tests/fugue/column/test_sql.py @@ -140,11 +140,13 @@ def dummy(expr): def test_where(): gen = SQLExpressionGenerator() - assert "SELECT * FROM x WHERE (a<5) AND b IS NULL" == gen.where( - (col("a") < 5) & col("b").is_null(), "x" + assert "SELECT * FROM !x! WHERE (a<5) AND b IS NULL" == _to_sql( + gen.where((col("a") < 5) & col("b").is_null(), "x") ) - assert "SELECT * FROM x WHERE a<5" == gen.where((col("a") < 5).alias("x"), "x") - raises(ValueError, lambda: gen.where(f.max(col("a")), "x")) + assert "SELECT * FROM !x! WHERE a<5" == _to_sql( + gen.where((col("a") < 5).alias("x"), "x") + ) + raises(ValueError, lambda: list(gen.where(f.max(col("a")), "x"))) def test_select(): @@ -152,33 +154,33 @@ def test_select(): # no aggregation cols = SelectColumns(col("*")) - assert "SELECT * FROM x" == gen.select(cols, "x") + assert "SELECT * FROM !x!" == _to_sql(gen.select(cols, "x")) cols = SelectColumns(col("a"), lit(1).alias("b"), (col("b") + col("c")).alias("x")) where = (col("a") > 5).alias("aa") - assert "SELECT a, 1 AS b, b+c AS x FROM t WHERE a>5" == gen.select( - cols, "t", where=where + assert "SELECT a, 1 AS b, b+c AS x FROM !t! WHERE a>5" == _to_sql( + gen.select(cols, "t", where=where) ) # aggregation without literals cols = SelectColumns(f.max(col("c")).alias("c"), col("a", "aa"), col("b")) - assert "SELECT MAX(c) AS c, a AS aa, b FROM t GROUP BY a, b" == gen.select( - cols, "t" + assert "SELECT MAX(c) AS c, a AS aa, b FROM !t! GROUP BY a, b" == _to_sql( + gen.select(cols, "t") ) where = col("a") < 10 having = (f.max(col("a")) > 5).alias("aaa") assert ( - "SELECT MAX(c) AS c, a AS aa, b FROM t WHERE a<10 GROUP BY a, b HAVING MAX(a)>5" - == gen.select(cols, "t", where=where, having=having) + "SELECT MAX(c) AS c, a AS aa, b FROM !t! WHERE a<10 GROUP BY a, b HAVING MAX(a)>5" + == _to_sql(gen.select(cols, "t", where=where, having=having)) ) cols = SelectColumns( f.min(col("c") + 1).alias("c"), f.avg(col("d") + col("e")).cast(int).alias("d"), ) - assert "SELECT MIN(c+1) AS c, CAST(AVG(d+e) AS long) AS d FROM t" == gen.select( - cols, "t" + assert "SELECT MIN(c+1) AS c, CAST(AVG(d+e) AS long) AS d FROM !t!" == _to_sql( + gen.select(cols, "t") ) # aggregation with literals @@ -186,19 +188,19 @@ def test_select(): lit(1, "k"), f.max(col("c")).alias("c"), lit(2, "j"), col("a", "aa"), col("b") ) assert ( - "SELECT 1 AS k, c, 2 AS j, aa, b FROM (SELECT MAX(c) AS c, a AS aa, b FROM t GROUP BY a, b)" - == gen.select(cols, "t") + "SELECT 1 AS k, c, 2 AS j, aa, b FROM ( SELECT MAX(c) AS c, a AS aa, b FROM !t! GROUP BY a, b )" + == _to_sql(gen.select(cols, "t")) ) cols = SelectColumns(lit(1, "k"), f.max(col("c")).alias("c"), lit(2, "j")) - assert "SELECT 1 AS k, c, 2 AS j FROM (SELECT MAX(c) AS c FROM t)" == gen.select( - cols, "t" + assert "SELECT 1 AS k, c, 2 AS j FROM ( SELECT MAX(c) AS c FROM !t! )" == _to_sql( + gen.select(cols, "t") ) cols = SelectColumns(lit(1, "k"), col("a"), f.max(col("c")).alias("c"), lit(2, "j")) assert ( - "SELECT 1 AS k, a, c, 2 AS j FROM (SELECT a, MAX(c) AS c FROM t GROUP BY a)" - == gen.select(cols, "t") + "SELECT 1 AS k, a, c, 2 AS j FROM ( SELECT a, MAX(c) AS c FROM !t! GROUP BY a )" + == _to_sql(gen.select(cols, "t")) ) # cast @@ -207,8 +209,8 @@ def test_select(): f.avg(col("d") + col("e")).cast(int).alias("d"), ) assert ( - "SELECT CAST(c AS double) AS c, CAST(AVG(d+e) AS long) AS d FROM t GROUP BY c" - == gen.select(cols, "t") + "SELECT CAST(c AS double) AS c, CAST(AVG(d+e) AS long) AS d FROM !t! GROUP BY c" + == _to_sql(gen.select(cols, "t")) ) # infer alias @@ -219,7 +221,8 @@ def test_select(): ) assert ( "SELECT CAST(-c AS double) AS c, CAST(MAX(e) AS long) AS e, " - "CAST(AVG(d+e) AS long) AS d FROM t GROUP BY -c" == gen.select(cols, "t") + "CAST(AVG(d+e) AS long) AS d FROM !t! GROUP BY -c" + == _to_sql(gen.select(cols, "t")) ) @@ -252,6 +255,16 @@ def test_no_cast(): cols = SelectColumns( f.max(col("c")).cast("long").alias("c"), col("a", "aa"), col("b") ) - assert "SELECT MAX(c) AS c, a AS aa, b FROM t GROUP BY a, b" == gen.select( - cols, "t" + assert "SELECT MAX(c) AS c, a AS aa, b FROM !t! GROUP BY a, b" == _to_sql( + gen.select(cols, "t") ) + + +def _to_sql(parts): + return ( + " ".join( + "!" + x[1].strip() + "!" if x[0] else x[1].strip() + for x in parts + if x[1].strip() != "" + ) + ).strip() diff --git a/tests/fugue/dataframe/test_arrow_dataframe.py b/tests/fugue/dataframe/test_arrow_dataframe.py index 551b7ecb..8cdfac34 100644 --- a/tests/fugue/dataframe/test_arrow_dataframe.py +++ b/tests/fugue/dataframe/test_arrow_dataframe.py @@ -1,15 +1,11 @@ -import json -from datetime import datetime from typing import Any -import numpy as np import pandas as pd -from fugue.dataframe import ArrowDataFrame, PandasDataFrame -from fugue.dataframe.utils import _df_eq as df_eq +import pyarrow as pa +from fugue.dataframe import ArrowDataFrame from fugue_test.dataframe_suite import DataFrameTests from pytest import raises -from triad.collections.schema import Schema, SchemaError -from triad.exceptions import InvalidOperationError +import fugue.api as fa class ArrowDataFrameTests(DataFrameTests.Tests): @@ -17,6 +13,17 @@ def df(self, data: Any = None, schema: Any = None) -> ArrowDataFrame: return ArrowDataFrame(data, schema) +class NativeArrowDataFrameTests(DataFrameTests.NativeTests): + def df(self, data: Any = None, schema: Any = None) -> pd.DataFrame: + return ArrowDataFrame(data, schema).as_arrow() + + def to_native_df(self, pdf: pd.DataFrame) -> Any: # pragma: no cover + return pa.Table.from_pandas(pdf) + + def test_num_partitions(self): + assert fa.get_num_partitions(self.df([[0, 1]], "a:int,b:int")) == 1 + + def test_init(): df = ArrowDataFrame(schema="a:str,b:int") assert df.empty diff --git a/tests/fugue/dataframe/test_dataframe.py b/tests/fugue/dataframe/test_dataframe.py index 506e6de8..0d7cb7b6 100644 --- a/tests/fugue/dataframe/test_dataframe.py +++ b/tests/fugue/dataframe/test_dataframe.py @@ -1,7 +1,28 @@ -from fugue.dataframe import ArrayDataFrame, DataFrame -from triad.collections.schema import Schema import copy +import pandas as pd +from pytest import raises +from triad.collections.schema import Schema + +from fugue.dataframe import ArrayDataFrame, DataFrame +from fugue.api import as_fugue_df, get_native_as_df +from fugue.bag.array_bag import ArrayBag + + +def test_as_fugue_df(): + with raises(NotImplementedError): + as_fugue_df(10) + with raises(TypeError): + as_fugue_df(ArrayBag([1, 2])) + df = pd.DataFrame([[0]], columns=["a"]) + assert isinstance(as_fugue_df(df), DataFrame) + + +def test_get_native_as_df(): + with raises(NotImplementedError): + get_native_as_df(10) + # other tests are in the suites + def test_show(): df = ArrayDataFrame(schema="a:str,b:str") @@ -56,5 +77,5 @@ def test_copy(): class MockDF(ArrayDataFrame): def __init__(self, df=None, schema=None): - super(). __init__(df=df, schema=schema) + super().__init__(df=df, schema=schema) DataFrame.__init__(self, lambda: Schema(schema)) diff --git a/tests/fugue/dataframe/test_pandas_dataframe.py b/tests/fugue/dataframe/test_pandas_dataframe.py index c0bc5e7a..38a45399 100644 --- a/tests/fugue/dataframe/test_pandas_dataframe.py +++ b/tests/fugue/dataframe/test_pandas_dataframe.py @@ -5,19 +5,40 @@ import numpy as np import pandas as pd -from fugue.dataframe import PandasDataFrame +from pytest import raises +from triad.collections.schema import Schema + +import fugue.api as fa +from fugue.dataframe import ArrowDataFrame, PandasDataFrame from fugue.dataframe.array_dataframe import ArrayDataFrame from fugue.dataframe.utils import _df_eq as df_eq from fugue_test.dataframe_suite import DataFrameTests -from pytest import raises -from triad.collections.schema import Schema, SchemaError -from triad.exceptions import InvalidOperationError class PandasDataFrameTests(DataFrameTests.Tests): def df(self, data: Any = None, schema: Any = None) -> PandasDataFrame: return PandasDataFrame(data, schema) + def test_num_partitions(self): + assert fa.get_num_partitions(self.df([[0, 1]], "a:int,b:int")) == 1 + + def test_api_as_local(self): + assert fa.is_local(self.df([[0, 1]], "a:int,b:int")) + + +class NativePandasDataFrameTests(DataFrameTests.NativeTests): + def df(self, data: Any = None, schema: Any = None) -> pd.DataFrame: + return ArrowDataFrame(data, schema).as_pandas() + + def to_native_df(self, pdf: pd.DataFrame) -> Any: # pragma: no cover + return pdf + + def test_num_partitions(self): + assert fa.get_num_partitions(self.df([[0, 1]], "a:int,b:int")) == 1 + + def test_map_type(self): + pass + def test_init(): df = PandasDataFrame(schema="a:str,b:int") @@ -76,10 +97,10 @@ def test_simple_methods(): def test_nested(): - #data = [[dict(a=1, b=[3, 4], d=1.0)], [json.dumps(dict(b=[30, "40"]))]] - #df = PandasDataFrame(data, "a:{a:str,b:[int]}") - #a = df.as_array(type_safe=True) - #assert [[dict(a="1", b=[3, 4])], [dict(a=None, b=[30, 40])]] == a + # data = [[dict(a=1, b=[3, 4], d=1.0)], [json.dumps(dict(b=[30, "40"]))]] + # df = PandasDataFrame(data, "a:{a:str,b:[int]}") + # a = df.as_array(type_safe=True) + # assert [[dict(a="1", b=[3, 4])], [dict(a=None, b=[30, 40])]] == a data = [[[json.dumps(dict(b=[30, "40"]))]]] df = PandasDataFrame(data, "a:[{a:str,b:[int]}]") diff --git a/tests/fugue/dataframe/test_utils.py b/tests/fugue/dataframe/test_utils.py index 4bd7add3..733b620c 100644 --- a/tests/fugue/dataframe/test_utils.py +++ b/tests/fugue/dataframe/test_utils.py @@ -3,26 +3,25 @@ import numpy as np import pandas as pd import pyarrow as pa +from pytest import raises +from triad import FileSystem, Schema +from triad.collections.schema import SchemaError +from triad.exceptions import InvalidOperationError, NoneArgumentError + +from fugue import ArrayDataFrame, ArrowDataFrame, IterableDataFrame, PandasDataFrame from fugue.dataframe import to_local_bounded_df, to_local_df -from fugue.dataframe.array_dataframe import ArrayDataFrame -from fugue.dataframe.iterable_dataframe import IterableDataFrame -from fugue.dataframe.pandas_dataframe import PandasDataFrame from fugue.dataframe.utils import _df_eq as df_eq from fugue.dataframe.utils import ( _schema_eq, deserialize_df, - get_dataframe_column_names, + get_column_names, get_join_schemas, normalize_dataframe_column_names, pickle_df, - rename_dataframe_column_names, + rename, serialize_df, unpickle_df, ) -from pytest import raises -from triad import FileSystem, Schema -from triad.collections.schema import SchemaError -from triad.exceptions import InvalidOperationError, NoneArgumentError def test_to_local_df(): @@ -44,11 +43,16 @@ def test_to_local_df(): def test_to_local_bounded_df(): df = ArrayDataFrame([[0, 1]], "a:int,b:int") idf = IterableDataFrame([[0, 1]], "a:int,b:int") + adf = ArrowDataFrame(df.as_array(), "a:int,b:int") assert to_local_bounded_df(df) is df r = to_local_bounded_df(idf) assert r is not idf assert r.as_array() == [[0, 1]] assert r.schema == "a:int,b:int" + r = to_local_bounded_df(adf.native) + assert isinstance(r, ArrowDataFrame) + assert r.as_array() == [[0, 1]] + assert r.schema == "a:int,b:int" def test_schema_eq(): @@ -200,50 +204,50 @@ def assert_eq(df, df_expected=None, raw=False): raises(ValueError, lambda: deserialize_df('{"x":1}')) -def test_get_dataframe_column_names(): +def _test_get_column_names(): df = pd.DataFrame([[0, 1, 2]]) - assert get_dataframe_column_names(df) == [0, 1, 2] + assert get_column_names(df) == [0, 1, 2] adf = pa.Table.from_pandas(df) - assert get_dataframe_column_names(adf) == ["0", "1", "2"] + assert get_column_names(adf) == ["0", "1", "2"] pdf = PandasDataFrame(pd.DataFrame([[0, 1]], columns=["a", "b"])) - assert get_dataframe_column_names(pdf) == ["a", "b"] + assert get_column_names(pdf) == ["a", "b"] -def test_rename_dataframe_column_names(): - assert rename_dataframe_column_names("dummy", {}) == "dummy" +def _test_rename(): + assert rename("dummy", {}) == "dummy" pdf = pd.DataFrame([[0, 1, 2]], columns=["a", "b", "c"]) - df = rename_dataframe_column_names(pdf, {}) - assert get_dataframe_column_names(df) == ["a", "b", "c"] - df = rename_dataframe_column_names(pdf, {"b": "bb"}) - assert get_dataframe_column_names(df) == ["a", "bb", "c"] + df = rename(pdf, {}) + assert get_column_names(df) == ["a", "b", "c"] + df = rename(pdf, {"b": "bb"}) + assert get_column_names(df) == ["a", "bb", "c"] adf = pa.Table.from_pandas(pdf) - adf = rename_dataframe_column_names(adf, {}) - assert get_dataframe_column_names(adf) == ["a", "b", "c"] - adf = rename_dataframe_column_names(adf, {"b": "bb"}) - assert get_dataframe_column_names(adf) == ["a", "bb", "c"] + adf = rename(adf, {}) + assert get_column_names(adf) == ["a", "b", "c"] + adf = rename(adf, {"b": "bb"}) + assert get_column_names(adf) == ["a", "bb", "c"] fdf = PandasDataFrame(pdf) - fdf = rename_dataframe_column_names(fdf, {}) - assert get_dataframe_column_names(fdf) == ["a", "b", "c"] - fdf = rename_dataframe_column_names(fdf, {"b": "bb"}) - assert get_dataframe_column_names(fdf) == ["a", "bb", "c"] + fdf = rename(fdf, {}) + assert get_column_names(fdf) == ["a", "b", "c"] + fdf = rename(fdf, {"b": "bb"}) + assert get_column_names(fdf) == ["a", "bb", "c"] def test_normalize_dataframe_column_names(): df = pd.DataFrame([[0, 1, 2]], columns=["a", "b", "c"]) df, names = normalize_dataframe_column_names(df) - assert get_dataframe_column_names(df) == ["a", "b", "c"] + assert get_column_names(df) == ["a", "b", "c"] assert names == {} df = pd.DataFrame([[0, 1, 2]]) df, names = normalize_dataframe_column_names(df) - assert get_dataframe_column_names(df) == ["_0", "_1", "_2"] + assert get_column_names(df) == ["_0", "_1", "_2"] assert names == {"_0": 0, "_1": 1, "_2": 2} df = pd.DataFrame([[0, 1, 2, 3]], columns=["1", "2", "_2", "大"]) df, names = normalize_dataframe_column_names(df) - assert get_dataframe_column_names(df) == ["_1", "_2_1", "_2", "_1_1"] + assert get_column_names(df) == ["_1", "_2_1", "_2", "_1_1"] assert names == {"_1": "1", "_2_1": "2", "_1_1": "大"} diff --git a/tests/fugue/execution/test_api.py b/tests/fugue/execution/test_api.py new file mode 100644 index 00000000..15890d17 --- /dev/null +++ b/tests/fugue/execution/test_api.py @@ -0,0 +1,25 @@ +import fugue.api as fa +from fugue import NativeExecutionEngine + + +def test_engine_operations(): + e = fa.set_global_engine("native") + assert isinstance(e, NativeExecutionEngine) + assert e.in_context and e.is_global + assert fa.get_current_engine() is e + with fa.engine_context("duckdb") as e2: + assert fa.get_current_engine() is e2 + assert not e2.is_global and e2.in_context + with e.as_context(): + assert not e2.is_global and e2.in_context + assert e.in_context and e.is_global + assert fa.get_current_engine() is e + assert e.in_context and e.is_global + assert fa.get_current_engine() is e2 + assert not e2.is_global and not e2.in_context + assert e.in_context and e.is_global + e3 = fa.set_global_engine("duckdb") + assert not e.in_context and not e.is_global + assert e3.in_context and e3.is_global + fa.clear_global_engine() + assert not e3.in_context and not e3.is_global diff --git a/tests/fugue/execution/test_factory.py b/tests/fugue/execution/test_factory.py index 58352182..612d876c 100644 --- a/tests/fugue/execution/test_factory.py +++ b/tests/fugue/execution/test_factory.py @@ -255,8 +255,11 @@ def test_make_execution_engine(): def test_context_and_infer_execution_engine(): e1 = _MockExecutionEngine({}) e2 = _MockExecutionEngine2(Dummy2(), {}) + assert not e1.in_context and not e2.in_context with e2.as_context(): + assert not e1.in_context and e2.in_context with e1.as_context() as ex: + assert e1.in_context and e2.in_context assert ex is e1 e = make_execution_engine( None, conf={"x": False}, infer_by=[pd.DataFrame(), Dummy2()] @@ -264,10 +267,13 @@ def test_context_and_infer_execution_engine(): assert isinstance(e, _MockExecutionEngine) assert not isinstance(e, _MockExecutionEngine2) assert not e.conf["x"] + assert not e1.in_context and e2.in_context e = make_execution_engine(None, conf={"x": True}) assert isinstance(e, _MockExecutionEngine2) + assert not e1.in_context and not e2.in_context + e = make_execution_engine(None) assert isinstance(e, NativeExecutionEngine) assert not isinstance(e, _MockExecutionEngine) diff --git a/tests/fugue/test_interfaceless.py b/tests/fugue/test_interfaceless.py index e78608ce..530b3f7b 100644 --- a/tests/fugue/test_interfaceless.py +++ b/tests/fugue/test_interfaceless.py @@ -42,7 +42,7 @@ def f2(df: pd.DataFrame) -> pd.DataFrame: assert isinstance(result, pd.DataFrame) assert sorted(result.values.tolist(), key=lambda x: x[0]) == [[0, 0], [1, 1]] result = transform( - pdf, f2, partition=dict(by=["a"]), force_output_fugue_dataframe=True + pdf, f2, partition=dict(by=["a"]), as_fugue=True ) assert isinstance(result, DataFrame) @@ -93,7 +93,7 @@ def test_transform_from_file(tmpdir): def f(df: pd.DataFrame) -> pd.DataFrame: return df.assign(x=1) - result = transform(fp, f, force_output_fugue_dataframe=True) + result = transform(fp, f, as_fugue=True) assert result.as_array(type_safe=True) == [[2, 1]] with raises(FugueInterfacelessError): @@ -116,7 +116,7 @@ def f(df: pd.DataFrame) -> pd.DataFrame: # checkpoint is True, save_path is None result = transform( - tdf, f, force_output_fugue_dataframe=True, checkpoint=True, engine=engine + tdf, f, as_fugue=True, checkpoint=True, engine=engine ) assert result.as_array() == [[2, 1]] @@ -124,7 +124,7 @@ def f(df: pd.DataFrame) -> pd.DataFrame: result = transform( tdf, f, - force_output_fugue_dataframe=True, + as_fugue=True, checkpoint=True, save_path=fp, engine=engine, @@ -136,7 +136,7 @@ def f(df: pd.DataFrame) -> pd.DataFrame: result = transform( tdf, f, - force_output_fugue_dataframe=True, + as_fugue=True, save_path=fp, engine=engine, ) @@ -163,7 +163,7 @@ def f(df: pd.DataFrame) -> pd.DataFrame: transform( tdf, f, - force_output_fugue_dataframe=True, + as_fugue=True, save_path="f.csv", engine=engine, ) @@ -171,7 +171,7 @@ def f(df: pd.DataFrame) -> pd.DataFrame: transform( tdf, f, - force_output_fugue_dataframe=True, + as_fugue=True, save_path="f.json", engine=engine, ) diff --git a/tests/fugue/utils/test_sql.py b/tests/fugue/utils/test_sql.py new file mode 100644 index 00000000..4d11e076 --- /dev/null +++ b/tests/fugue/utils/test_sql.py @@ -0,0 +1,23 @@ +from fugue._utils.sql import get_temp_tb_name, parse_sql + + +def test_parse_sql(): + def parse(sql): + parts = parse_sql(sql) + return "".join([p[1] if not p[0] else "!" + p[1] + "!" for p in parts]) + + t1 = get_temp_tb_name() + t2 = get_temp_tb_name() + assert parse("") == "" + assert parse(f"{t1}") == f"!{t1.key}!" + assert parse(f" {t1} ") == f" !{t1.key}! " + assert parse(f"SELECT * FROM {t1}") == f"SELECT * FROM !{t1.key}!" + assert ( + parse(f"SELECT * FROM {t1} NATURAL JOIN {t2}") + == f"SELECT * FROM !{t1.key}! NATURAL JOIN !{t2.key}!" + ) + assert ( + parse(f"SELECT {t1}.* FROM {t1} NATURAL JOIN {t2} WHERE {t2}.x<1") + == f"SELECT !{t1.key}!.* FROM !{t1.key}! " + f"NATURAL JOIN !{t2.key}! WHERE !{t2.key}!.x<1" + ) diff --git a/tests/fugue/workflow/test_workflow.py b/tests/fugue/workflow/test_workflow.py index 7429f77c..bd9a721e 100644 --- a/tests/fugue/workflow/test_workflow.py +++ b/tests/fugue/workflow/test_workflow.py @@ -67,6 +67,7 @@ def test_workflow(): builder = FugueWorkflow() a = builder.create_data([[0], [0], [1]], "a:int") + assert builder.last_df is a raises(InvalidOperationError, lambda: a._task.copy()) raises(InvalidOperationError, lambda: copy.copy(a._task)) raises(InvalidOperationError, lambda: copy.deepcopy(a._task)) @@ -77,11 +78,16 @@ def test_workflow(): b = a.transform(mock_tf1, "*,b:int", pre_partition=dict(by=["a"])) b.show() + assert builder.last_df is b builder.create_data([[0], [1]], "b:int").show() + assert builder.last_df is not b c = ArrayDataFrame([[100]], "a:int") builder.show(a, b, c) b = a.partition(by=["a"]).transform(mock_tf2).persist().broadcast() b.show() + assert builder.last_df is b + c = builder.df(a) + assert builder.last_df is a builder.run() df_eq(a.result, [[0], [0], [1]], "a:int") diff --git a/tests/fugue_dask/test_dataframe.py b/tests/fugue_dask/test_dataframe.py index f8b41064..03aa5f40 100644 --- a/tests/fugue_dask/test_dataframe.py +++ b/tests/fugue_dask/test_dataframe.py @@ -6,6 +6,7 @@ import dask.dataframe as pd import numpy as np import pandas +import fugue.api as fi from fugue.dataframe.array_dataframe import ArrayDataFrame from fugue.dataframe.pandas_dataframe import PandasDataFrame from fugue.dataframe.utils import _df_eq as df_eq @@ -14,18 +15,48 @@ from pytest import raises from triad.collections.schema import Schema from fugue.dataframe.utils import ( - get_dataframe_column_names, - rename_dataframe_column_names, + get_column_names, + rename, ) class DaskDataFrameTests(DataFrameTests.Tests): - def df( - self, data: Any = None, schema: Any = None - ) -> DaskDataFrame: + def df(self, data: Any = None, schema: Any = None) -> DaskDataFrame: return DaskDataFrame(data, schema) +class NativeDaskDataFrameTests(DataFrameTests.NativeTests): + def df(self, data: Any = None, schema: Any = None): + return DaskDataFrame(data, schema).native + + def to_native_df(self, pdf: pandas.DataFrame) -> Any: + return pd.from_pandas(pdf, npartitions=2) + + def test_not_local(self): + assert not fi.is_local(self.df([], "a:int,b:str")) + + def test_alter_columns(self): + pass + + def test_as_arrow(self): + pass + + def test_binary_type(self): + pass + + def test_deep_nested_types(self): + pass + + def test_list_type(self): + pass + + def test_map_type(self): + pass + + def test_struct_type(self): + pass + + def test_init(): df = DaskDataFrame(schema="a:str,b:int") assert df.is_bounded @@ -201,20 +232,20 @@ def _test_as_array_perf(): print(nts, ts) -def test_get_dataframe_column_names(): +def _test_get_column_names(): df = pd.from_pandas(pandas.DataFrame([[0, 1, 2]]), npartitions=1) - assert get_dataframe_column_names(df) == [0, 1, 2] + assert get_column_names(df) == [0, 1, 2] -def test_rename_dataframe_column_names(): +def _test_rename(): pdf = pd.from_pandas( pandas.DataFrame([[0, 1, 2]], columns=["a", "b", "c"]), npartitions=1 ) - df = rename_dataframe_column_names(pdf, {}) + df = rename(pdf, {}) assert isinstance(df, pd.DataFrame) - assert get_dataframe_column_names(df) == ["a", "b", "c"] + assert get_column_names(df) == ["a", "b", "c"] pdf = pd.from_pandas(pandas.DataFrame([[0, 1, 2]]), npartitions=1) - df = rename_dataframe_column_names(pdf, {0: "_0", 1: "_1", 2: "_2"}) + df = rename(pdf, {0: "_0", 1: "_1", 2: "_2"}) assert isinstance(df, pd.DataFrame) - assert get_dataframe_column_names(df) == ["_0", "_1", "_2"] + assert get_column_names(df) == ["_0", "_1", "_2"] diff --git a/tests/fugue_dask/test_execution_engine.py b/tests/fugue_dask/test_execution_engine.py index b96f95be..abcfdfb9 100644 --- a/tests/fugue_dask/test_execution_engine.py +++ b/tests/fugue_dask/test_execution_engine.py @@ -5,16 +5,18 @@ import dask.dataframe as dd import pandas as pd from dask.distributed import Client -from fugue import infer_execution_engine, transform + +import fugue.api as fa +from fugue import transform from fugue.collections.partition import PartitionSpec from fugue.dataframe.pandas_dataframe import PandasDataFrame from fugue.dataframe.utils import _df_eq as df_eq +from fugue.plugins import infer_execution_engine from fugue.workflow.workflow import FugueWorkflow -from fugue_test.builtin_suite import BuiltInTests -from fugue_test.execution_suite import ExecutionEngineTests - from fugue_dask.dataframe import DaskDataFrame from fugue_dask.execution_engine import DaskExecutionEngine +from fugue_test.builtin_suite import BuiltInTests +from fugue_test.execution_suite import ExecutionEngineTests _CONF = { "fugue.rpc.server": "fugue.rpc.flask.FlaskRPCServer", @@ -28,15 +30,21 @@ class DaskExecutionEngineTests(ExecutionEngineTests.Tests): @classmethod def setUpClass(cls): cls._engine = cls.make_engine(cls) + fa.set_global_engine(cls._engine) @classmethod def tearDownClass(cls): + fa.clear_global_engine() cls._engine.dask_client.close() def make_engine(self): - e = DaskExecutionEngine(conf=dict(test=True, **_CONF)) + client = Client(processes=True, n_workers=3, threads_per_worker=1) + e = DaskExecutionEngine(client, conf=dict(test=True, **_CONF)) return e + def test_get_parallelism(self): + assert fa.get_current_parallelism(self.engine) == 3 + def test__join_outer_pandas_incompatible(self): return @@ -159,7 +167,7 @@ def tr(df: List[List[Any]], add: Optional[callable]) -> List[List[Any]]: schema="b:binary", callback=cb.add, as_local=True, - force_output_fugue_dataframe=True, + as_fugue=True, engine="dask", engine_conf=_CONF, ) @@ -171,7 +179,7 @@ def tr(df: List[List[Any]], add: Optional[callable]) -> List[List[Any]]: pdf, tr, schema="b:binary", - force_output_fugue_dataframe=True, + as_fugue=True, engine="dask", ) assert not res.is_local @@ -184,7 +192,7 @@ def tr(df: List[List[Any]], add: Optional[callable]) -> List[List[Any]]: tr, schema="b:binary", callback=cb.add, - force_output_fugue_dataframe=True, + as_fugue=True, engine="dask", engine_conf=_CONF, persist=True, # when you have a persist, you can use callback @@ -192,6 +200,3 @@ def tr(df: List[List[Any]], add: Optional[callable]) -> List[List[Any]]: assert not res.is_local assert 5 == res.count() assert 5 == cb.n - - - diff --git a/tests/fugue_dask/test_ibis.py b/tests/fugue_dask/test_ibis.py index 1141e68d..2274ddd9 100644 --- a/tests/fugue_dask/test_ibis.py +++ b/tests/fugue_dask/test_ibis.py @@ -1,11 +1,11 @@ import pytest ibis = pytest.importorskip("ibis") +from fugue_dask import DaskExecutionEngine +from fugue_dask.ibis_engine import DaskIbisEngine from fugue_ibis import IbisEngine from fugue_test.ibis_suite import IbisTests -from fugue_dask import DaskExecutionEngine, DaskIbisEngine - class DaskIbisTests(IbisTests.Tests): def make_engine(self): diff --git a/tests/fugue_duckdb/test_dask.py b/tests/fugue_duckdb/test_dask.py index 85081e49..d498dce8 100644 --- a/tests/fugue_duckdb/test_dask.py +++ b/tests/fugue_duckdb/test_dask.py @@ -4,17 +4,17 @@ import duckdb import pandas as pd import pyarrow as pa -from fugue import ArrowDataFrame, DataFrame, FugueWorkflow, PartitionSpec +from dask.distributed import Client +from pytest import raises + +import fugue.api as fa +from fugue import ArrowDataFrame, DataFrame, FugueWorkflow, PartitionSpec, fsql from fugue.dataframe.utils import _df_eq as df_eq from fugue_dask import DaskDataFrame -from fugue import fsql -from fugue_test.builtin_suite import BuiltInTests -from fugue_test.execution_suite import ExecutionEngineTests -from pytest import raises -from dask.distributed import Client from fugue_duckdb import DuckDaskExecutionEngine from fugue_duckdb.dataframe import DuckDataFrame - +from fugue_test.builtin_suite import BuiltInTests +from fugue_test.execution_suite import ExecutionEngineTests _CONF = { "fugue.rpc.server": "fugue.rpc.flask.FlaskRPCServer", @@ -29,20 +29,26 @@ class DuckDaskExecutionEngineTests(ExecutionEngineTests.Tests): def setUpClass(cls): cls._con = duckdb.connect() cls._engine = cls.make_engine(cls) + fa.set_global_engine(cls._engine) @classmethod def tearDownClass(cls): + fa.clear_global_engine() cls._con.close() cls._engine.dask_client.close() def make_engine(self): + client = Client(processes=True, n_workers=2, threads_per_worker=1) e = DuckDaskExecutionEngine( conf={"test": True, "fugue.duckdb.pragma.threads": 2}, connection=self._con, - dask_client=Client(), + dask_client=client, ) return e + def test_get_parallelism(self): + assert fa.get_current_parallelism(self.engine) == 2 + def test_to_df_dask(self): pdf = pd.DataFrame([[1.1]], columns=["a"]) df = dd.from_pandas(pdf, npartitions=2) @@ -118,13 +124,11 @@ def test_yield_2(self): def assert_data(df: DataFrame) -> None: assert df.schema == "a:datetime,b:bytes,c:[long]" - df = pd.DataFrame( - [[1,2,3]], columns=list("abc") - ) + df = pd.DataFrame([[1, 2, 3]], columns=list("abc")) with FugueWorkflow() as dag: x = dag.df(df) result = dag.select("SELECT * FROM ", x) result.yield_dataframe_as("x") res = dag.run(self.engine) - assert res["x"].as_array() == [[1,2,3]] + assert res["x"].as_array() == [[1, 2, 3]] diff --git a/tests/fugue_duckdb/test_dataframe.py b/tests/fugue_duckdb/test_dataframe.py index fa0dab7e..db25da60 100644 --- a/tests/fugue_duckdb/test_dataframe.py +++ b/tests/fugue_duckdb/test_dataframe.py @@ -2,14 +2,12 @@ from typing import Any import duckdb -import numpy as np import pandas as pd -from fugue import ArrowDataFrame -from fugue.dataframe.utils import _df_eq as df_eq -from fugue_test.dataframe_suite import DataFrameTests -from pytest import raises +import fugue.api as fa +from fugue import ArrowDataFrame from fugue_duckdb.dataframe import DuckDataFrame +from fugue_test.dataframe_suite import DataFrameTests class DuckDataFrameTests(DataFrameTests.Tests): @@ -64,6 +62,22 @@ def test_init(self): assert df.is_bounded assert df.is_local - def test_duck_as_locak(self): + def test_duck_as_local(self): df = self.df([[2.1, 1]], "a:double,b:int") assert isinstance(df.as_local(), ArrowDataFrame) + + +class NativeDuckDataFrameTests(DataFrameTests.NativeTests): + @classmethod + def setUpClass(cls): + cls._con = duckdb.connect() + + def df(self, data: Any = None, schema: Any = None) -> DuckDataFrame: + df = ArrowDataFrame(data, schema) + return DuckDataFrame(duckdb.arrow(df.native, self._con)).native + + def to_native_df(self, pdf: pd.DataFrame) -> Any: + return duckdb.from_df(pdf) + + def test_num_partitions(self): + assert fa.get_num_partitions(self.df([[0, 1]], "a:int,b:int")) == 1 diff --git a/tests/fugue_duckdb/test_execution_engine.py b/tests/fugue_duckdb/test_execution_engine.py index 5e2323a6..49830e14 100644 --- a/tests/fugue_duckdb/test_execution_engine.py +++ b/tests/fugue_duckdb/test_execution_engine.py @@ -3,15 +3,16 @@ import duckdb import pandas as pd import pyarrow as pa -from fugue import ArrowDataFrame, DataFrame, FugueWorkflow, infer_execution_engine -from fugue.dataframe.utils import _df_eq as df_eq -from fugue import fsql -from fugue_test.builtin_suite import BuiltInTests -from fugue_test.execution_suite import ExecutionEngineTests from pytest import raises +import fugue.api as fa +from fugue import ArrowDataFrame, DataFrame, FugueWorkflow, fsql +from fugue.api import engine_context +from fugue.plugins import infer_execution_engine from fugue_duckdb import DuckExecutionEngine from fugue_duckdb.dataframe import DuckDataFrame +from fugue_test.builtin_suite import BuiltInTests +from fugue_test.execution_suite import ExecutionEngineTests class DuckExecutionEngineTests(ExecutionEngineTests.Tests): @@ -19,9 +20,11 @@ class DuckExecutionEngineTests(ExecutionEngineTests.Tests): def setUpClass(cls): cls._con = duckdb.connect() cls._engine = cls.make_engine(cls) + fa.set_global_engine(cls._engine) @classmethod def tearDownClass(cls): + fa.clear_global_engine() cls._con.close() def make_engine(self): @@ -30,6 +33,13 @@ def make_engine(self): ) return e + def test_duck_to_df(self): + e = self.engine + a = e.to_df([[1, 2, 3]], "a:double,b:double,c:int") + assert isinstance(a, DuckDataFrame) + b = e.to_df(a.native_as_df()) + assert isinstance(b, DuckDataFrame) + def test_intersect_all(self): e = self.engine a = e.to_df([[1, 2, 3], [4, None, 6], [4, None, 6]], "a:double,b:double,c:int") @@ -174,13 +184,27 @@ def test_sql_yield(): assert isinstance(res["a"], ArrowDataFrame) assert isinstance(res["b"], ArrowDataFrame) + # in context + with engine_context("duck"): + res = fsql( + """ + CREATE [[0]] SCHEMA a:int + YIELD DATAFRAME AS a + CREATE [[0]] SCHEMA b:int + YIELD LOCAL DATAFRAME AS b + """ + ).run() + + assert isinstance(res["a"], DuckDataFrame) + assert isinstance(res["b"], ArrowDataFrame) + def test_infer_engine(): con = duckdb.connect() df = con.from_df(pd.DataFrame([[0]], columns=["a"])) - assert infer_execution_engine([df])=="duckdb" + assert infer_execution_engine([df]) == "duckdb" fdf = DuckDataFrame(df) - assert infer_execution_engine([fdf])=="duckdb" + assert infer_execution_engine([fdf]) == "duckdb" - con.close() \ No newline at end of file + con.close() diff --git a/tests/fugue_duckdb/test_utils.py b/tests/fugue_duckdb/test_utils.py index 1cc2deaa..c515ece5 100644 --- a/tests/fugue_duckdb/test_utils.py +++ b/tests/fugue_duckdb/test_utils.py @@ -1,10 +1,11 @@ -from fugue_duckdb._utils import to_pa_type, to_duck_type, encode_value_to_expr -import pyarrow as pa import duckdb -from triad.utils.pyarrow import TRIAD_DEFAULT_TIMESTAMP -from pytest import raises -import pandas as pd import numpy as np +import pandas as pd +import pyarrow as pa +from pytest import raises +from triad.utils.pyarrow import TRIAD_DEFAULT_TIMESTAMP + +from fugue_duckdb._utils import encode_value_to_expr, to_duck_type, to_pa_type def test_encode_value_to_expr(): @@ -13,7 +14,7 @@ def test_encode_value_to_expr(): assert "1" == encode_value_to_expr(1) assert "1" == encode_value_to_expr(np.int32(1)) assert "FALSE" == encode_value_to_expr(False) - assert "TRUE" == encode_value_to_expr(np.bool(1)) + assert "TRUE" == encode_value_to_expr(np.bool_(1)) assert "E'abc'" == encode_value_to_expr("abc") assert "E'abc\\n;def'" == encode_value_to_expr("abc\n;def") assert "'\\xcaABC'::BLOB" == encode_value_to_expr(b"\xCAABC") diff --git a/tests/fugue_ibis/mock/dataframe.py b/tests/fugue_ibis/mock/dataframe.py index 4cf03587..1d8e442b 100644 --- a/tests/fugue_ibis/mock/dataframe.py +++ b/tests/fugue_ibis/mock/dataframe.py @@ -1,8 +1,8 @@ from typing import Any from fugue import ArrowDataFrame, DataFrame, LocalDataFrame +from fugue.plugins import as_fugue_dataset, as_local_bounded from fugue_ibis import IbisDataFrame, IbisTable -from fugue_ibis._utils import to_schema class MockDuckDataFrame(IbisDataFrame): @@ -14,3 +14,15 @@ def _to_local_df(self, table: IbisTable, schema: Any = None) -> LocalDataFrame: def _to_iterable_df(self, table: IbisTable, schema: Any = None) -> LocalDataFrame: return self._to_local_df(table, schema=schema) + + +# should also check the df._findbackend is duckdb +@as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, IbisTable)) +def _ibis_as_fugue(df: IbisTable, **kwargs: Any) -> bool: + return MockDuckDataFrame(df, **kwargs) + + +# should also check the df._findbackend is duckdb +@as_local_bounded.candidate(lambda df, **kwargs: isinstance(df, IbisTable)) +def _ibis_as_local(df: IbisTable, **kwargs: Any) -> bool: + return df.execute() diff --git a/tests/fugue_ibis/mock/execution_engine.py b/tests/fugue_ibis/mock/execution_engine.py index 21dfe7ae..64523876 100644 --- a/tests/fugue_ibis/mock/execution_engine.py +++ b/tests/fugue_ibis/mock/execution_engine.py @@ -12,7 +12,6 @@ PartitionCursor, PartitionSpec, ) -from fugue.collections.partition import EMPTY_PARTITION_SPEC from fugue_ibis import IbisDataFrame, IbisExecutionEngine, IbisTable from triad import FileSystem, assert_or_throw @@ -124,10 +123,11 @@ def save_df( path: str, format_hint: Any = None, mode: str = "overwrite", - partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, + partition_spec: Optional[PartitionSpec] = None, force_single: bool = False, **kwargs: Any, ) -> None: + partition_spec = partition_spec or PartitionSpec() return self._native_engine.save_df( df, path, format_hint, mode, partition_spec, force_single, **kwargs ) diff --git a/tests/fugue_ibis/test_dataframe.py b/tests/fugue_ibis/test_dataframe.py index 67ca52bc..f863c83c 100644 --- a/tests/fugue_ibis/test_dataframe.py +++ b/tests/fugue_ibis/test_dataframe.py @@ -4,7 +4,10 @@ import ibis import pandas as pd +import pyarrow as pa import pytest + +import fugue.api as fe from fugue import ArrowDataFrame from fugue_duckdb.dataframe import DuckDataFrame from fugue_test.dataframe_suite import DataFrameTests @@ -31,13 +34,8 @@ def test_init_df(self): def test_is_local(self): df = self.df([["x", 1]], "a:str,b:int") - assert not df.is_local - assert df.is_bounded - - def _test_as_arrow(self): - # empty - df = self.df([["a", 1]], "a:str,b:int") - assert [["a", 1]] == list(ArrowDataFrame(df.as_arrow()).as_array()) + assert not fe.is_local(df) + assert fe.is_bounded(df) def test_map_type(self): pass @@ -56,3 +54,44 @@ def test_as_arrow(self): assert [dict(a=datetime(2020, 1, 1), b=1)] == list( ArrowDataFrame(df.as_arrow()).as_dict_iterable() ) + + def test_deep_nested_types(self): + pass + + def test_list_type(self): + pass + + +@pytest.mark.skipif(sys.version_info < (3, 8), reason="< 3.8") +class NativeIbisDataFrameTests(DataFrameTests.NativeTests): + @classmethod + def setUpClass(cls): + cls._con = ibis.duckdb.connect() + + def df(self, data: Any = None, schema: Any = None): + df = ArrowDataFrame(data, schema) + name = f"_{id(df.native)}" + self._con.con.execute("register", (name, df.native)) + return MockDuckDataFrame(self._con.table(name), schema=schema).native + + def to_native_df(self, pdf: pd.DataFrame) -> Any: + name = f"_{id(pdf)}" + self._con.con.execute("register", (name, pa.Table.from_pandas(pdf))) + return self._con.table(name) + + def test_is_local(self): + df = self.df([["x", 1]], "a:str,b:int") + assert not fe.is_local(df) + assert fe.is_bounded(df) + + def test_map_type(self): + pass + + def test_as_arrow(self): + pass + + def test_deep_nested_types(self): + pass + + def test_list_type(self): + pass diff --git a/tests/fugue_ibis/test_extensions.py b/tests/fugue_ibis/test_extensions.py index 1a01afb1..5e55b216 100644 --- a/tests/fugue_ibis/test_extensions.py +++ b/tests/fugue_ibis/test_extensions.py @@ -1,21 +1,19 @@ import pytest ibis = pytest.importorskip("ibis") -from fugue import FugueWorkflow, NativeExecutionEngine - -from fugue_ibis import as_fugue, as_ibis, run_ibis -from fugue_ibis.execution.ibis_engine import to_ibis_engine -from fugue_ibis.execution.pandas_backend import PandasIbisEngine from pytest import raises +from fugue import FugueWorkflow, NativeExecutionEngine +from fugue_ibis import PandasIbisEngine, as_fugue, as_ibis, parse_ibis_engine, run_ibis + -def test_to_ibis_engine(): +def test_parse_ibis_engine(): e = NativeExecutionEngine() ie = PandasIbisEngine(e) - assert isinstance(to_ibis_engine(e, None), PandasIbisEngine) - assert isinstance(to_ibis_engine(e, ie), PandasIbisEngine) + assert isinstance(parse_ibis_engine(e, e), PandasIbisEngine) + assert isinstance(parse_ibis_engine(ie, e), PandasIbisEngine) with raises(NotImplementedError): - to_ibis_engine(e, "dummy") + parse_ibis_engine("dummy", e) def test_run_ibis(): diff --git a/tests/fugue_ray/test_dataframe.py b/tests/fugue_ray/test_dataframe.py index 411ec20e..8ca3bdcd 100644 --- a/tests/fugue_ray/test_dataframe.py +++ b/tests/fugue_ray/test_dataframe.py @@ -7,8 +7,8 @@ from fugue.dataframe.array_dataframe import ArrayDataFrame from fugue.dataframe.arrow_dataframe import _build_empty_arrow from fugue.dataframe.utils import ( - get_dataframe_column_names, - rename_dataframe_column_names, + get_column_names, + rename, ) from fugue_test.dataframe_suite import DataFrameTests from pytest import raises @@ -101,22 +101,40 @@ def test_ray_num_partitions(self): df = RayDataFrame(rdf.repartition(5)) assert 5 == df.num_partitions - def test_get_dataframe_column_names(self): + def _test_get_column_names(self): df = rd.from_pandas(pd.DataFrame([[0, 10, 20]], columns=["0", "1", "2"])) - assert get_dataframe_column_names(df) == ["0", "1", "2"] + assert get_column_names(df) == ["0", "1", "2"] df = rd.from_arrow( pa.Table.from_pandas(pd.DataFrame([[0, 10, 20]], columns=["0", "1", "2"])) ) - assert get_dataframe_column_names(df) == ["0", "1", "2"] + assert get_column_names(df) == ["0", "1", "2"] - def test_rename_dataframe_column_names(self): + def _test_rename(self): rdf = rd.from_pandas(pd.DataFrame([[0, 10, 20]], columns=["a", "b", "c"])) - df = rename_dataframe_column_names(rdf, {}) + df = rename(rdf, {}) assert isinstance(df, rd.Dataset) - assert get_dataframe_column_names(df) == ["a", "b", "c"] + assert get_column_names(df) == ["a", "b", "c"] pdf = rd.from_pandas(pd.DataFrame([[0, 10, 20]], columns=["0", "1", "2"])) - df = rename_dataframe_column_names(pdf, {"0": "_0", "1": "_1", "2": "_2"}) + df = rename(pdf, {"0": "_0", "1": "_1", "2": "_2"}) assert isinstance(df, rd.Dataset) - assert get_dataframe_column_names(df) == ["_0", "_1", "_2"] + assert get_column_names(df) == ["_0", "_1", "_2"] + + +class NativeRayDataFrameTests(DataFrameTests.NativeTests): + @classmethod + def setUpClass(cls): + ray.init(num_cpus=2) + + @classmethod + def tearDownClass(cls): + ray.shutdown() + + def df(self, data: Any = None, schema: Any = None): + res = RayDataFrame(data, schema) + # native ray dataset can't handle the schema when empty + return res if res.empty else res.native + + def to_native_df(self, pdf: pd.DataFrame) -> Any: + return rd.from_pandas(pdf) diff --git a/tests/fugue_ray/test_execution_engine.py b/tests/fugue_ray/test_execution_engine.py index 49aa99b1..9008980a 100644 --- a/tests/fugue_ray/test_execution_engine.py +++ b/tests/fugue_ray/test_execution_engine.py @@ -4,21 +4,16 @@ import pandas as pd import ray import ray.data as rd -from fugue import ( - ArrayDataFrame, - FugueWorkflow, - transform, - DataFrame, - infer_execution_engine, -) -from fugue.dataframe.utils import _df_eq as df_eq -from fugue import fsql -from fugue_test.builtin_suite import BuiltInTests -from fugue_test.execution_suite import ExecutionEngineTests from pytest import raises from triad import FileSystem -from fugue_ray import RayExecutionEngine, RayDataFrame +import fugue.api as fa +from fugue import ArrayDataFrame, DataFrame, FugueWorkflow, fsql, transform +from fugue.dataframe.utils import _df_eq as df_eq +from fugue.plugins import infer_execution_engine +from fugue_ray import RayDataFrame, RayExecutionEngine +from fugue_test.builtin_suite import BuiltInTests +from fugue_test.execution_suite import ExecutionEngineTests _CONF = { "fugue.rpc.server": "fugue.rpc.flask.FlaskRPCServer", @@ -34,9 +29,11 @@ def setUpClass(cls): ray.init(num_cpus=2) cls._con = duckdb.connect() cls._engine = cls.make_engine(cls) + fa.set_global_engine(cls._engine) @classmethod def tearDownClass(cls): + fa.clear_global_engine() cls._con.close() ray.shutdown() @@ -47,6 +44,9 @@ def make_engine(self): ) return e + def test_get_parallelism(self): + assert fa.get_current_parallelism(self.engine) == 2 + def test_repartitioning(self): # schema: * def t(df: pd.DataFrame) -> pd.DataFrame: @@ -61,7 +61,7 @@ def t(df: pd.DataFrame) -> pd.DataFrame: partition="per_row", engine="ray", as_local=True, - force_output_fugue_dataframe=True, + as_fugue=True, ) df_eq( res, @@ -77,7 +77,7 @@ def t(df: pd.DataFrame) -> pd.DataFrame: partition=dict(num=3, algo="rand"), engine="ray", as_local=True, - force_output_fugue_dataframe=True, + as_fugue=True, ) df_eq( res, @@ -93,7 +93,7 @@ def t(df: pd.DataFrame) -> pd.DataFrame: partition=dict(num=40), engine="ray", as_local=True, - force_output_fugue_dataframe=True, + as_fugue=True, ) df_eq( res, @@ -119,7 +119,7 @@ def t(df: pd.DataFrame) -> pd.DataFrame: "fugue.ray.remote.num_cpus": 1, }, as_local=True, - force_output_fugue_dataframe=True, + as_fugue=True, ) df_eq( res, diff --git a/tests/fugue_spark/test_dataframe.py b/tests/fugue_spark/test_dataframe.py index 4a988019..d9fafc21 100644 --- a/tests/fugue_spark/test_dataframe.py +++ b/tests/fugue_spark/test_dataframe.py @@ -5,18 +5,16 @@ import pyspark import pyspark.sql as ps import pytest -from fugue.dataframe.pandas_dataframe import PandasDataFrame -from fugue.dataframe.utils import ( - get_dataframe_column_names, - rename_dataframe_column_names, -) -from fugue_test.dataframe_suite import DataFrameTests from pyspark.sql import SparkSession from triad.collections.schema import Schema +import fugue.api as fi +from fugue.dataframe.pandas_dataframe import PandasDataFrame +from fugue.plugins import get_column_names, rename from fugue_spark import SparkExecutionEngine from fugue_spark._utils.convert import to_schema, to_spark_schema from fugue_spark.dataframe import SparkDataFrame +from fugue_test.dataframe_suite import DataFrameTests class SparkDataFrameTests(DataFrameTests.Tests): @@ -38,6 +36,30 @@ def test_map_type(self): return super().test_map_type() +class NativeSparkDataFrameTests(DataFrameTests.NativeTests): + @pytest.fixture(autouse=True) + def init_session(self, spark_session): + self.spark_session = spark_session + + def df(self, data: Any = None, schema: Any = None): + engine = SparkExecutionEngine(self.spark_session) + return engine.to_df(data, schema=schema).native + + def to_native_df(self, pdf: pd.DataFrame) -> Any: + return self.spark_session.createDataFrame(pdf) + + def test_not_local(self): + assert not fi.is_local(self.df([], "a:int,b:str")) + + def test_alter_columns_invalid(self): + # TODO: Spark will silently cast invalid data to nulls without exceptions + pass + + def test_map_type(self): + if pyspark.__version__ >= "3": + return super().test_map_type() + + def test_init(spark_session): sdf = spark_session.createDataFrame([["a", 1]]) df = SparkDataFrame(sdf, "a:str,b:double") @@ -123,24 +145,24 @@ def _df(data, schema=None): return SparkDataFrame(df, schema) -def test_get_dataframe_column_names(spark_session): +def _test_get_column_names(spark_session): df = spark_session.createDataFrame( pd.DataFrame([[0, 1, 2]], columns=["0", "1", "2"]) ) - assert get_dataframe_column_names(df) == ["0", "1", "2"] + assert get_column_names(df) == ["0", "1", "2"] -def test_rename_dataframe_column_names(spark_session): +def _test_rename(spark_session): pdf = spark_session.createDataFrame( pd.DataFrame([[0, 1, 2]], columns=["a", "b", "c"]) ) - df = rename_dataframe_column_names(pdf, {}) + df = rename(pdf, {}) assert isinstance(df, ps.DataFrame) - assert get_dataframe_column_names(df) == ["a", "b", "c"] + assert get_column_names(df) == ["a", "b", "c"] pdf = spark_session.createDataFrame( pd.DataFrame([[0, 1, 2]], columns=["0", "1", "2"]) ) - df = rename_dataframe_column_names(pdf, {"0": "_0", "1": "_1", "2": "_2"}) + df = rename(pdf, {"0": "_0", "1": "_1", "2": "_2"}) assert isinstance(df, ps.DataFrame) - assert get_dataframe_column_names(df) == ["_0", "_1", "_2"] + assert get_column_names(df) == ["_0", "_1", "_2"] diff --git a/tests/fugue_spark/test_execution_engine.py b/tests/fugue_spark/test_execution_engine.py index 61fb8a13..b488547d 100644 --- a/tests/fugue_spark/test_execution_engine.py +++ b/tests/fugue_spark/test_execution_engine.py @@ -6,7 +6,14 @@ import pyspark.rdd as pr import pyspark.sql as ps import pytest -from fugue import infer_execution_engine, transform +from pyspark import SparkContext, StorageLevel +from pyspark.sql import DataFrame as SDataFrame +from pyspark.sql import SparkSession +from pytest import raises +from triad import Schema + +import fugue.api as fa +from fugue import transform from fugue.collections.partition import PartitionSpec from fugue.dataframe import ( ArrayDataFrame, @@ -16,17 +23,12 @@ ) from fugue.dataframe.utils import _df_eq as df_eq from fugue.extensions.transformer import Transformer, transformer +from fugue.plugins import infer_execution_engine from fugue.workflow.workflow import FugueWorkflow -from fugue_test.builtin_suite import BuiltInTests -from fugue_test.execution_suite import ExecutionEngineTests -from pyspark import SparkContext, StorageLevel -from pyspark.sql import DataFrame as SDataFrame -from pyspark.sql import SparkSession -from pytest import raises -from triad import Schema - from fugue_spark.dataframe import SparkDataFrame from fugue_spark.execution_engine import SparkExecutionEngine +from fugue_test.builtin_suite import BuiltInTests +from fugue_test.execution_suite import ExecutionEngineTests class SparkExecutionEngineTests(ExecutionEngineTests.Tests): @@ -41,6 +43,9 @@ def make_engine(self): ) return e + def test_get_parallelism(self): + assert fa.get_current_parallelism(self.engine) == 4 + def test_not_using_pandas_udf(self): assert not self.engine.create_default_map_engine()._should_use_pandas_udf( Schema("a:int") @@ -121,6 +126,9 @@ def make_engine(self): e = SparkExecutionEngine(session, {"test": True}) return e + def test_get_parallelism(self): + assert fa.get_current_parallelism(self.engine) == 4 + def test__join_outer_pandas_incompatible(self): return diff --git a/tests/fugue_spark/test_ibis.py b/tests/fugue_spark/test_ibis.py index 664461c7..cf9561ad 100644 --- a/tests/fugue_spark/test_ibis.py +++ b/tests/fugue_spark/test_ibis.py @@ -1,11 +1,12 @@ import pytest ibis = pytest.importorskip("ibis") -from fugue_ibis import IbisEngine -from fugue_test.ibis_suite import IbisTests from pyspark.sql import SparkSession -from fugue_spark import SparkExecutionEngine, SparkIbisEngine +from fugue_ibis import IbisEngine +from fugue_spark import SparkExecutionEngine +from fugue_spark.ibis_engine import SparkIbisEngine +from fugue_test.ibis_suite import IbisTests class SparkIbisTests(IbisTests.Tests):