From d67e319b9c16fd60aff7574b25352b7c48d142d5 Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Thu, 19 Dec 2024 17:53:13 +0100 Subject: [PATCH 1/4] refactor(rust): Temporarily disable common subplan elim for new-streaming --- crates/polars-lazy/src/frame/mod.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/crates/polars-lazy/src/frame/mod.rs b/crates/polars-lazy/src/frame/mod.rs index 154726e6a27e..d0860f3faccd 100644 --- a/crates/polars-lazy/src/frame/mod.rs +++ b/crates/polars-lazy/src/frame/mod.rs @@ -601,11 +601,15 @@ impl LazyFrame { opt_state &= !OptFlags::COMM_SUBPLAN_ELIM; } - // The new streaming engine can't deal with the way the common - // subexpression elimination adds length-incorrect with_columns. #[cfg(feature = "cse")] if new_streaming { + // The new streaming engine can't deal with the way the common + // subexpression elimination adds length-incorrect with_columns. opt_state &= !OptFlags::COMM_SUBEXPR_ELIM; + + // The new streaming engine can't yet deal with the cache nodes + // introduced by common subplan elimination. + opt_state &= !OptFlags::COMM_SUBPLAN_ELIM; } let lp_top = optimize( From 772f5a6d6ecbab05aa6a08016ccb90f197711bfc Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Fri, 20 Dec 2024 14:37:42 +0100 Subject: [PATCH 2/4] fixed tests --- py-polars/tests/unit/operations/test_join.py | 4 +-- py-polars/tests/unit/sql/test_joins.py | 27 ++++++++++++-------- py-polars/tests/unit/test_cse.py | 1 + 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/py-polars/tests/unit/operations/test_join.py b/py-polars/tests/unit/operations/test_join.py index 9a3b763b5f6f..d42258a26562 100644 --- a/py-polars/tests/unit/operations/test_join.py +++ b/py-polars/tests/unit/operations/test_join.py @@ -195,14 +195,14 @@ def test_join_lazy_frame_on_expression() -> None: def test_join() -> None: df_left = pl.DataFrame( { - "a": ["a", "b", "a", "z"], + "a": ["a", "b", "c", "z"], "b": [1, 2, 3, 4], "c": [6, 5, 4, 3], } ) df_right = pl.DataFrame( { - "a": ["b", "c", "b", "a"], + "a": ["b", "c", "d", "a"], "k": [0, 3, 9, 6], "c": [1, 0, 2, 1], } diff --git a/py-polars/tests/unit/sql/test_joins.py b/py-polars/tests/unit/sql/test_joins.py index c423fc4c45f4..fa7a4ce152f7 100644 --- a/py-polars/tests/unit/sql/test_joins.py +++ b/py-polars/tests/unit/sql/test_joins.py @@ -124,16 +124,23 @@ def test_join_inner(foods_ipc_path: Path, join_clause: str) -> None: eager=True, ) - assert out.to_dict(as_series=False) == { - "category": ["vegetables", "vegetables"], - "calories": [45, 20], - "fats_g": [0.5, 0.0], - "sugars_g": [2, 2], - "category:foods2": ["vegetables", "vegetables"], - "calories:foods2": [45, 45], - "fats_g:foods2": [0.5, 0.5], - "sugars_g:foods2": [2, 2], - } + assert_frame_equal( + out, + pl.DataFrame( + { + "category": ["vegetables", "vegetables"], + "calories": [45, 20], + "fats_g": [0.5, 0.0], + "sugars_g": [2, 2], + "category:foods2": ["vegetables", "vegetables"], + "calories:foods2": [45, 45], + "fats_g:foods2": [0.5, 0.5], + "sugars_g:foods2": [2, 2], + } + ), + check_dtypes=False, + check_row_order=False, + ) @pytest.mark.parametrize( diff --git a/py-polars/tests/unit/test_cse.py b/py-polars/tests/unit/test_cse.py index 06efd675c84f..47e60bdb0fd6 100644 --- a/py-polars/tests/unit/test_cse.py +++ b/py-polars/tests/unit/test_cse.py @@ -647,6 +647,7 @@ def test_cse_and_schema_update_projection_pd() -> None: @pytest.mark.debug +@pytest.mark.may_fail_auto_streaming def test_cse_predicate_self_join(capfd: Any, monkeypatch: Any) -> None: monkeypatch.setenv("POLARS_VERBOSE", "1") y = pl.LazyFrame({"a": [1], "b": [2], "y": [3]}) From fc0bd50cb9885aa5a7952a0f09ee1682d5954139 Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Fri, 20 Dec 2024 15:17:24 +0100 Subject: [PATCH 3/4] fix test (take 2) --- py-polars/tests/unit/operations/test_join.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/py-polars/tests/unit/operations/test_join.py b/py-polars/tests/unit/operations/test_join.py index d42258a26562..3bc0649a72ea 100644 --- a/py-polars/tests/unit/operations/test_join.py +++ b/py-polars/tests/unit/operations/test_join.py @@ -195,23 +195,27 @@ def test_join_lazy_frame_on_expression() -> None: def test_join() -> None: df_left = pl.DataFrame( { - "a": ["a", "b", "c", "z"], + "a": ["a", "b", "a", "z"], "b": [1, 2, 3, 4], "c": [6, 5, 4, 3], } ) df_right = pl.DataFrame( { - "a": ["b", "c", "d", "a"], + "a": ["b", "c", "b", "a"], "k": [0, 3, 9, 6], "c": [1, 0, 2, 1], } ) - joined = df_left.join(df_right, left_on="a", right_on="a").sort("a") + joined = df_left.join( + df_right, left_on="a", right_on="a", maintain_order="left_right" + ).sort("a") assert_series_equal(joined["b"], pl.Series("b", [1, 3, 2, 2])) - joined = df_left.join(df_right, left_on="a", right_on="a", how="left").sort("a") + joined = df_left.join( + df_right, left_on="a", right_on="a", how="left", maintain_order="left_right" + ).sort("a") assert joined["c_right"].is_null().sum() == 1 assert_series_equal(joined["b"], pl.Series("b", [1, 3, 2, 2, 4])) From cbab50f6b73229d0e95ae6db7075cf455b4df773 Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Fri, 20 Dec 2024 16:50:46 +0100 Subject: [PATCH 4/4] fix test (take 3) --- py-polars/tests/unit/sql/test_joins.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/py-polars/tests/unit/sql/test_joins.py b/py-polars/tests/unit/sql/test_joins.py index fa7a4ce152f7..8b3b968991a2 100644 --- a/py-polars/tests/unit/sql/test_joins.py +++ b/py-polars/tests/unit/sql/test_joins.py @@ -113,12 +113,15 @@ def test_join_cross_11927() -> None: def test_join_inner(foods_ipc_path: Path, join_clause: str) -> None: foods1 = pl.scan_ipc(foods_ipc_path) foods2 = foods1 # noqa: F841 + schema = foods1.collect_schema() + sort_clause = ", ".join(f'{c} ASC, "{c}:foods2" DESC' for c in schema) out = pl.sql( f""" SELECT * FROM foods1 INNER JOIN foods2 {join_clause} + ORDER BY {sort_clause} LIMIT 2 """, eager=True, @@ -128,18 +131,17 @@ def test_join_inner(foods_ipc_path: Path, join_clause: str) -> None: out, pl.DataFrame( { - "category": ["vegetables", "vegetables"], - "calories": [45, 20], - "fats_g": [0.5, 0.0], - "sugars_g": [2, 2], - "category:foods2": ["vegetables", "vegetables"], - "calories:foods2": [45, 45], - "fats_g:foods2": [0.5, 0.5], - "sugars_g:foods2": [2, 2], + "category": ["fruit", "fruit"], + "calories": [30, 30], + "fats_g": [0.0, 0.0], + "sugars_g": [3, 5], + "category:foods2": ["fruit", "fruit"], + "calories:foods2": [130, 130], + "fats_g:foods2": [0.0, 0.0], + "sugars_g:foods2": [25, 25], } ), check_dtypes=False, - check_row_order=False, )