From 6748a50b020e9f0ee84ecb339621e596697a74d2 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 2 Aug 2023 19:31:24 -0700 Subject: [PATCH 1/8] start to refactor tests --- python/cudf/cudf/tests/test_groupby.py | 158 +++++++++++++++++++------ 1 file changed, 119 insertions(+), 39 deletions(-) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 48092be390d..2c07d191efd 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -381,17 +381,75 @@ def emulate(df): @pytest.fixture(scope="module") -def groupby_jit_data(): +def groupby_jit_data_small(): + """ + Small dataset for testing groupby apply with jit + returns a dataframe whose keys columns define + 4 groups of size 1, 2, 3, 4 as well as an additional + key column that can be used to test subgroups + + useful for very basic testing of result values + + """ np.random.seed(0) df = DataFrame() - nelem = 20 - df["key1"] = np.random.randint(0, 3, nelem) - df["key2"] = np.random.randint(0, 2, nelem) - df["val1"] = np.random.random(nelem) - df["val2"] = np.random.random(nelem) + key1 = ( + [1] # group of size 1 + + [2, 2] # group of size 2 + + [3, 3, 3] # group of size 3 + + [4, 4, 4, 4] # group of size 4 + ) + key2 = [1, 2] * 5 + df["key1"] = key1 + df["key2"] = key2 + + df["val1"] = np.random.random(len(key1)) + df["val2"] = np.random.random(len(key1)) + + # randomly permute data + df = df.sample(frac=1).reset_index(drop=True) + return df + + +@pytest.fixture(scope="module") +def groupby_jit_data_large(groupby_jit_data_small): + """ + Large dataset for testing groupby apply with jit + useful for validating that block level algorithms + return the correct result for groups larger than + the maximum thread per block size + """ + max_tpb = 1024 + factor = ( + max_tpb + 1 + ) # bigger than a block but not always an exact multiple + df = cudf.concat([groupby_jit_data_small] * factor) return df +@pytest.fixture(scope="module") +def groupby_jit_data_nans(groupby_jit_data_small): + """ + Small dataset containing nans + """ + + df = groupby_jit_data_small.sort_values(["key1", "key2"]) + df["val1"][::2] = np.nan + df = df.sample(frac=1).reset_index(drop=True) + return df + + +@pytest.fixture(scope="module") +def groupby_jit_datasets( + groupby_jit_data_small, groupby_jit_data_large, groupby_jit_data_nans +): + return { + "small": groupby_jit_data_small, + "large": groupby_jit_data_large, + "nans": groupby_jit_data_nans, + } + + def run_groupby_apply_jit_test(data, func, keys, *args): expect_groupby_obj = data.to_pandas().groupby(keys, as_index=False) got_groupby_obj = data.groupby(keys) @@ -402,15 +460,7 @@ def run_groupby_apply_jit_test(data, func, keys, *args): assert_groupby_results_equal(cudf_jit_result, pandas_result) -@pytest.mark.parametrize( - "dtype", - SUPPORTED_GROUPBY_NUMPY_TYPES, - ids=[str(t) for t in SUPPORTED_GROUPBY_NUMPY_TYPES], -) -@pytest.mark.parametrize( - "func", ["min", "max", "sum", "mean", "var", "std", "idxmin", "idxmax"] -) -def test_groupby_apply_jit_reductions(func, groupby_jit_data, dtype): +def groupby_apply_jit_reductions_test_inner(func, data, dtype): # ideally we'd just have: # lambda group: getattr(group, func)() # but the current kernel caching mechanism relies on pickle which @@ -427,41 +477,67 @@ def func(df): exec(funcstr, lcl) func = lcl["func"] - groupby_jit_data["val1"] = groupby_jit_data["val1"].astype(dtype) - groupby_jit_data["val2"] = groupby_jit_data["val2"].astype(dtype) + data["val1"] = data["val1"].astype(dtype) + data["val2"] = data["val2"].astype(dtype) - run_groupby_apply_jit_test(groupby_jit_data, func, ["key1"]) + run_groupby_apply_jit_test(data, func, ["key1"]) -@pytest.mark.parametrize("dtype", ["float64"]) -@pytest.mark.parametrize("func", ["min", "max", "sum", "mean", "var", "std"]) -@pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf]) -def test_groupby_apply_jit_reductions_special_vals( - func, groupby_jit_data, dtype, special_val +# test unary reductions +@pytest.mark.parametrize( + "dtype", + SUPPORTED_GROUPBY_NUMPY_TYPES, + ids=[str(t) for t in SUPPORTED_GROUPBY_NUMPY_TYPES], +) +@pytest.mark.parametrize( + "func", ["min", "max", "sum", "mean", "var", "std", "idxmin", "idxmax"] +) +@pytest.mark.parametrize("dataset", ["small", "large", "nans"]) +def test_groupby_apply_jit_unary_reductions( + func, dtype, dataset, groupby_jit_datasets +): + dataset = groupby_jit_datasets[dataset] + groupby_apply_jit_reductions_test_inner(func, dataset, dtype) + + +# test unary reductions for special values +def groupby_apply_jit_reductions_special_vals_inner( + func, data, dtype, special_val ): - # dynamically generate to avoid pickling error. - # see test_groupby_apply_jit_reductions for details. funcstr = textwrap.dedent( f""" - def func(df): - return df['val1'].{func}() + def func(df): + return df['val1'].{func}() """ ) lcl = {} exec(funcstr, lcl) func = lcl["func"] - groupby_jit_data["val1"] = special_val - groupby_jit_data["val1"] = groupby_jit_data["val1"].astype(dtype) + data["val1"] = data["val1"].astype(dtype) + data["val2"] = data["val2"].astype(dtype) + data["val1"] = special_val + data["val2"] = special_val - run_groupby_apply_jit_test(groupby_jit_data, func, ["key1"]) + run_groupby_apply_jit_test(data, func, ["key1"]) + + +@pytest.mark.parametrize("dtype", ["float64"]) +@pytest.mark.parametrize("func", ["min", "max", "sum", "mean", "var", "std"]) +@pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf]) +@pytest.mark.parametrize("dataset", ["small", "large", "nans"]) +def test_groupby_apply_jit_reductions_special_vals( + func, dataset, dtype, special_val +): + dataset = groupby_jit_datasets[dataset] + groupby_apply_jit_reductions_test_inner(func, dataset, dtype, special_val) @pytest.mark.parametrize("dtype", ["float64"]) @pytest.mark.parametrize("func", ["idxmax", "idxmin"]) @pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf]) def test_groupby_apply_jit_idx_reductions_special_vals( - func, groupby_jit_data, dtype, special_val + func, groupby_jit_data_small, dtype, special_val ): # dynamically generate to avoid pickling error. # see test_groupby_apply_jit_reductions for details. @@ -475,16 +551,18 @@ def func(df): exec(funcstr, lcl) func = lcl["func"] - groupby_jit_data["val1"] = special_val - groupby_jit_data["val1"] = groupby_jit_data["val1"].astype(dtype) + groupby_jit_data_small["val1"] = special_val + groupby_jit_data_small["val1"] = groupby_jit_data_small["val1"].astype( + dtype + ) expect = ( - groupby_jit_data.to_pandas() + groupby_jit_data_small.to_pandas() .groupby("key1", as_index=False) .apply(func) ) - grouped = groupby_jit_data.groupby("key1") + grouped = groupby_jit_data_small.groupby("key1") sorted = grouped._grouped()[3].to_pandas() expect_vals = sorted["key1"].drop_duplicates().index expect[None] = expect_vals @@ -501,8 +579,8 @@ def func(df): lambda df: df["val1"].mean() + df["val2"].std(), ], ) -def test_groupby_apply_jit_basic(func, groupby_jit_data): - run_groupby_apply_jit_test(groupby_jit_data, func, ["key1", "key2"]) +def test_groupby_apply_jit_basic(func, groupby_jit_data_small): + run_groupby_apply_jit_test(groupby_jit_data_small, func, ["key1", "key2"]) def create_test_groupby_apply_jit_args_params(): @@ -521,8 +599,10 @@ def f3(df, k, L, m): @pytest.mark.parametrize( "func,args", create_test_groupby_apply_jit_args_params() ) -def test_groupby_apply_jit_args(func, args, groupby_jit_data): - run_groupby_apply_jit_test(groupby_jit_data, func, ["key1", "key2"], *args) +def test_groupby_apply_jit_args(func, args, groupby_jit_data_small): + run_groupby_apply_jit_test( + groupby_jit_data_small, func, ["key1", "key2"], *args + ) def test_groupby_apply_jit_block_divergence(): From 301b8fa34183182c94f2e3a923d6132031b9661e Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 10 Aug 2023 07:02:15 -0700 Subject: [PATCH 2/8] refactor correlaton test --- python/cudf/cudf/tests/test_groupby.py | 35 ++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 783275d91c6..70dafc77e78 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -585,6 +585,41 @@ def test_groupby_apply_jit_idx_reductions_special_vals( ) +@pytest.mark.parametrize( + "dtype", + [ + "int32", + "int64", + pytest.param( + "float32", + marks=pytest.mark.xfail( + reason="https://github.com/rapidsai/cudf/issues/13839" + ), + ), + pytest.param( + "float64", + marks=pytest.mark.xfail( + reason="https://github.com/rapidsai/cudf/issues/13839" + ), + ), + ], +) +@pytest.mark.parametrize("dataset", ["large", "nans"]) +def test_groupby_apply_jit_correlation(dataset, groupby_jit_datasets, dtype): + + dataset = groupby_jit_datasets[dataset] + + dataset["val3"] = dataset["val3"].astype(dtype) + dataset["val4"] = dataset["val4"].astype(dtype) + + keys = ["key1"] + + def func(group): + return group["val3"].corr(group["val4"]) + + run_groupby_apply_jit_test(dataset, func, keys) + + @pytest.mark.parametrize( "func", [ From 37c5934be560363a6e7d68e9a2514cc8b22e759d Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 14 Aug 2023 09:41:38 -0700 Subject: [PATCH 3/8] finish filling out xfails and xwarns --- python/cudf/cudf/core/groupby/groupby.py | 1 + python/cudf/cudf/tests/test_groupby.py | 45 +++++++++++++++++++----- 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 2ed9bed5b49..01d775d9d0b 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1223,6 +1223,7 @@ def _iterative_groupby_apply( chunks = [ grouped_values[s:e] for s, e in zip(offsets[:-1], offsets[1:]) ] + breakpoint() chunk_results = [function(chk, *args) for chk in chunks] return self._post_process_chunk_results( chunk_results, group_names, group_keys, grouped_values diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 70dafc77e78..529e7fdafb1 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -403,10 +403,8 @@ def groupby_jit_data_small(): df["key1"] = key1 df["key2"] = key2 - df["val1"] = np.random.random(len(key1)) - df["val2"] = np.random.random(len(key1)) - df["val3"] = np.random.randint(0, 10, len(key1)) - df["val4"] = np.random.randint(0, 10, len(key1)) + df["val1"] = np.random.randint(0, 10, len(key1)) + df["val2"] = np.random.randint(0, 10, len(key1)) # randomly permute data df = df.sample(frac=1).reset_index(drop=True) @@ -500,7 +498,15 @@ def test_groupby_apply_jit_unary_reductions( func, dtype, dataset, groupby_jit_datasets ): dataset = groupby_jit_datasets[dataset] - groupby_apply_jit_reductions_test_inner(func, dataset, dtype) + + if func in {"sum", "mean", "var", "std"} and dtype == "int32": + with pytest.xfail( + reason="https://github.com/rapidsai/cudf/issues/13873" + ): + groupby_apply_jit_reductions_test_inner(func, dataset, dtype) + return + else: + groupby_apply_jit_reductions_test_inner(func, dataset, dtype) # test unary reductions for special values @@ -604,18 +610,39 @@ def test_groupby_apply_jit_idx_reductions_special_vals( ), ], ) -@pytest.mark.parametrize("dataset", ["large", "nans"]) +@pytest.mark.parametrize( + "dataset", + [ + pytest.param( + "small", + marks=[ + pytest.mark.filterwarnings( + "ignore:Degrees of Freedom <= 0 for slice" + ), + pytest.mark.filterwarnings( + "ignore:divide by zero encountered in divide" + ), + ], + ), + pytest.param( + "large", + marks=pytest.mark.xfail( + reason="https://github.com/rapidsai/cudf/issues/13875" + ), + ), + ], +) def test_groupby_apply_jit_correlation(dataset, groupby_jit_datasets, dtype): dataset = groupby_jit_datasets[dataset] - dataset["val3"] = dataset["val3"].astype(dtype) - dataset["val4"] = dataset["val4"].astype(dtype) + dataset["val1"] = dataset["val1"].astype(dtype) + dataset["val2"] = dataset["val2"].astype(dtype) keys = ["key1"] def func(group): - return group["val3"].corr(group["val4"]) + return group["val1"].corr(group["val2"]) run_groupby_apply_jit_test(dataset, func, keys) From c4187ffdfabec52adacaad86bfdbce14b2fb822c Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 15 Aug 2023 11:53:27 -0500 Subject: [PATCH 4/8] Apply suggestions from code review Co-authored-by: Bradley Dice --- python/cudf/cudf/core/groupby/groupby.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 01d775d9d0b..2ed9bed5b49 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1223,7 +1223,6 @@ def _iterative_groupby_apply( chunks = [ grouped_values[s:e] for s, e in zip(offsets[:-1], offsets[1:]) ] - breakpoint() chunk_results = [function(chk, *args) for chk in chunks] return self._post_process_chunk_results( chunk_results, group_names, group_keys, grouped_values From 9adff4eaa2769f1ab2864e0fbecfe76b0fabd0b1 Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Fri, 15 Dec 2023 08:57:11 -0800 Subject: [PATCH 5/8] fix bugs/tests/remove xfails --- python/cudf/cudf/tests/test_groupby.py | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 96a1b9e1fb5..7f609d2b4a7 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -446,6 +446,7 @@ def groupby_jit_data_nans(groupby_jit_data_small): """ df = groupby_jit_data_small.sort_values(["key1", "key2"]) + df["val1"] = df["val1"].astype("float64") df["val1"][::2] = np.nan df = df.sample(frac=1).reset_index(drop=True) return df @@ -510,14 +511,7 @@ def test_groupby_apply_jit_unary_reductions( ): dataset = groupby_jit_datasets[dataset] - if func in {"sum", "mean", "var", "std"} and dtype == "int32": - with pytest.xfail( - reason="https://github.com/rapidsai/cudf/issues/13873" - ): - groupby_apply_jit_reductions_test_inner(func, dataset, dtype) - return - else: - groupby_apply_jit_reductions_test_inner(func, dataset, dtype) + groupby_apply_jit_reductions_test_inner(func, dataset, dtype) # test unary reductions for special values @@ -634,12 +628,7 @@ def func(group): ), ], ), - pytest.param( - "large", - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/13875" - ), - ), + "large", ], ) def test_groupby_apply_jit_correlation(dataset, groupby_jit_datasets, dtype): @@ -654,7 +643,7 @@ def test_groupby_apply_jit_correlation(dataset, groupby_jit_datasets, dtype): def func(group): return group["val1"].corr(group["val2"]) - if dtype.kind == "f": + if np.dtype(dtype).kind == "f": # Feature Request: # https://github.com/rapidsai/cudf/issues/13839 m = ( @@ -697,7 +686,9 @@ def func(group): @pytest.mark.parametrize("op", arith_ops + comparison_ops) -def test_groupby_apply_jit_invalid_binary_ops_error(groupby_jit_data_small, op): +def test_groupby_apply_jit_invalid_binary_ops_error( + groupby_jit_data_small, op +): keys = ["key1"] def func(group): From 6de1e14160286eb74abe4832dc6e61da7fbed06a Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Thu, 4 Jan 2024 08:38:36 -0800 Subject: [PATCH 6/8] address reviews --- python/cudf/cudf/tests/test_groupby.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 7f609d2b4a7..694d7beec71 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. import collections import datetime @@ -402,23 +402,18 @@ def groupby_jit_data_small(): useful for very basic testing of result values """ - np.random.seed(0) + rng = np.random.default_rng(42) df = DataFrame() - key1 = ( - [1] # group of size 1 - + [2, 2] # group of size 2 - + [3, 3, 3] # group of size 3 - + [4, 4, 4, 4] # group of size 4 - ) + key1 = [1] + [2] * 2 + [3] * 3 + [4] * 4 key2 = [1, 2] * 5 df["key1"] = key1 df["key2"] = key2 - df["val1"] = np.random.randint(0, 10, len(key1)) - df["val2"] = np.random.randint(0, 10, len(key1)) + df["val1"] = rng.integers(0, 10, len(key1)) + df["val2"] = rng.integers(0, 10, len(key1)) # randomly permute data - df = df.sample(frac=1).reset_index(drop=True) + df = df.sample(frac=1, ignore_index=True) return df @@ -448,7 +443,7 @@ def groupby_jit_data_nans(groupby_jit_data_small): df = groupby_jit_data_small.sort_values(["key1", "key2"]) df["val1"] = df["val1"].astype("float64") df["val1"][::2] = np.nan - df = df.sample(frac=1).reset_index(drop=True) + df = df.sample(frac=1, ignore_index=True) return df @@ -558,7 +553,7 @@ def func(df): run_groupby_apply_jit_test(data, func, ["key1"]) -@pytest.mark.parametrize("dtype", ["float64"]) +@pytest.mark.parametrize("dtype", ["float64", "float32"]) @pytest.mark.parametrize("func", ["min", "max", "sum", "mean", "var", "std"]) @pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf]) @pytest.mark.parametrize("dataset", ["small", "large", "nans"]) From 2c9cdf35dbecfce62abe44a868266a3b881d4376 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Mon, 8 Jan 2024 09:13:48 -0600 Subject: [PATCH 7/8] Update python/cudf/cudf/tests/test_groupby.py Co-authored-by: Bradley Dice --- python/cudf/cudf/tests/test_groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 694d7beec71..f910509a660 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -639,7 +639,7 @@ def func(group): return group["val1"].corr(group["val2"]) if np.dtype(dtype).kind == "f": - # Feature Request: + # Correlation of floating types is not yet supported: # https://github.com/rapidsai/cudf/issues/13839 m = ( f"Series.corr\\(Series\\) is not " From ffdeb85c0e59611e49bb3076066b562976fb412c Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Mon, 8 Jan 2024 07:15:54 -0800 Subject: [PATCH 8/8] update comments --- python/cudf/cudf/tests/test_groupby.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index f910509a660..b46949faa06 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -394,13 +394,10 @@ def emulate(df): @pytest.fixture(scope="module") def groupby_jit_data_small(): """ - Small dataset for testing groupby apply with jit - returns a dataframe whose keys columns define - 4 groups of size 1, 2, 3, 4 as well as an additional - key column that can be used to test subgroups - - useful for very basic testing of result values - + Return a small dataset for testing JIT Groupby Apply. The dataframe + contains 4 groups of size 1, 2, 3, 4 as well as an additional key + column that can be used to test subgroups within groups. This data + is useful for smoke testing basic numeric results """ rng = np.random.default_rng(42) df = DataFrame() @@ -420,10 +417,10 @@ def groupby_jit_data_small(): @pytest.fixture(scope="module") def groupby_jit_data_large(groupby_jit_data_small): """ - Large dataset for testing groupby apply with jit - useful for validating that block level algorithms - return the correct result for groups larger than - the maximum thread per block size + Larger version of groupby_jit_data_small which contains enough data + to require more than one block per group. This data is useful for + testing if JIT GroupBy algorithms scale to larger dastasets without + manifesting numerical issues such as overflow. """ max_tpb = 1024 factor = ( @@ -437,7 +434,8 @@ def groupby_jit_data_large(groupby_jit_data_small): @pytest.fixture(scope="module") def groupby_jit_data_nans(groupby_jit_data_small): """ - Small dataset containing nans + Returns a modified version of groupby_jit_data_small which contains + nan values. """ df = groupby_jit_data_small.sort_values(["key1", "key2"])