From 6748a50b020e9f0ee84ecb339621e596697a74d2 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 2 Aug 2023 19:31:24 -0700
Subject: [PATCH 1/8] start to refactor tests

---
 python/cudf/cudf/tests/test_groupby.py | 158 +++++++++++++++++++------
 1 file changed, 119 insertions(+), 39 deletions(-)

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 48092be390d..2c07d191efd 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -381,17 +381,75 @@ def emulate(df):
 
 
 @pytest.fixture(scope="module")
-def groupby_jit_data():
+def groupby_jit_data_small():
+    """
+    Small dataset for testing groupby apply with jit
+    returns a dataframe whose keys columns define
+    4 groups of size 1, 2, 3, 4 as well as an additional
+    key column that can be used to test subgroups
+
+    useful for very basic testing of result values
+
+    """
     np.random.seed(0)
     df = DataFrame()
-    nelem = 20
-    df["key1"] = np.random.randint(0, 3, nelem)
-    df["key2"] = np.random.randint(0, 2, nelem)
-    df["val1"] = np.random.random(nelem)
-    df["val2"] = np.random.random(nelem)
+    key1 = (
+        [1]  # group of size 1
+        + [2, 2]  # group of size 2
+        + [3, 3, 3]  # group of size 3
+        + [4, 4, 4, 4]  # group of size 4
+    )
+    key2 = [1, 2] * 5
+    df["key1"] = key1
+    df["key2"] = key2
+
+    df["val1"] = np.random.random(len(key1))
+    df["val2"] = np.random.random(len(key1))
+
+    # randomly permute data
+    df = df.sample(frac=1).reset_index(drop=True)
+    return df
+
+
+@pytest.fixture(scope="module")
+def groupby_jit_data_large(groupby_jit_data_small):
+    """
+    Large dataset for testing groupby apply with jit
+    useful for validating that block level algorithms
+    return the correct result for groups larger than
+    the maximum thread per block size
+    """
+    max_tpb = 1024
+    factor = (
+        max_tpb + 1
+    )  # bigger than a block but not always an exact multiple
+    df = cudf.concat([groupby_jit_data_small] * factor)
     return df
 
 
+@pytest.fixture(scope="module")
+def groupby_jit_data_nans(groupby_jit_data_small):
+    """
+    Small dataset containing nans
+    """
+
+    df = groupby_jit_data_small.sort_values(["key1", "key2"])
+    df["val1"][::2] = np.nan
+    df = df.sample(frac=1).reset_index(drop=True)
+    return df
+
+
+@pytest.fixture(scope="module")
+def groupby_jit_datasets(
+    groupby_jit_data_small, groupby_jit_data_large, groupby_jit_data_nans
+):
+    return {
+        "small": groupby_jit_data_small,
+        "large": groupby_jit_data_large,
+        "nans": groupby_jit_data_nans,
+    }
+
+
 def run_groupby_apply_jit_test(data, func, keys, *args):
     expect_groupby_obj = data.to_pandas().groupby(keys, as_index=False)
     got_groupby_obj = data.groupby(keys)
@@ -402,15 +460,7 @@ def run_groupby_apply_jit_test(data, func, keys, *args):
     assert_groupby_results_equal(cudf_jit_result, pandas_result)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    SUPPORTED_GROUPBY_NUMPY_TYPES,
-    ids=[str(t) for t in SUPPORTED_GROUPBY_NUMPY_TYPES],
-)
-@pytest.mark.parametrize(
-    "func", ["min", "max", "sum", "mean", "var", "std", "idxmin", "idxmax"]
-)
-def test_groupby_apply_jit_reductions(func, groupby_jit_data, dtype):
+def groupby_apply_jit_reductions_test_inner(func, data, dtype):
     # ideally we'd just have:
     # lambda group: getattr(group, func)()
     # but the current kernel caching mechanism relies on pickle which
@@ -427,41 +477,67 @@ def func(df):
     exec(funcstr, lcl)
     func = lcl["func"]
 
-    groupby_jit_data["val1"] = groupby_jit_data["val1"].astype(dtype)
-    groupby_jit_data["val2"] = groupby_jit_data["val2"].astype(dtype)
+    data["val1"] = data["val1"].astype(dtype)
+    data["val2"] = data["val2"].astype(dtype)
 
-    run_groupby_apply_jit_test(groupby_jit_data, func, ["key1"])
+    run_groupby_apply_jit_test(data, func, ["key1"])
 
 
-@pytest.mark.parametrize("dtype", ["float64"])
-@pytest.mark.parametrize("func", ["min", "max", "sum", "mean", "var", "std"])
-@pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf])
-def test_groupby_apply_jit_reductions_special_vals(
-    func, groupby_jit_data, dtype, special_val
+# test unary reductions
+@pytest.mark.parametrize(
+    "dtype",
+    SUPPORTED_GROUPBY_NUMPY_TYPES,
+    ids=[str(t) for t in SUPPORTED_GROUPBY_NUMPY_TYPES],
+)
+@pytest.mark.parametrize(
+    "func", ["min", "max", "sum", "mean", "var", "std", "idxmin", "idxmax"]
+)
+@pytest.mark.parametrize("dataset", ["small", "large", "nans"])
+def test_groupby_apply_jit_unary_reductions(
+    func, dtype, dataset, groupby_jit_datasets
+):
+    dataset = groupby_jit_datasets[dataset]
+    groupby_apply_jit_reductions_test_inner(func, dataset, dtype)
+
+
+# test unary reductions for special values
+def groupby_apply_jit_reductions_special_vals_inner(
+    func, data, dtype, special_val
 ):
-    # dynamically generate to avoid pickling error.
-    # see test_groupby_apply_jit_reductions for details.
     funcstr = textwrap.dedent(
         f"""
-            def func(df):
-                return df['val1'].{func}()
+        def func(df):
+            return df['val1'].{func}()
         """
     )
     lcl = {}
     exec(funcstr, lcl)
     func = lcl["func"]
 
-    groupby_jit_data["val1"] = special_val
-    groupby_jit_data["val1"] = groupby_jit_data["val1"].astype(dtype)
+    data["val1"] = data["val1"].astype(dtype)
+    data["val2"] = data["val2"].astype(dtype)
+    data["val1"] = special_val
+    data["val2"] = special_val
 
-    run_groupby_apply_jit_test(groupby_jit_data, func, ["key1"])
+    run_groupby_apply_jit_test(data, func, ["key1"])
+
+
+@pytest.mark.parametrize("dtype", ["float64"])
+@pytest.mark.parametrize("func", ["min", "max", "sum", "mean", "var", "std"])
+@pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf])
+@pytest.mark.parametrize("dataset", ["small", "large", "nans"])
+def test_groupby_apply_jit_reductions_special_vals(
+    func, dataset, dtype, special_val
+):
+    dataset = groupby_jit_datasets[dataset]
+    groupby_apply_jit_reductions_test_inner(func, dataset, dtype, special_val)
 
 
 @pytest.mark.parametrize("dtype", ["float64"])
 @pytest.mark.parametrize("func", ["idxmax", "idxmin"])
 @pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf])
 def test_groupby_apply_jit_idx_reductions_special_vals(
-    func, groupby_jit_data, dtype, special_val
+    func, groupby_jit_data_small, dtype, special_val
 ):
     # dynamically generate to avoid pickling error.
     # see test_groupby_apply_jit_reductions for details.
@@ -475,16 +551,18 @@ def func(df):
     exec(funcstr, lcl)
     func = lcl["func"]
 
-    groupby_jit_data["val1"] = special_val
-    groupby_jit_data["val1"] = groupby_jit_data["val1"].astype(dtype)
+    groupby_jit_data_small["val1"] = special_val
+    groupby_jit_data_small["val1"] = groupby_jit_data_small["val1"].astype(
+        dtype
+    )
 
     expect = (
-        groupby_jit_data.to_pandas()
+        groupby_jit_data_small.to_pandas()
         .groupby("key1", as_index=False)
         .apply(func)
     )
 
-    grouped = groupby_jit_data.groupby("key1")
+    grouped = groupby_jit_data_small.groupby("key1")
     sorted = grouped._grouped()[3].to_pandas()
     expect_vals = sorted["key1"].drop_duplicates().index
     expect[None] = expect_vals
@@ -501,8 +579,8 @@ def func(df):
         lambda df: df["val1"].mean() + df["val2"].std(),
     ],
 )
-def test_groupby_apply_jit_basic(func, groupby_jit_data):
-    run_groupby_apply_jit_test(groupby_jit_data, func, ["key1", "key2"])
+def test_groupby_apply_jit_basic(func, groupby_jit_data_small):
+    run_groupby_apply_jit_test(groupby_jit_data_small, func, ["key1", "key2"])
 
 
 def create_test_groupby_apply_jit_args_params():
@@ -521,8 +599,10 @@ def f3(df, k, L, m):
 @pytest.mark.parametrize(
     "func,args", create_test_groupby_apply_jit_args_params()
 )
-def test_groupby_apply_jit_args(func, args, groupby_jit_data):
-    run_groupby_apply_jit_test(groupby_jit_data, func, ["key1", "key2"], *args)
+def test_groupby_apply_jit_args(func, args, groupby_jit_data_small):
+    run_groupby_apply_jit_test(
+        groupby_jit_data_small, func, ["key1", "key2"], *args
+    )
 
 
 def test_groupby_apply_jit_block_divergence():

From 301b8fa34183182c94f2e3a923d6132031b9661e Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 10 Aug 2023 07:02:15 -0700
Subject: [PATCH 2/8] refactor correlaton test

---
 python/cudf/cudf/tests/test_groupby.py | 35 ++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 783275d91c6..70dafc77e78 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -585,6 +585,41 @@ def test_groupby_apply_jit_idx_reductions_special_vals(
     )
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "int32",
+        "int64",
+        pytest.param(
+            "float32",
+            marks=pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/13839"
+            ),
+        ),
+        pytest.param(
+            "float64",
+            marks=pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/13839"
+            ),
+        ),
+    ],
+)
+@pytest.mark.parametrize("dataset", ["large", "nans"])
+def test_groupby_apply_jit_correlation(dataset, groupby_jit_datasets, dtype):
+
+    dataset = groupby_jit_datasets[dataset]
+
+    dataset["val3"] = dataset["val3"].astype(dtype)
+    dataset["val4"] = dataset["val4"].astype(dtype)
+
+    keys = ["key1"]
+
+    def func(group):
+        return group["val3"].corr(group["val4"])
+
+    run_groupby_apply_jit_test(dataset, func, keys)
+
+
 @pytest.mark.parametrize(
     "func",
     [

From 37c5934be560363a6e7d68e9a2514cc8b22e759d Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 14 Aug 2023 09:41:38 -0700
Subject: [PATCH 3/8] finish filling out xfails and xwarns

---
 python/cudf/cudf/core/groupby/groupby.py |  1 +
 python/cudf/cudf/tests/test_groupby.py   | 45 +++++++++++++++++++-----
 2 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 2ed9bed5b49..01d775d9d0b 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1223,6 +1223,7 @@ def _iterative_groupby_apply(
         chunks = [
             grouped_values[s:e] for s, e in zip(offsets[:-1], offsets[1:])
         ]
+        breakpoint()
         chunk_results = [function(chk, *args) for chk in chunks]
         return self._post_process_chunk_results(
             chunk_results, group_names, group_keys, grouped_values
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 70dafc77e78..529e7fdafb1 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -403,10 +403,8 @@ def groupby_jit_data_small():
     df["key1"] = key1
     df["key2"] = key2
 
-    df["val1"] = np.random.random(len(key1))
-    df["val2"] = np.random.random(len(key1))
-    df["val3"] = np.random.randint(0, 10, len(key1))
-    df["val4"] = np.random.randint(0, 10, len(key1))
+    df["val1"] = np.random.randint(0, 10, len(key1))
+    df["val2"] = np.random.randint(0, 10, len(key1))
 
     # randomly permute data
     df = df.sample(frac=1).reset_index(drop=True)
@@ -500,7 +498,15 @@ def test_groupby_apply_jit_unary_reductions(
     func, dtype, dataset, groupby_jit_datasets
 ):
     dataset = groupby_jit_datasets[dataset]
-    groupby_apply_jit_reductions_test_inner(func, dataset, dtype)
+
+    if func in {"sum", "mean", "var", "std"} and dtype == "int32":
+        with pytest.xfail(
+            reason="https://github.com/rapidsai/cudf/issues/13873"
+        ):
+            groupby_apply_jit_reductions_test_inner(func, dataset, dtype)
+        return
+    else:
+        groupby_apply_jit_reductions_test_inner(func, dataset, dtype)
 
 
 # test unary reductions for special values
@@ -604,18 +610,39 @@ def test_groupby_apply_jit_idx_reductions_special_vals(
         ),
     ],
 )
-@pytest.mark.parametrize("dataset", ["large", "nans"])
+@pytest.mark.parametrize(
+    "dataset",
+    [
+        pytest.param(
+            "small",
+            marks=[
+                pytest.mark.filterwarnings(
+                    "ignore:Degrees of Freedom <= 0 for slice"
+                ),
+                pytest.mark.filterwarnings(
+                    "ignore:divide by zero encountered in divide"
+                ),
+            ],
+        ),
+        pytest.param(
+            "large",
+            marks=pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/13875"
+            ),
+        ),
+    ],
+)
 def test_groupby_apply_jit_correlation(dataset, groupby_jit_datasets, dtype):
 
     dataset = groupby_jit_datasets[dataset]
 
-    dataset["val3"] = dataset["val3"].astype(dtype)
-    dataset["val4"] = dataset["val4"].astype(dtype)
+    dataset["val1"] = dataset["val1"].astype(dtype)
+    dataset["val2"] = dataset["val2"].astype(dtype)
 
     keys = ["key1"]
 
     def func(group):
-        return group["val3"].corr(group["val4"])
+        return group["val1"].corr(group["val2"])
 
     run_groupby_apply_jit_test(dataset, func, keys)
 

From c4187ffdfabec52adacaad86bfdbce14b2fb822c Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Tue, 15 Aug 2023 11:53:27 -0500
Subject: [PATCH 4/8] Apply suggestions from code review

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 python/cudf/cudf/core/groupby/groupby.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 01d775d9d0b..2ed9bed5b49 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1223,7 +1223,6 @@ def _iterative_groupby_apply(
         chunks = [
             grouped_values[s:e] for s, e in zip(offsets[:-1], offsets[1:])
         ]
-        breakpoint()
         chunk_results = [function(chk, *args) for chk in chunks]
         return self._post_process_chunk_results(
             chunk_results, group_names, group_keys, grouped_values

From 9adff4eaa2769f1ab2864e0fbecfe76b0fabd0b1 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 15 Dec 2023 08:57:11 -0800
Subject: [PATCH 5/8] fix bugs/tests/remove xfails

---
 python/cudf/cudf/tests/test_groupby.py | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 96a1b9e1fb5..7f609d2b4a7 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -446,6 +446,7 @@ def groupby_jit_data_nans(groupby_jit_data_small):
     """
 
     df = groupby_jit_data_small.sort_values(["key1", "key2"])
+    df["val1"] = df["val1"].astype("float64")
     df["val1"][::2] = np.nan
     df = df.sample(frac=1).reset_index(drop=True)
     return df
@@ -510,14 +511,7 @@ def test_groupby_apply_jit_unary_reductions(
 ):
     dataset = groupby_jit_datasets[dataset]
 
-    if func in {"sum", "mean", "var", "std"} and dtype == "int32":
-        with pytest.xfail(
-            reason="https://github.com/rapidsai/cudf/issues/13873"
-        ):
-            groupby_apply_jit_reductions_test_inner(func, dataset, dtype)
-        return
-    else:
-        groupby_apply_jit_reductions_test_inner(func, dataset, dtype)
+    groupby_apply_jit_reductions_test_inner(func, dataset, dtype)
 
 
 # test unary reductions for special values
@@ -634,12 +628,7 @@ def func(group):
                 ),
             ],
         ),
-        pytest.param(
-            "large",
-            marks=pytest.mark.xfail(
-                reason="https://github.com/rapidsai/cudf/issues/13875"
-            ),
-        ),
+        "large",
     ],
 )
 def test_groupby_apply_jit_correlation(dataset, groupby_jit_datasets, dtype):
@@ -654,7 +643,7 @@ def test_groupby_apply_jit_correlation(dataset, groupby_jit_datasets, dtype):
     def func(group):
         return group["val1"].corr(group["val2"])
 
-    if dtype.kind == "f":
+    if np.dtype(dtype).kind == "f":
         # Feature Request:
         # https://github.com/rapidsai/cudf/issues/13839
         m = (
@@ -697,7 +686,9 @@ def func(group):
 
 
 @pytest.mark.parametrize("op", arith_ops + comparison_ops)
-def test_groupby_apply_jit_invalid_binary_ops_error(groupby_jit_data_small, op):
+def test_groupby_apply_jit_invalid_binary_ops_error(
+    groupby_jit_data_small, op
+):
     keys = ["key1"]
 
     def func(group):

From 6de1e14160286eb74abe4832dc6e61da7fbed06a Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 4 Jan 2024 08:38:36 -0800
Subject: [PATCH 6/8] address reviews

---
 python/cudf/cudf/tests/test_groupby.py | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 7f609d2b4a7..694d7beec71 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 import collections
 import datetime
@@ -402,23 +402,18 @@ def groupby_jit_data_small():
     useful for very basic testing of result values
 
     """
-    np.random.seed(0)
+    rng = np.random.default_rng(42)
     df = DataFrame()
-    key1 = (
-        [1]  # group of size 1
-        + [2, 2]  # group of size 2
-        + [3, 3, 3]  # group of size 3
-        + [4, 4, 4, 4]  # group of size 4
-    )
+    key1 = [1] + [2] * 2 + [3] * 3 + [4] * 4
     key2 = [1, 2] * 5
     df["key1"] = key1
     df["key2"] = key2
 
-    df["val1"] = np.random.randint(0, 10, len(key1))
-    df["val2"] = np.random.randint(0, 10, len(key1))
+    df["val1"] = rng.integers(0, 10, len(key1))
+    df["val2"] = rng.integers(0, 10, len(key1))
 
     # randomly permute data
-    df = df.sample(frac=1).reset_index(drop=True)
+    df = df.sample(frac=1, ignore_index=True)
     return df
 
 
@@ -448,7 +443,7 @@ def groupby_jit_data_nans(groupby_jit_data_small):
     df = groupby_jit_data_small.sort_values(["key1", "key2"])
     df["val1"] = df["val1"].astype("float64")
     df["val1"][::2] = np.nan
-    df = df.sample(frac=1).reset_index(drop=True)
+    df = df.sample(frac=1, ignore_index=True)
     return df
 
 
@@ -558,7 +553,7 @@ def func(df):
     run_groupby_apply_jit_test(data, func, ["key1"])
 
 
-@pytest.mark.parametrize("dtype", ["float64"])
+@pytest.mark.parametrize("dtype", ["float64", "float32"])
 @pytest.mark.parametrize("func", ["min", "max", "sum", "mean", "var", "std"])
 @pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf])
 @pytest.mark.parametrize("dataset", ["small", "large", "nans"])

From 2c9cdf35dbecfce62abe44a868266a3b881d4376 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Mon, 8 Jan 2024 09:13:48 -0600
Subject: [PATCH 7/8] Update python/cudf/cudf/tests/test_groupby.py

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 python/cudf/cudf/tests/test_groupby.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 694d7beec71..f910509a660 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -639,7 +639,7 @@ def func(group):
         return group["val1"].corr(group["val2"])
 
     if np.dtype(dtype).kind == "f":
-        # Feature Request:
+        # Correlation of floating types is not yet supported:
         # https://github.com/rapidsai/cudf/issues/13839
         m = (
             f"Series.corr\\(Series\\) is not "

From ffdeb85c0e59611e49bb3076066b562976fb412c Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 8 Jan 2024 07:15:54 -0800
Subject: [PATCH 8/8] update comments

---
 python/cudf/cudf/tests/test_groupby.py | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index f910509a660..b46949faa06 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -394,13 +394,10 @@ def emulate(df):
 @pytest.fixture(scope="module")
 def groupby_jit_data_small():
     """
-    Small dataset for testing groupby apply with jit
-    returns a dataframe whose keys columns define
-    4 groups of size 1, 2, 3, 4 as well as an additional
-    key column that can be used to test subgroups
-
-    useful for very basic testing of result values
-
+    Return a small dataset for testing JIT Groupby Apply. The dataframe
+    contains 4 groups of size 1, 2, 3, 4 as well as an additional key
+    column that can be used to test subgroups within groups. This data
+    is useful for smoke testing basic numeric results
     """
     rng = np.random.default_rng(42)
     df = DataFrame()
@@ -420,10 +417,10 @@ def groupby_jit_data_small():
 @pytest.fixture(scope="module")
 def groupby_jit_data_large(groupby_jit_data_small):
     """
-    Large dataset for testing groupby apply with jit
-    useful for validating that block level algorithms
-    return the correct result for groups larger than
-    the maximum thread per block size
+    Larger version of groupby_jit_data_small which contains enough data
+    to require more than one block per group. This data is useful for
+    testing if JIT GroupBy algorithms scale to larger dastasets without
+    manifesting numerical issues such as overflow.
     """
     max_tpb = 1024
     factor = (
@@ -437,7 +434,8 @@ def groupby_jit_data_large(groupby_jit_data_small):
 @pytest.fixture(scope="module")
 def groupby_jit_data_nans(groupby_jit_data_small):
     """
-    Small dataset containing nans
+    Returns a modified version of groupby_jit_data_small which contains
+    nan values.
     """
 
     df = groupby_jit_data_small.sort_values(["key1", "key2"])