From 62daaf00192275820c91d3c28af8f7a93db7c99b Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Fri, 14 Jun 2024 14:22:51 +0200
Subject: [PATCH 01/88] added cluster test api, first commit

---
 .../76_new_cluster_test_api.py                | 467 ++++++++++++++++++
 1 file changed, 467 insertions(+)
 create mode 100644 tutorials/stats-sensor-space/76_new_cluster_test_api.py

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
new file mode 100644
index 00000000000..4e2b3af8f6d
--- /dev/null
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -0,0 +1,467 @@
+from pathlib import Path
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from mpl_toolkits.axes_grid1 import make_axes_locatable
+import mne
+
+# eventually we want to use the _permutation_cluster_test function
+
+# import and load dataset
+path_to_p3 = Path("C:/Users/Carina/mne_data/ERP_CORE_P3")
+
+def prep_sample_data(plot_evokeds: bool = False):
+    """
+    Load the P3 dataset and extract the target, non-target and contrast evokeds.
+    """
+    # Define the range of participant IDs
+    participant_ids = range(15, 20)  # This will cover 015 to 019
+
+    evokeds_allsubs = []
+
+    # Loop over each participant ID and generate the corresponding filename
+    for pid in participant_ids:
+        # Create the filename using an f-string, ensuring the participant ID is zero-padded to 3 digits
+        filename_p3 = f"sub-{pid:03d}_ses-P3_task-P3_ave.fif"
+
+        # Print the filename (or perform your desired operations on it)
+        print(filename_p3)
+
+        p3_file_path = Path(path_to_p3) / filename_p3
+
+        evokeds = mne.read_evokeds(p3_file_path)
+
+        # add to list
+        evokeds_allsubs.append(evokeds)
+
+    target_only = [evoked[0] for evoked in evokeds_allsubs]
+    non_target_only = [evoked[1] for evoked in evokeds_allsubs]
+    contrast = [evoked[2] for evoked in evokeds_allsubs]
+
+    if plot_evokeds:
+        # plot the grand average
+        mne.grand_average(target_only).plot()
+        mne.grand_average(non_target_only).plot()
+        mne.grand_average(contrast).plot()
+
+    # create contrast from evokeds target and non-target
+    diff_evoked = [
+        mne.combine_evoked([evokeds_a, evokeds_b], weights=[1, -1])
+        for evokeds_a, evokeds_b in zip(target_only, non_target_only)
+    ]
+
+    if plot_evokeds:
+        mne.grand_average(diff_evoked).plot()
+
+    # crop the evokeds in the post stimulus window
+    contrast = [evokeds.crop(tmin=-0.1, tmax=0.6) for evokeds in contrast]
+    target_only = [evokeds.crop(tmin=-0.1, tmax=0.6) for evokeds in target_only]
+    non_target_only = [evokeds.crop(tmin=-0.1, tmax=0.6) for evokeds in non_target_only]
+
+    return contrast, target_only, non_target_only
+
+
+def old_api_cluster(n_permutations: int = 10000, seed: int = 1234):
+    """
+    Run the cluster test using the old API to get a bechmark result for the new API.
+    Currently implementing a paired t-test with contrast between participants.
+    """
+    contrast, target_only, non_target_only = prep_sample_data()
+
+    # extract the data for each evoked and store in numpy array
+    data = np.array([evoked.data for evoked in contrast])
+
+    # shape should be (n_subjects, n_channels, n_times)
+    data.shape
+
+    # reshape to channels as last dimension
+    data = data.transpose(0, 2, 1)
+
+    data.shape
+
+    adjacency, _ = mne.channels.find_ch_adjacency(contrast[0].info, ch_type="eeg")
+
+    stat_fun, threshold = mne.stats.cluster_level._check_fun(
+        X=data, stat_fun=None, threshold=None, tail=0, kind="within"
+    )
+
+    # adjacency = mne.channels.find_ch_adjacency(contrast[0].info, ch_type='eeg')
+    # Run the analysis
+    T_obs, clusters, cluster_p_values, H0 = (
+        mne.stats.cluster_level._permutation_cluster_test(
+            [data],
+            threshold=threshold,
+            stat_fun=stat_fun,
+            n_jobs=-1,  # takes all CPU cores
+            max_step=1,  # maximum distance between samples (time points)
+            exclude=None,  # exclude no time points or channels
+            step_down_p=0,  # step down in jumps test
+            t_power=1,  # weigh each location by its stats score
+            out_type="indices",
+            check_disjoint=False,
+            buffer_size=None,  # block size for chunking the data
+            n_permutations=n_permutations,
+            tail=0,
+            adjacency=adjacency,
+            seed=seed,
+        )
+    )
+
+    print(min(cluster_p_values))
+
+    plot_cluster(
+        contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values
+    )
+
+    return T_obs, clusters, cluster_p_values, H0
+
+
+# fit cluster test with dataframe as input
+# create condition list that repeats 5times 1 and then 5 times 0
+# 1 = target, 0 = non-target
+# condition = 5 * [1] + 5 * [0]
+
+# 1 = target, 0 = non-target
+# contrast, target_only, non_target_only = prep_sample_data()
+
+# evokeds_list = target_only + non_target_only
+
+
+def create_random_evokeds_id_condition_list(evoked_data_a: list, evoked_data_b: list):
+    """
+    Create a list of shuffled participant IDs, conditions, and evoked data.
+    # Keep the participant IDs and conditions paired but shuffle the order of the evoked data.
+    """
+    import random
+
+    # Example participant IDs
+    participant_ids = ["p1", "p2", "p3", "p4", "p5"] * 2
+
+    # Combine the evoked data into a single list
+    all_evoked_data = evoked_data_a + evoked_data_b
+
+    # Create a corresponding list of conditions
+    conditions = [1] * len(evoked_data_a) + [0] * len(evoked_data_b)
+
+    # Combine the participant IDs, conditions, and evoked data into a list of tuples
+    combined_list = list(zip(participant_ids, conditions, all_evoked_data))
+
+    # Shuffle the combined list
+    random.shuffle(combined_list)
+
+    # Separate the shuffled list back into participant IDs, conditions, and evoked data
+    shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = zip(
+        *combined_list
+    )
+
+    # Convert the tuples back to lists
+    shuffled_participant_ids = list(shuffled_participant_ids)
+    shuffled_conditions = list(shuffled_conditions)
+    shuffled_evoked_data = list(shuffled_evoked_data)
+
+    return shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data
+
+
+def create_random_paired_evokeds_list(evoked_data_a: list, evoked_data_b: list):
+    """
+    Create a list of shuffled evoked data where each pair of target and non-target evoked data is shuffled together.
+    """
+    import random
+
+    # Create a list of tuples where each tuple contains an evoked data and its corresponding label
+    evoked_pairs = [(evoked, 1) for evoked in evoked_data_a] + [
+        (evoked, 0) for evoked in evoked_data_b
+    ]
+
+    # Shuffle the list of tuples
+    random.shuffle(evoked_pairs)
+
+    # Separate the shuffled list back into evoked data and labels
+    shuffled_evoked_data, shuffled_labels = zip(*evoked_pairs)
+
+    # Convert the tuples back to lists
+    shuffled_evoked_data = list(shuffled_evoked_data)
+
+    return shuffled_evoked_data
+
+
+# shuffle order of pairs
+shuffled_evokeds_list = create_random_paired_evokeds_list(target_only, non_target_only)
+# shouldn't change the results (p-value is different though?)
+
+shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = (
+    create_random_evokeds_id_condition_list(
+        evoked_data_a=target_only, evoked_data_b=non_target_only
+    )
+)
+
+
+def prepare_dataframe_for_cluster_function(
+    contrast: bool = False,
+    evokeds: list = None,
+    condition: list = None,
+    subject_index: list = None,
+):
+    """
+    Prepare a dataframe for the cluster test function.
+
+    Parameters
+    ----------
+    contrast : bool, optional
+        If True, a contrast is calculated. Default is False.
+    evokeds : list, optional
+        List of evoked objects. Default is None.
+    condition : list, optional
+        List of conditions for each evoked object. Default is None.
+    subject_index : list, optional
+        List of subject IDs. Default is None.
+
+    """
+    # create an empty dataframe
+    df = pd.DataFrame()
+
+    if contrast == True:
+        # check if evoked list is dividable by 2
+        if len(evokeds) % 2 != 0:
+            raise ValueError("evokeds list needs to be dividable by 2")
+        if condition is not None:
+            # Convert lists to DataFrame for easier manipulation
+            df = pd.DataFrame(
+                {
+                    "evoked": evokeds,
+                    "condition": condition,
+                    "subject_index": subject_index,
+                }
+            )
+
+        return df
+
+
+def cluster_test(
+    df: pd.DataFrame,
+    n_permutations: int = 10000,
+    seed: int = 1234,
+    contrast_weights: list = [1, -1],
+):
+    """
+    Run the cluster test using the new API.
+    # currently supports paired t-test with contrast or with list of conditions
+
+    Parameters
+    ----------
+    dataframe : pd.DataFrame
+        Dataframe with evoked data, conditions and subject IDs.
+    n_permutations : int, optional
+        Number of permutations. Default is 10000.
+    seed : int, optional
+        Random seed. Default is 1234.
+
+    Returns
+    -------
+    T_obs : array
+        The observed test statistic.
+    clusters : list
+        List of clusters.
+    cluster_p_values : array
+        Array of cluster p-values.
+    H0 : array
+        The permuted test statistics.
+    """
+    if df.condition is not None:
+        # Extract unique conditions
+        unique_conditions = np.unique(df.condition)
+        if len(unique_conditions) != 2:
+            raise ValueError("Condition list needs to contain 2 unique values")
+        if df.subject_index is not None:
+            # Initialize a list to hold the combined evoked data
+            evokeds_data = []
+
+            # Process each subject's evoked data
+            for sub_id in df.subject_index.unique():
+                sub_df = df[df.subject_index == sub_id]
+
+                # Split evokeds list based on condition list for this subject
+                evokeds_a = sub_df[sub_df.condition == unique_conditions[0]][
+                    "evoked"
+                ].tolist()
+                evokeds_b = sub_df[sub_df.condition == unique_conditions[1]][
+                    "evoked"
+                ].tolist()
+
+                if len(evokeds_a) != 1 or len(evokeds_b) != 1:
+                    raise ValueError(
+                        f"Subject {sub_id}: Each subject must have exactly one evoked for each condition"
+                    )
+
+                # Calculate contrast based on condition list
+                diff_evoked = mne.combine_evoked(
+                    [evokeds_a[0], evokeds_b[0]], weights=contrast_weights
+                )
+                evokeds_data.append(diff_evoked)
+        else:
+            # calculate length of evokeds list
+            n_evokeds = len(df.evokeds)
+            # now split evokeds list in two lists
+            evokeds_a = df.evokeds[: n_evokeds // 2]
+            evokeds_b = df.evokeds[n_evokeds // 2 :]
+            # create contrast from evokeds_a and evokeds_b
+            diff_evoked = [
+                mne.combine_evoked([evo_a, evo_b], weights=contrast_weights)
+                for evo_a, evo_b in zip(evokeds_a, evokeds_b)
+            ]
+            evokeds_data = diff_evoked
+    else:
+        evokeds_data = df.evokeds
+
+    # extract number of channels
+    n_channels = evokeds_data[0].info["nchan"]
+
+    # loop over rows and extract data from evokeds
+    data_array = np.array([evoked.data for evoked in evokeds_data])
+
+    # find the dimension that is equal to n_channels
+    if data_array.shape[1] == n_channels:
+        # reshape to channels as last dimension
+        data = data_array.transpose(0, 2, 1)
+
+    adjacency, _ = mne.channels.find_ch_adjacency(evokeds_data[0].info, ch_type="eeg")
+
+    stat_fun, threshold = mne.stats.cluster_level._check_fun(
+        X=data, stat_fun=None, threshold=None, tail=0, kind="within"
+    )
+
+    T_obs, clusters, cluster_p_values, H0 = (
+        mne.stats.cluster_level._permutation_cluster_test(
+            [data],
+            threshold=threshold,
+            stat_fun=stat_fun,
+            n_jobs=-1,
+            max_step=1,
+            exclude=None,
+            step_down_p=0.05,
+            t_power=1,
+            out_type="indices",
+            check_disjoint=True,
+            buffer_size=None,
+            n_permutations=n_permutations,
+            tail=0,
+            adjacency=adjacency,
+            seed=seed,
+        )
+    )
+
+    print(min(cluster_p_values))
+
+    # need to adjust plotting function for contrast only data
+    contrast, evokeds_a, evokeds_b = prep_sample_data()
+
+    # plot cluster
+    plot_cluster(contrast, evokeds_a, evokeds_b, T_obs, clusters, cluster_p_values)
+
+    return T_obs, clusters, cluster_p_values, H0
+
+
+def plot_cluster(
+    contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values
+):
+    """
+    Plot the cluster with the lowest p-value.
+
+    Parameters
+    ----------
+    contrast : list
+        List of contrast evoked objects.
+    target_only : list
+        List of target evoked objects.
+    non_target_only : list
+        List of non-target evoked objects.
+    T_obs : array
+        The observed test statistic.
+    clusters : list
+        List of clusters.
+    cluster_p_values : array
+        Array of cluster p-values.
+
+    Returns
+    -------
+    None
+
+    """
+    # configure variables for visualization
+    colors = {"target": "crimson", "non-target": "steelblue"}
+
+    # organize data for plotting
+    evokeds = {"target": target_only, "non-target": non_target_only}
+
+    lowest_p_cluster = np.argmin(cluster_p_values)
+
+    # plot the cluster with the lowest p-value
+    time_inds, space_inds = np.squeeze(clusters[lowest_p_cluster])
+    ch_inds = np.unique(space_inds)
+    time_inds = np.unique(time_inds)
+
+    # get topography for F stat
+    t_map = T_obs[time_inds, ...].mean(axis=0)
+
+    # get signals at the sensors contributing to the cluster
+    sig_times = contrast[0].times[time_inds]
+
+    # create spatial mask
+    mask = np.zeros((t_map.shape[0], 1), dtype=bool)
+    mask[ch_inds, :] = True
+
+    # initialize figure
+    fig, ax_topo = plt.subplots(1, 1, figsize=(10, 3), layout="constrained")
+
+    # plot average test statistic and mark significant sensors
+    t_evoked = mne.EvokedArray(t_map[:, np.newaxis], contrast[0].info, tmin=0)
+    t_evoked.plot_topomap(
+        times=0,
+        mask=mask,
+        axes=ax_topo,
+        cmap="Reds",
+        vlim=(np.min, np.max),
+        show=False,
+        colorbar=False,
+        mask_params=dict(markersize=10),
+    )
+    image = ax_topo.images[0]
+
+    # remove the title that would otherwise say "0.000 s"
+    ax_topo.set_title("")
+
+    # create additional axes (for ERF and colorbar)
+    divider = make_axes_locatable(ax_topo)
+
+    # add axes for colorbar
+    ax_colorbar = divider.append_axes("right", size="5%", pad=0.05)
+    plt.colorbar(image, cax=ax_colorbar)
+    ax_topo.set_xlabel(
+        "Averaged t-map ({:0.3f} - {:0.3f} s)".format(*sig_times[[0, -1]])
+    )
+
+    # add new axis for time courses and plot time courses
+    ax_signals = divider.append_axes("right", size="300%", pad=1.2)
+    title = f"Cluster #1, {len(ch_inds)} sensor"
+    if len(ch_inds) > 1:
+        title += "s (mean)"
+    mne.viz.plot_compare_evokeds(
+        evokeds,
+        title=title,
+        picks=ch_inds,
+        axes=ax_signals,
+        colors=colors,
+        show=False,
+        split_legend=True,
+        truncate_yaxis="auto",
+    )
+
+    # plot temporal cluster extent
+    ymin, ymax = ax_signals.get_ylim()
+    ax_signals.fill_betweenx(
+        (ymin, ymax), sig_times[0], sig_times[-1], color="green", alpha=0.3
+    )
+
+    plt.show()
+
+    return None

From d59978f575842ef148e814679bae7c746c1e2b4a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 14 Jun 2024 12:24:30 +0000
Subject: [PATCH 02/88] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tutorials/stats-sensor-space/76_new_cluster_test_api.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index 4e2b3af8f6d..2bdae528448 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -1,8 +1,10 @@
 from pathlib import Path
+
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 from mpl_toolkits.axes_grid1 import make_axes_locatable
+
 import mne
 
 # eventually we want to use the _permutation_cluster_test function
@@ -10,6 +12,7 @@
 # import and load dataset
 path_to_p3 = Path("C:/Users/Carina/mne_data/ERP_CORE_P3")
 
+
 def prep_sample_data(plot_evokeds: bool = False):
     """
     Load the P3 dataset and extract the target, non-target and contrast evokeds.

From 2843905c57bf2fe841607c88adb054fbc6ec322a Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Fri, 14 Jun 2024 19:02:45 +0200
Subject: [PATCH 03/88] tested dataframe function and results, cleaned up

---
 .../76_new_cluster_test_api.py                | 187 +++++++++---------
 1 file changed, 95 insertions(+), 92 deletions(-)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index 4e2b3af8f6d..3f001251ba5 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -5,7 +5,6 @@
 from mpl_toolkits.axes_grid1 import make_axes_locatable
 import mne
 
-# eventually we want to use the _permutation_cluster_test function
 
 # import and load dataset
 path_to_p3 = Path("C:/Users/Carina/mne_data/ERP_CORE_P3")
@@ -85,7 +84,6 @@ def old_api_cluster(n_permutations: int = 10000, seed: int = 1234):
         X=data, stat_fun=None, threshold=None, tail=0, kind="within"
     )
 
-    # adjacency = mne.channels.find_ch_adjacency(contrast[0].info, ch_type='eeg')
     # Run the analysis
     T_obs, clusters, cluster_p_values, H0 = (
         mne.stats.cluster_level._permutation_cluster_test(
@@ -115,25 +113,15 @@ def old_api_cluster(n_permutations: int = 10000, seed: int = 1234):
 
     return T_obs, clusters, cluster_p_values, H0
 
-
-# fit cluster test with dataframe as input
-# create condition list that repeats 5times 1 and then 5 times 0
-# 1 = target, 0 = non-target
-# condition = 5 * [1] + 5 * [0]
-
-# 1 = target, 0 = non-target
-# contrast, target_only, non_target_only = prep_sample_data()
-
-# evokeds_list = target_only + non_target_only
-
-
-def create_random_evokeds_id_condition_list(evoked_data_a: list, evoked_data_b: list):
+def create_random_evokeds_id_condition_list():
     """
     Create a list of shuffled participant IDs, conditions, and evoked data.
     # Keep the participant IDs and conditions paired but shuffle the order of the evoked data.
     """
     import random
 
+    _ , evoked_data_a, evoked_data_b = prep_sample_data()
+
     # Example participant IDs
     participant_ids = ["p1", "p2", "p3", "p4", "p5"] * 2
 
@@ -162,42 +150,42 @@ def create_random_evokeds_id_condition_list(evoked_data_a: list, evoked_data_b:
     return shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data
 
 
-def create_random_paired_evokeds_list(evoked_data_a: list, evoked_data_b: list):
+def create_random_paired_evokeds_list():
     """
     Create a list of shuffled evoked data where each pair of target and non-target evoked data is shuffled together.
     """
     import random
+    _, evoked_data_a, evoked_data_b = prep_sample_data()
 
-    # Create a list of tuples where each tuple contains an evoked data and its corresponding label
-    evoked_pairs = [(evoked, 1) for evoked in evoked_data_a] + [
-        (evoked, 0) for evoked in evoked_data_b
-    ]
+    # Ensure evoked_data_a and evoked_data_b are of the same length
+    assert len(evoked_data_a) == len(evoked_data_b), "evoked_data_a and evoked_data_b must have the same length"
+    
+    # Create a list of participant indices
+    participant_indices = list(range(len(evoked_data_a)))
 
-    # Shuffle the list of tuples
-    random.shuffle(evoked_pairs)
+    # Shuffle the list of participant indices
+    random.shuffle(participant_indices)
 
-    # Separate the shuffled list back into evoked data and labels
-    shuffled_evoked_data, shuffled_labels = zip(*evoked_pairs)
+    # Reorder evoked data according to the shuffled participant indices
+    shuffled_evoked_data_a = [evoked_data_a[i] for i in participant_indices]
+    shuffled_evoked_data_b = [evoked_data_b[i] for i in participant_indices]
 
-    # Convert the tuples back to lists
-    shuffled_evoked_data = list(shuffled_evoked_data)
+    # Combine the shuffled evoked data into a single list
+    shuffled_evoked_data = shuffled_evoked_data_a + shuffled_evoked_data_b
+
+    # Combine the original evoked data into a single list
+    original_evoked_data = evoked_data_a + evoked_data_b
 
-    return shuffled_evoked_data
+    return original_evoked_data, shuffled_evoked_data
 
 
 # shuffle order of pairs
-shuffled_evokeds_list = create_random_paired_evokeds_list(target_only, non_target_only)
+original_evoked_data, shuffled_evoked_data = create_random_paired_evokeds_list()
 # shouldn't change the results (p-value is different though?)
 
-shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = (
-    create_random_evokeds_id_condition_list(
-        evoked_data_a=target_only, evoked_data_b=non_target_only
-    )
-)
-
+shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = create_random_evokeds_id_condition_list()
 
 def prepare_dataframe_for_cluster_function(
-    contrast: bool = False,
     evokeds: list = None,
     condition: list = None,
     subject_index: list = None,
@@ -216,29 +204,39 @@ def prepare_dataframe_for_cluster_function(
     subject_index : list, optional
         List of subject IDs. Default is None.
 
+    Returns
+    -------
+    df : DataFrame
+        The prepared DataFrame for the cluster test function.
     """
-    # create an empty dataframe
-    df = pd.DataFrame()
-
-    if contrast == True:
-        # check if evoked list is dividable by 2
-        if len(evokeds) % 2 != 0:
-            raise ValueError("evokeds list needs to be dividable by 2")
-        if condition is not None:
-            # Convert lists to DataFrame for easier manipulation
-            df = pd.DataFrame(
-                {
-                    "evoked": evokeds,
-                    "condition": condition,
-                    "subject_index": subject_index,
-                }
-            )
-
-        return df
+    # Initialize the DataFrame with evoked data
+    df = pd.DataFrame({
+        "evoked": evokeds,
+        "condition": condition if condition is not None else np.nan,
+        "subject_index": subject_index if subject_index is not None else np.nan
+    })
+
+    return df
 
+# run with original data
+df = prepare_dataframe_for_cluster_function(evokeds=original_evoked_data,
+                                            condition=None,
+                                            subject_index=None)
+
+df = prepare_dataframe_for_cluster_function(evokeds=shuffled_evoked_data,
+                                            condition=None,
+                                            subject_index=None)
+
+df = prepare_dataframe_for_cluster_function(evokeds=shuffled_evoked_data,
+                                            condition=shuffled_conditions,
+                                        subject_index=shuffled_participant_ids)
+
+
+cluster_test(df)
 
 def cluster_test(
     df: pd.DataFrame,
+    contrast: bool = True,
     n_permutations: int = 10000,
     seed: int = 1234,
     contrast_weights: list = [1, -1],
@@ -267,43 +265,47 @@ def cluster_test(
     H0 : array
         The permuted test statistics.
     """
-    if df.condition is not None:
-        # Extract unique conditions
-        unique_conditions = np.unique(df.condition)
-        if len(unique_conditions) != 2:
-            raise ValueError("Condition list needs to contain 2 unique values")
-        if df.subject_index is not None:
+    # Check if conditions and subject_index are present and valid
+    conditions_present = pd.notna(df['condition']).all()
+    subject_index_present = pd.notna(df['subject_index']).all()
+
+    if contrast == 1:
+        if conditions_present:
+            # Extract unique conditions
+            unique_conditions = np.unique(df.condition)
+            if len(unique_conditions) != 2:
+                raise ValueError("Condition list needs to contain 2 unique values")
             # Initialize a list to hold the combined evoked data
             evokeds_data = []
-
-            # Process each subject's evoked data
-            for sub_id in df.subject_index.unique():
-                sub_df = df[df.subject_index == sub_id]
-
-                # Split evokeds list based on condition list for this subject
-                evokeds_a = sub_df[sub_df.condition == unique_conditions[0]][
-                    "evoked"
-                ].tolist()
-                evokeds_b = sub_df[sub_df.condition == unique_conditions[1]][
-                    "evoked"
-                ].tolist()
-
-                if len(evokeds_a) != 1 or len(evokeds_b) != 1:
-                    raise ValueError(
-                        f"Subject {sub_id}: Each subject must have exactly one evoked for each condition"
+            if subject_index_present:
+                # Process each subject's evoked data
+                for sub_id in df.subject_index.unique():
+                    sub_df = df[df.subject_index == sub_id]
+
+                    # Split evokeds list based on condition list for this subject
+                    evokeds_a = sub_df[sub_df.condition == unique_conditions[0]][
+                        "evoked"
+                    ].tolist()
+                    evokeds_b = sub_df[sub_df.condition == unique_conditions[1]][
+                        "evoked"
+                    ].tolist()
+
+                    if len(evokeds_a) != 1 or len(evokeds_b) != 1:
+                        raise ValueError(
+                            f"Subject {sub_id}: Each subject must have exactly one evoked for each condition"
+                        )
+
+                    # Calculate contrast based on condition list
+                    diff_evoked = mne.combine_evoked(
+                        [evokeds_a[0], evokeds_b[0]], weights=contrast_weights
                     )
-
-                # Calculate contrast based on condition list
-                diff_evoked = mne.combine_evoked(
-                    [evokeds_a[0], evokeds_b[0]], weights=contrast_weights
-                )
-                evokeds_data.append(diff_evoked)
+                    evokeds_data.append(diff_evoked)
         else:
             # calculate length of evokeds list
-            n_evokeds = len(df.evokeds)
+            n_evokeds = len(df.evoked)
             # now split evokeds list in two lists
-            evokeds_a = df.evokeds[: n_evokeds // 2]
-            evokeds_b = df.evokeds[n_evokeds // 2 :]
+            evokeds_a = df.evoked[: n_evokeds // 2]
+            evokeds_b = df.evoked[n_evokeds // 2 :]
             # create contrast from evokeds_a and evokeds_b
             diff_evoked = [
                 mne.combine_evoked([evo_a, evo_b], weights=contrast_weights)
@@ -311,7 +313,7 @@ def cluster_test(
             ]
             evokeds_data = diff_evoked
     else:
-        evokeds_data = df.evokeds
+        evokeds_data = df.evoked.tolist()
 
     # extract number of channels
     n_channels = evokeds_data[0].info["nchan"]
@@ -330,19 +332,20 @@ def cluster_test(
         X=data, stat_fun=None, threshold=None, tail=0, kind="within"
     )
 
+    # Run the analysis
     T_obs, clusters, cluster_p_values, H0 = (
         mne.stats.cluster_level._permutation_cluster_test(
             [data],
             threshold=threshold,
             stat_fun=stat_fun,
-            n_jobs=-1,
-            max_step=1,
-            exclude=None,
-            step_down_p=0.05,
-            t_power=1,
+            n_jobs=-1,  # takes all CPU cores
+            max_step=1,  # maximum distance between samples (time points)
+            exclude=None,  # exclude no time points or channels
+            step_down_p=0,  # step down in jumps test
+            t_power=1,  # weigh each location by its stats score
             out_type="indices",
-            check_disjoint=True,
-            buffer_size=None,
+            check_disjoint=False,
+            buffer_size=None,  # block size for chunking the data
             n_permutations=n_permutations,
             tail=0,
             adjacency=adjacency,

From fa5b215ded34da56ef72bccd8dd3fd6290c8fe2f Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Fri, 14 Jun 2024 19:04:48 +0200
Subject: [PATCH 04/88] added ToDos

---
 tutorials/stats-sensor-space/76_new_cluster_test_api.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index 5d943985aa2..51ad611aa58 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -7,6 +7,8 @@
 
 import mne
 
+# TODO: implement formulaic design matrix for paired t-test
+# TODO: @erik: add dataset to mne-data
 
 # import and load dataset
 path_to_p3 = Path("C:/Users/Carina/mne_data/ERP_CORE_P3")
@@ -235,8 +237,6 @@ def prepare_dataframe_for_cluster_function(
                                         subject_index=shuffled_participant_ids)
 
 
-cluster_test(df)
-
 def cluster_test(
     df: pd.DataFrame,
     contrast: bool = True,
@@ -471,3 +471,5 @@ def plot_cluster(
     plt.show()
 
     return None
+
+cluster_test(df)
\ No newline at end of file

From 1a1511ddec91aeea543b2e3c671b077d2711ef7e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 14 Jun 2024 17:04:48 +0000
Subject: [PATCH 05/88] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../76_new_cluster_test_api.py                | 54 +++++++++++--------
 1 file changed, 33 insertions(+), 21 deletions(-)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index 5d943985aa2..3396e3137ff 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -7,7 +7,6 @@
 
 import mne
 
-
 # import and load dataset
 path_to_p3 = Path("C:/Users/Carina/mne_data/ERP_CORE_P3")
 
@@ -116,6 +115,7 @@ def old_api_cluster(n_permutations: int = 10000, seed: int = 1234):
 
     return T_obs, clusters, cluster_p_values, H0
 
+
 def create_random_evokeds_id_condition_list():
     """
     Create a list of shuffled participant IDs, conditions, and evoked data.
@@ -123,7 +123,7 @@ def create_random_evokeds_id_condition_list():
     """
     import random
 
-    _ , evoked_data_a, evoked_data_b = prep_sample_data()
+    _, evoked_data_a, evoked_data_b = prep_sample_data()
 
     # Example participant IDs
     participant_ids = ["p1", "p2", "p3", "p4", "p5"] * 2
@@ -158,11 +158,14 @@ def create_random_paired_evokeds_list():
     Create a list of shuffled evoked data where each pair of target and non-target evoked data is shuffled together.
     """
     import random
+
     _, evoked_data_a, evoked_data_b = prep_sample_data()
 
     # Ensure evoked_data_a and evoked_data_b are of the same length
-    assert len(evoked_data_a) == len(evoked_data_b), "evoked_data_a and evoked_data_b must have the same length"
-    
+    assert len(evoked_data_a) == len(
+        evoked_data_b
+    ), "evoked_data_a and evoked_data_b must have the same length"
+
     # Create a list of participant indices
     participant_indices = list(range(len(evoked_data_a)))
 
@@ -186,7 +189,10 @@ def create_random_paired_evokeds_list():
 original_evoked_data, shuffled_evoked_data = create_random_paired_evokeds_list()
 # shouldn't change the results (p-value is different though?)
 
-shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = create_random_evokeds_id_condition_list()
+shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = (
+    create_random_evokeds_id_condition_list()
+)
+
 
 def prepare_dataframe_for_cluster_function(
     evokeds: list = None,
@@ -213,30 +219,36 @@ def prepare_dataframe_for_cluster_function(
         The prepared DataFrame for the cluster test function.
     """
     # Initialize the DataFrame with evoked data
-    df = pd.DataFrame({
-        "evoked": evokeds,
-        "condition": condition if condition is not None else np.nan,
-        "subject_index": subject_index if subject_index is not None else np.nan
-    })
+    df = pd.DataFrame(
+        {
+            "evoked": evokeds,
+            "condition": condition if condition is not None else np.nan,
+            "subject_index": subject_index if subject_index is not None else np.nan,
+        }
+    )
 
     return df
 
+
 # run with original data
-df = prepare_dataframe_for_cluster_function(evokeds=original_evoked_data,
-                                            condition=None,
-                                            subject_index=None)
+df = prepare_dataframe_for_cluster_function(
+    evokeds=original_evoked_data, condition=None, subject_index=None
+)
 
-df = prepare_dataframe_for_cluster_function(evokeds=shuffled_evoked_data,
-                                            condition=None,
-                                            subject_index=None)
+df = prepare_dataframe_for_cluster_function(
+    evokeds=shuffled_evoked_data, condition=None, subject_index=None
+)
 
-df = prepare_dataframe_for_cluster_function(evokeds=shuffled_evoked_data,
-                                            condition=shuffled_conditions,
-                                        subject_index=shuffled_participant_ids)
+df = prepare_dataframe_for_cluster_function(
+    evokeds=shuffled_evoked_data,
+    condition=shuffled_conditions,
+    subject_index=shuffled_participant_ids,
+)
 
 
 cluster_test(df)
 
+
 def cluster_test(
     df: pd.DataFrame,
     contrast: bool = True,
@@ -269,8 +281,8 @@ def cluster_test(
         The permuted test statistics.
     """
     # Check if conditions and subject_index are present and valid
-    conditions_present = pd.notna(df['condition']).all()
-    subject_index_present = pd.notna(df['subject_index']).all()
+    conditions_present = pd.notna(df["condition"]).all()
+    subject_index_present = pd.notna(df["subject_index"]).all()
 
     if contrast == 1:
         if conditions_present:

From a12cf951fda22bed14ffd18dd9f67e30627f5b00 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 14 Jun 2024 20:51:19 +0000
Subject: [PATCH 06/88] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tutorials/stats-sensor-space/76_new_cluster_test_api.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index a1630dcd8ee..011c2f69d7f 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -251,6 +251,7 @@ def prepare_dataframe_for_cluster_function(
 
 cluster_test(df)
 
+
 def cluster_test(
     df: pd.DataFrame,
     contrast: bool = True,
@@ -486,4 +487,5 @@ def plot_cluster(
 
     return None
 
-cluster_test(df)
\ No newline at end of file
+
+cluster_test(df)

From 45ce63a75a1fbc5a3676a924fb181bb0e7e7e3f7 Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Wed, 19 Jun 2024 19:28:07 +0200
Subject: [PATCH 07/88] added formula support and implemented suggestions

---
 .../76_new_cluster_test_api.py                | 54 +++++++++++++++----
 1 file changed, 45 insertions(+), 9 deletions(-)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index 011c2f69d7f..eef90a2612b 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -225,8 +225,8 @@ def prepare_dataframe_for_cluster_function(
     df = pd.DataFrame(
         {
             "evoked": evokeds,
-            "condition": condition if condition is not None else np.nan,
-            "subject_index": subject_index if subject_index is not None else np.nan,
+            "condition": condition if condition is not None else pd.NA,
+            "subject_index": subject_index if subject_index is not None else pd.NA,
         }
     )
 
@@ -249,15 +249,13 @@ def prepare_dataframe_for_cluster_function(
 )
 
 
-cluster_test(df)
-
-
 def cluster_test(
     df: pd.DataFrame,
-    contrast: bool = True,
+    formula: str = None, # Wilkinson notation formula for design matrix
+    contrast: bool = True, # will be replaced by formulaic design matrix
     n_permutations: int = 10000,
-    seed: int = 1234,
-    contrast_weights: list = [1, -1],
+    seed: None | int | np.random.RandomState = None,
+    contrast_weights: list = [1, -1], # will be replaced by formulaic design matrix
 ):
     """
     Run the cluster test using the new API.
@@ -287,6 +285,22 @@ def cluster_test(
     conditions_present = pd.notna(df["condition"]).all()
     subject_index_present = pd.notna(df["subject_index"]).all()
 
+    # add a data column to the dataframe (numpy array)
+    df["data"] = [evoked.data for evoked in df.evoked]
+
+    # convert wide format to long format
+    df_long = convert_wide_to_long(df)
+
+    # check if formula is present
+    if formula is not None:
+        import formulaic
+
+        # create design matrix based on formula
+        # Create the design matrix using formulaic
+        y, X = formulaic.model_matrix(formula, df_long)
+
+        # what to do with the design matrix?
+
     if contrast == 1:
         if conditions_present:
             # Extract unique conditions
@@ -381,6 +395,29 @@ def cluster_test(
 
     return T_obs, clusters, cluster_p_values, H0
 
+# Convert wide format to long format
+def convert_wide_to_long(df):
+    long_format_data = []
+    for idx, row in df.iterrows():
+        condition = row['condition']
+        subject_index = row['subject_index']
+        data_2d = row['data']
+        
+        for channel in range(data_2d.shape[0]):
+            for timepoint in range(data_2d.shape[1]):
+                long_format_data.append({
+                    'condition': condition,
+                    'subject_index': subject_index,
+                    'channel': channel,
+                    'timepoint': timepoint,
+                    'value': data_2d[channel, timepoint]
+                })
+    
+    df_long = pd.DataFrame(long_format_data)
+    return df_long
+
+df_long = convert_wide_to_long(df)
+
 
 def plot_cluster(
     contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values
@@ -485,7 +522,6 @@ def plot_cluster(
 
     plt.show()
 
-    return None
 
 
 cluster_test(df)

From 2b7bae8cae58d9ee370edd48a1642e7f28a73aa8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 19 Jun 2024 17:28:23 +0000
Subject: [PATCH 08/88] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../76_new_cluster_test_api.py                | 35 ++++++++++---------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index eef90a2612b..7c0abc95fae 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -251,11 +251,11 @@ def prepare_dataframe_for_cluster_function(
 
 def cluster_test(
     df: pd.DataFrame,
-    formula: str = None, # Wilkinson notation formula for design matrix
-    contrast: bool = True, # will be replaced by formulaic design matrix
+    formula: str = None,  # Wilkinson notation formula for design matrix
+    contrast: bool = True,  # will be replaced by formulaic design matrix
     n_permutations: int = 10000,
     seed: None | int | np.random.RandomState = None,
-    contrast_weights: list = [1, -1], # will be replaced by formulaic design matrix
+    contrast_weights: list = [1, -1],  # will be replaced by formulaic design matrix
 ):
     """
     Run the cluster test using the new API.
@@ -395,27 +395,31 @@ def cluster_test(
 
     return T_obs, clusters, cluster_p_values, H0
 
+
 # Convert wide format to long format
 def convert_wide_to_long(df):
     long_format_data = []
     for idx, row in df.iterrows():
-        condition = row['condition']
-        subject_index = row['subject_index']
-        data_2d = row['data']
-        
+        condition = row["condition"]
+        subject_index = row["subject_index"]
+        data_2d = row["data"]
+
         for channel in range(data_2d.shape[0]):
             for timepoint in range(data_2d.shape[1]):
-                long_format_data.append({
-                    'condition': condition,
-                    'subject_index': subject_index,
-                    'channel': channel,
-                    'timepoint': timepoint,
-                    'value': data_2d[channel, timepoint]
-                })
-    
+                long_format_data.append(
+                    {
+                        "condition": condition,
+                        "subject_index": subject_index,
+                        "channel": channel,
+                        "timepoint": timepoint,
+                        "value": data_2d[channel, timepoint],
+                    }
+                )
+
     df_long = pd.DataFrame(long_format_data)
     return df_long
 
+
 df_long = convert_wide_to_long(df)
 
 
@@ -523,5 +527,4 @@ def plot_cluster(
     plt.show()
 
 
-
 cluster_test(df)

From 38834baeb64460c885988f85f5175f0ff8cdd84b Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Sat, 22 Jun 2024 11:10:13 +0200
Subject: [PATCH 09/88] fixed linting errors

---
 .../76_new_cluster_test_api.py                | 35 +++++++++++++------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index 7c0abc95fae..2f1d55383d2 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -6,6 +6,7 @@
 from mpl_toolkits.axes_grid1 import make_axes_locatable
 
 import mne
+from mne.utils import _soft_import_
 
 # TODO: implement formulaic design matrix for paired t-test
 # TODO: @erik: add dataset to mne-data
@@ -15,9 +16,7 @@
 
 
 def prep_sample_data(plot_evokeds: bool = False):
-    """
-    Load the P3 dataset and extract the target, non-target and contrast evokeds.
-    """
+    """Load the P3 dataset."""
     # Define the range of participant IDs
     participant_ids = range(15, 20)  # This will cover 015 to 019
 
@@ -25,7 +24,7 @@ def prep_sample_data(plot_evokeds: bool = False):
 
     # Loop over each participant ID and generate the corresponding filename
     for pid in participant_ids:
-        # Create the filename using an f-string, ensuring the participant ID is zero-padded to 3 digits
+        # Create the filename using an f-string, ID is zero-padded to 3 digits
         filename_p3 = f"sub-{pid:03d}_ses-P3_task-P3_ave.fif"
 
         # Print the filename (or perform your desired operations on it)
@@ -67,7 +66,8 @@ def prep_sample_data(plot_evokeds: bool = False):
 
 def old_api_cluster(n_permutations: int = 10000, seed: int = 1234):
     """
-    Run the cluster test using the old API to get a bechmark result for the new API.
+    Run the cluster test using the old API to get a benchmark result for the new API.
+
     Currently implementing a paired t-test with contrast between participants.
     """
     contrast, target_only, non_target_only = prep_sample_data()
@@ -122,7 +122,8 @@ def old_api_cluster(n_permutations: int = 10000, seed: int = 1234):
 def create_random_evokeds_id_condition_list():
     """
     Create a list of shuffled participant IDs, conditions, and evoked data.
-    # Keep the participant IDs and conditions paired but shuffle the order of the evoked data.
+
+    # Keep the participant IDs and conditions paired but shuffle the order of the data.
     """
     import random
 
@@ -158,7 +159,10 @@ def create_random_evokeds_id_condition_list():
 
 def create_random_paired_evokeds_list():
     """
-    Create a list of shuffled evoked data where each pair of target and non-target evoked data is shuffled together.
+    Create shuffled paired evoked data.
+
+    Create a list of shuffled evoked data where each pair of target
+    and non-target evoked data is shuffled together.
     """
     import random
 
@@ -255,10 +259,11 @@ def cluster_test(
     contrast: bool = True,  # will be replaced by formulaic design matrix
     n_permutations: int = 10000,
     seed: None | int | np.random.RandomState = None,
-    contrast_weights: list = [1, -1],  # will be replaced by formulaic design matrix
+    contrast_weights: list = (1, -1),  # will be replaced by formulaic design matrix
 ):
     """
     Run the cluster test using the new API.
+
     # currently supports paired t-test with contrast or with list of conditions
 
     Parameters
@@ -293,12 +298,14 @@ def cluster_test(
 
     # check if formula is present
     if formula is not None:
-        import formulaic
+        formulaic = _soft_import_("formulaic")  # soft import
 
         # create design matrix based on formula
         # Create the design matrix using formulaic
         y, X = formulaic.model_matrix(formula, df_long)
 
+        # sign flip for paired t-test
+
         # what to do with the design matrix?
 
     if contrast == 1:
@@ -324,7 +331,7 @@ def cluster_test(
 
                     if len(evokeds_a) != 1 or len(evokeds_b) != 1:
                         raise ValueError(
-                            f"Subject {sub_id}: Each subject must have exactly one evoked for each condition"
+                            f"Subject {sub_id}: subject must have one evoked per cond"
                         )
 
                     # Calculate contrast based on condition list
@@ -398,6 +405,14 @@ def cluster_test(
 
 # Convert wide format to long format
 def convert_wide_to_long(df):
+    """
+    Convert a DataFrame from wide to long.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame in wide format.
+    """
     long_format_data = []
     for idx, row in df.iterrows():
         condition = row["condition"]

From c00859f79ecfcc889fb84a13bc0c1632a59b92c7 Mon Sep 17 00:00:00 2001
From: Eric Larson <larson.eric.d@gmail.com>
Date: Tue, 25 Jun 2024 12:25:21 -0400
Subject: [PATCH 10/88] ENH: Add dataset [skip azp] [skip actions]

---
 mne/datasets/config.py                                  | 4 ++--
 pyproject.toml                                          | 3 +++
 tutorials/stats-sensor-space/76_new_cluster_test_api.py | 7 +++----
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/mne/datasets/config.py b/mne/datasets/config.py
index a2f2d7781b7..be6c4c49f70 100644
--- a/mne/datasets/config.py
+++ b/mne/datasets/config.py
@@ -90,7 +90,7 @@
 # here: ↓↓↓↓↓↓↓↓
 RELEASES = dict(
     testing="0.152",
-    misc="0.27",
+    misc="0.30",
     phantom_kit="0.2",
     ucl_opm_auditory="0.2",
 )
@@ -131,7 +131,7 @@
 )
 MNE_DATASETS["misc"] = dict(
     archive_name=f"{MISC_VERSIONED}.tar.gz",  # 'mne-misc-data',
-    hash="md5:e343d3a00cb49f8a2f719d14f4758afe",
+    hash="md5:201d35531d3c03701cf50e38bb73481f",
     url=(
         "https://codeload.github.com/mne-tools/mne-misc-data/tar.gz/"
         f'{RELEASES["misc"]}'
diff --git a/pyproject.toml b/pyproject.toml
index 93bfb4abead..6c909263bcb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -107,6 +107,7 @@ full-no-qt = [
     "snirf",
     "defusedxml",
     "neo",
+    "formulaic",
 ]
 full = ["mne[full-no-qt]", "PyQt6!=6.6.0", "PyQt6-Qt6!=6.6.0,!=6.7.0"]
 full-pyqt6 = ["mne[full]"]
@@ -145,6 +146,7 @@ test_extra = [
     "snirf",
     "neo",
     "mne-bids",
+    "formulaic",
 ]
 
 # Dependencies for building the documentation
@@ -157,6 +159,7 @@ doc = [
     "sphinxcontrib-towncrier",
     "memory_profiler",
     "neo",
+    "formulaic",
     "seaborn!=0.11.2",
     "sphinx_copybutton",
     "sphinx-design",
diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index 2f1d55383d2..8eb7637df53 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -6,13 +6,12 @@
 from mpl_toolkits.axes_grid1 import make_axes_locatable
 
 import mne
-from mne.utils import _soft_import_
+from mne.utils import _soft_import
 
 # TODO: implement formulaic design matrix for paired t-test
-# TODO: @erik: add dataset to mne-data
 
 # import and load dataset
-path_to_p3 = Path("C:/Users/Carina/mne_data/ERP_CORE_P3")
+path_to_p3 = mne.datasets.misc.data_path() / "ERP_CORE" / "P3"
 
 
 def prep_sample_data(plot_evokeds: bool = False):
@@ -298,7 +297,7 @@ def cluster_test(
 
     # check if formula is present
     if formula is not None:
-        formulaic = _soft_import_("formulaic")  # soft import
+        formulaic = _soft_import("formulaic")  # soft import
 
         # create design matrix based on formula
         # Create the design matrix using formulaic

From 9c8ec900cf1ac02b13e1b8fdedeca05a8897a882 Mon Sep 17 00:00:00 2001
From: Eric Larson <larson.eric.d@gmail.com>
Date: Tue, 25 Jun 2024 12:26:57 -0400
Subject: [PATCH 11/88] FIX: One more [skip azp] [skip actions]

---
 environment.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/environment.yml b/environment.yml
index cc2f8e752d5..71cd307cca0 100644
--- a/environment.yml
+++ b/environment.yml
@@ -64,3 +64,4 @@ dependencies:
   - lazy_loader
   - defusedxml
   - python-neo
+  - formulaic

From 47363b539250b63a654e05a3a3aebc0e89ac8b4d Mon Sep 17 00:00:00 2001
From: Eric Larson <larson.eric.d@gmail.com>
Date: Tue, 25 Jun 2024 12:39:41 -0400
Subject: [PATCH 12/88] FIX: Title [skip azp] [skip actions]

---
 .../stats-sensor-space/76_new_cluster_test_api.py    | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index 8eb7637df53..f9c4f61ad5f 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -1,3 +1,15 @@
+"""
+.. _tut-new-cluster-test-api:
+
+====================
+New cluster test API
+====================
+
+This tutorial shows how to use the new API for cluster testing.
+"""
+# License: BSD-3-Clause
+# Copyright the MNE-Python contributors.
+
 from pathlib import Path
 
 import matplotlib.pyplot as plt

From 1f6221dccedc679b8af651420dbbb2068037eb26 Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Sun, 30 Jun 2024 20:11:28 +0200
Subject: [PATCH 13/88] first draft of formulaic paired t-test

---
 .../76_new_cluster_test_api.py                | 342 ++++++++++++------
 1 file changed, 224 insertions(+), 118 deletions(-)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index f9c4f61ad5f..6a3a966bbcc 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -15,12 +15,13 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+import scipy
 from mpl_toolkits.axes_grid1 import make_axes_locatable
 
 import mne
 from mne.utils import _soft_import
 
-# TODO: implement formulaic design matrix for paired t-test
+# TODO: test function and update docstrings
 
 # import and load dataset
 path_to_p3 = mne.datasets.misc.data_path() / "ERP_CORE" / "P3"
@@ -248,15 +249,6 @@ def prepare_dataframe_for_cluster_function(
     return df
 
 
-# run with original data
-df = prepare_dataframe_for_cluster_function(
-    evokeds=original_evoked_data, condition=None, subject_index=None
-)
-
-df = prepare_dataframe_for_cluster_function(
-    evokeds=shuffled_evoked_data, condition=None, subject_index=None
-)
-
 df = prepare_dataframe_for_cluster_function(
     evokeds=shuffled_evoked_data,
     condition=shuffled_conditions,
@@ -267,24 +259,56 @@ def prepare_dataframe_for_cluster_function(
 def cluster_test(
     df: pd.DataFrame,
     formula: str = None,  # Wilkinson notation formula for design matrix
-    contrast: bool = True,  # will be replaced by formulaic design matrix
     n_permutations: int = 10000,
     seed: None | int | np.random.RandomState = None,
-    contrast_weights: list = (1, -1),  # will be replaced by formulaic design matrix
+    tail: int = 0,  # 0 for two-tailed, 1 for greater, -1 for less
+    n_jobs: int = 1,  # how many cores to use
+    adjacency: tuple = None,
+    max_step: int = 1,  # maximum distance between samples (time points)
+    exclude: list = None,  # exclude no time points or channels
+    step_down_p: int = 0,  # step down in jumps test
+    t_power: int = 1,  # weigh each location by its stats score
+    out_type: str = "indices",
+    check_disjoint: bool = False,
+    buffer_size: int = None,  # block size for chunking the data
 ):
     """
     Run the cluster test using the new API.
 
-    # currently supports paired t-test with contrast or with list of conditions
+    # currently supports paired t-test
 
     Parameters
     ----------
     dataframe : pd.DataFrame
         Dataframe with evoked data, conditions and subject IDs.
+    formula : str, optional
+        Wilkinson notation formula for design matrix. Default is None.
     n_permutations : int, optional
         Number of permutations. Default is 10000.
+    seed : None | int | np.random.RandomState, optional
+        Seed for the random number generator. Default is None.
+    tail : int, optional
+        0 for two-tailed, 1 for greater, -1 for less. Default is 0.
+    n_jobs : int, optional
+        How many cores to use. Default is 1.
+    adjacency : None, optional
+        Adjacency matrix. Default is None.
+    max_step : int, optional
+        Maximum distance between samples (time points). Default is 1.
+    exclude : np.Array, optional
+        Exclude no time points or channels. Default is None.
+    step_down_p : int, optional
+        Step down in jumps test. Default is 0.
+    t_power : int, optional
+        Weigh each location by its stats score. Default is 1.
+    out_type : str, optional
+        Output type. Default is "indices".
+    check_disjoint : bool, optional
+        Check if clusters are disjoint. Default is False.
+    buffer_size : int, optional
+        Block size for chunking the data. Default is None.
     seed : int, optional
-        Random seed. Default is 1234.
+        Seed for the random number generator. Default is None.
 
     Returns
     -------
@@ -297,108 +321,78 @@ def cluster_test(
     H0 : array
         The permuted test statistics.
     """
-    # Check if conditions and subject_index are present and valid
-    conditions_present = pd.notna(df["condition"]).all()
-    subject_index_present = pd.notna(df["subject_index"]).all()
-
+    # for now this assumes a dataframe with a column for evoked data
     # add a data column to the dataframe (numpy array)
     df["data"] = [evoked.data for evoked in df.evoked]
 
-    # convert wide format to long format
-    df_long = convert_wide_to_long(df)
+    # extract number of channels and timepoints
+    # (eventually should also allow for frequency)
+    n_channels, n_timepoints = df["data"][0].shape
+
+    # convert wide format to long format for formulaic
+    df_long = unpack_time_and_channels(df)
+
+    # Pivot the DataFrame
+    pivot_df = df_long.pivot_table(
+        index=["subject_index", "channel", "timepoint"],
+        columns="condition",
+        values="value",
+    ).reset_index()
+
+    # if not 2 unique conditions raise error
+    if len(pd.unique(df.condition)) != 2:
+        raise ValueError("Condition list needs to contain 2 unique values")
+
+    # Compute the difference (assuming there are only 2 conditions)
+    pivot_df["y"] = pivot_df[0] - pivot_df[1]
+
+    # Optional: Clean up the DataFrame
+    pivot_df = pivot_df[["subject_index", "channel", "timepoint", "y"]]
 
     # check if formula is present
     if formula is not None:
-        formulaic = _soft_import("formulaic")  # soft import
+        formulaic = _soft_import(
+            "formulaic", purpose="set up Design Matrix"
+        )  # soft import (not a dependency for MNE)
 
-        # create design matrix based on formula
+        # for the paired t-test y is the difference between conditions
+        # X is the design matrix with a column with 1s and 0s for each participant
         # Create the design matrix using formulaic
-        y, X = formulaic.model_matrix(formula, df_long)
-
-        # sign flip for paired t-test
-
-        # what to do with the design matrix?
-
-    if contrast == 1:
-        if conditions_present:
-            # Extract unique conditions
-            unique_conditions = np.unique(df.condition)
-            if len(unique_conditions) != 2:
-                raise ValueError("Condition list needs to contain 2 unique values")
-            # Initialize a list to hold the combined evoked data
-            evokeds_data = []
-            if subject_index_present:
-                # Process each subject's evoked data
-                for sub_id in df.subject_index.unique():
-                    sub_df = df[df.subject_index == sub_id]
-
-                    # Split evokeds list based on condition list for this subject
-                    evokeds_a = sub_df[sub_df.condition == unique_conditions[0]][
-                        "evoked"
-                    ].tolist()
-                    evokeds_b = sub_df[sub_df.condition == unique_conditions[1]][
-                        "evoked"
-                    ].tolist()
-
-                    if len(evokeds_a) != 1 or len(evokeds_b) != 1:
-                        raise ValueError(
-                            f"Subject {sub_id}: subject must have one evoked per cond"
-                        )
-
-                    # Calculate contrast based on condition list
-                    diff_evoked = mne.combine_evoked(
-                        [evokeds_a[0], evokeds_b[0]], weights=contrast_weights
-                    )
-                    evokeds_data.append(diff_evoked)
-        else:
-            # calculate length of evokeds list
-            n_evokeds = len(df.evoked)
-            # now split evokeds list in two lists
-            evokeds_a = df.evoked[: n_evokeds // 2]
-            evokeds_b = df.evoked[n_evokeds // 2 :]
-            # create contrast from evokeds_a and evokeds_b
-            diff_evoked = [
-                mne.combine_evoked([evo_a, evo_b], weights=contrast_weights)
-                for evo_a, evo_b in zip(evokeds_a, evokeds_b)
-            ]
-            evokeds_data = diff_evoked
+        y, X = formulaic.model_matrix(formula, pivot_df)
     else:
-        evokeds_data = df.evoked.tolist()
-
-    # extract number of channels
-    n_channels = evokeds_data[0].info["nchan"]
-
-    # loop over rows and extract data from evokeds
-    data_array = np.array([evoked.data for evoked in evokeds_data])
+        raise ValueError(
+            "Formula is required and needs to be a string in Wilkinson notation."
+        )
 
-    # find the dimension that is equal to n_channels
-    if data_array.shape[1] == n_channels:
-        # reshape to channels as last dimension
-        data = data_array.transpose(0, 2, 1)
+    # now prep design matrix outcome variable for input into MNE cluster function
+    # we initially had first channels, then timepoints,
+    # now we need first timepoints, then channels
+    y_for_cluster = y.values.reshape(-1, n_channels, n_timepoints).transpose(0, 2, 1)
 
-    adjacency, _ = mne.channels.find_ch_adjacency(evokeds_data[0].info, ch_type="eeg")
+    adjacency, _ = mne.channels.find_ch_adjacency(df["evoked"][0].info, ch_type="eeg")
 
+    # define stat function and threshold
     stat_fun, threshold = mne.stats.cluster_level._check_fun(
-        X=data, stat_fun=None, threshold=None, tail=0, kind="within"
+        X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="within"
     )
 
-    # Run the analysis
+    # Run the cluster-based permutation test
     T_obs, clusters, cluster_p_values, H0 = (
         mne.stats.cluster_level._permutation_cluster_test(
-            [data],
+            [y_for_cluster],
+            n_permutations=10000,
             threshold=threshold,
             stat_fun=stat_fun,
-            n_jobs=-1,  # takes all CPU cores
-            max_step=1,  # maximum distance between samples (time points)
-            exclude=None,  # exclude no time points or channels
-            step_down_p=0,  # step down in jumps test
-            t_power=1,  # weigh each location by its stats score
-            out_type="indices",
-            check_disjoint=False,
-            buffer_size=None,  # block size for chunking the data
-            n_permutations=n_permutations,
-            tail=0,
+            tail=tail,
+            n_jobs=n_jobs,
             adjacency=adjacency,
+            max_step=max_step,  # maximum distance between samples (time points)
+            exclude=exclude,  # exclude no time points or channels
+            step_down_p=step_down_p,  # step down in jumps test
+            t_power=t_power,  # weigh each location by its stats score
+            out_type=out_type,
+            check_disjoint=check_disjoint,
+            buffer_size=buffer_size,  # block size for chunking the data
             seed=seed,
         )
     )
@@ -414,39 +408,44 @@ def cluster_test(
     return T_obs, clusters, cluster_p_values, H0
 
 
-# Convert wide format to long format
-def convert_wide_to_long(df):
+def unpack_time_and_channels(df):
     """
-    Convert a DataFrame from wide to long.
+    Extract the time and channel data from the DataFrame.
 
     Parameters
     ----------
     df : pd.DataFrame
         DataFrame in wide format.
     """
-    long_format_data = []
-    for idx, row in df.iterrows():
-        condition = row["condition"]
-        subject_index = row["subject_index"]
-        data_2d = row["data"]
-
-        for channel in range(data_2d.shape[0]):
-            for timepoint in range(data_2d.shape[1]):
-                long_format_data.append(
-                    {
-                        "condition": condition,
-                        "subject_index": subject_index,
-                        "channel": channel,
-                        "timepoint": timepoint,
-                        "value": data_2d[channel, timepoint],
-                    }
-                )
+    # Extracting all necessary data using list comprehensions for better performance
+    long_format_data = [
+        {
+            "condition": row["condition"],
+            "subject_index": row["subject_index"],
+            "channel": channel,
+            "timepoint": timepoint,
+            "value": row["data"][channel, timepoint],
+        }
+        for idx, row in df.iterrows()
+        for channel in range(row["data"].shape[0])
+        for timepoint in range(row["data"].shape[1])
+    ]
 
+    # Creating the long format DataFrame
     df_long = pd.DataFrame(long_format_data)
+
     return df_long
 
 
-df_long = convert_wide_to_long(df)
+# Example usage
+# Sample wide format DataFrame
+df_wide = pd.DataFrame(
+    {
+        "condition": ["A", "B"],
+        "subject_index": [1, 2],
+        "data": [np.array([[1, 2, 3], [4, 5, 6]]), np.array([[7, 8, 9], [10, 11, 12]])],
+    }
+)
 
 
 def plot_cluster(
@@ -553,4 +552,111 @@ def plot_cluster(
     plt.show()
 
 
-cluster_test(df)
+# translated the limo permutation ttest from matlab to python
+def limo_ttest_permute(Data, n_perm=None):
+    """
+    Pseudo one-sample t-test using sign-test with permutations.
+
+    Parameters
+    ----------
+    Data (numpy.ndarray): A matrix of data for the one-sample t-test.
+                          Shape can be (n_channels, n_var, n_obs) or
+                          (n_var, n_obs).
+                        n_perm (int, optional): Number of permutations to perform.
+    If None, it defaults based on the number of observations.
+
+    Returns
+    -------
+    t_vals (numpy.ndarray): t-values under H0.
+    p_vals (numpy.ndarray): p-values under H0.
+    dfe (int): Degrees of freedom.
+    """
+    # Check inputs and reshape if necessary
+    if Data.ndim == 3:
+        n_channels, n_var, n_obs = Data.shape
+    else:
+        n_channels = 1
+        n_var, n_obs = Data.shape
+        Data = Data[np.newaxis, ...]
+
+    # Warn if the number of observations is very small
+    if n_obs < 7:
+        n_psbl_prms = 2**n_obs
+        print(
+            f"Due to the very limited number of observations, "
+            f"the total number of possible permutations is small ({n_psbl_prms}). "
+            "Thus, only a limited number of p-values are possible "
+            "and the test might be overly conservative."
+        )
+
+    # Set up permutation test
+    if n_obs <= 12:
+        n_perm = 2**n_obs  # total number of possible permutations
+        exact = True
+        print(
+            "Due to the limited number of observations, all possible permutations "
+            "of the data will be computed instead of random permutations."
+        )
+    else:
+        exact = False
+        if n_perm is None:
+            n_perm = 1000
+
+    print(f"Executing permutation test with {n_perm} permutations...")
+
+    # Initialize variables
+    t_vals = np.full(
+        (n_channels, n_var, n_perm), np.nan
+    )  # Array to store t-values for each permutation
+    sqrt_nXnM1 = np.sqrt(
+        n_obs * (n_obs - 1)
+    )  # Precompute constant for t-value calculation
+    dfe = n_obs - 1  # Degrees of freedom
+
+    if exact:
+        # Use all possible permutations
+        for perm in range(n_perm):
+            # Set sign of each trial / participant's data
+            temp = np.array(
+                [int(x) for x in bin(perm)[2:].zfill(n_obs)]
+            )  # Convert perm index to binary array
+            sn = np.where(temp == 0, -1, 1)  # Map 0 to -1 and 1 to 1
+            sn_mtrx = np.tile(sn, (n_var, 1)).T  # Repeat sn for each variable
+
+            for c in range(n_channels):
+                data = Data[c, :, :]
+                d_perm = data * sn_mtrx  # Apply sign flip to data
+
+                # Compute t-score of permuted data
+                sm = np.sum(d_perm, axis=1)  # Sum of permuted data
+                mn = sm / n_obs  # Mean of permuted data
+                sm_sqrs = (
+                    np.sum(d_perm**2, axis=1) - (sm**2) / n_obs
+                )  # Sum of squares for standard error
+                stder = np.sqrt(sm_sqrs) / sqrt_nXnM1  # Standard error
+                t_vals[c, :, perm] = mn / stder  # Compute t-values
+
+    else:
+        # Use random permutations
+        for perm in range(n_perm):
+            # Randomly set sign of each trial / participant's data
+            sn = (np.random.rand(n_obs) > 0.5) * 2 - 1  # Generate random sign flips
+            sn_mtrx = np.tile(sn, (n_var, 1))  # Repeat sn for each variable
+
+            for c in range(n_channels):
+                data = Data[c, :, :]
+                d_perm = data * sn_mtrx  # Apply sign flip to data
+
+                # Compute t-score of permuted data
+                sm = np.sum(d_perm, axis=1)  # Sum of permuted data
+                mn = sm / n_obs  # Mean of permuted data
+                sm_sqrs = (
+                    np.sum(d_perm**2, axis=1) - (sm**2) / n_obs
+                )  # Sum of squares for standard error
+                stder = np.sqrt(sm_sqrs) / sqrt_nXnM1  # Standard error
+                t_vals[c, :, perm] = mn / stder  # Compute t-values
+
+    # Compute p-values from t-values
+    p_vals = 2 * scipy.stats.cdf(-np.abs(t_vals), dfe)
+
+    return t_vals, p_vals, dfe

From 37616e53d32b9bbf0371109ec1bc8aadd5de4e8a Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Sat, 6 Jul 2024 10:36:55 +0200
Subject: [PATCH 14/88] first draft without cluster plotting class implemented

---
 mne/stats/cluster_level.py                    | 293 +++++++
 .../76_new_cluster_test_api.py                | 722 +++---------------
 2 files changed, 392 insertions(+), 623 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index 76ae11bab7c..d3813c57817 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -10,12 +10,17 @@
 # License: BSD-3-Clause
 # Copyright the MNE-Python contributors.
 
+import matplotlib.pyplot as plt
 import numpy as np
+import pandas as pd
+from mpl_toolkits.axes_grid1 import make_axes_locatable
 from scipy import ndimage, sparse
 from scipy.sparse.csgraph import connected_components
 from scipy.stats import f as fstat
 from scipy.stats import t as tstat
 
+from .. import EvokedArray
+from ..channels import find_ch_adjacency
 from ..fixes import has_numba, jit
 from ..parallel import parallel_func
 from ..source_estimate import MixedSourceEstimate, SourceEstimate, VolSourceEstimate
@@ -24,6 +29,7 @@
     ProgressBar,
     _check_option,
     _pl,
+    _soft_import,
     _validate_type,
     check_random_state,
     logger,
@@ -31,6 +37,7 @@
     verbose,
     warn,
 )
+from ..viz import plot_compare_evokeds
 from .parametric import f_oneway, ttest_1samp_no_p
 
 
@@ -1729,3 +1736,289 @@ def summarize_clusters_stc(
     data_summary[:, 0] = np.sum(data_summary, axis=1)
 
     return klass(data_summary, vertices, tmin, tstep, subject)
+
+
+def cluster_test(
+    df: pd.DataFrame,
+    formula: str = None,  # Wilkinson notation formula for design matrix
+    n_permutations: int = 10000,
+    seed: None | int | np.random.RandomState = None,
+    tail: int = 0,  # 0 for two-tailed, 1 for greater, -1 for less
+    n_jobs: int = 1,  # how many cores to use
+    adjacency: tuple = None,
+    max_step: int = 1,  # maximum distance between samples (time points)
+    exclude: list = None,  # exclude no time points or channels
+    step_down_p: int = 0,  # step down in jumps test
+    t_power: int = 1,  # weigh each location by its stats score
+    out_type: str = "indices",
+    check_disjoint: bool = False,
+    buffer_size: int = None,  # block size for chunking the data
+):
+    """
+    Run the cluster test using the new API.
+
+    # currently supports paired t-test
+
+    Parameters
+    ----------
+    dataframe : pd.DataFrame
+        Dataframe with evoked data, conditions and subject IDs.
+    formula : str, optional
+        Wilkinson notation formula for design matrix. Default is None.
+    n_permutations : int, optional
+        Number of permutations. Default is 10000.
+    seed : None | int | np.random.RandomState, optional
+        Seed for the random number generator. Default is None.
+    tail : int, optional
+        0 for two-tailed, 1 for greater, -1 for less. Default is 0.
+    n_jobs : int, optional
+        How many cores to use. Default is 1.
+    adjacency : None, optional
+        Adjacency matrix. Default is None.
+    max_step : int, optional
+        Maximum distance between samples (time points). Default is 1.
+    exclude : np.Array, optional
+        Exclude no time points or channels. Default is None.
+    step_down_p : int, optional
+        Step down in jumps test. Default is 0.
+    t_power : int, optional
+        Weigh each location by its stats score. Default is 1.
+    out_type : str, optional
+        Output type. Default is "indices".
+    check_disjoint : bool, optional
+        Check if clusters are disjoint. Default is False.
+    buffer_size : int, optional
+        Block size for chunking the data. Default is None.
+    seed : int, optional
+        Seed for the random number generator. Default is None.
+
+    Returns
+    -------
+    T_obs : array
+        The observed test statistic.
+    clusters : list
+        List of clusters.
+    cluster_p_values : array
+        Array of cluster p-values.
+    H0 : array
+        The permuted test statistics.
+    """
+    # for now this assumes a dataframe with a column for evoked data or epochs
+    # add a data column to the dataframe (numpy array)
+    df["data"] = [evoked.data for evoked in df.evoked]
+
+    # extract number of channels and timepoints
+    # (eventually should also allow for frequency)
+    n_channels, n_timepoints = df["data"][0].shape
+
+    # convert wide format to long format for formulaic
+    df_long = unpack_time_and_channels(df)
+
+    # Pivot the DataFrame
+    pivot_df = df_long.pivot_table(
+        index=["subject_index", "channel", "timepoint"],
+        columns="condition",
+        values="value",
+    ).reset_index()
+
+    # if not 2 unique conditions raise error
+    if len(pd.unique(df.condition)) != 2:
+        raise ValueError("Condition list needs to contain 2 unique values")
+
+    # Get the unique conditions
+    conditions = np.unique(df.condition)
+
+    # Compute the difference (assuming there are only 2 conditions)
+    pivot_df["evoked"] = pivot_df[conditions[0]] - pivot_df[conditions[1]]
+
+    # Optional: Clean up the DataFrame
+    pivot_df = pivot_df[["subject_index", "channel", "timepoint", "evoked"]]
+
+    # check if formula is present
+    if formula is not None:
+        formulaic = _soft_import(
+            "formulaic", purpose="set up Design Matrix"
+        )  # soft import (not a dependency for MNE)
+
+        # for the paired t-test y is the difference between conditions
+        # X is the design matrix with a column with 1s and 0s for each participant
+        # Create the design matrix using formulaic
+        y, X = formulaic.model_matrix(formula, pivot_df)
+    else:
+        raise ValueError(
+            "Formula is required and needs to be a string in Wilkinson notation."
+        )
+
+    # now prep design matrix outcome variable for input into MNE cluster function
+    # we initially had first channels, then timepoints,
+    # now we need first timepoints, then channels
+    y_for_cluster = y.values.reshape(-1, n_channels, n_timepoints).transpose(0, 2, 1)
+
+    adjacency, _ = find_ch_adjacency(df["evoked"][0].info, ch_type="eeg")
+
+    # define stat function and threshold
+    stat_fun, threshold = _check_fun(
+        X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="within"
+    )
+
+    # Run the cluster-based permutation test
+    T_obs, clusters, cluster_p_values, H0 = _permutation_cluster_test(
+        [y_for_cluster],
+        n_permutations=10000,
+        threshold=threshold,
+        stat_fun=stat_fun,
+        tail=tail,
+        n_jobs=n_jobs,
+        adjacency=adjacency,
+        max_step=max_step,  # maximum distance between samples (time points)
+        exclude=exclude,  # exclude no time points or channels
+        step_down_p=step_down_p,  # step down in jumps test
+        t_power=t_power,  # weigh each location by its stats score
+        out_type=out_type,
+        check_disjoint=check_disjoint,
+        buffer_size=buffer_size,  # block size for chunking the data
+        seed=seed,
+    )
+
+    print(min(cluster_p_values))
+
+    return T_obs, clusters, cluster_p_values, H0
+
+
+def unpack_time_and_channels(df):
+    """
+    Extract the time and channel data from the DataFrame.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame in wide format.
+    """
+    # Extracting all necessary data using list comprehensions for better performance
+    long_format_data = [
+        {
+            "condition": row["condition"],
+            "subject_index": row["subject_index"],
+            "channel": channel,
+            "timepoint": timepoint,
+            "value": row["data"][channel, timepoint],
+        }
+        for idx, row in df.iterrows()
+        for channel in range(row["data"].shape[0])
+        for timepoint in range(row["data"].shape[1])
+    ]
+
+    # Creating the long format DataFrame
+    df_long = pd.DataFrame(long_format_data)
+
+    return df_long
+
+
+def plot_cluster(
+    contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values
+):
+    """
+    Plot the cluster with the lowest p-value.
+
+    Parameters
+    ----------
+    contrast : list
+        List of contrast evoked objects.
+    target_only : list
+        List of target evoked objects.
+    non_target_only : list
+        List of non-target evoked objects.
+    T_obs : array
+        The observed test statistic.
+    clusters : list
+        List of clusters.
+    cluster_p_values : array
+        Array of cluster p-values.
+
+    Returns
+    -------
+    None
+
+    """
+    # configure variables for visualization
+    colors = {"target": "crimson", "non-target": "steelblue"}
+
+    # organize data for plotting
+    evokeds = {"target": target_only, "non-target": non_target_only}
+
+    lowest_p_cluster = np.argmin(cluster_p_values)
+
+    # plot the cluster with the lowest p-value
+    time_inds, space_inds = np.squeeze(clusters[lowest_p_cluster])
+    ch_inds = np.unique(space_inds)
+    time_inds = np.unique(time_inds)
+
+    # get topography for F stat
+    t_map = T_obs[time_inds, ...].mean(axis=0)
+
+    # get signals at the sensors contributing to the cluster
+    sig_times = contrast[0].times[time_inds]
+
+    # create spatial mask
+    mask = np.zeros((t_map.shape[0], 1), dtype=bool)
+    mask[ch_inds, :] = True
+
+    # initialize figure
+    fig, ax_topo = plt.subplots(1, 1, figsize=(10, 3), layout="constrained")
+
+    # plot average test statistic and mark significant sensors
+    t_evoked = EvokedArray(t_map[:, np.newaxis], contrast[0].info, tmin=0)
+    t_evoked.plot_topomap(
+        times=0,
+        mask=mask,
+        axes=ax_topo,
+        cmap="Reds",
+        vlim=(np.min, np.max),
+        show=False,
+        colorbar=False,
+        mask_params=dict(markersize=10),
+    )
+    image = ax_topo.images[0]
+
+    # remove the title that would otherwise say "0.000 s"
+    ax_topo.set_title("")
+
+    # soft import?
+    # make_axes_locatable = _soft_import(
+    #    "mpl_toolkits.axes_grid1.make_axes_locatable",
+    #    purpose="plot cluster results"
+    # )  # soft import (not a dependency for MNE)
+
+    # create additional axes (for ERF and colorbar)
+    divider = make_axes_locatable(ax_topo)
+
+    # add axes for colorbar
+    ax_colorbar = divider.append_axes("right", size="5%", pad=0.05)
+    plt.colorbar(image, cax=ax_colorbar)
+    ax_topo.set_xlabel(
+        "Averaged t-map ({:0.3f} - {:0.3f} s)".format(*sig_times[[0, -1]])
+    )
+
+    # add new axis for time courses and plot time courses
+    ax_signals = divider.append_axes("right", size="300%", pad=1.2)
+    title = f"Cluster #1, {len(ch_inds)} sensor"
+    if len(ch_inds) > 1:
+        title += "s (mean)"
+        plot_compare_evokeds(
+            evokeds,
+            title=title,
+            picks=ch_inds,
+            axes=ax_signals,
+            colors=colors,
+            show=False,
+            split_legend=True,
+            truncate_yaxis="auto",
+        )
+
+    # plot temporal cluster extent
+    ymin, ymax = ax_signals.get_ylim()
+    ax_signals.fill_betweenx(
+        (ymin, ymax), sig_times[0], sig_times[-1], color="green", alpha=0.3
+    )
+
+    plt.show()
diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index 6a3a966bbcc..ec8bd8275a1 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -1,662 +1,138 @@
 """
 .. _tut-new-cluster-test-api:
 
-====================
-New cluster test API
-====================
+===============================================================
+New cluster test API that allows for Wilkinson style formulas
+===============================================================
 
 This tutorial shows how to use the new API for cluster testing.
+This script shows how to estimate significant clusters in
+evoked contrast data of multiple subjects.
+It uses a non-parametric statistical procedure based on permutations and
+cluster level statistics.
+
+The procedure consists of:
+
+  - loading evoked data from multiple subjects
+  - construct a dataframe that contains the difference between conditions
+  - run the new cluster test function
+
+Here, the unit of observation are evokeds from multiple subjects (2nd level analysis).
+
+For more information on cluster-based permutation testing in MNE-Python,
+see also: :ref:`tut-cluster-one-samp-tfr`.
 """
+# Authors: Carina Forster <carinaforster0611@gmail.com>
+#
 # License: BSD-3-Clause
 # Copyright the MNE-Python contributors.
 
+# %%
+
 from pathlib import Path
 
-import matplotlib.pyplot as plt
-import numpy as np
 import pandas as pd
-import scipy
-from mpl_toolkits.axes_grid1 import make_axes_locatable
 
 import mne
-from mne.utils import _soft_import
-
-# TODO: test function and update docstrings
 
-# import and load dataset
+# Set parameters
+# --------------
+# Define the path to the P3 dataset
 path_to_p3 = mne.datasets.misc.data_path() / "ERP_CORE" / "P3"
 
+# Define the range of participant IDs
+participant_ids = range(15, 20)  # This will cover 015 to 019
 
-def prep_sample_data(plot_evokeds: bool = False):
-    """Load the P3 dataset."""
-    # Define the range of participant IDs
-    participant_ids = range(15, 20)  # This will cover 015 to 019
-
-    evokeds_allsubs = []
-
-    # Loop over each participant ID and generate the corresponding filename
-    for pid in participant_ids:
-        # Create the filename using an f-string, ID is zero-padded to 3 digits
-        filename_p3 = f"sub-{pid:03d}_ses-P3_task-P3_ave.fif"
-
-        # Print the filename (or perform your desired operations on it)
-        print(filename_p3)
-
-        p3_file_path = Path(path_to_p3) / filename_p3
-
-        evokeds = mne.read_evokeds(p3_file_path)
-
-        # add to list
-        evokeds_allsubs.append(evokeds)
-
-    target_only = [evoked[0] for evoked in evokeds_allsubs]
-    non_target_only = [evoked[1] for evoked in evokeds_allsubs]
-    contrast = [evoked[2] for evoked in evokeds_allsubs]
-
-    if plot_evokeds:
-        # plot the grand average
-        mne.grand_average(target_only).plot()
-        mne.grand_average(non_target_only).plot()
-        mne.grand_average(contrast).plot()
-
-    # create contrast from evokeds target and non-target
-    diff_evoked = [
-        mne.combine_evoked([evokeds_a, evokeds_b], weights=[1, -1])
-        for evokeds_a, evokeds_b in zip(target_only, non_target_only)
-    ]
-
-    if plot_evokeds:
-        mne.grand_average(diff_evoked).plot()
-
-    # crop the evokeds in the post stimulus window
-    contrast = [evokeds.crop(tmin=-0.1, tmax=0.6) for evokeds in contrast]
-    target_only = [evokeds.crop(tmin=-0.1, tmax=0.6) for evokeds in target_only]
-    non_target_only = [evokeds.crop(tmin=-0.1, tmax=0.6) for evokeds in non_target_only]
-
-    return contrast, target_only, non_target_only
-
-
-def old_api_cluster(n_permutations: int = 10000, seed: int = 1234):
-    """
-    Run the cluster test using the old API to get a benchmark result for the new API.
-
-    Currently implementing a paired t-test with contrast between participants.
-    """
-    contrast, target_only, non_target_only = prep_sample_data()
-
-    # extract the data for each evoked and store in numpy array
-    data = np.array([evoked.data for evoked in contrast])
-
-    # shape should be (n_subjects, n_channels, n_times)
-    data.shape
-
-    # reshape to channels as last dimension
-    data = data.transpose(0, 2, 1)
-
-    data.shape
+# store the evoked data of all subjects
+evokeds_allsubs = []
 
-    adjacency, _ = mne.channels.find_ch_adjacency(contrast[0].info, ch_type="eeg")
+# Loop over each participant ID and generate the corresponding filename
+for pid in participant_ids:
+    # Create the filename using an f-string, ID is zero-padded to 3 digits
+    filename_p3 = f"sub-{pid:03d}_ses-P3_task-P3_ave.fif"
 
-    stat_fun, threshold = mne.stats.cluster_level._check_fun(
-        X=data, stat_fun=None, threshold=None, tail=0, kind="within"
-    )
+    # Create the full path to the file
+    p3_file_path = Path(path_to_p3) / filename_p3
 
-    # Run the analysis
-    T_obs, clusters, cluster_p_values, H0 = (
-        mne.stats.cluster_level._permutation_cluster_test(
-            [data],
-            threshold=threshold,
-            stat_fun=stat_fun,
-            n_jobs=-1,  # takes all CPU cores
-            max_step=1,  # maximum distance between samples (time points)
-            exclude=None,  # exclude no time points or channels
-            step_down_p=0,  # step down in jumps test
-            t_power=1,  # weigh each location by its stats score
-            out_type="indices",
-            check_disjoint=False,
-            buffer_size=None,  # block size for chunking the data
-            n_permutations=n_permutations,
-            tail=0,
-            adjacency=adjacency,
-            seed=seed,
-        )
-    )
+    # load the evoked data
+    evokeds = mne.read_evokeds(p3_file_path)
 
-    print(min(cluster_p_values))
+    # add subjects evoked data to list
+    evokeds_allsubs.append(evokeds)
 
-    plot_cluster(
-        contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values
-    )
+# the P3b dataset is part of the freely available ERP CORE dataset
+# participants were presented with a visual oddball task
+# and the P3b component was analyzed
+# the conditions of interest are the target (rare visual stimuli)
+# and non-target stimuli (frequency visual stimuli)
 
-    return T_obs, clusters, cluster_p_values, H0
+# let's extract the target and non-target evokeds
+target_only = [evoked[0] for evoked in evokeds_allsubs]
+non_target_only = [evoked[1] for evoked in evokeds_allsubs]
 
+# let's first have a look at the data
+# create contrast from target and non-target evokeds
+diff_evoked = [
+    mne.combine_evoked([evokeds_a, evokeds_b], weights=[1, -1])
+    for evokeds_a, evokeds_b in zip(target_only, non_target_only)
+]
 
-def create_random_evokeds_id_condition_list():
-    """
-    Create a list of shuffled participant IDs, conditions, and evoked data.
+# plot the grand average of the difference signal
+mne.grand_average(diff_evoked).plot()
+# plot the topography of the difference signal
+mne.grand_average(diff_evoked).plot_topomap()
 
-    # Keep the participant IDs and conditions paired but shuffle the order of the data.
-    """
-    import random
+# we can see that the strongest difference is around 400 ms in
+# visual channels (occipital region)
 
-    _, evoked_data_a, evoked_data_b = prep_sample_data()
+# Next we prepare a dataframe for the cluster test function
+# the dataframe should contain the contrast evoked data and the subject index
+# each row in the dataframe should represent one observation (evoked data)
 
-    # Example participant IDs
-    participant_ids = ["p1", "p2", "p3", "p4", "p5"] * 2
+# save the evoked data for both conditions in one list
+evokeds_conditions = target_only + non_target_only
 
-    # Combine the evoked data into a single list
-    all_evoked_data = evoked_data_a + evoked_data_b
+# set up a list that defines the condition for each evoked data
+# this will be used to create the conditions column in the dataframe
+conditions = ["target"] * len(target_only) + ["non-target"] * len(non_target_only)
 
-    # Create a corresponding list of conditions
-    conditions = [1] * len(evoked_data_a) + [0] * len(evoked_data_b)
+# finally add a column that defines the subject index
+# this will be used to create the subject_index column in the dataframe
+# we multiply the participant_ids by 2 to account for the two conditions
+subject_index = list(participant_ids) * 2
 
-    # Combine the participant IDs, conditions, and evoked data into a list of tuples
-    combined_list = list(zip(participant_ids, conditions, all_evoked_data))
-
-    # Shuffle the combined list
-    random.shuffle(combined_list)
-
-    # Separate the shuffled list back into participant IDs, conditions, and evoked data
-    shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = zip(
-        *combined_list
-    )
-
-    # Convert the tuples back to lists
-    shuffled_participant_ids = list(shuffled_participant_ids)
-    shuffled_conditions = list(shuffled_conditions)
-    shuffled_evoked_data = list(shuffled_evoked_data)
-
-    return shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data
-
-
-def create_random_paired_evokeds_list():
-    """
-    Create shuffled paired evoked data.
-
-    Create a list of shuffled evoked data where each pair of target
-    and non-target evoked data is shuffled together.
-    """
-    import random
-
-    _, evoked_data_a, evoked_data_b = prep_sample_data()
-
-    # Ensure evoked_data_a and evoked_data_b are of the same length
-    assert len(evoked_data_a) == len(
-        evoked_data_b
-    ), "evoked_data_a and evoked_data_b must have the same length"
-
-    # Create a list of participant indices
-    participant_indices = list(range(len(evoked_data_a)))
-
-    # Shuffle the list of participant indices
-    random.shuffle(participant_indices)
-
-    # Reorder evoked data according to the shuffled participant indices
-    shuffled_evoked_data_a = [evoked_data_a[i] for i in participant_indices]
-    shuffled_evoked_data_b = [evoked_data_b[i] for i in participant_indices]
-
-    # Combine the shuffled evoked data into a single list
-    shuffled_evoked_data = shuffled_evoked_data_a + shuffled_evoked_data_b
-
-    # Combine the original evoked data into a single list
-    original_evoked_data = evoked_data_a + evoked_data_b
-
-    return original_evoked_data, shuffled_evoked_data
-
-
-# shuffle order of pairs
-original_evoked_data, shuffled_evoked_data = create_random_paired_evokeds_list()
-# shouldn't change the results (p-value is different though?)
-
-shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = (
-    create_random_evokeds_id_condition_list()
+# create the dataframe
+df = pd.DataFrame(
+    {
+        "evoked": evokeds_conditions,
+        "condition": conditions,
+        "subject_index": subject_index,
+    }
 )
 
+# now we can run the cluster test function
+# we will use the new API that allows for Wilkinson style formulas
+# the formula should be a string in Wilkinson notation
 
-def prepare_dataframe_for_cluster_function(
-    evokeds: list = None,
-    condition: list = None,
-    subject_index: list = None,
-):
-    """
-    Prepare a dataframe for the cluster test function.
-
-    Parameters
-    ----------
-    contrast : bool, optional
-        If True, a contrast is calculated. Default is False.
-    evokeds : list, optional
-        List of evoked objects. Default is None.
-    condition : list, optional
-        List of conditions for each evoked object. Default is None.
-    subject_index : list, optional
-        List of subject IDs. Default is None.
-
-    Returns
-    -------
-    df : DataFrame
-        The prepared DataFrame for the cluster test function.
-    """
-    # Initialize the DataFrame with evoked data
-    df = pd.DataFrame(
-        {
-            "evoked": evokeds,
-            "condition": condition if condition is not None else pd.NA,
-            "subject_index": subject_index if subject_index is not None else pd.NA,
-        }
-    )
-
-    return df
-
-
-df = prepare_dataframe_for_cluster_function(
-    evokeds=shuffled_evoked_data,
-    condition=shuffled_conditions,
-    subject_index=shuffled_participant_ids,
-)
+# we want to test whether there is a significant difference between
+# target and non-target stimuli in the post-stimulus window
+# we will use a cluster-based permutation paired t-test for this
 
+# let's first define the formula based on Wilkinson notation
+formula = "evoked ~ 1 + C(subject_index)"
 
-def cluster_test(
-    df: pd.DataFrame,
-    formula: str = None,  # Wilkinson notation formula for design matrix
-    n_permutations: int = 10000,
-    seed: None | int | np.random.RandomState = None,
-    tail: int = 0,  # 0 for two-tailed, 1 for greater, -1 for less
-    n_jobs: int = 1,  # how many cores to use
-    adjacency: tuple = None,
-    max_step: int = 1,  # maximum distance between samples (time points)
-    exclude: list = None,  # exclude no time points or channels
-    step_down_p: int = 0,  # step down in jumps test
-    t_power: int = 1,  # weigh each location by its stats score
-    out_type: str = "indices",
-    check_disjoint: bool = False,
-    buffer_size: int = None,  # block size for chunking the data
-):
-    """
-    Run the cluster test using the new API.
-
-    # currently supports paired t-test
-
-    Parameters
-    ----------
-    dataframe : pd.DataFrame
-        Dataframe with evoked data, conditions and subject IDs.
-    formula : str, optional
-        Wilkinson notation formula for design matrix. Default is None.
-    n_permutations : int, optional
-        Number of permutations. Default is 10000.
-    seed : None | int | np.random.RandomState, optional
-        Seed for the random number generator. Default is None.
-    tail : int, optional
-        0 for two-tailed, 1 for greater, -1 for less. Default is 0.
-    n_jobs : int, optional
-        How many cores to use. Default is 1.
-    adjacency : None, optional
-        Adjacency matrix. Default is None.
-    max_step : int, optional
-        Maximum distance between samples (time points). Default is 1.
-    exclude : np.Array, optional
-        Exclude no time points or channels. Default is None.
-    step_down_p : int, optional
-        Step down in jumps test. Default is 0.
-    t_power : int, optional
-        Weigh each location by its stats score. Default is 1.
-    out_type : str, optional
-        Output type. Default is "indices".
-    check_disjoint : bool, optional
-        Check if clusters are disjoint. Default is False.
-    buffer_size : int, optional
-        Block size for chunking the data. Default is None.
-    seed : int, optional
-        Seed for the random number generator. Default is None.
-
-    Returns
-    -------
-    T_obs : array
-        The observed test statistic.
-    clusters : list
-        List of clusters.
-    cluster_p_values : array
-        Array of cluster p-values.
-    H0 : array
-        The permuted test statistics.
-    """
-    # for now this assumes a dataframe with a column for evoked data
-    # add a data column to the dataframe (numpy array)
-    df["data"] = [evoked.data for evoked in df.evoked]
-
-    # extract number of channels and timepoints
-    # (eventually should also allow for frequency)
-    n_channels, n_timepoints = df["data"][0].shape
-
-    # convert wide format to long format for formulaic
-    df_long = unpack_time_and_channels(df)
-
-    # Pivot the DataFrame
-    pivot_df = df_long.pivot_table(
-        index=["subject_index", "channel", "timepoint"],
-        columns="condition",
-        values="value",
-    ).reset_index()
-
-    # if not 2 unique conditions raise error
-    if len(pd.unique(df.condition)) != 2:
-        raise ValueError("Condition list needs to contain 2 unique values")
-
-    # Compute the difference (assuming there are only 2 conditions)
-    pivot_df["y"] = pivot_df[0] - pivot_df[1]
-
-    # Optional: Clean up the DataFrame
-    pivot_df = pivot_df[["subject_index", "channel", "timepoint", "y"]]
-
-    # check if formula is present
-    if formula is not None:
-        formulaic = _soft_import(
-            "formulaic", purpose="set up Design Matrix"
-        )  # soft import (not a dependency for MNE)
-
-        # for the paired t-test y is the difference between conditions
-        # X is the design matrix with a column with 1s and 0s for each participant
-        # Create the design matrix using formulaic
-        y, X = formulaic.model_matrix(formula, pivot_df)
-    else:
-        raise ValueError(
-            "Formula is required and needs to be a string in Wilkinson notation."
-        )
-
-    # now prep design matrix outcome variable for input into MNE cluster function
-    # we initially had first channels, then timepoints,
-    # now we need first timepoints, then channels
-    y_for_cluster = y.values.reshape(-1, n_channels, n_timepoints).transpose(0, 2, 1)
-
-    adjacency, _ = mne.channels.find_ch_adjacency(df["evoked"][0].info, ch_type="eeg")
-
-    # define stat function and threshold
-    stat_fun, threshold = mne.stats.cluster_level._check_fun(
-        X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="within"
-    )
-
-    # Run the cluster-based permutation test
-    T_obs, clusters, cluster_p_values, H0 = (
-        mne.stats.cluster_level._permutation_cluster_test(
-            [y_for_cluster],
-            n_permutations=10000,
-            threshold=threshold,
-            stat_fun=stat_fun,
-            tail=tail,
-            n_jobs=n_jobs,
-            adjacency=adjacency,
-            max_step=max_step,  # maximum distance between samples (time points)
-            exclude=exclude,  # exclude no time points or channels
-            step_down_p=step_down_p,  # step down in jumps test
-            t_power=t_power,  # weigh each location by its stats score
-            out_type=out_type,
-            check_disjoint=check_disjoint,
-            buffer_size=buffer_size,  # block size for chunking the data
-            seed=seed,
-        )
-    )
-
-    print(min(cluster_p_values))
-
-    # need to adjust plotting function for contrast only data
-    contrast, evokeds_a, evokeds_b = prep_sample_data()
-
-    # plot cluster
-    plot_cluster(contrast, evokeds_a, evokeds_b, T_obs, clusters, cluster_p_values)
-
-    return T_obs, clusters, cluster_p_values, H0
-
-
-def unpack_time_and_channels(df):
-    """
-    Extract the time and channel data from the DataFrame.
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        DataFrame in wide format.
-    """
-    # Extracting all necessary data using list comprehensions for better performance
-    long_format_data = [
-        {
-            "condition": row["condition"],
-            "subject_index": row["subject_index"],
-            "channel": channel,
-            "timepoint": timepoint,
-            "value": row["data"][channel, timepoint],
-        }
-        for idx, row in df.iterrows()
-        for channel in range(row["data"].shape[0])
-        for timepoint in range(row["data"].shape[1])
-    ]
-
-    # Creating the long format DataFrame
-    df_long = pd.DataFrame(long_format_data)
-
-    return df_long
-
-
-# Example usage
-# Sample wide format DataFrame
-df_wide = pd.DataFrame(
-    {
-        "condition": ["A", "B"],
-        "subject_index": [1, 2],
-        "data": [np.array([[1, 2, 3], [4, 5, 6]]), np.array([[7, 8, 9], [10, 11, 12]])],
-    }
+# run the cluster test
+T_obs, clusters, cluster_p_values, H0 = mne.stats.cluster_level.cluster_test(
+    df=df, formula=formula
 )
 
-
-def plot_cluster(
-    contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values
-):
-    """
-    Plot the cluster with the lowest p-value.
-
-    Parameters
-    ----------
-    contrast : list
-        List of contrast evoked objects.
-    target_only : list
-        List of target evoked objects.
-    non_target_only : list
-        List of non-target evoked objects.
-    T_obs : array
-        The observed test statistic.
-    clusters : list
-        List of clusters.
-    cluster_p_values : array
-        Array of cluster p-values.
-
-    Returns
-    -------
-    None
-
-    """
-    # configure variables for visualization
-    colors = {"target": "crimson", "non-target": "steelblue"}
-
-    # organize data for plotting
-    evokeds = {"target": target_only, "non-target": non_target_only}
-
-    lowest_p_cluster = np.argmin(cluster_p_values)
-
-    # plot the cluster with the lowest p-value
-    time_inds, space_inds = np.squeeze(clusters[lowest_p_cluster])
-    ch_inds = np.unique(space_inds)
-    time_inds = np.unique(time_inds)
-
-    # get topography for F stat
-    t_map = T_obs[time_inds, ...].mean(axis=0)
-
-    # get signals at the sensors contributing to the cluster
-    sig_times = contrast[0].times[time_inds]
-
-    # create spatial mask
-    mask = np.zeros((t_map.shape[0], 1), dtype=bool)
-    mask[ch_inds, :] = True
-
-    # initialize figure
-    fig, ax_topo = plt.subplots(1, 1, figsize=(10, 3), layout="constrained")
-
-    # plot average test statistic and mark significant sensors
-    t_evoked = mne.EvokedArray(t_map[:, np.newaxis], contrast[0].info, tmin=0)
-    t_evoked.plot_topomap(
-        times=0,
-        mask=mask,
-        axes=ax_topo,
-        cmap="Reds",
-        vlim=(np.min, np.max),
-        show=False,
-        colorbar=False,
-        mask_params=dict(markersize=10),
-    )
-    image = ax_topo.images[0]
-
-    # remove the title that would otherwise say "0.000 s"
-    ax_topo.set_title("")
-
-    # create additional axes (for ERF and colorbar)
-    divider = make_axes_locatable(ax_topo)
-
-    # add axes for colorbar
-    ax_colorbar = divider.append_axes("right", size="5%", pad=0.05)
-    plt.colorbar(image, cax=ax_colorbar)
-    ax_topo.set_xlabel(
-        "Averaged t-map ({:0.3f} - {:0.3f} s)".format(*sig_times[[0, -1]])
-    )
-
-    # add new axis for time courses and plot time courses
-    ax_signals = divider.append_axes("right", size="300%", pad=1.2)
-    title = f"Cluster #1, {len(ch_inds)} sensor"
-    if len(ch_inds) > 1:
-        title += "s (mean)"
-    mne.viz.plot_compare_evokeds(
-        evokeds,
-        title=title,
-        picks=ch_inds,
-        axes=ax_signals,
-        colors=colors,
-        show=False,
-        split_legend=True,
-        truncate_yaxis="auto",
-    )
-
-    # plot temporal cluster extent
-    ymin, ymax = ax_signals.get_ylim()
-    ax_signals.fill_betweenx(
-        (ymin, ymax), sig_times[0], sig_times[-1], color="green", alpha=0.3
-    )
-
-    plt.show()
-
-
-# translated the limo permutation ttest from matlab to python
-def limo_ttest_permute(Data, n_perm=None):
-    """
-    Pseudo one-sample t-test using sign-test with permutations.
-
-    Parameters
-    ----------
-    Data (numpy.ndarray): A matrix of data for the one-sample t-test.
-                          Shape can be (n_channels, n_var, n_obs) or
-                          (n_var, n_obs).
-                        n_perm (int, optional): Number of permutations to perform.
-    If None, it defaults based on the number of observations.
-
-    Returns
-    -------
-    t_vals (numpy.ndarray): t-values under H0.
-    p_vals (numpy.ndarray): p-values under H0.
-    dfe (int): Degrees of freedom.
-    """
-    # Check inputs and reshape if necessary
-    if Data.ndim == 3:
-        n_channels, n_var, n_obs = Data.shape
-    else:
-        n_channels = 1
-        n_var, n_obs = Data.shape
-        Data = Data[np.newaxis, ...]
-
-    # Warn if the number of observations is very small
-    if n_obs < 7:
-        n_psbl_prms = 2**n_obs
-        print(
-            f"Due to the very limited number of observations, "
-            f"the total number of possible permutations is small ({n_psbl_prms}). "
-            "Thus, only a limited number of p-values are possible "
-            "and the test might be overly conservative."
-        )
-
-    # Set up permutation test
-    if n_obs <= 12:
-        n_perm = 2**n_obs  # total number of possible permutations
-        exact = True
-        print(
-            "Due to the limited number of observations, all possible permutations "
-            "of the data will be computed instead of random permutations."
-        )
-    else:
-        exact = False
-        if n_perm is None:
-            n_perm = 1000
-
-    print(f"Executing permutation test with {n_perm} permutations...")
-
-    # Initialize variables
-    t_vals = np.full(
-        (n_channels, n_var, n_perm), np.nan
-    )  # Array to store t-values for each permutation
-    sqrt_nXnM1 = np.sqrt(
-        n_obs * (n_obs - 1)
-    )  # Precompute constant for t-value calculation
-    dfe = n_obs - 1  # Degrees of freedom
-
-    if exact:
-        # Use all possible permutations
-        for perm in range(n_perm):
-            # Set sign of each trial / participant's data
-            temp = np.array(
-                [int(x) for x in bin(perm)[2:].zfill(n_obs)]
-            )  # Convert perm index to binary array
-            sn = np.where(temp == 0, -1, 1)  # Map 0 to -1 and 1 to 1
-            sn_mtrx = np.tile(sn, (n_var, 1)).T  # Repeat sn for each variable
-
-            for c in range(n_channels):
-                data = Data[c, :, :]
-                d_perm = data * sn_mtrx  # Apply sign flip to data
-
-                # Compute t-score of permuted data
-                sm = np.sum(d_perm, axis=1)  # Sum of permuted data
-                mn = sm / n_obs  # Mean of permuted data
-                sm_sqrs = (
-                    np.sum(d_perm**2, axis=1) - (sm**2) / n_obs
-                )  # Sum of squares for standard error
-                stder = np.sqrt(sm_sqrs) / sqrt_nXnM1  # Standard error
-                t_vals[c, :, perm] = mn / stder  # Compute t-values
-
-    else:
-        # Use random permutations
-        for perm in range(n_perm):
-            # Randomly set sign of each trial / participant's data
-            sn = (np.random.rand(n_obs) > 0.5) * 2 - 1  # Generate random sign flips
-            sn_mtrx = np.tile(sn, (n_var, 1))  # Repeat sn for each variable
-
-            for c in range(n_channels):
-                data = Data[c, :, :]
-                d_perm = data * sn_mtrx  # Apply sign flip to data
-
-                # Compute t-score of permuted data
-                sm = np.sum(d_perm, axis=1)  # Sum of permuted data
-                mn = sm / n_obs  # Mean of permuted data
-                sm_sqrs = (
-                    np.sum(d_perm**2, axis=1) - (sm**2) / n_obs
-                )  # Sum of squares for standard error
-                stder = np.sqrt(sm_sqrs) / sqrt_nXnM1  # Standard error
-                t_vals[c, :, perm] = mn / stder  # Compute t-values
-
-    # Compute p-values from t-values
-    p_vals = 2 * scipy.stats.cdf(-np.abs(t_vals), dfe)
-
-    return t_vals, p_vals, dfe
+# finally let's plot the results
+# we plot the cluster with the lowest p-value
+# and the topomap of the significant cluster
+# we can see that there is something going on around 400 ms
+# in the visual channels
+# however the cluster is not significant which is not surprising
+# given the small sample size (only 5 subjects)
+mne.stats.cluster_level.plot_cluster(
+    diff_evoked, target_only, non_target_only, T_obs, clusters, cluster_p_values
+)

From 6aaef9a7acd47e44f897c8759a346a215c59ae72 Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Sat, 6 Jul 2024 11:01:12 +0200
Subject: [PATCH 15/88] cleaned up plotting function

---
 mne/stats/cluster_level.py                    | 61 ++++++++++---------
 .../76_new_cluster_test_api.py                |  6 +-
 2 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index d3813c57817..686f1097063 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -1755,14 +1755,14 @@ def cluster_test(
     buffer_size: int = None,  # block size for chunking the data
 ):
     """
-    Run the cluster test using the new API.
+    Run a cluster permutation test based on formulaic input.
 
-    # currently supports paired t-test
+    # currently only supports paired t-test on evokeds or epochs
 
     Parameters
     ----------
     dataframe : pd.DataFrame
-        Dataframe with evoked data, conditions and subject IDs.
+        Dataframe with evoked/epoched data, conditions and subject IDs.
     formula : str, optional
         Wilkinson notation formula for design matrix. Default is None.
     n_permutations : int, optional
@@ -1794,6 +1794,7 @@ def cluster_test(
 
     Returns
     -------
+    TODO: turn this into a class for further plotting
     T_obs : array
         The observed test statistic.
     clusters : list
@@ -1814,7 +1815,7 @@ def cluster_test(
     # convert wide format to long format for formulaic
     df_long = unpack_time_and_channels(df)
 
-    # Pivot the DataFrame
+    # pivot the DataFrame
     pivot_df = df_long.pivot_table(
         index=["subject_index", "channel", "timepoint"],
         columns="condition",
@@ -1825,7 +1826,7 @@ def cluster_test(
     if len(pd.unique(df.condition)) != 2:
         raise ValueError("Condition list needs to contain 2 unique values")
 
-    # Get the unique conditions
+    # get the unique conditions
     conditions = np.unique(df.condition)
 
     # Compute the difference (assuming there are only 2 conditions)
@@ -1849,9 +1850,8 @@ def cluster_test(
             "Formula is required and needs to be a string in Wilkinson notation."
         )
 
-    # now prep design matrix outcome variable for input into MNE cluster function
-    # we initially had first channels, then timepoints,
-    # now we need first timepoints, then channels
+    # now prep design matrix for input into MNE cluster function
+    # cluster functions expects channels as list dimension
     y_for_cluster = y.values.reshape(-1, n_channels, n_timepoints).transpose(0, 2, 1)
 
     adjacency, _ = find_ch_adjacency(df["evoked"][0].info, ch_type="eeg")
@@ -1864,7 +1864,7 @@ def cluster_test(
     # Run the cluster-based permutation test
     T_obs, clusters, cluster_p_values, H0 = _permutation_cluster_test(
         [y_for_cluster],
-        n_permutations=10000,
+        n_permutations=n_permutations,
         threshold=threshold,
         stat_fun=stat_fun,
         tail=tail,
@@ -1880,19 +1880,24 @@ def cluster_test(
         seed=seed,
     )
 
-    print(min(cluster_p_values))
+    print(f"smallest cluster p-value: {min(cluster_p_values)}")
 
     return T_obs, clusters, cluster_p_values, H0
 
 
-def unpack_time_and_channels(df):
+def unpack_time_and_channels(df: pd.DataFrame = None) -> pd.DataFrame:
     """
-    Extract the time and channel data from the DataFrame.
+    Extract timepoints and channels and convert to long.
 
     Parameters
     ----------
     df : pd.DataFrame
         DataFrame in wide format.
+
+    Returns
+    -------
+    df_long : pd.DataFrame
+        DataFrame in long format.
     """
     # Extracting all necessary data using list comprehensions for better performance
     long_format_data = [
@@ -1914,20 +1919,18 @@ def unpack_time_and_channels(df):
     return df_long
 
 
-def plot_cluster(
-    contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values
-):
+def plot_cluster(cond_dict, T_obs, clusters, cluster_p_values):
     """
     Plot the cluster with the lowest p-value.
 
+    2D cluster plotted with topoplot on the left and evoked signals on the right.
+    Timepoints that are part of the cluster are
+    highlighted in green on the evoked signals.
+
     Parameters
     ----------
-    contrast : list
-        List of contrast evoked objects.
-    target_only : list
-        List of target evoked objects.
-    non_target_only : list
-        List of non-target evoked objects.
+    cond_dict : dict
+        Dictionary with conditions as keys and evoked data as values.
     T_obs : array
         The observed test statistic.
     clusters : list
@@ -1940,11 +1943,13 @@ def plot_cluster(
     None
 
     """
-    # configure variables for visualization
-    colors = {"target": "crimson", "non-target": "steelblue"}
+    # extract condition labels from the dictionary
+    cond_keys = list(cond_dict.keys())
+    # extract the evokeds from the dictionary
+    cond_values = list(cond_dict.values())
 
-    # organize data for plotting
-    evokeds = {"target": target_only, "non-target": non_target_only}
+    # configure variables for visualization
+    colors = {cond_keys[0]: "crimson", cond_keys[1]: "steelblue"}
 
     lowest_p_cluster = np.argmin(cluster_p_values)
 
@@ -1957,7 +1962,7 @@ def plot_cluster(
     t_map = T_obs[time_inds, ...].mean(axis=0)
 
     # get signals at the sensors contributing to the cluster
-    sig_times = contrast[0].times[time_inds]
+    sig_times = cond_values[0][0].times[time_inds]
 
     # create spatial mask
     mask = np.zeros((t_map.shape[0], 1), dtype=bool)
@@ -1967,7 +1972,7 @@ def plot_cluster(
     fig, ax_topo = plt.subplots(1, 1, figsize=(10, 3), layout="constrained")
 
     # plot average test statistic and mark significant sensors
-    t_evoked = EvokedArray(t_map[:, np.newaxis], contrast[0].info, tmin=0)
+    t_evoked = EvokedArray(t_map[:, np.newaxis], cond_values[0][0].info, tmin=0)
     t_evoked.plot_topomap(
         times=0,
         mask=mask,
@@ -2005,7 +2010,7 @@ def plot_cluster(
     if len(ch_inds) > 1:
         title += "s (mean)"
         plot_compare_evokeds(
-            evokeds,
+            cond_dict,
             title=title,
             picks=ch_inds,
             axes=ax_signals,
diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index ec8bd8275a1..a88904a5b5b 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -125,6 +125,8 @@
 T_obs, clusters, cluster_p_values, H0 = mne.stats.cluster_level.cluster_test(
     df=df, formula=formula
 )
+# set up conditions dictionary for cluster plots
+conditions_dict = {"target": target_only, "non-target": non_target_only}
 
 # finally let's plot the results
 # we plot the cluster with the lowest p-value
@@ -133,6 +135,4 @@
 # in the visual channels
 # however the cluster is not significant which is not surprising
 # given the small sample size (only 5 subjects)
-mne.stats.cluster_level.plot_cluster(
-    diff_evoked, target_only, non_target_only, T_obs, clusters, cluster_p_values
-)
+mne.stats.cluster_level.plot_cluster(conditions_dict, T_obs, clusters, cluster_p_values)

From 0f99c709e16207c1bc6e5a9a3e664030cb171d27 Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Sat, 6 Jul 2024 11:53:49 +0200
Subject: [PATCH 16/88] implemented cluser results class

---
 mne/stats/cluster_level.py                    | 213 +++++++++---------
 .../76_new_cluster_test_api.py                |  23 +-
 2 files changed, 124 insertions(+), 112 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index 686f1097063..146a6cd7c5f 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -1794,15 +1794,8 @@ def cluster_test(
 
     Returns
     -------
-    TODO: turn this into a class for further plotting
-    T_obs : array
-        The observed test statistic.
-    clusters : list
-        List of clusters.
-    cluster_p_values : array
-        Array of cluster p-values.
-    H0 : array
-        The permuted test statistics.
+    ClusterResult
+        Object containing the results of the cluster permutation test.
     """
     # for now this assumes a dataframe with a column for evoked data or epochs
     # add a data column to the dataframe (numpy array)
@@ -1882,7 +1875,7 @@ def cluster_test(
 
     print(f"smallest cluster p-value: {min(cluster_p_values)}")
 
-    return T_obs, clusters, cluster_p_values, H0
+    return ClusterResult(T_obs, clusters, cluster_p_values, H0)
 
 
 def unpack_time_and_channels(df: pd.DataFrame = None) -> pd.DataFrame:
@@ -1919,111 +1912,127 @@ def unpack_time_and_channels(df: pd.DataFrame = None) -> pd.DataFrame:
     return df_long
 
 
-def plot_cluster(cond_dict, T_obs, clusters, cluster_p_values):
+class ClusterResult:
     """
-    Plot the cluster with the lowest p-value.
-
-    2D cluster plotted with topoplot on the left and evoked signals on the right.
-    Timepoints that are part of the cluster are
-    highlighted in green on the evoked signals.
+    Object containing the results of the cluster permutation test.
 
     Parameters
     ----------
-    cond_dict : dict
-        Dictionary with conditions as keys and evoked data as values.
-    T_obs : array
+    T_obs : np.ndarray
         The observed test statistic.
     clusters : list
         List of clusters.
-    cluster_p_values : array
-        Array of cluster p-values.
-
-    Returns
-    -------
-    None
-
+    cluster_p_values : np.ndarray
+        P-values for each cluster.
+    H0 : np.ndarray
+        Max cluster level stats observed under permutation.
     """
-    # extract condition labels from the dictionary
-    cond_keys = list(cond_dict.keys())
-    # extract the evokeds from the dictionary
-    cond_values = list(cond_dict.values())
-
-    # configure variables for visualization
-    colors = {cond_keys[0]: "crimson", cond_keys[1]: "steelblue"}
-
-    lowest_p_cluster = np.argmin(cluster_p_values)
-
-    # plot the cluster with the lowest p-value
-    time_inds, space_inds = np.squeeze(clusters[lowest_p_cluster])
-    ch_inds = np.unique(space_inds)
-    time_inds = np.unique(time_inds)
-
-    # get topography for F stat
-    t_map = T_obs[time_inds, ...].mean(axis=0)
-
-    # get signals at the sensors contributing to the cluster
-    sig_times = cond_values[0][0].times[time_inds]
-
-    # create spatial mask
-    mask = np.zeros((t_map.shape[0], 1), dtype=bool)
-    mask[ch_inds, :] = True
-
-    # initialize figure
-    fig, ax_topo = plt.subplots(1, 1, figsize=(10, 3), layout="constrained")
-
-    # plot average test statistic and mark significant sensors
-    t_evoked = EvokedArray(t_map[:, np.newaxis], cond_values[0][0].info, tmin=0)
-    t_evoked.plot_topomap(
-        times=0,
-        mask=mask,
-        axes=ax_topo,
-        cmap="Reds",
-        vlim=(np.min, np.max),
-        show=False,
-        colorbar=False,
-        mask_params=dict(markersize=10),
-    )
-    image = ax_topo.images[0]
 
-    # remove the title that would otherwise say "0.000 s"
-    ax_topo.set_title("")
+    def __init__(self, T_obs, clusters, cluster_p_values, H0):
+        self.T_obs = T_obs
+        self.clusters = clusters
+        self.cluster_p_values = cluster_p_values
+        self.H0 = H0
+
+    def plot_cluster(self, cond_dict: dict = None):
+        """
+        Plot the cluster with the lowest p-value.
+
+        2D cluster plotted with topoplot on the left and evoked signals on the right.
+        Timepoints that are part of the cluster are
+        highlighted in green on the evoked signals.
+
+        Parameters
+        ----------
+        cond_dict : dict
+            Dictionary with condition labels as keys and evoked objects as values.
+
+        Returns
+        -------
+        None
+
+        """
+        # extract condition labels from the dictionary
+        cond_keys = list(cond_dict.keys())
+        # extract the evokeds from the dictionary
+        cond_values = list(cond_dict.values())
+
+        # configure variables for visualization
+        colors = {cond_keys[0]: "crimson", cond_keys[1]: "steelblue"}
+
+        lowest_p_cluster = np.argmin(self.cluster_p_values)
+
+        # plot the cluster with the lowest p-value
+        time_inds, space_inds = np.squeeze(self.clusters[lowest_p_cluster])
+        ch_inds = np.unique(space_inds)
+        time_inds = np.unique(time_inds)
+
+        # get topography for F stat
+        t_map = self.T_obs[time_inds, ...].mean(axis=0)
+
+        # get signals at the sensors contributing to the cluster
+        sig_times = cond_values[0][0].times[time_inds]
+
+        # create spatial mask
+        mask = np.zeros((t_map.shape[0], 1), dtype=bool)
+        mask[ch_inds, :] = True
+
+        # initialize figure
+        fig, ax_topo = plt.subplots(1, 1, figsize=(10, 3), layout="constrained")
+
+        # plot average test statistic and mark significant sensors
+        t_evoked = EvokedArray(t_map[:, np.newaxis], cond_values[0][0].info, tmin=0)
+        t_evoked.plot_topomap(
+            times=0,
+            mask=mask,
+            axes=ax_topo,
+            cmap="Reds",
+            vlim=(np.min, np.max),
+            show=False,
+            colorbar=False,
+            mask_params=dict(markersize=10),
+        )
+        image = ax_topo.images[0]
 
-    # soft import?
-    # make_axes_locatable = _soft_import(
-    #    "mpl_toolkits.axes_grid1.make_axes_locatable",
-    #    purpose="plot cluster results"
-    # )  # soft import (not a dependency for MNE)
+        # remove the title that would otherwise say "0.000 s"
+        ax_topo.set_title("")
 
-    # create additional axes (for ERF and colorbar)
-    divider = make_axes_locatable(ax_topo)
+        # soft import?
+        # make_axes_locatable = _soft_import(
+        #    "mpl_toolkits.axes_grid1.make_axes_locatable",
+        #    purpose="plot cluster results"
+        # )  # soft import (not a dependency for MNE)
 
-    # add axes for colorbar
-    ax_colorbar = divider.append_axes("right", size="5%", pad=0.05)
-    plt.colorbar(image, cax=ax_colorbar)
-    ax_topo.set_xlabel(
-        "Averaged t-map ({:0.3f} - {:0.3f} s)".format(*sig_times[[0, -1]])
-    )
+        # create additional axes (for ERF and colorbar)
+        divider = make_axes_locatable(ax_topo)
 
-    # add new axis for time courses and plot time courses
-    ax_signals = divider.append_axes("right", size="300%", pad=1.2)
-    title = f"Cluster #1, {len(ch_inds)} sensor"
-    if len(ch_inds) > 1:
-        title += "s (mean)"
-        plot_compare_evokeds(
-            cond_dict,
-            title=title,
-            picks=ch_inds,
-            axes=ax_signals,
-            colors=colors,
-            show=False,
-            split_legend=True,
-            truncate_yaxis="auto",
+        # add axes for colorbar
+        ax_colorbar = divider.append_axes("right", size="5%", pad=0.05)
+        plt.colorbar(image, cax=ax_colorbar)
+        ax_topo.set_xlabel(
+            "Averaged t-map ({:0.3f} - {:0.3f} s)".format(*sig_times[[0, -1]])
         )
 
-    # plot temporal cluster extent
-    ymin, ymax = ax_signals.get_ylim()
-    ax_signals.fill_betweenx(
-        (ymin, ymax), sig_times[0], sig_times[-1], color="green", alpha=0.3
-    )
+        # add new axis for time courses and plot time courses
+        ax_signals = divider.append_axes("right", size="300%", pad=1.2)
+        title = f"Cluster #1, {len(ch_inds)} sensor"
+        if len(ch_inds) > 1:
+            title += "s (mean)"
+            plot_compare_evokeds(
+                cond_dict,
+                title=title,
+                picks=ch_inds,
+                axes=ax_signals,
+                colors=colors,
+                show=False,
+                split_legend=True,
+                truncate_yaxis="auto",
+            )
+
+        # plot temporal cluster extent
+        ymin, ymax = ax_signals.get_ylim()
+        ax_signals.fill_betweenx(
+            (ymin, ymax), sig_times[0], sig_times[-1], color="green", alpha=0.3
+        )
 
-    plt.show()
+        plt.show()
diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index a88904a5b5b..3acfd21f7f0 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -15,7 +15,8 @@
 
   - loading evoked data from multiple subjects
   - construct a dataframe that contains the difference between conditions
-  - run the new cluster test function
+  - run the new cluster test function with formula in Wilkinson notation
+  - plot the results with the ClusterResults Class
 
 Here, the unit of observation are evokeds from multiple subjects (2nd level analysis).
 
@@ -121,18 +122,20 @@
 # let's first define the formula based on Wilkinson notation
 formula = "evoked ~ 1 + C(subject_index)"
 
-# run the cluster test
-T_obs, clusters, cluster_p_values, H0 = mne.stats.cluster_level.cluster_test(
-    df=df, formula=formula
-)
+# run the cluster test and return the cluster_result object
+cluster_result = mne.stats.cluster_level.cluster_test(df=df, formula=formula)
+
+# note that we ran an exact test due to the small sample size (only 15 permutations)
+
 # set up conditions dictionary for cluster plots
 conditions_dict = {"target": target_only, "non-target": non_target_only}
 
-# finally let's plot the results
+# finally let's plot the results using the ClusterResults class
+
 # we plot the cluster with the lowest p-value
-# and the topomap of the significant cluster
+
 # we can see that there is something going on around 400 ms
-# in the visual channels
-# however the cluster is not significant which is not surprising
+# in the visual channels (topomap on the left)
+# however the cluster is not significant which is unsurprising
 # given the small sample size (only 5 subjects)
-mne.stats.cluster_level.plot_cluster(conditions_dict, T_obs, clusters, cluster_p_values)
+cluster_result.plot_cluster(cond_dict=conditions_dict)

From 4083691f928cc932213fc1a0610c769c53476e15 Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Sat, 6 Jul 2024 11:55:34 +0200
Subject: [PATCH 17/88] added contribution

---
 mne/stats/cluster_level.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index 146a6cd7c5f..9a223d715ae 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -6,6 +6,7 @@
 #          Eric Larson <larson.eric.d@gmail.com>
 #          Denis Engemann <denis.engemann@gmail.com>
 #          Fernando Perez (bin_perm_rep function)
+#          Carina Forster <carina.forster0611@gmail.com>
 #
 # License: BSD-3-Clause
 # Copyright the MNE-Python contributors.

From 7e9b2e5d3b844bab1454ca16076a1dcf747992d9 Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Thu, 18 Jul 2024 14:35:02 +0200
Subject: [PATCH 18/88] fixed codespell

---
 mne/stats/cluster_level.py                    | 55 +++++++++++--------
 .../76_new_cluster_test_api.py                | 34 +++++++-----
 2 files changed, 50 insertions(+), 39 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index 9a223d715ae..0d3ecec2e58 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -1820,8 +1820,14 @@ def cluster_test(
     if len(pd.unique(df.condition)) != 2:
         raise ValueError("Condition list needs to contain 2 unique values")
 
-    # get the unique conditions
-    conditions = np.unique(df.condition)
+    # Get unique elements and the indices of their first occurrences
+    unique_elements, indices = np.unique(df.condition, return_index=True)
+
+    # Sort unique elements by the indices of their first occurrences
+    conditions = unique_elements[np.argsort(indices)]
+
+    # print the contrast used for the paired t-test
+    print(f"Contrast used for paired t-test: {conditions[0]} - {conditions[1]}")
 
     # Compute the difference (assuming there are only 2 conditions)
     pivot_df["evoked"] = pivot_df[conditions[0]] - pivot_df[conditions[1]]
@@ -1968,8 +1974,8 @@ def plot_cluster(self, cond_dict: dict = None):
         ch_inds = np.unique(space_inds)
         time_inds = np.unique(time_inds)
 
-        # get topography for F stat
-        t_map = self.T_obs[time_inds, ...].mean(axis=0)
+        # get topography for t stat
+        t_map = self.T_obs[time_inds, ...].mean(axis=0).astype(int)
 
         # get signals at the sensors contributing to the cluster
         sig_times = cond_values[0][0].times[time_inds]
@@ -1987,11 +1993,11 @@ def plot_cluster(self, cond_dict: dict = None):
             times=0,
             mask=mask,
             axes=ax_topo,
-            cmap="Reds",
-            vlim=(np.min, np.max),
+            cmap="RdBu_r",
             show=False,
             colorbar=False,
             mask_params=dict(markersize=10),
+            scalings=1.00,
         )
         image = ax_topo.images[0]
 
@@ -2008,32 +2014,33 @@ def plot_cluster(self, cond_dict: dict = None):
         divider = make_axes_locatable(ax_topo)
 
         # add axes for colorbar
-        ax_colorbar = divider.append_axes("right", size="5%", pad=0.05)
-        plt.colorbar(image, cax=ax_colorbar)
+        ax_colorbar = divider.append_axes("right", size="5%", pad=0.1)
+        cbar = plt.colorbar(image, cax=ax_colorbar)
+        cbar.set_label("t-value")
         ax_topo.set_xlabel(
-            "Averaged t-map ({:0.3f} - {:0.3f} s)".format(*sig_times[[0, -1]])
+            "average from {:0.3f} to {:0.3f} s".format(*sig_times[[0, -1]])
         )
 
         # add new axis for time courses and plot time courses
-        ax_signals = divider.append_axes("right", size="300%", pad=1.2)
-        title = f"Cluster #1, {len(ch_inds)} sensor"
-        if len(ch_inds) > 1:
-            title += "s (mean)"
-            plot_compare_evokeds(
-                cond_dict,
-                title=title,
-                picks=ch_inds,
-                axes=ax_signals,
-                colors=colors,
-                show=False,
-                split_legend=True,
-                truncate_yaxis="auto",
-            )
+        ax_signals = divider.append_axes("right", size="300%", pad=1.3)
+        title = f"Signal averaged over {len(ch_inds)} sensor(s)"
+        plot_compare_evokeds(
+            cond_dict,
+            title=title,
+            picks=ch_inds,
+            axes=ax_signals,
+            colors=colors,
+            show=False,
+            split_legend=True,
+            truncate_yaxis="auto",
+            truncate_xaxis=False,
+        )
+        plt.legend(frameon=False, loc="upper left")
 
         # plot temporal cluster extent
         ymin, ymax = ax_signals.get_ylim()
         ax_signals.fill_betweenx(
-            (ymin, ymax), sig_times[0], sig_times[-1], color="green", alpha=0.3
+            (ymin, ymax), sig_times[0], sig_times[-1], color="grey", alpha=0.3
         )
 
         plt.show()
diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index 3acfd21f7f0..842e0543b0b 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -6,8 +6,9 @@
 ===============================================================
 
 This tutorial shows how to use the new API for cluster testing.
-This script shows how to estimate significant clusters in
-evoked contrast data of multiple subjects.
+The new API allows for Wilkinson style formulas and allows for more flexibility in
+the design of the test. Here we will demonstrate how to use the new API for
+a standard paired t-test on evoked data from multiple subjects.
 It uses a non-parametric statistical procedure based on permutations and
 cluster level statistics.
 
@@ -16,7 +17,7 @@
   - loading evoked data from multiple subjects
   - construct a dataframe that contains the difference between conditions
   - run the new cluster test function with formula in Wilkinson notation
-  - plot the results with the ClusterResults Class
+  - plot the results with the new ClusterResults API
 
 Here, the unit of observation are evokeds from multiple subjects (2nd level analysis).
 
@@ -41,13 +42,14 @@
 # Define the path to the P3 dataset
 path_to_p3 = mne.datasets.misc.data_path() / "ERP_CORE" / "P3"
 
-# Define the range of participant IDs
-participant_ids = range(15, 20)  # This will cover 015 to 019
+# Define the range of participant IDs (we only have 5 participants in the dataset)
+participant_ids = range(15, 20)  # This will cover participant 15 to 19
 
 # store the evoked data of all subjects
 evokeds_allsubs = []
 
 # Loop over each participant ID and generate the corresponding filename
+# to load the evoked data
 for pid in participant_ids:
     # Create the filename using an f-string, ID is zero-padded to 3 digits
     filename_p3 = f"sub-{pid:03d}_ses-P3_task-P3_ave.fif"
@@ -58,21 +60,22 @@
     # load the evoked data
     evokeds = mne.read_evokeds(p3_file_path)
 
-    # add subjects evoked data to list
+    # add single subjects evoked data to a list
     evokeds_allsubs.append(evokeds)
 
 # the P3b dataset is part of the freely available ERP CORE dataset
 # participants were presented with a visual oddball task
 # and the P3b component was analyzed
 # the conditions of interest are the target (rare visual stimuli)
-# and non-target stimuli (frequency visual stimuli)
+# and non-target stimuli (frequent visual stimuli)
 
 # let's extract the target and non-target evokeds
 target_only = [evoked[0] for evoked in evokeds_allsubs]
 non_target_only = [evoked[1] for evoked in evokeds_allsubs]
 
 # let's first have a look at the data
-# create contrast from target and non-target evokeds
+
+# create contrast target - non-target
 diff_evoked = [
     mne.combine_evoked([evokeds_a, evokeds_b], weights=[1, -1])
     for evokeds_a, evokeds_b in zip(target_only, non_target_only)
@@ -84,7 +87,7 @@
 mne.grand_average(diff_evoked).plot_topomap()
 
 # we can see that the strongest difference is around 400 ms in
-# visual channels (occipital region)
+# central-parietal channels with a stronger evoked signal for target stimuli
 
 # Next we prepare a dataframe for the cluster test function
 # the dataframe should contain the contrast evoked data and the subject index
@@ -93,7 +96,7 @@
 # save the evoked data for both conditions in one list
 evokeds_conditions = target_only + non_target_only
 
-# set up a list that defines the condition for each evoked data
+# create a list that defines the condition for each evoked data
 # this will be used to create the conditions column in the dataframe
 conditions = ["target"] * len(target_only) + ["non-target"] * len(non_target_only)
 
@@ -102,7 +105,7 @@
 # we multiply the participant_ids by 2 to account for the two conditions
 subject_index = list(participant_ids) * 2
 
-# create the dataframe
+# create the dataframe containing the evoked data, the condition and the subject index
 df = pd.DataFrame(
     {
         "evoked": evokeds_conditions,
@@ -122,20 +125,21 @@
 # let's first define the formula based on Wilkinson notation
 formula = "evoked ~ 1 + C(subject_index)"
 
-# run the cluster test and return the cluster_result object
+# run the new cluster test API and return the new cluster_result object
 cluster_result = mne.stats.cluster_level.cluster_test(df=df, formula=formula)
 
 # note that we ran an exact test due to the small sample size (only 15 permutations)
 
 # set up conditions dictionary for cluster plots
+# this is necessary for plotting the evoked data and the cluster result on top
 conditions_dict = {"target": target_only, "non-target": non_target_only}
 
 # finally let's plot the results using the ClusterResults class
 
 # we plot the cluster with the lowest p-value
-
+cluster_result.plot_cluster(cond_dict=conditions_dict)
 # we can see that there is something going on around 400 ms
-# in the visual channels (topomap on the left)
+# with a stronger signal for target trials in right central-parietal channels
+
 # however the cluster is not significant which is unsurprising
 # given the small sample size (only 5 subjects)
-cluster_result.plot_cluster(cond_dict=conditions_dict)

From 8f510a9fdca0ba6411795d79a871b3cfd9c21a6a Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Mon, 22 Jul 2024 20:22:20 +0200
Subject: [PATCH 19/88] first review

---
 mne/stats/cluster_level.py | 384 ++++++++++++++++++++++++++-----------
 1 file changed, 272 insertions(+), 112 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index 0d3ecec2e58..ab0fd0daf69 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -11,6 +11,10 @@
 # License: BSD-3-Clause
 # Copyright the MNE-Python contributors.
 
+from __future__ import annotations
+
+from typing import Literal
+
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
@@ -20,12 +24,13 @@
 from scipy.stats import f as fstat
 from scipy.stats import t as tstat
 
-from .. import EvokedArray
-from ..channels import find_ch_adjacency
+from .. import Epochs, Evoked
+from ..epochs import EpochsArray, EvokedArray
 from ..fixes import has_numba, jit
 from ..parallel import parallel_func
 from ..source_estimate import MixedSourceEstimate, SourceEstimate, VolSourceEstimate
 from ..source_space import SourceSpaces
+from ..time_frequency import AverageTFR, AverageTFRArray, EpochsTFR, EpochsTFRArray
 from ..utils import (
     ProgressBar,
     _check_option,
@@ -945,7 +950,7 @@ def _permutation_cluster_test(
     sample_shape = X[0].shape[1:]
     for x in X:
         if x.shape[1:] != sample_shape:
-            raise ValueError("All samples mush have the same size")
+            raise ValueError("All samples must have the same size")
 
     # flatten the last dimensions in case the data is high dimensional
     X = [np.reshape(x, (x.shape[0], -1)) for x in X]
@@ -1739,21 +1744,186 @@ def summarize_clusters_stc(
     return klass(data_summary, vertices, tmin, tstep, subject)
 
 
+def validate_input_dataframe(df: pd.DataFrame, formula: str):
+    """
+    Validate the input dataframe for the cluster permutation test.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Dataframe with 3 columns (subject_index, condition, data).
+    formula : formulaic.ModelSpec
+        Wilkinson style Formula for the design matrix.
+
+    Returns
+    -------
+    dv_name : str
+        Name of the dependent variable.
+    """
+    # extract dependent variable name from formula
+    formulaic = _soft_import(
+        "formulaic", purpose="set up Design Matrix"
+    )  # soft import (not a dependency for MNE)
+    formula = formulaic.Formula(formula)
+    dv_name = str(formula.lhs)
+
+    # check if all necessary columns are present
+    if dv_name not in df.columns:
+        raise ValueError("""DataFrame needs to contain a column
+                        with the dependent variable name
+                        as defined in the formula""")
+    if "condition" not in df.columns:
+        raise ValueError("DataFrame needs to contain a condition column")
+    if "subject_index" not in df.columns:
+        raise ValueError("DataFrame needs to contain a subject_index column")
+
+    # check if the data column contains only valid types
+    check_column_types(df[dv_name])
+
+    # check if the shape of the data is consistent
+    if not all(data.data.shape == df[dv_name][0].data.shape for data in df[dv_name]):
+        raise ValueError("Data objects need to have the same shape")
+
+    # check if the condition column contains only 2 unique values
+    if len(pd.unique(df.condition)) != 2:
+        raise ValueError("currently only supports 2 conditions.")
+
+    return dv_name
+
+
+def check_column_types(input_data: np.ndarray):
+    """
+    Check if the column types are valid for the cluster permutation test.
+
+    Parameters
+    ----------
+    input_data : np.Array
+        Data to be checked for the cluster permutation test.
+    """
+    # Get the type of the first element
+    first_type = type(input_data.iloc[0])
+
+    # Define the possible valid types
+    valid_types = (
+        Evoked,
+        EvokedArray,
+        Epochs,
+        EpochsArray,
+        AverageTFR,
+        EpochsTFR,
+        EpochsTFRArray,
+        AverageTFRArray,
+    )
+
+    # Check if the type of the first element is a valid type
+    if first_type not in valid_types:
+        raise ValueError(f"Object type '{first_type}' is not a valid type.")
+
+    # Check if all elements are of the same type as the first one
+    if not all(isinstance(data, first_type) for data in input_data):
+        raise ValueError("Data column must contain objects of the same type.")
+
+
+def prepare_data_for_cluster_test(input_df: pd.DataFrame, dv_name: str):
+    """
+    Prepare the data for the cluster permutation test.
+
+    Parameters
+    ----------
+    input_data : np.ndarray
+        Data to be prepared for the cluster permutation test.
+
+    Returns
+    -------
+    data : np.Array
+        Data prepared for the cluster permutation test.
+    """
+    # extract data and add to dataframe
+    input_df["data"] = [data.data for data in input_df[dv_name]]
+
+    # extract dimensions from time series or time-frequency data
+    first_data_obj = input_df["data"].iloc[0]
+    if isinstance(first_data_obj, (Epochs, Evoked, EpochsArray, EvokedArray)):
+        n_channels, n_timepoints = first_data_obj.get_data().shape
+    if isinstance(
+        first_data_obj, (AverageTFR, EpochsTFR, AverageTFRArray, EpochsTFRArray)
+    ):
+        n_channels, n_freqs, n_timepoints = first_data_obj.get_data().shape
+
+    reshaped_data = []
+
+    for idx, row in input_df.iterrows():
+        subject_index = row["subject_index"]
+        condition = row["condition"]
+        data_array = row["data"]
+
+        if data_array.ndim == 2:
+            n_channels, n_timepoints = data_array.shape
+            # timepoints are the columns
+            df_temp = pd.DataFrame(
+                data_array, columns=[f"timepoint_{i}" for i in range(n_timepoints)]
+            )
+            df_temp["channel"] = range(n_channels)
+            df_temp["subject_index"] = subject_index
+            df_temp["condition"] = condition
+
+            reshaped_data.append(df_temp)
+
+        elif data_array.ndim == 3:
+            n_channels, n_freqs, n_timepoints = data_array.shape
+            # timepoints are the columns
+            df_temp = pd.DataFrame(
+                data_array.reshape(-1, n_timepoints),
+                columns=[f"timepoint_{i}" for i in range(n_timepoints)],
+            )
+            df_temp["frequency"] = np.repeat(range(n_freqs), n_channels)
+            df_temp["channel"] = np.tile(range(n_channels), n_freqs)
+            df_temp["subject_index"] = subject_index
+            df_temp["condition"] = condition
+
+            reshaped_data.append(df_temp)
+
+        else:
+            raise ValueError(f"Unsupported data array dimensions: {data_array.ndim}")
+    # combine the reshaped data
+    combined_df = pd.concat(reshaped_data, ignore_index=True)
+    # Convert the dataframe to long format
+    id_vars = ["subject_index", "condition", "channel"]
+    if "frequency" in combined_df.columns:
+        id_vars.append("frequency")
+
+    reshaped_df = pd.melt(
+        combined_df, id_vars=id_vars, var_name="timepoint", value_name="value"
+    )
+
+    # rename column and convert to integer
+    reshaped_df["timepoint"] = (
+        reshaped_df["timepoint"].str.replace("timepoint_", "").astype(int)
+    )
+
+    # return the reshaped dataframe and dimensions
+    if data_array.ndim == 2:
+        return reshaped_df, data_array.ndim, n_channels, n_timepoints
+    elif data_array.ndim == 3:
+        return reshaped_df, data_array.ndim, n_channels, n_freqs, n_timepoints
+
+
 def cluster_test(
     df: pd.DataFrame,
-    formula: str = None,  # Wilkinson notation formula for design matrix
-    n_permutations: int = 10000,
+    formula: str,  # Wilkinson notation formula for design matrix
+    paired_test: bool,  # whether to run a paired t-test or unpaired test
+    n_permutations: int = 1024,  # same default as in old API
     seed: None | int | np.random.RandomState = None,
-    tail: int = 0,  # 0 for two-tailed, 1 for greater, -1 for less
+    tail: Literal[-1, 0, 1] = 0,  # 0 for two-tailed, 1 for greater, -1 for less
     n_jobs: int = 1,  # how many cores to use
-    adjacency: tuple = None,
+    adjacency: tuple | None = None,
     max_step: int = 1,  # maximum distance between samples (time points)
-    exclude: list = None,  # exclude no time points or channels
+    exclude: list | None = None,  # exclude no time points or channels
     step_down_p: int = 0,  # step down in jumps test
     t_power: int = 1,  # weigh each location by its stats score
-    out_type: str = "indices",
+    out_type: Literal["indices", "mask"] = "indices",
     check_disjoint: bool = False,
-    buffer_size: int = None,  # block size for chunking the data
+    buffer_size: int | None = None,  # block size for chunking the data
 ):
     """
     Run a cluster permutation test based on formulaic input.
@@ -1762,12 +1932,14 @@ def cluster_test(
 
     Parameters
     ----------
-    dataframe : pd.DataFrame
-        Dataframe with evoked/epoched data, conditions and subject IDs.
-    formula : str, optional
-        Wilkinson notation formula for design matrix. Default is None.
+    df : pd.DataFrame
+        Dataframe with 3 columns (subject_index, condition, evoked).
+    formula : str
+        Wilkinson notation formula for design matrix.
+    paired_test: bool
+        Whether to run a paired t-test.
     n_permutations : int, optional
-        Number of permutations. Default is 10000.
+        Number of permutations. Default is 1024.
     seed : None | int | np.random.RandomState, optional
         Seed for the random number generator. Default is None.
     tail : int, optional
@@ -1775,7 +1947,7 @@ def cluster_test(
     n_jobs : int, optional
         How many cores to use. Default is 1.
     adjacency : None, optional
-        Adjacency matrix. Default is None.
+        Provide a adjacency matrix. Default is None.
     max_step : int, optional
         Maximum distance between samples (time points). Default is 1.
     exclude : np.Array, optional
@@ -1798,27 +1970,38 @@ def cluster_test(
     ClusterResult
         Object containing the results of the cluster permutation test.
     """
-    # for now this assumes a dataframe with a column for evoked data or epochs
-    # add a data column to the dataframe (numpy array)
-    df["data"] = [evoked.data for evoked in df.evoked]
-
-    # extract number of channels and timepoints
-    # (eventually should also allow for frequency)
-    n_channels, n_timepoints = df["data"][0].shape
-
-    # convert wide format to long format for formulaic
-    df_long = unpack_time_and_channels(df)
-
-    # pivot the DataFrame
-    pivot_df = df_long.pivot_table(
-        index=["subject_index", "channel", "timepoint"],
-        columns="condition",
-        values="value",
-    ).reset_index()
-
-    # if not 2 unique conditions raise error
-    if len(pd.unique(df.condition)) != 2:
-        raise ValueError("Condition list needs to contain 2 unique values")
+    # check if formula is present
+    if formula is None:
+        raise ValueError("Wilkinson style formula is required.")
+
+    # validate the input dataframe and return name of dependent variable
+    dv_name = validate_input_dataframe(df, formula)
+
+    # prepare the data for the cluster permutation test
+    prep_result = prepare_data_for_cluster_test(df, dv_name)
+
+    if prep_result[1] == 2:
+        # pivot the dataframe based on condition for later subtraction
+        pivot_df = (
+            prep_result[0]
+            .pivot_table(
+                index=["subject_index", "channel", "timepoint"],
+                columns="condition",
+                values="value",
+            )
+            .reset_index()
+        )
+    elif prep_result[1] == 3:
+        # pivot the dataframe based on condition for later subtraction
+        pivot_df = (
+            prep_result[0]
+            .pivot_table(
+                index=["subject_index", "channel", "frequency", "timepoint"],
+                columns="condition",
+                values="value",
+            )
+            .reset_index()
+        )
 
     # Get unique elements and the indices of their first occurrences
     unique_elements, indices = np.unique(df.condition, return_index=True)
@@ -1826,41 +2009,51 @@ def cluster_test(
     # Sort unique elements by the indices of their first occurrences
     conditions = unique_elements[np.argsort(indices)]
 
-    # print the contrast used for the paired t-test
-    print(f"Contrast used for paired t-test: {conditions[0]} - {conditions[1]}")
+    # store the contrast for the clusterResults object
+    contrast = f"{conditions[0]} - {conditions[1]}"
 
-    # Compute the difference (assuming there are only 2 conditions)
-    pivot_df["evoked"] = pivot_df[conditions[0]] - pivot_df[conditions[1]]
+    # print the contrast used for the paired t-test so the user knows
+    # what is subtracted from what
+    logger.info(f"Contrast used for paired t-test: {contrast}")
 
-    # Optional: Clean up the DataFrame
-    pivot_df = pivot_df[["subject_index", "channel", "timepoint", "evoked"]]
+    # Compute the difference (assuming there are only 2 conditions)
+    pivot_df[dv_name] = pivot_df[conditions[0]] - pivot_df[conditions[1]]
+
+    # for the paired t-test y is the difference between conditions
+    # X is the design matrix with a column with 1s and 0s for each participant
+    # Create the design matrix using formulaic
+    formulaic = _soft_import(
+        "formulaic", purpose="set up Design Matrix"
+    )  # soft import (not a dependency for MNE)
+    y, X = formulaic.model_matrix(formula, pivot_df)
+
+    # Prepare design matrix for input into MNE cluster function
+    # MNE cluster functions expect channels as the last dimension
+
+    if prep_result[1] == 2:
+        # Reshape y.values into a 3D array: (participants, n_channels, n_timepoints)
+        y_reshaped = y.values.reshape(-1, prep_result[2], prep_result[3])
+        # Transpose the array to have channels as the last dimension
+        y_for_cluster = y_reshaped.transpose(0, 2, 1)
+    elif prep_result[1] == 3:
+        # Reshape y.values into a 4D array:
+        # (participants, n_channels, n_freqs, n_timepoints)
+        y_reshaped = y.values.reshape(
+            -1, prep_result[2], prep_result[3], prep_result[4]
+        )
+        # Transpose the array to have channels as the last dimension
+        y_for_cluster = y_reshaped.transpose(0, 3, 2, 1)
 
-    # check if formula is present
-    if formula is not None:
-        formulaic = _soft_import(
-            "formulaic", purpose="set up Design Matrix"
-        )  # soft import (not a dependency for MNE)
-
-        # for the paired t-test y is the difference between conditions
-        # X is the design matrix with a column with 1s and 0s for each participant
-        # Create the design matrix using formulaic
-        y, X = formulaic.model_matrix(formula, pivot_df)
+    if paired_test:
+        # define stat function and threshold
+        stat_fun, threshold = _check_fun(
+            X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="within"
+        )
     else:
-        raise ValueError(
-            "Formula is required and needs to be a string in Wilkinson notation."
+        # define stat function and threshold
+        stat_fun, threshold = _check_fun(
+            X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="between"
         )
-
-    # now prep design matrix for input into MNE cluster function
-    # cluster functions expects channels as list dimension
-    y_for_cluster = y.values.reshape(-1, n_channels, n_timepoints).transpose(0, 2, 1)
-
-    adjacency, _ = find_ch_adjacency(df["evoked"][0].info, ch_type="eeg")
-
-    # define stat function and threshold
-    stat_fun, threshold = _check_fun(
-        X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="within"
-    )
-
     # Run the cluster-based permutation test
     T_obs, clusters, cluster_p_values, H0 = _permutation_cluster_test(
         [y_for_cluster],
@@ -1885,40 +2078,6 @@ def cluster_test(
     return ClusterResult(T_obs, clusters, cluster_p_values, H0)
 
 
-def unpack_time_and_channels(df: pd.DataFrame = None) -> pd.DataFrame:
-    """
-    Extract timepoints and channels and convert to long.
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        DataFrame in wide format.
-
-    Returns
-    -------
-    df_long : pd.DataFrame
-        DataFrame in long format.
-    """
-    # Extracting all necessary data using list comprehensions for better performance
-    long_format_data = [
-        {
-            "condition": row["condition"],
-            "subject_index": row["subject_index"],
-            "channel": channel,
-            "timepoint": timepoint,
-            "value": row["data"][channel, timepoint],
-        }
-        for idx, row in df.iterrows()
-        for channel in range(row["data"].shape[0])
-        for timepoint in range(row["data"].shape[1])
-    ]
-
-    # Creating the long format DataFrame
-    df_long = pd.DataFrame(long_format_data)
-
-    return df_long
-
-
 class ClusterResult:
     """
     Object containing the results of the cluster permutation test.
@@ -1935,13 +2094,19 @@ class ClusterResult:
         Max cluster level stats observed under permutation.
     """
 
-    def __init__(self, T_obs, clusters, cluster_p_values, H0):
+    def __init__(
+        self,
+        T_obs: np.typing.NDArray,
+        clusters: list,
+        cluster_p_values: np.typing.NDArray,
+        H0: np.typing.NDArray,
+    ):
         self.T_obs = T_obs
         self.clusters = clusters
         self.cluster_p_values = cluster_p_values
         self.H0 = H0
 
-    def plot_cluster(self, cond_dict: dict = None):
+    def plot_cluster(self, condition_labels: dict):
         """
         Plot the cluster with the lowest p-value.
 
@@ -1951,18 +2116,13 @@ def plot_cluster(self, cond_dict: dict = None):
 
         Parameters
         ----------
-        cond_dict : dict
+        condition_labels : dict
             Dictionary with condition labels as keys and evoked objects as values.
-
-        Returns
-        -------
-        None
-
         """
         # extract condition labels from the dictionary
-        cond_keys = list(cond_dict.keys())
+        cond_keys = list(condition_labels.keys())
         # extract the evokeds from the dictionary
-        cond_values = list(cond_dict.values())
+        cond_values = list(condition_labels.values())
 
         # configure variables for visualization
         colors = {cond_keys[0]: "crimson", cond_keys[1]: "steelblue"}
@@ -2025,7 +2185,7 @@ def plot_cluster(self, cond_dict: dict = None):
         ax_signals = divider.append_axes("right", size="300%", pad=1.3)
         title = f"Signal averaged over {len(ch_inds)} sensor(s)"
         plot_compare_evokeds(
-            cond_dict,
+            condition_labels,
             title=title,
             picks=ch_inds,
             axes=ax_signals,

From d6c0c4c299b3aeaa9089f923aa9017de0b12e42d Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Mon, 22 Jul 2024 20:44:43 +0200
Subject: [PATCH 20/88] quick clean up

---
 .../76_new_cluster_test_api.py                | 27 ++++++++++++++-----
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index 842e0543b0b..efbc6d5e3f0 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -29,7 +29,7 @@
 # License: BSD-3-Clause
 # Copyright the MNE-Python contributors.
 
-# %%
+# %% Load the required packages
 
 from pathlib import Path
 
@@ -37,6 +37,8 @@
 
 import mne
 
+# %% Load the P3 dataset
+
 # Set parameters
 # --------------
 # Define the path to the P3 dataset
@@ -69,6 +71,8 @@
 # the conditions of interest are the target (rare visual stimuli)
 # and non-target stimuli (frequent visual stimuli)
 
+# %% visually inspect the evoked data for each condition
+
 # let's extract the target and non-target evokeds
 target_only = [evoked[0] for evoked in evokeds_allsubs]
 non_target_only = [evoked[1] for evoked in evokeds_allsubs]
@@ -89,7 +93,8 @@
 # we can see that the strongest difference is around 400 ms in
 # central-parietal channels with a stronger evoked signal for target stimuli
 
-# Next we prepare a dataframe for the cluster test function
+# %% Prepare the dataframe for the new cluster test API
+
 # the dataframe should contain the contrast evoked data and the subject index
 # each row in the dataframe should represent one observation (evoked data)
 
@@ -114,7 +119,8 @@
     }
 )
 
-# now we can run the cluster test function
+# %% run the cluster test function with formulaic input
+
 # we will use the new API that allows for Wilkinson style formulas
 # the formula should be a string in Wilkinson notation
 
@@ -123,12 +129,21 @@
 # we will use a cluster-based permutation paired t-test for this
 
 # let's first define the formula based on Wilkinson notation
+# we want to predict the evoked difference signal based on the subject
+# the cluster test randomly permutes the subject label
+# the 1 in the formula represents the intercept which is always included
+# C is a categorical variable that will be dummy coded
 formula = "evoked ~ 1 + C(subject_index)"
 
 # run the new cluster test API and return the new cluster_result object
-cluster_result = mne.stats.cluster_level.cluster_test(df=df, formula=formula)
+cluster_result = mne.stats.cluster_level.cluster_test(
+    df=df, formula=formula, paired_test=True, adjacency=None
+)
+
+# note that we ran an exact test due to the small sample size
+# (only 15 permutations)
 
-# note that we ran an exact test due to the small sample size (only 15 permutations)
+# %% plot the results
 
 # set up conditions dictionary for cluster plots
 # this is necessary for plotting the evoked data and the cluster result on top
@@ -137,7 +152,7 @@
 # finally let's plot the results using the ClusterResults class
 
 # we plot the cluster with the lowest p-value
-cluster_result.plot_cluster(cond_dict=conditions_dict)
+cluster_result.plot_cluster(condition_labels=conditions_dict)
 # we can see that there is something going on around 400 ms
 # with a stronger signal for target trials in right central-parietal channels
 

From f17f38fffc5b7c8ae0b19dec319df68a9d75df0f Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Mon, 22 Jul 2024 21:52:00 +0200
Subject: [PATCH 21/88] test compare_old_vs_new_cluster_API

---
 mne/stats/tests/test_cluster_level.py | 150 +++++++++++++++++++++++++-
 1 file changed, 149 insertions(+), 1 deletion(-)

diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py
index 439045b8d08..2307f793dad 100644
--- a/mne/stats/tests/test_cluster_level.py
+++ b/mne/stats/tests/test_cluster_level.py
@@ -8,6 +8,7 @@
 from functools import partial
 
 import numpy as np
+import pandas as pd
 import pytest
 from numpy.testing import (
     assert_allclose,
@@ -17,10 +18,20 @@
 )
 from scipy import linalg, sparse, stats
 
-from mne import MixedSourceEstimate, SourceEstimate, SourceSpaces, VolSourceEstimate
+from mne import (
+    EvokedArray,
+    MixedSourceEstimate,
+    SourceEstimate,
+    SourceSpaces,
+    VolSourceEstimate,
+    create_info,
+)
 from mne.fixes import _eye_array
 from mne.stats import combine_adjacency, ttest_ind_no_p
 from mne.stats.cluster_level import (
+    _check_fun,
+    _permutation_cluster_test,
+    cluster_test,
     f_oneway,
     permutation_cluster_1samp_test,
     permutation_cluster_test,
@@ -29,6 +40,7 @@
     summarize_clusters_stc,
     ttest_1samp_no_p,
 )
+from mne.time_frequency import AverageTFRArray
 from mne.utils import _record_warnings, catch_logging
 
 n_space = 50
@@ -869,3 +881,139 @@ def test_output_equiv(shape, out_type, adjacency, threshold):
             assert out_type == "indices"
             got_mask[np.ix_(*clu)] = n
     assert_array_equal(got_mask, want_mask)
+
+
+def create_sample_data_cluster_test():
+    """Create sample data to test new cluster API."""
+    # Prepare some dummy data
+    n_subjects = 20
+    n_conditions = 2
+    n_channels = 5
+    n_timepoints = 8
+    n_freqs = 3
+
+    # Create dummy data
+    dummy_data_2d = [
+        np.random.rand(n_channels, n_timepoints)
+        for _ in range(n_subjects * n_conditions)
+    ]
+    dummy_data_3d = [
+        np.random.rand(n_channels, n_freqs, n_timepoints)
+        for _ in range(n_subjects * n_conditions)
+    ]
+
+    # Create a DataFrame with dummy data
+    df_2d = pd.DataFrame(
+        {
+            "subject_index": np.repeat(range(n_subjects), n_conditions),
+            "condition": np.tile(["cond1", "cond2"], n_subjects),
+            "data": dummy_data_2d,
+        }
+    )
+
+    df_3d = pd.DataFrame(
+        {
+            "subject_index": np.repeat(range(n_subjects), n_conditions),
+            "condition": np.tile(["cond1", "cond2"], n_subjects),
+            "data": dummy_data_3d,
+        }
+    )
+
+    return df_2d, df_3d
+
+
+def compare_old_and_new_cluster_api():
+    """Make sure old and new cluster API results are the same."""
+    # load sample data
+    df_2d, df_3d = create_sample_data_cluster_test()
+
+    # mandatory parameters for new cluster API
+    formula = "evoked ~ 1 + C(subject_index)"
+
+    data_to_test = [df_2d, df_3d]
+
+    # save 2D and 3D data results for both old and new API
+    result_old_api_all = []
+    result_new_api_all = []
+    d_all = []
+
+    for df in data_to_test:
+        # Pivot the DataFrame to have conditions as columns for old API
+        pivot_df = df.pivot(index="subject_index", columns="condition", values="data")
+
+        # Subtract condition 2 data from condition 1 data for each subject
+        pivot_df["cond_diff"] = pivot_df.apply(
+            lambda row: row["cond1"] - row["cond1"], axis=1
+        )
+
+        # Extract the 'cond_diff' column as a numpy array
+        cond_diff_array = np.stack(pivot_df["cond_diff"].values)
+
+        # extract data and reshape for old API
+        if pivot_df.cond_diff[0].ndim == 2:
+            # reshape to channels as last dimension
+            d = cond_diff_array.transpose(0, 2, 1)
+        else:
+            # reshape 3D data to channels as last dimension
+            d = cond_diff_array.transpose(0, 3, 2, 1)
+
+        # define test statistic
+        stat_fun, threshold = _check_fun(
+            X=d, stat_fun=None, threshold=None, tail=0, kind="within"
+        )
+
+        # Run old cluster api
+        result_old_api = _permutation_cluster_test(
+            [d],
+            threshold=threshold,
+            stat_fun=stat_fun,
+            n_jobs=-1,  # takes all CPU cores
+            max_step=1,  # maximum distance between samples (time points)
+            exclude=None,  # exclude no time points or channels
+            step_down_p=0,  # step down in jumps test
+            t_power=1,  # weigh each location by its stats score
+            out_type="indices",
+            check_disjoint=False,
+            buffer_size=None,  # block size for chunking the data
+            n_permutations=1024,
+            tail=0,
+            adjacency=None,
+            seed=42,
+        )
+        result_old_api_all.append(result_old_api)
+        d_all.append(d)
+
+        if df.data[0].ndim == 2:
+            # convert each row in data column into evoked object
+            df["evoked"] = df["data"].apply(
+                lambda x: EvokedArray(
+                    x, create_info(df.data[0].shape[0], 1000.0, "eeg")
+                )
+            )
+        else:
+            # convert each row in data column into evoked object
+            df["evoked"] = df["data"].apply(
+                lambda x: AverageTFRArray(
+                    create_info(df.data[0].shape[0], 1000.0, "eeg"),
+                    x,
+                    times=np.arange(df.data[0].shape[2]),
+                    freqs=np.arange(df.data[0].shape[1]),
+                )
+            )
+
+        # run the new cluster test API and return the new cluster_result object
+        cluster_result = cluster_test(
+            df=df, formula=formula, paired_test=True, adjacency=None, seed=42
+        )
+        result_new_api_all.append(cluster_result)
+
+    # compare old and new API results both for 2D and 3D data
+    for result_old_api, result_new_api in zip(result_old_api_all, result_new_api_all):
+        # compare the cluster statistics
+        assert_array_equal(result_old_api[0], result_new_api.T_obs)
+
+        # compare the cluster indices
+        assert_array_equal(result_old_api[1], result_new_api.clusters)
+
+        # compare the cluster p-values
+        assert_array_equal(result_old_api[2], result_new_api.cluster_p_values)

From 9d592de86e56412dfc69433e0c9116589f4fde1f Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Thu, 25 Jul 2024 12:17:36 -0500
Subject: [PATCH 22/88] simplify tests

Co-authored-by: Carina Forster <carinaforster0611@gmail.com>
---
 mne/stats/tests/test_cluster_level.py | 136 ++++++++------------------
 1 file changed, 41 insertions(+), 95 deletions(-)

diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py
index 2307f793dad..01fcd5adba6 100644
--- a/mne/stats/tests/test_cluster_level.py
+++ b/mne/stats/tests/test_cluster_level.py
@@ -19,6 +19,7 @@
 from scipy import linalg, sparse, stats
 
 from mne import (
+    EpochsArray,
     EvokedArray,
     MixedSourceEstimate,
     SourceEstimate,
@@ -29,8 +30,6 @@
 from mne.fixes import _eye_array
 from mne.stats import combine_adjacency, ttest_ind_no_p
 from mne.stats.cluster_level import (
-    _check_fun,
-    _permutation_cluster_test,
     cluster_test,
     f_oneway,
     permutation_cluster_1samp_test,
@@ -40,7 +39,7 @@
     summarize_clusters_stc,
     ttest_1samp_no_p,
 )
-from mne.time_frequency import AverageTFRArray
+from mne.time_frequency import AverageTFRArray, EpochsTFRArray
 from mne.utils import _record_warnings, catch_logging
 
 n_space = 50
@@ -922,98 +921,45 @@ def create_sample_data_cluster_test():
     return df_2d, df_3d
 
 
-def compare_old_and_new_cluster_api():
-    """Make sure old and new cluster API results are the same."""
-    # load sample data
-    df_2d, df_3d = create_sample_data_cluster_test()
-
-    # mandatory parameters for new cluster API
-    formula = "evoked ~ 1 + C(subject_index)"
-
-    data_to_test = [df_2d, df_3d]
-
-    # save 2D and 3D data results for both old and new API
-    result_old_api_all = []
-    result_new_api_all = []
-    d_all = []
-
-    for df in data_to_test:
-        # Pivot the DataFrame to have conditions as columns for old API
-        pivot_df = df.pivot(index="subject_index", columns="condition", values="data")
-
-        # Subtract condition 2 data from condition 1 data for each subject
-        pivot_df["cond_diff"] = pivot_df.apply(
-            lambda row: row["cond1"] - row["cond1"], axis=1
-        )
-
-        # Extract the 'cond_diff' column as a numpy array
-        cond_diff_array = np.stack(pivot_df["cond_diff"].values)
-
-        # extract data and reshape for old API
-        if pivot_df.cond_diff[0].ndim == 2:
-            # reshape to channels as last dimension
-            d = cond_diff_array.transpose(0, 2, 1)
-        else:
-            # reshape 3D data to channels as last dimension
-            d = cond_diff_array.transpose(0, 3, 2, 1)
-
-        # define test statistic
-        stat_fun, threshold = _check_fun(
-            X=d, stat_fun=None, threshold=None, tail=0, kind="within"
-        )
-
-        # Run old cluster api
-        result_old_api = _permutation_cluster_test(
-            [d],
-            threshold=threshold,
-            stat_fun=stat_fun,
-            n_jobs=-1,  # takes all CPU cores
-            max_step=1,  # maximum distance between samples (time points)
-            exclude=None,  # exclude no time points or channels
-            step_down_p=0,  # step down in jumps test
-            t_power=1,  # weigh each location by its stats score
-            out_type="indices",
-            check_disjoint=False,
-            buffer_size=None,  # block size for chunking the data
-            n_permutations=1024,
-            tail=0,
-            adjacency=None,
-            seed=42,
-        )
-        result_old_api_all.append(result_old_api)
-        d_all.append(d)
-
-        if df.data[0].ndim == 2:
-            # convert each row in data column into evoked object
-            df["evoked"] = df["data"].apply(
-                lambda x: EvokedArray(
-                    x, create_info(df.data[0].shape[0], 1000.0, "eeg")
-                )
-            )
-        else:
-            # convert each row in data column into evoked object
-            df["evoked"] = df["data"].apply(
-                lambda x: AverageTFRArray(
-                    create_info(df.data[0].shape[0], 1000.0, "eeg"),
-                    x,
-                    times=np.arange(df.data[0].shape[2]),
-                    freqs=np.arange(df.data[0].shape[1]),
-                )
-            )
-
-        # run the new cluster test API and return the new cluster_result object
-        cluster_result = cluster_test(
-            df=df, formula=formula, paired_test=True, adjacency=None, seed=42
+def test_compare_old_and_new_cluster_api():
+    """Test for same results from old and new APIs."""
+    condition1_1d, condition2_1d, condition1_2d, condition2_2d = _get_conditions()
+    df_1d = pd.DataFrame(
+        dict(
+            data=[condition1_1d, condition2_1d],
+            condition=["a", "b"],
         )
-        result_new_api_all.append(cluster_result)
-
-    # compare old and new API results both for 2D and 3D data
-    for result_old_api, result_new_api in zip(result_old_api_all, result_new_api_all):
-        # compare the cluster statistics
-        assert_array_equal(result_old_api[0], result_new_api.T_obs)
+    )
+    kwargs = dict(n_permutations=100, tail=1, seed=1, buffer_size=None, out_type="mask")
+    F_obs, clusters, cluster_pvals, H0 = permutation_cluster_test(
+        [condition1_1d, condition2_1d], **kwargs
+    )
+    formula = "data ~ condition"
+    cluster_result = cluster_test(df_1d, formula, **kwargs)
+    assert_array_equal(cluster_result.H0, H0)
+    assert_array_equal(cluster_result.stat_obs, F_obs)
+    assert_array_equal(cluster_result.cluster_p_values, cluster_pvals)
+    assert cluster_result.clusters == clusters
 
-        # compare the cluster indices
-        assert_array_equal(result_old_api[1], result_new_api.clusters)
 
-        # compare the cluster p-values
-        assert_array_equal(result_old_api[2], result_new_api.cluster_p_values)
+@pytest.mark.parametrize(
+    "Inst", (EpochsArray, EvokedArray, EpochsTFRArray, AverageTFRArray)
+)
+def test_new_cluster_api(Inst):
+    """Test handling different MNE objects in the cluster API."""
+    pd = pytest.importorskip("pandas")
+
+    n_epo, n_chan, n_freq, n_times = 2, 3, 5, 7
+    shape = (n_chan, n_times)
+    if Inst in (EpochsArray, EpochsTFRArray):
+        shape = (n_epo,) + shape
+    if Inst in (EpochsTFRArray, AverageTFRArray):
+        shape = shape[:-1] + (n_freq, shape[-1])
+
+    info = create_info(...)
+    inst1 = Inst(np.random.normal(shape, ...), info=info)
+    inst2 = Inst(np.random.normal(shape, ...), info=info)
+
+    df = pd.DataFrame(dict(data=[inst1, inst2], condition=["a", "b"]))
+    result = cluster_test(df, "data~condition", ...)
+    assert result  # TODO do something more interesting here

From d64ef84d6a1885fe225e19fe99e7cf3087550854 Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Thu, 25 Jul 2024 12:23:49 -0500
Subject: [PATCH 23/88] refactor cluster_test

Co-authored-by: Eric Larson <larson.eric.d@gmail.com>
Co-authored-by: Carina Forster <carinaforster0611@gmail.com>
---
 mne/stats/cluster_level.py | 419 ++++++++++++-------------------------
 1 file changed, 139 insertions(+), 280 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index ab0fd0daf69..a366b19ecc1 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -24,13 +24,12 @@
 from scipy.stats import f as fstat
 from scipy.stats import t as tstat
 
-from .. import Epochs, Evoked
-from ..epochs import EpochsArray, EvokedArray
+from .. import BaseEpochs, Evoked, EvokedArray
 from ..fixes import has_numba, jit
 from ..parallel import parallel_func
 from ..source_estimate import MixedSourceEstimate, SourceEstimate, VolSourceEstimate
 from ..source_space import SourceSpaces
-from ..time_frequency import AverageTFR, AverageTFRArray, EpochsTFR, EpochsTFRArray
+from ..time_frequency import BaseTFR
 from ..utils import (
     ProgressBar,
     _check_option,
@@ -1744,191 +1743,65 @@ def summarize_clusters_stc(
     return klass(data_summary, vertices, tmin, tstep, subject)
 
 
-def validate_input_dataframe(df: pd.DataFrame, formula: str):
-    """
-    Validate the input dataframe for the cluster permutation test.
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        Dataframe with 3 columns (subject_index, condition, data).
-    formula : formulaic.ModelSpec
-        Wilkinson style Formula for the design matrix.
-
-    Returns
-    -------
-    dv_name : str
-        Name of the dependent variable.
-    """
-    # extract dependent variable name from formula
-    formulaic = _soft_import(
-        "formulaic", purpose="set up Design Matrix"
-    )  # soft import (not a dependency for MNE)
-    formula = formulaic.Formula(formula)
-    dv_name = str(formula.lhs)
-
+def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str):
     # check if all necessary columns are present
-    if dv_name not in df.columns:
-        raise ValueError("""DataFrame needs to contain a column
-                        with the dependent variable name
-                        as defined in the formula""")
-    if "condition" not in df.columns:
-        raise ValueError("DataFrame needs to contain a condition column")
-    if "subject_index" not in df.columns:
-        raise ValueError("DataFrame needs to contain a subject_index column")
-
-    # check if the data column contains only valid types
-    check_column_types(df[dv_name])
-
+    missing = ({dv_name} | {iv_name}) - set(df.columns)
+    sep = '", "'
+    if missing:
+        raise ValueError(
+            f"DataFrame must contain a column named for each term in `formula`. "
+            f"Column{_pl(missing)} missing for term{_pl(missing)} "
+            f'"{sep.join(missing)}".'
+        )
+    # check if the data column contains valid (and consistent) instance types
+    inst = df[dv_name].iloc[0]
+    valid_types = (Evoked, BaseEpochs, BaseTFR, np.ndarray)
+    _validate_type(inst, valid_types, f"Data in dependent variable column '{dv_name}'")
+    all_types = set(df[dv_name].map(type))
+    all_type_names = ", ".join([type(x).__name__ for x in all_types])
+    prologue = f"Data in dependent variable column '{dv_name}' must all have "
+    if len(all_types) > 1:
+        raise ValueError(
+            f"{prologue} the same type, but found types {{{all_type_names}}}."
+        )
     # check if the shape of the data is consistent
-    if not all(data.data.shape == df[dv_name][0].data.shape for data in df[dv_name]):
-        raise ValueError("Data objects need to have the same shape")
-
-    # check if the condition column contains only 2 unique values
-    if len(pd.unique(df.condition)) != 2:
-        raise ValueError("currently only supports 2 conditions.")
-
-    return dv_name
-
-
-def check_column_types(input_data: np.ndarray):
-    """
-    Check if the column types are valid for the cluster permutation test.
-
-    Parameters
-    ----------
-    input_data : np.Array
-        Data to be checked for the cluster permutation test.
-    """
-    # Get the type of the first element
-    first_type = type(input_data.iloc[0])
-
-    # Define the possible valid types
-    valid_types = (
-        Evoked,
-        EvokedArray,
-        Epochs,
-        EpochsArray,
-        AverageTFR,
-        EpochsTFR,
-        EpochsTFRArray,
-        AverageTFRArray,
-    )
-
-    # Check if the type of the first element is a valid type
-    if first_type not in valid_types:
-        raise ValueError(f"Object type '{first_type}' is not a valid type.")
-
-    # Check if all elements are of the same type as the first one
-    if not all(isinstance(data, first_type) for data in input_data):
-        raise ValueError("Data column must contain objects of the same type.")
-
-
-def prepare_data_for_cluster_test(input_df: pd.DataFrame, dv_name: str):
-    """
-    Prepare the data for the cluster permutation test.
-
-    Parameters
-    ----------
-    input_data : np.ndarray
-        Data to be prepared for the cluster permutation test.
-
-    Returns
-    -------
-    data : np.Array
-        Data prepared for the cluster permutation test.
-    """
-    # extract data and add to dataframe
-    input_df["data"] = [data.data for data in input_df[dv_name]]
-
-    # extract dimensions from time series or time-frequency data
-    first_data_obj = input_df["data"].iloc[0]
-    if isinstance(first_data_obj, (Epochs, Evoked, EpochsArray, EvokedArray)):
-        n_channels, n_timepoints = first_data_obj.get_data().shape
-    if isinstance(
-        first_data_obj, (AverageTFR, EpochsTFR, AverageTFRArray, EpochsTFRArray)
-    ):
-        n_channels, n_freqs, n_timepoints = first_data_obj.get_data().shape
-
-    reshaped_data = []
-
-    for idx, row in input_df.iterrows():
-        subject_index = row["subject_index"]
-        condition = row["condition"]
-        data_array = row["data"]
-
-        if data_array.ndim == 2:
-            n_channels, n_timepoints = data_array.shape
-            # timepoints are the columns
-            df_temp = pd.DataFrame(
-                data_array, columns=[f"timepoint_{i}" for i in range(n_timepoints)]
-            )
-            df_temp["channel"] = range(n_channels)
-            df_temp["subject_index"] = subject_index
-            df_temp["condition"] = condition
-
-            reshaped_data.append(df_temp)
-
-        elif data_array.ndim == 3:
-            n_channels, n_freqs, n_timepoints = data_array.shape
-            # timepoints are the columns
-            df_temp = pd.DataFrame(
-                data_array.reshape(-1, n_timepoints),
-                columns=[f"timepoint_{i}" for i in range(n_timepoints)],
-            )
-            df_temp["frequency"] = np.repeat(range(n_freqs), n_channels)
-            df_temp["channel"] = np.tile(range(n_channels), n_freqs)
-            df_temp["subject_index"] = subject_index
-            df_temp["condition"] = condition
-
-            reshaped_data.append(df_temp)
-
-        else:
-            raise ValueError(f"Unsupported data array dimensions: {data_array.ndim}")
-    # combine the reshaped data
-    combined_df = pd.concat(reshaped_data, ignore_index=True)
-    # Convert the dataframe to long format
-    id_vars = ["subject_index", "condition", "channel"]
-    if "frequency" in combined_df.columns:
-        id_vars.append("frequency")
-
-    reshaped_df = pd.melt(
-        combined_df, id_vars=id_vars, var_name="timepoint", value_name="value"
-    )
-
-    # rename column and convert to integer
-    reshaped_df["timepoint"] = (
-        reshaped_df["timepoint"].str.replace("timepoint_", "").astype(int)
-    )
-
-    # return the reshaped dataframe and dimensions
-    if data_array.ndim == 2:
-        return reshaped_df, data_array.ndim, n_channels, n_timepoints
-    elif data_array.ndim == 3:
-        return reshaped_df, data_array.ndim, n_channels, n_freqs, n_timepoints
+    if isinstance(inst, np.ndarray):
+        all_shapes = set(df[dv_name].map(lambda x: x.shape[1:]))  # first dim may vary
+    elif isinstance(inst, BaseEpochs):
+        all_shapes = set(df[dv_name].map(lambda x: x.get_data().shape[1:]))
+    else:
+        all_shapes = set(df[dv_name].map(lambda x: x.get_data().shape))
+    if len(all_shapes) > 1:
+        raise ValueError(
+            f"{prologue} consistent shape, but {len(all_shapes)} different "
+            f"shapes were found: {'; '.join(all_shapes)}."
+        )
+    return all_types.pop()
 
 
+@verbose
 def cluster_test(
     df: pd.DataFrame,
-    formula: str,  # Wilkinson notation formula for design matrix
-    paired_test: bool,  # whether to run a paired t-test or unpaired test
-    n_permutations: int = 1024,  # same default as in old API
-    seed: None | int | np.random.RandomState = None,
-    tail: Literal[-1, 0, 1] = 0,  # 0 for two-tailed, 1 for greater, -1 for less
-    n_jobs: int = 1,  # how many cores to use
+    formula: str,
+    *,
+    within_id: str | None = None,
+    stat_fun: callable | None = None,
+    tail: Literal[-1, 0, 1] = 0,
+    threshold=None,
+    n_permutations: int = 1024,
     adjacency: tuple | None = None,
-    max_step: int = 1,  # maximum distance between samples (time points)
-    exclude: list | None = None,  # exclude no time points or channels
-    step_down_p: int = 0,  # step down in jumps test
-    t_power: int = 1,  # weigh each location by its stats score
-    out_type: Literal["indices", "mask"] = "indices",
+    max_step: int = 1,
+    exclude: list | None = None,
+    step_down_p: int = 0,
+    t_power: int = 1,
     check_disjoint: bool = False,
-    buffer_size: int | None = None,  # block size for chunking the data
+    out_type: Literal["indices", "mask"] = "indices",
+    seed: None | int | np.random.RandomState = None,
+    buffer_size: int | None = None,
+    n_jobs: int = 1,
+    verbose=None,
 ):
-    """
-    Run a cluster permutation test based on formulaic input.
-
-    # currently only supports paired t-test on evokeds or epochs
+    """Run a cluster permutation test from a DataFrame and a formula.
 
     Parameters
     ----------
@@ -1936,16 +1809,14 @@ def cluster_test(
         Dataframe with 3 columns (subject_index, condition, evoked).
     formula : str
         Wilkinson notation formula for design matrix.
-    paired_test: bool
-        Whether to run a paired t-test.
-    n_permutations : int, optional
-        Number of permutations. Default is 1024.
-    seed : None | int | np.random.RandomState, optional
-        Seed for the random number generator. Default is None.
+    within_id : None | str
+        Name of column in ``df`` to use in identifying within-group contrasts.
+    stat_fun : None | callable
+        Statistical function to use.
     tail : int, optional
         0 for two-tailed, 1 for greater, -1 for less. Default is 0.
-    n_jobs : int, optional
-        How many cores to use. Default is 1.
+    n_permutations : int, optional
+        Number of permutations. Default is 1024.
     adjacency : None, optional
         Provide a adjacency matrix. Default is None.
     max_step : int, optional
@@ -1956,107 +1827,86 @@ def cluster_test(
         Step down in jumps test. Default is 0.
     t_power : int, optional
         Weigh each location by its stats score. Default is 1.
-    out_type : str, optional
-        Output type. Default is "indices".
     check_disjoint : bool, optional
         Check if clusters are disjoint. Default is False.
+    out_type : str, optional
+        Output type. Default is "indices".
+    seed : None | int | np.random.RandomState, optional
+        Seed for the random number generator. Default is None.
     buffer_size : int, optional
         Block size for chunking the data. Default is None.
-    seed : int, optional
-        Seed for the random number generator. Default is None.
+    n_jobs : int, optional
+        How many cores to use. Default is 1.
+    %(verbose)s
 
     Returns
     -------
     ClusterResult
         Object containing the results of the cluster permutation test.
     """
-    # check if formula is present
-    if formula is None:
-        raise ValueError("Wilkinson style formula is required.")
-
-    # validate the input dataframe and return name of dependent variable
-    dv_name = validate_input_dataframe(df, formula)
-
-    # prepare the data for the cluster permutation test
-    prep_result = prepare_data_for_cluster_test(df, dv_name)
-
-    if prep_result[1] == 2:
-        # pivot the dataframe based on condition for later subtraction
-        pivot_df = (
-            prep_result[0]
-            .pivot_table(
-                index=["subject_index", "channel", "timepoint"],
-                columns="condition",
-                values="value",
-            )
-            .reset_index()
-        )
-    elif prep_result[1] == 3:
-        # pivot the dataframe based on condition for later subtraction
-        pivot_df = (
-            prep_result[0]
-            .pivot_table(
-                index=["subject_index", "channel", "frequency", "timepoint"],
-                columns="condition",
-                values="value",
-            )
-            .reset_index()
+    # parse formula
+    formulaic = _soft_import("formulaic", purpose="parse formula for clustering")
+    parser = formulaic.parser.DefaultFormulaParser(include_intercept=False)
+    formula = formulaic.Formula(formula, _parser=parser)
+    dv_name = str(np.array(formula.lhs.root).item())
+    iv_name = str(np.array(formula.rhs.root).item())
+    # validate the input dataframe and return the type of the data column entries
+    _dtype = _validate_cluster_df(df, dv_name, iv_name)
+
+    # for within_subject
+    _validate_type(within_id, (str, None), "within_id")
+    if within_id:
+        df = df.copy(deep=False)  # Don't mutate input dataframe row order!
+        df.sort_values([iv_name, within_id], inplace=True)
+        counts = df[within_id].value_counts()
+        if any(counts != 2):
+            raise ValueError("Badness 10000")
+
+    # extract the data
+
+    def _extract_data_array(series):
+        return np.concatenate(series.values)
+
+    def _extract_data_mne(series):
+        return np.array(
+            series.map(lambda inst: inst.get_data().swapaxes(-2, -1)).to_list()
         )
 
-    # Get unique elements and the indices of their first occurrences
-    unique_elements, indices = np.unique(df.condition, return_index=True)
-
-    # Sort unique elements by the indices of their first occurrences
-    conditions = unique_elements[np.argsort(indices)]
-
-    # store the contrast for the clusterResults object
-    contrast = f"{conditions[0]} - {conditions[1]}"
-
-    # print the contrast used for the paired t-test so the user knows
-    # what is subtracted from what
-    logger.info(f"Contrast used for paired t-test: {contrast}")
-
-    # Compute the difference (assuming there are only 2 conditions)
-    pivot_df[dv_name] = pivot_df[conditions[0]] - pivot_df[conditions[1]]
-
-    # for the paired t-test y is the difference between conditions
-    # X is the design matrix with a column with 1s and 0s for each participant
-    # Create the design matrix using formulaic
-    formulaic = _soft_import(
-        "formulaic", purpose="set up Design Matrix"
-    )  # soft import (not a dependency for MNE)
-    y, X = formulaic.model_matrix(formula, pivot_df)
-
-    # Prepare design matrix for input into MNE cluster function
-    # MNE cluster functions expect channels as the last dimension
-
-    if prep_result[1] == 2:
-        # Reshape y.values into a 3D array: (participants, n_channels, n_timepoints)
-        y_reshaped = y.values.reshape(-1, prep_result[2], prep_result[3])
-        # Transpose the array to have channels as the last dimension
-        y_for_cluster = y_reshaped.transpose(0, 2, 1)
-    elif prep_result[1] == 3:
-        # Reshape y.values into a 4D array:
-        # (participants, n_channels, n_freqs, n_timepoints)
-        y_reshaped = y.values.reshape(
-            -1, prep_result[2], prep_result[3], prep_result[4]
-        )
-        # Transpose the array to have channels as the last dimension
-        y_for_cluster = y_reshaped.transpose(0, 3, 2, 1)
+    def _extract_data_tfr(series):
+        return series.map(lambda inst: inst.get_data().swapaxes(-3, -1)).to_list()
 
-    if paired_test:
-        # define stat function and threshold
-        stat_fun, threshold = _check_fun(
-            X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="within"
-        )
+    if _dtype is np.ndarray:
+        func = _extract_data_array
+    elif _dtype is BaseTFR:
+        func = _extract_data_tfr
     else:
-        # define stat function and threshold
-        stat_fun, threshold = _check_fun(
-            X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="between"
-        )
+        func = _extract_data_mne
+    # convert to a list-like X for clustering
+    X = df.groupby(iv_name).agg({dv_name: func})[dv_name].to_list()
+
+    # determine test type
+    if len(X) == 1:
+        kind = "within"
+    elif len(X) > 2:
+        kind = "between"
+    elif len(set(x.shape for x in X)) > 1:
+        kind = "between"
+    # by now we know there are exactly 2 elements in X, and their shapes match
+    elif within_id in df:
+        kind = "within"
+        X = X[0] - X[1]
+    else:
+        kind = "between"
+
+    # define stat function and threshold
+    stat_fun, threshold = _check_fun(
+        X=X, stat_fun=stat_fun, threshold=threshold, tail=tail, kind=kind
+    )
+    if kind == "within":
+        X = [X]
     # Run the cluster-based permutation test
-    T_obs, clusters, cluster_p_values, H0 = _permutation_cluster_test(
-        [y_for_cluster],
+    stat_obs, clusters, cluster_p_values, H0 = _permutation_cluster_test(
+        X,
         n_permutations=n_permutations,
         threshold=threshold,
         stat_fun=stat_fun,
@@ -2073,9 +1923,9 @@ def cluster_test(
         seed=seed,
     )
 
-    print(f"smallest cluster p-value: {min(cluster_p_values)}")
+    # print(f"smallest cluster p-value: {min(cluster_p_values)}")
 
-    return ClusterResult(T_obs, clusters, cluster_p_values, H0)
+    return ClusterResult(stat_obs, clusters, cluster_p_values, H0, stat_fun)
 
 
 class ClusterResult:
@@ -2084,7 +1934,7 @@ class ClusterResult:
 
     Parameters
     ----------
-    T_obs : np.ndarray
+    stat_obs : np.ndarray
         The observed test statistic.
     clusters : list
         List of clusters.
@@ -2096,15 +1946,24 @@ class ClusterResult:
 
     def __init__(
         self,
-        T_obs: np.typing.NDArray,
+        stat_obs: np.typing.NDArray,
         clusters: list,
         cluster_p_values: np.typing.NDArray,
         H0: np.typing.NDArray,
+        stat_fun: callable,
     ):
-        self.T_obs = T_obs
+        self.stat_obs = stat_obs
         self.clusters = clusters
         self.cluster_p_values = cluster_p_values
         self.H0 = H0
+        self.stat_fun = stat_fun
+        # TODO improve detection of stat name (e.g. unpaired T)?
+        if stat_fun is f_oneway:
+            self.stat_name = "F-statistic"
+        elif stat_fun is ttest_1samp_no_p:
+            self.stat_name = "paired T-statistic"
+        else:
+            self.stat_name = "test statistic"
 
     def plot_cluster(self, condition_labels: dict):
         """
@@ -2135,7 +1994,7 @@ def plot_cluster(self, condition_labels: dict):
         time_inds = np.unique(time_inds)
 
         # get topography for t stat
-        t_map = self.T_obs[time_inds, ...].mean(axis=0).astype(int)
+        t_map = self.stat_obs[time_inds, ...].mean(axis=0).astype(int)
 
         # get signals at the sensors contributing to the cluster
         sig_times = cond_values[0][0].times[time_inds]
@@ -2176,7 +2035,7 @@ def plot_cluster(self, condition_labels: dict):
         # add axes for colorbar
         ax_colorbar = divider.append_axes("right", size="5%", pad=0.1)
         cbar = plt.colorbar(image, cax=ax_colorbar)
-        cbar.set_label("t-value")
+        cbar.set_label(self.stat_name)
         ax_topo.set_xlabel(
             "average from {:0.3f} to {:0.3f} s".format(*sig_times[[0, -1]])
         )

From dc8a799b16f5871c6fb0352f414265d26ad20d29 Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Thu, 25 Jul 2024 12:25:12 -0500
Subject: [PATCH 24/88] make tutorial match modified API

Co-authored-by: Carina Forster <carinaforster0611@gmail.com>
---
 tutorials/stats-sensor-space/76_new_cluster_test_api.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index efbc6d5e3f0..83b4f019b6f 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -133,11 +133,11 @@
 # the cluster test randomly permutes the subject label
 # the 1 in the formula represents the intercept which is always included
 # C is a categorical variable that will be dummy coded
-formula = "evoked ~ 1 + C(subject_index)"
+formula = "evoked ~ condition"
 
 # run the new cluster test API and return the new cluster_result object
 cluster_result = mne.stats.cluster_level.cluster_test(
-    df=df, formula=formula, paired_test=True, adjacency=None
+    df=df, formula=formula, within_id="subject_index"
 )
 
 # note that we ran an exact test due to the small sample size

From f12cf6e574eaa08d9dd9f59d93a66b3e3f49bcf2 Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Thu, 25 Jul 2024 12:30:30 -0500
Subject: [PATCH 25/88] remove unused test helper func

---
 mne/stats/tests/test_cluster_level.py | 39 ---------------------------
 1 file changed, 39 deletions(-)

diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py
index 01fcd5adba6..24f3ee687ca 100644
--- a/mne/stats/tests/test_cluster_level.py
+++ b/mne/stats/tests/test_cluster_level.py
@@ -882,45 +882,6 @@ def test_output_equiv(shape, out_type, adjacency, threshold):
     assert_array_equal(got_mask, want_mask)
 
 
-def create_sample_data_cluster_test():
-    """Create sample data to test new cluster API."""
-    # Prepare some dummy data
-    n_subjects = 20
-    n_conditions = 2
-    n_channels = 5
-    n_timepoints = 8
-    n_freqs = 3
-
-    # Create dummy data
-    dummy_data_2d = [
-        np.random.rand(n_channels, n_timepoints)
-        for _ in range(n_subjects * n_conditions)
-    ]
-    dummy_data_3d = [
-        np.random.rand(n_channels, n_freqs, n_timepoints)
-        for _ in range(n_subjects * n_conditions)
-    ]
-
-    # Create a DataFrame with dummy data
-    df_2d = pd.DataFrame(
-        {
-            "subject_index": np.repeat(range(n_subjects), n_conditions),
-            "condition": np.tile(["cond1", "cond2"], n_subjects),
-            "data": dummy_data_2d,
-        }
-    )
-
-    df_3d = pd.DataFrame(
-        {
-            "subject_index": np.repeat(range(n_subjects), n_conditions),
-            "condition": np.tile(["cond1", "cond2"], n_subjects),
-            "data": dummy_data_3d,
-        }
-    )
-
-    return df_2d, df_3d
-
-
 def test_compare_old_and_new_cluster_api():
     """Test for same results from old and new APIs."""
     condition1_1d, condition2_1d, condition1_2d, condition2_2d = _get_conditions()

From 5b97971602d761e7f2b181f7f0914032aacb8e96 Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Thu, 25 Jul 2024 12:33:11 -0500
Subject: [PATCH 26/88] vulture allowlist update

---
 tools/vulture_allowlist.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/vulture_allowlist.py b/tools/vulture_allowlist.py
index 3de48b3b906..c41ea610880 100644
--- a/tools/vulture_allowlist.py
+++ b/tools/vulture_allowlist.py
@@ -136,3 +136,6 @@
 _qt_raise_window
 _qt_disable_paint
 _qt_get_stylesheet
+
+# used in tutorial, not sure why shows up
+plot_cluster

From 5f5b0fc3262ce29ff19139756f10143df12a4804 Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Sun, 28 Jul 2024 13:00:41 +0200
Subject: [PATCH 27/88] included BaseTFR in validate_cluster_df

---
 mne/stats/cluster_level.py | 43 +++++++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index a366b19ecc1..fc41b3a5506 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -1744,18 +1744,24 @@ def summarize_clusters_stc(
 
 
 def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str):
+    """Validate the input DataFrame for cluster tests."""
     # check if all necessary columns are present
-    missing = ({dv_name} | {iv_name}) - set(df.columns)
+    missing = ({dv_name} | {iv_name}) - set(df.columns)  # should be empty
     sep = '", "'
-    if missing:
+    if missing:  # if not empty, there are missing columns
         raise ValueError(
             f"DataFrame must contain a column named for each term in `formula`. "
-            f"Column{_pl(missing)} missing for term{_pl(missing)} "
+            f"Column{_pl(missing)} missing for term{_pl(missing)} "  # _pl = pluralize
             f'"{sep.join(missing)}".'
         )
     # check if the data column contains valid (and consistent) instance types
     inst = df[dv_name].iloc[0]
-    valid_types = (Evoked, BaseEpochs, BaseTFR, np.ndarray)
+    valid_types = (
+        Evoked,
+        BaseEpochs,
+        BaseTFR,
+        np.ndarray,
+    )  # Base covers all Epochs and TFRs
     _validate_type(inst, valid_types, f"Data in dependent variable column '{dv_name}'")
     all_types = set(df[dv_name].map(type))
     all_type_names = ", ".join([type(x).__name__ for x in all_types])
@@ -1766,8 +1772,10 @@ def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str):
         )
     # check if the shape of the data is consistent
     if isinstance(inst, np.ndarray):
-        all_shapes = set(df[dv_name].map(lambda x: x.shape[1:]))  # first dim may vary
-    elif isinstance(inst, BaseEpochs):
+        all_shapes = set(
+            df[dv_name].map(lambda x: x.shape[1:])
+        )  # first dim may vary (participants or epochs)
+    elif isinstance(inst, (BaseEpochs | BaseTFR)):  # should include BaseTFR?
         all_shapes = set(df[dv_name].map(lambda x: x.get_data().shape[1:]))
     else:
         all_shapes = set(df[dv_name].map(lambda x: x.get_data().shape))
@@ -1776,14 +1784,14 @@ def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str):
             f"{prologue} consistent shape, but {len(all_shapes)} different "
             f"shapes were found: {'; '.join(all_shapes)}."
         )
-    return all_types.pop()
+    return all_types.pop()  # return the type of the data column entries
 
 
 @verbose
 def cluster_test(
     df: pd.DataFrame,
     formula: str,
-    *,
+    *,  # end of positional-only parameters
     within_id: str | None = None,
     stat_fun: callable | None = None,
     tail: Literal[-1, 0, 1] = 0,
@@ -1806,9 +1814,10 @@ def cluster_test(
     Parameters
     ----------
     df : pd.DataFrame
-        Dataframe with 3 columns (subject_index, condition, evoked).
+        Dataframe containing the data, dependent and independent variables.
     formula : str
-        Wilkinson notation formula for design matrix.
+        Wilkinson notation formula for design matrix. The names of the dependent
+        and independent variable should match the columns in the dataframe.
     within_id : None | str
         Name of column in ``df`` to use in identifying within-group contrasts.
     stat_fun : None | callable
@@ -1848,8 +1857,10 @@ def cluster_test(
     formulaic = _soft_import("formulaic", purpose="parse formula for clustering")
     parser = formulaic.parser.DefaultFormulaParser(include_intercept=False)
     formula = formulaic.Formula(formula, _parser=parser)
+    # extract the dependent and independent variable names
     dv_name = str(np.array(formula.lhs.root).item())
     iv_name = str(np.array(formula.rhs.root).item())
+
     # validate the input dataframe and return the type of the data column entries
     _dtype = _validate_cluster_df(df, dv_name, iv_name)
 
@@ -1860,10 +1871,9 @@ def cluster_test(
         df.sort_values([iv_name, within_id], inplace=True)
         counts = df[within_id].value_counts()
         if any(counts != 2):
-            raise ValueError("Badness 10000")
-
-    # extract the data
+            raise ValueError("for paired tttest, each subject must have 2 observations")
 
+    # extract the data from the dataframe
     def _extract_data_array(series):
         return np.concatenate(series.values)
 
@@ -1881,15 +1891,16 @@ def _extract_data_tfr(series):
         func = _extract_data_tfr
     else:
         func = _extract_data_mne
+
     # convert to a list-like X for clustering
     X = df.groupby(iv_name).agg({dv_name: func})[dv_name].to_list()
 
     # determine test type
     if len(X) == 1:
-        kind = "within"
+        kind = "within"  # data already subtracted
     elif len(X) > 2:
         kind = "between"
-    elif len(set(x.shape for x in X)) > 1:
+    elif len(set(x.shape for x in X)) > 1:  # check if shapes match
         kind = "between"
     # by now we know there are exactly 2 elements in X, and their shapes match
     elif within_id in df:
@@ -1923,8 +1934,6 @@ def _extract_data_tfr(series):
         seed=seed,
     )
 
-    # print(f"smallest cluster p-value: {min(cluster_p_values)}")
-
     return ClusterResult(stat_obs, clusters, cluster_p_values, H0, stat_fun)
 
 

From ccccb5bfa19a71be0f95dc243e7fb59dfc4a267d Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Sun, 28 Jul 2024 13:35:40 +0200
Subject: [PATCH 28/88] comments on cluster_test function

---
 mne/stats/cluster_level.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index fc41b3a5506..7b44641a3ec 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -1864,7 +1864,7 @@ def cluster_test(
     # validate the input dataframe and return the type of the data column entries
     _dtype = _validate_cluster_df(df, dv_name, iv_name)
 
-    # for within_subject
+    # for within_subject designs, check if each subject has 2 observations
     _validate_type(within_id, (str, None), "within_id")
     if within_id:
         df = df.copy(deep=False)  # Don't mutate input dataframe row order!
@@ -1877,7 +1877,7 @@ def cluster_test(
     def _extract_data_array(series):
         return np.concatenate(series.values)
 
-    def _extract_data_mne(series):
+    def _extract_data_mne(series):  # 2D data
         return np.array(
             series.map(lambda inst: inst.get_data().swapaxes(-2, -1)).to_list()
         )
@@ -1900,21 +1900,26 @@ def _extract_data_tfr(series):
         kind = "within"  # data already subtracted
     elif len(X) > 2:
         kind = "between"
-    elif len(set(x.shape for x in X)) > 1:  # check if shapes match
+    elif (
+        len(set(x.shape for x in X)) > 1
+    ):  # check if there are unequal observations in each group
         kind = "between"
     # by now we know there are exactly 2 elements in X, and their shapes match
     elif within_id in df:
         kind = "within"
         X = X[0] - X[1]
-    else:
+    else:  # what would be another else cas
         kind = "between"
 
     # define stat function and threshold
     stat_fun, threshold = _check_fun(
         X=X, stat_fun=stat_fun, threshold=threshold, tail=tail, kind=kind
     )
-    if kind == "within":
+
+    # check_fun doesn't work with list input`
+    if kind == "within":  # will this create an issue for already subtracted data?
         X = [X]
+
     # Run the cluster-based permutation test
     stat_obs, clusters, cluster_p_values, H0 = _permutation_cluster_test(
         X,

From 59b1a3a7850681df957d3fcb35818f4c9d7b5911 Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Sun, 28 Jul 2024 14:11:54 +0200
Subject: [PATCH 29/88] updated clusterResult class and plot function

---
 mne/stats/cluster_level.py | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index 7b44641a3ec..7f48c999f5f 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -1971,7 +1971,8 @@ def __init__(
         self.cluster_p_values = cluster_p_values
         self.H0 = H0
         self.stat_fun = stat_fun
-        # TODO improve detection of stat name (e.g. unpaired T)?
+
+        # unpaired t-test is f_oneway
         if stat_fun is f_oneway:
             self.stat_name = "F-statistic"
         elif stat_fun is ttest_1samp_no_p:
@@ -1979,7 +1980,7 @@ def __init__(
         else:
             self.stat_name = "test statistic"
 
-    def plot_cluster(self, condition_labels: dict):
+    def plot_cluster_time_sensor(self, condition_labels: dict):
         """
         Plot the cluster with the lowest p-value.
 
@@ -1992,13 +1993,20 @@ def plot_cluster(self, condition_labels: dict):
         condition_labels : dict
             Dictionary with condition labels as keys and evoked objects as values.
         """
+        # define colorblind friendly colors
+        colorblind_palette = ["#4daf4a", "#f781bf"]
+
         # extract condition labels from the dictionary
         cond_keys = list(condition_labels.keys())
         # extract the evokeds from the dictionary
         cond_values = list(condition_labels.values())
 
         # configure variables for visualization
-        colors = {cond_keys[0]: "crimson", cond_keys[1]: "steelblue"}
+        colors = {
+            cond_keys[0]: colorblind_palette[0],
+            cond_keys[1]: colorblind_palette[1],
+        }
+        line_styles = {cond_keys[0]: "-", cond_keys[1]: "--"}
 
         lowest_p_cluster = np.argmin(self.cluster_p_values)
 
@@ -2051,18 +2059,23 @@ def plot_cluster(self, condition_labels: dict):
         cbar = plt.colorbar(image, cax=ax_colorbar)
         cbar.set_label(self.stat_name)
         ax_topo.set_xlabel(
-            "average from {:0.3f} to {:0.3f} s".format(*sig_times[[0, -1]])
+            "Spatial cluster extent:\n averaged from {:0.3f} to {:0.3f} s".format(
+                *sig_times[[0, -1]]
+            )
         )
 
         # add new axis for time courses and plot time courses
         ax_signals = divider.append_axes("right", size="300%", pad=1.3)
-        title = f"Signal averaged over {len(ch_inds)} sensor(s)"
+        title = (
+            f"Temporal cluster extent:\nSignal averaged over {len(ch_inds)} sensor(s)"
+        )
         plot_compare_evokeds(
             condition_labels,
             title=title,
             picks=ch_inds,
             axes=ax_signals,
             colors=colors,
+            linestyles=line_styles,
             show=False,
             split_legend=True,
             truncate_yaxis="auto",

From 98d08797fb3a943878a1b432202effee4db2c796 Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Sun, 28 Jul 2024 14:12:12 +0200
Subject: [PATCH 30/88] updated function call for plotting

---
 tutorials/stats-sensor-space/76_new_cluster_test_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index 83b4f019b6f..b7f933d127b 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -152,7 +152,7 @@
 # finally let's plot the results using the ClusterResults class
 
 # we plot the cluster with the lowest p-value
-cluster_result.plot_cluster(condition_labels=conditions_dict)
+cluster_result.plot_cluster_time_sensor(condition_labels=conditions_dict)
 # we can see that there is something going on around 400 ms
 # with a stronger signal for target trials in right central-parietal channels
 

From ec0324207428b720920908e5a25bb7f685d3e676 Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Sun, 28 Jul 2024 14:14:18 +0200
Subject: [PATCH 31/88] changed color

---
 mne/stats/cluster_level.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index 7f48c999f5f..8e5b73d6474 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -1994,7 +1994,7 @@ def plot_cluster_time_sensor(self, condition_labels: dict):
             Dictionary with condition labels as keys and evoked objects as values.
         """
         # define colorblind friendly colors
-        colorblind_palette = ["#4daf4a", "#f781bf"]
+        colorblind_palette = ["#4daf4a", "#984ea3"]
 
         # extract condition labels from the dictionary
         cond_keys = list(condition_labels.keys())

From 5941f61f9b26f06e830cd83c513b9c046c1235d6 Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Thu, 1 Aug 2024 12:30:11 -0500
Subject: [PATCH 32/88] docstring/docdict cleanups and fixes

---
 mne/stats/cluster_level.py |  68 +++++++++---------
 mne/utils/docs.py          | 138 ++++++++++++++++++++++---------------
 2 files changed, 118 insertions(+), 88 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index c763f01da91..9f6a3a8d343 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -1796,12 +1796,12 @@ def cluster_test(
     stat_fun: callable | None = None,
     tail: Literal[-1, 0, 1] = 0,
     threshold=None,
-    n_permutations: int = 1024,
-    adjacency: tuple | None = None,
-    max_step: int = 1,
-    exclude: list | None = None,
-    step_down_p: int = 0,
-    t_power: int = 1,
+    n_permutations: str | int = 1024,
+    adjacency: sparse.spmatrix | False = False,
+    max_step: int = 1,  # TODO may need to provide `max_step_time` and `max_step_freq`
+    exclude: list | None = None,  # TODO needs rethink because user passes MNE objects
+    step_down_p: float = 0.0,
+    t_power: float = 1.0,
     check_disjoint: bool = False,
     out_type: Literal["indices", "mask"] = "indices",
     seed: None | int | np.random.RandomState = None,
@@ -1819,35 +1819,41 @@ def cluster_test(
         Wilkinson notation formula for design matrix. The names of the dependent
         and independent variable should match the columns in the dataframe.
     within_id : None | str
-        Name of column in ``df`` to use in identifying within-group contrasts.
-    stat_fun : None | callable
-        Statistical function to use.
-    tail : int, optional
-        0 for two-tailed, 1 for greater, -1 for less. Default is 0.
-    n_permutations : int, optional
-        Number of permutations. Default is 1024.
-    adjacency : None, optional
-        Provide a adjacency matrix. Default is None.
+        Name of column in ``df`` to use in identifying within-group contrasts. If
+        ``None``, will perform a between-group test. Ignored if the number of groups
+        (unique values in the independent variable column of ``df``) is greater than 2.
+    %(stat_fun_clust_both)s
+    %(tail_clust)s
+    %(threshold_clust_both)s
+    %(n_permutations_clust_all)s
+    %(adjacency_clust_both)s
     max_step : int, optional
         Maximum distance between samples (time points). Default is 1.
-    exclude : np.Array, optional
-        Exclude no time points or channels. Default is None.
-    step_down_p : int, optional
-        Step down in jumps test. Default is 0.
-    t_power : int, optional
-        Weigh each location by its stats score. Default is 1.
-    check_disjoint : bool, optional
-        Check if clusters are disjoint. Default is False.
-    out_type : str, optional
-        Output type. Default is "indices".
-    seed : None | int | np.random.RandomState, optional
-        Seed for the random number generator. Default is None.
-    buffer_size : int, optional
-        Block size for chunking the data. Default is None.
-    n_jobs : int, optional
-        How many cores to use. Default is 1.
+    exclude : array-like of bool | None
+        Mask to apply to the data to exclude certain points from clustering
+        (e.g., medial wall vertices). Should be the same shape as the channels/vertices
+        dimension of the data objects. If ``None``, no points are excluded.
+    %(step_down_p_clust)s
+    %(t_power_clust)s
+    check_disjoint : bool
+        Whether to check if the ``adjacency`` matrix can be separated into disjoint
+        sets before clustering. This may lead to faster clustering, especially if
+        the "time" and/or "frequency" dimensions are large.
+    %(out_type_clust)s
+    %(seed)s
+    buffer_size : int | None
+        Block size to use when computing test statistics. This can significantly
+        reduce memory usage when ``n_jobs > 1`` and memory sharing between
+        processes is enabled (see :func:`mne.set_cache_dir`), because the data will be
+        shared between processes and each process only needs to allocate space for
+        a small block of locations at a time.
+    %(n_jobs)s
     %(verbose)s
 
+    Notes
+    -----
+    %(threshold_clust_t_or_f_notes)s
+
     Returns
     -------
     ClusterResult
diff --git a/mne/utils/docs.py b/mne/utils/docs.py
index ff9e11ee776..464e7e3e84c 100644
--- a/mne/utils/docs.py
+++ b/mne/utils/docs.py
@@ -144,61 +144,54 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75):
     formatting. This can add overhead so is meant only for debugging.
 """
 
-docdict["adjacency_clust"] = """
-adjacency : scipy.sparse.spmatrix | None | False
+_adjacency_clust_template = """
+adjacency : scipy.sparse.spmatrix | {param_none}False
     Defines adjacency between locations in the data, where "locations" can be
     spatial vertices, frequency bins, time points, etc. For spatial vertices
     (i.e. sensor space data), see :func:`mne.channels.find_ch_adjacency` or
     :func:`mne.spatial_inter_hemi_adjacency`. For source space data, see
-    :func:`mne.spatial_src_adjacency` or
-    :func:`mne.spatio_temporal_src_adjacency`. If ``False``, assumes
-    no adjacency (each location is treated as independent and unconnected).
-    If ``None``, a regular lattice adjacency is assumed, connecting
-    each {sp} location to its neighbor(s) along the last dimension
-    of {{eachgrp}} ``{{x}}``{lastdim}.
+    :func:`mne.spatial_src_adjacency` or :func:`mne.spatio_temporal_src_adjacency`.
+    If ``False``, assumes no adjacency (each location is treated as independent and
+    unconnected).{if_none}
     If ``adjacency`` is a matrix, it is assumed to be symmetric (only the
     upper triangular half is used) and must be square with dimension equal to
-    ``{{x}}.shape[-1]`` {parone} or ``{{x}}.shape[-1] * {{x}}.shape[-2]``
-    {partwo} or (optionally)
-    ``{{x}}.shape[-1] * {{x}}.shape[-2] * {{x}}.shape[-3]``
-    {parthree}.{memory}
+    the product of the last 1, 2, or 3 data dimensions (e.g., for time-frequency data:
+    n_channels, n_channels * n_freqs, or n_channels * n_freqs * n_times).{memory}
+"""
+_if_none = """ If ``None``, a regular lattice adjacency is assumed, connecting
+    each {spatial}location to its neighbor(s) along the last dimension
+    of {the_data}.
 """
-
-mem = (
-    " If spatial adjacency is uniform in time, it is recommended to use "
-    "a square matrix with dimension ``{x}.shape[-1]`` (n_vertices) to save "
-    "memory and computation, and to use ``max_step`` to define the extent "
-    "of temporal adjacency to consider when clustering."
-)
-comb = " The function `mne.stats.combine_adjacency` may be useful for 4D data."
 st = dict(
-    sp="spatial",
-    lastdim="",
-    parone="(n_vertices)",
-    partwo="(n_times * n_vertices)",
-    parthree="(n_times * n_freqs * n_vertices)",
-    memory=mem,
+    param_none="None | ",
+    if_none=_if_none.format(spatial="spatial ", the_data="{eachgrp} ``{x}``"),
+    memory="""
+    If spatial adjacency is uniform in time, it is recommended to use a square matrix
+    with dimension ``{x}.shape[-1]`` (n_vertices) to save memory and computation,
+    and to use ``max_step`` to define the extent of temporal adjacency to consider when
+    clustering.
+""",
 )
 tf = dict(
-    sp="",
-    lastdim=" (or the last two dimensions if ``{x}`` is 2D)",
-    parone="(for 2D data)",
-    partwo="(for 3D data)",
-    parthree="(for 4D data)",
-    memory=comb,
+    param_none="None | ",
+    if_none=_if_none.format(
+        spatial="",
+        the_data="{eachgrp} ``{x}`` (or the last two dimensions if ``{x}`` is 2D)",
+    ),
+    memory="""
+    The function `mne.stats.combine_adjacency` may be useful for 4D data.
+""",
 )
-nogroups = dict(eachgrp="", x="X")
+nogrps = dict(eachgrp="", x="X")
 groups = dict(eachgrp="each group ", x="X[k]")
-docdict["adjacency_clust_1"] = (
-    docdict["adjacency_clust"].format(**tf).format(**nogroups)
-)
-docdict["adjacency_clust_n"] = docdict["adjacency_clust"].format(**tf).format(**groups)
-docdict["adjacency_clust_st1"] = (
-    docdict["adjacency_clust"].format(**st).format(**nogroups)
-)
-docdict["adjacency_clust_stn"] = (
-    docdict["adjacency_clust"].format(**st).format(**groups)
+
+docdict["adjacency_clust_1"] = _adjacency_clust_template.format(**tf).format(**nogrps)
+docdict["adjacency_clust_both"] = _adjacency_clust_template.format(
+    param_none="", if_none="", memory=""
 )
+docdict["adjacency_clust_n"] = _adjacency_clust_template.format(**tf).format(**groups)
+docdict["adjacency_clust_st1"] = _adjacency_clust_template.format(**st).format(**nogrps)
+docdict["adjacency_clust_stn"] = _adjacency_clust_template.format(**st).format(**groups)
 
 docdict["adjust_dig_chpi"] = """
 adjust_dig : bool
@@ -708,7 +701,7 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75):
 
 docdict["check_disjoint_clust"] = """
 check_disjoint : bool
-    Whether to check if the connectivity matrix can be separated into disjoint
+    Whether to check if the ``adjacency`` matrix can be separated into disjoint
     sets before clustering. This may lead to faster clustering, especially if
     the second dimension of ``X`` (usually the "time" dimension) is large.
 """
@@ -1416,7 +1409,7 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75):
 """
 
 docdict["exclude_clust"] = """
-exclude : bool array or None
+exclude : array-like of bool | None
     Mask to apply to the data to exclude certain points from clustering
     (e.g., medial wall vertices). Should be the same shape as ``X``.
     If ``None``, no points are excluded.
@@ -3958,7 +3951,7 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75):
 seed : None | int | instance of ~numpy.random.RandomState
     A seed for the NumPy random number generator (RNG). If ``None`` (default),
     the seed will be  obtained from the operating system
-    (see  :class:`~numpy.random.RandomState` for details), meaning it will most
+    (see :class:`~numpy.random.RandomState` for details), meaning it will most
     likely produce different output every time this function or method is run.
     To achieve reproducible results, pass a value here to explicitly initialize
     the RNG with a defined state.
@@ -4249,16 +4242,23 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75):
     channel names in the file will be used when possible.
 """
 
-_stat_fun_clust_base = """
+_stat_fun_template = """
 stat_fun : callable | None
     Function called to calculate the test statistic. Must accept 1D-array as
-    input and return a 1D array. If ``None`` (the default), uses
-    `mne.stats.{}`.
+    input and return a 1D array. If ``None`` (the default), uses {}.
 """
 
-docdict["stat_fun_clust_f"] = _stat_fun_clust_base.format("f_oneway")
+docdict["stat_fun_clust_both"] = _stat_fun_template.format(
+    """:func:`mne.stats.ttest_1samp_no_p`
+    for paired tests and :func:`mne.stats.f_oneway` for unpaired tests or tests of
+    more than 2 groups."""
+)
+
+docdict["stat_fun_clust_f"] = _stat_fun_template.format(":func:`mne.stats.f_oneway`")
 
-docdict["stat_fun_clust_t"] = _stat_fun_clust_base.format("ttest_1samp_no_p")
+docdict["stat_fun_clust_t"] = _stat_fun_template.format(
+    ":func:`mne.stats.ttest_1samp_no_p`"
+)
 
 docdict["static"] = """
 static : instance of SpatialImage
@@ -4469,10 +4469,10 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75):
 threshold : float | dict | None
     The so-called "cluster forming threshold" in the form of a test statistic
     (note: this is not an alpha level / "p-value").
-    If numeric, vertices with data values more extreme than ``threshold`` will
-    be used to form clusters. If ``None``, {} will be chosen
+    If numeric, vertices with stat values more extreme than ``threshold`` will
+    be used to form clusters. If ``None``, {which_thresh} will be chosen
     automatically that corresponds to a p-value of 0.05 for the given number of
-    observations (only valid when using {}). If ``threshold`` is a
+    observations (only valid when using {which_stat}). If ``threshold`` is a
     :class:`dict` (with keys ``'start'`` and ``'step'``) then threshold-free
     cluster enhancement (TFCE) will be used (see the
     :ref:`TFCE example <tfce_example>` and :footcite:`SmithNichols2009`).
@@ -4480,8 +4480,14 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75):
     a particular p-value for one-tailed or two-tailed tests.
 """
 
-f_test = ("an F-threshold", "an F-statistic")
-docdict["threshold_clust_f"] = _threshold_clust_base.format(*f_test)
+docdict["threshold_clust_both"] = _threshold_clust_base.format(
+    which_thresh="a t- or F-threshold",
+    which_stat="``stat_fun=None``, i.e., a paired t-test or one-way F-test",
+)
+
+docdict["threshold_clust_f"] = _threshold_clust_base.format(
+    which_thresh="an F-threshold", which_stat="an F-statistic"
+)
 
 docdict["threshold_clust_f_notes"] = """
 For computing a ``threshold`` based on a p-value, use the conversion
@@ -4493,8 +4499,9 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75):
     thresh = scipy.stats.f.ppf(1 - pval, dfn=dfn, dfd=dfd)  # F distribution
 """
 
-t_test = ("a t-threshold", "a t-statistic")
-docdict["threshold_clust_t"] = _threshold_clust_base.format(*t_test)
+docdict["threshold_clust_t"] = _threshold_clust_base.format(
+    which_thresh="a t-threshold", which_stat="a t-statistic"
+)
 
 docdict["threshold_clust_t_notes"] = """
 For computing a ``threshold`` based on a p-value, use the conversion
@@ -4508,6 +4515,23 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75):
 For testing the lower tail (``tail=-1``), don't subtract ``pval`` from 1.
 """
 
+docdict["threshold_clust_t_or_f_notes"] = """
+For computing a ``threshold`` based on a p-value, use the conversion
+from :meth:`scipy.stats.rv_continuous.ppf`::
+
+    pval = 0.001  # arbitrary
+    # for t-statistic
+    df = n_observations - 1  # degrees of freedom for the t-test
+    thresh = scipy.stats.t.ppf(1 - pval / 2, df)  # two-tailed, t distribution
+    # for f-statistic
+    dfn = n_conditions - 1  # degrees of freedom numerator
+    dfd = n_observations - n_conditions  # degrees of freedom denominator
+    thresh = scipy.stats.f.ppf(1 - pval, dfn=dfn, dfd=dfd)  # F distribution
+
+For a one-tailed test (``tail=1``), don't divide the p-value by 2.
+For testing the lower tail (``tail=-1``), don't subtract ``pval`` from 1.
+"""
+
 docdict["time_bandwidth_tfr"] = """
 time_bandwidth : float ``≥ 2.0``
     Product between the temporal window length (in seconds) and the *full*

From 368fa44bc8b0772ba514ecd9532d588446c7a7a3 Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Mon, 5 Aug 2024 13:15:37 +0200
Subject: [PATCH 33/88] implemented Dan's comments

---
 mne/stats/cluster_level.py | 65 ++++++++++++++++++++------------------
 1 file changed, 35 insertions(+), 30 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index 9f6a3a8d343..804d035ff51 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -1775,7 +1775,7 @@ def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str):
         all_shapes = set(
             df[dv_name].map(lambda x: x.shape[1:])
         )  # first dim may vary (participants or epochs)
-    elif isinstance(inst, (BaseEpochs | BaseTFR)):  # should include BaseTFR?
+    elif isinstance(inst, (BaseEpochs | BaseTFR)):
         all_shapes = set(df[dv_name].map(lambda x: x.get_data().shape[1:]))
     else:
         all_shapes = set(df[dv_name].map(lambda x: x.get_data().shape))
@@ -1797,7 +1797,7 @@ def cluster_test(
     tail: Literal[-1, 0, 1] = 0,
     threshold=None,
     n_permutations: str | int = 1024,
-    adjacency: sparse.spmatrix | False = False,
+    adjacency: sparse.spmatrix | None | False = None,  # should be None (default)
     max_step: int = 1,  # TODO may need to provide `max_step_time` and `max_step_freq`
     exclude: list | None = None,  # TODO needs rethink because user passes MNE objects
     step_down_p: float = 0.0,
@@ -1817,7 +1817,7 @@ def cluster_test(
         Dataframe containing the data, dependent and independent variables.
     formula : str
         Wilkinson notation formula for design matrix. The names of the dependent
-        and independent variable should match the columns in the dataframe.
+        and independent variable should match the columns in ``df``.
     within_id : None | str
         Name of column in ``df`` to use in identifying within-group contrasts. If
         ``None``, will perform a between-group test. Ignored if the number of groups
@@ -1877,7 +1877,7 @@ def cluster_test(
         df.sort_values([iv_name, within_id], inplace=True)
         counts = df[within_id].value_counts()
         if any(counts != 2):
-            raise ValueError("for paired tttest, each subject must have 2 observations")
+            raise ValueError("for paired t-test, each subject must have 2 observations")
 
     # extract the data from the dataframe
     def _extract_data_array(series):
@@ -1914,7 +1914,7 @@ def _extract_data_tfr(series):
     elif within_id in df:
         kind = "within"
         X = X[0] - X[1]
-    else:  # what would be another else cas
+    else:  # 2 elements in X but no within_id provided → unpaired test
         kind = "between"
 
     # define stat function and threshold
@@ -1978,7 +1978,7 @@ def __init__(
         self.H0 = H0
         self.stat_fun = stat_fun
 
-        # unpaired t-test is f_oneway
+        # unpaired t-test equivalent to f_oneway w/ 2 groups
         if stat_fun is f_oneway:
             self.stat_name = "F-statistic"
         elif stat_fun is ttest_1samp_no_p:
@@ -1986,7 +1986,15 @@ def __init__(
         else:
             self.stat_name = "test statistic"
 
-    def plot_cluster_time_sensor(self, condition_labels: dict):
+    def plot_cluster_time_sensor(
+        self,
+        condition_labels: dict,
+        colors: list | dict | None = None,
+        linestyles: list | dict | None = None,
+        cmap_evokeds: None | str | tuple = None,
+        cmap_topo: None | str | tuple = None,
+        ci: float | bool | callable() | None = None,
+    ):
         """
         Plot the cluster with the lowest p-value.
 
@@ -1998,21 +2006,23 @@ def plot_cluster_time_sensor(self, condition_labels: dict):
         ----------
         condition_labels : dict
             Dictionary with condition labels as keys and evoked objects as values.
+        colors : list|dict|None
+            Colors to use when plotting the ERP lines and confidence bands.
+        linestyles : list|dict|None
+            Styles to use when plotting the ERP lines.
+        cmap_evokeds : None|str|tuple
+            Colormap from which to draw color values when plotting the ERP lines.
+        cmap_topo: matplotlib colormap
+            Colormap to use for the topomap.
+        ci : float|bool|callable()|None
+            Confidence band around each ERP time series.
         """
-        # define colorblind friendly colors
-        colorblind_palette = ["#4daf4a", "#984ea3"]
-
         # extract condition labels from the dictionary
         cond_keys = list(condition_labels.keys())
         # extract the evokeds from the dictionary
         cond_values = list(condition_labels.values())
 
-        # configure variables for visualization
-        colors = {
-            cond_keys[0]: colorblind_palette[0],
-            cond_keys[1]: colorblind_palette[1],
-        }
-        line_styles = {cond_keys[0]: "-", cond_keys[1]: "--"}
+        linestyles = {cond_keys[0]: "-", cond_keys[1]: "--"}
 
         lowest_p_cluster = np.argmin(self.cluster_p_values)
 
@@ -2040,7 +2050,7 @@ def plot_cluster_time_sensor(self, condition_labels: dict):
             times=0,
             mask=mask,
             axes=ax_topo,
-            cmap="RdBu_r",
+            cmap=cmap_topo,
             show=False,
             colorbar=False,
             mask_params=dict(markersize=10),
@@ -2049,13 +2059,11 @@ def plot_cluster_time_sensor(self, condition_labels: dict):
         image = ax_topo.images[0]
 
         # remove the title that would otherwise say "0.000 s"
-        ax_topo.set_title("")
-
-        # soft import?
-        # make_axes_locatable = _soft_import(
-        #    "mpl_toolkits.axes_grid1.make_axes_locatable",
-        #    purpose="plot cluster results"
-        # )  # soft import (not a dependency for MNE)
+        ax_topo.set_title(
+            "Spatial cluster extent:\n averaged from {:0.3f} to {:0.3f} s".format(
+                *sig_times[[0, -1]]
+            )
+        )
 
         # create additional axes (for ERF and colorbar)
         divider = make_axes_locatable(ax_topo)
@@ -2064,11 +2072,6 @@ def plot_cluster_time_sensor(self, condition_labels: dict):
         ax_colorbar = divider.append_axes("right", size="5%", pad=0.1)
         cbar = plt.colorbar(image, cax=ax_colorbar)
         cbar.set_label(self.stat_name)
-        ax_topo.set_xlabel(
-            "Spatial cluster extent:\n averaged from {:0.3f} to {:0.3f} s".format(
-                *sig_times[[0, -1]]
-            )
-        )
 
         # add new axis for time courses and plot time courses
         ax_signals = divider.append_axes("right", size="300%", pad=1.3)
@@ -2081,11 +2084,13 @@ def plot_cluster_time_sensor(self, condition_labels: dict):
             picks=ch_inds,
             axes=ax_signals,
             colors=colors,
-            linestyles=line_styles,
+            linestyles=linestyles,
+            cmap=cmap_evokeds,
             show=False,
             split_legend=True,
             truncate_yaxis="auto",
             truncate_xaxis=False,
+            ci=ci,
         )
         plt.legend(frameon=False, loc="upper left")
 

From 3aa32b699932d188768dfc60f33b4e6d0df2645d Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Mon, 5 Aug 2024 13:22:41 +0200
Subject: [PATCH 34/88] implemented Dan's comments

---
 tutorials/stats-sensor-space/76_new_cluster_test_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index b7f933d127b..fb928f89d0a 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -152,7 +152,7 @@
 # finally let's plot the results using the ClusterResults class
 
 # we plot the cluster with the lowest p-value
-cluster_result.plot_cluster_time_sensor(condition_labels=conditions_dict)
+cluster_result.plot_cluster_time_sensor(condition_labels=conditions_dict, ci=True)
 # we can see that there is something going on around 400 ms
 # with a stronger signal for target trials in right central-parietal channels
 

From a76afd31ecdc8a55aec3003ab12eff433dc0616d Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Mon, 5 Aug 2024 16:11:23 +0200
Subject: [PATCH 35/88] test for handling different MNE objects - test is
 failing

---
 mne/stats/tests/test_cluster_level.py | 101 +++++++++++++++++++++++---
 1 file changed, 90 insertions(+), 11 deletions(-)

diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py
index 24f3ee687ca..1c126494250 100644
--- a/mne/stats/tests/test_cluster_level.py
+++ b/mne/stats/tests/test_cluster_level.py
@@ -906,21 +906,100 @@ def test_compare_old_and_new_cluster_api():
 @pytest.mark.parametrize(
     "Inst", (EpochsArray, EvokedArray, EpochsTFRArray, AverageTFRArray)
 )
+@pytest.mark.filterwarnings('ignore:Ignoring argument "tail":RuntimeWarning')
 def test_new_cluster_api(Inst):
     """Test handling different MNE objects in the cluster API."""
     pd = pytest.importorskip("pandas")
 
-    n_epo, n_chan, n_freq, n_times = 2, 3, 5, 7
-    shape = (n_chan, n_times)
-    if Inst in (EpochsArray, EpochsTFRArray):
-        shape = (n_epo,) + shape
-    if Inst in (EpochsTFRArray, AverageTFRArray):
-        shape = shape[:-1] + (n_freq, shape[-1])
+    n_epo, n_chan, n_freq, n_times = 2, 3, 4, 5
+    info = create_info(ch_names=n_chan, sfreq=1000, ch_types="eeg")
+    # Introduce a significant difference in a specific region, time, and frequency
+    region_start = 1
+    region_end = 2
+    time_start = 2
+    time_end = 4
+    freq_start = 2
+    freq_end = 4
+
+    if Inst == EpochsArray:
+        # Create random data for EpochsArray
+        inst1 = Inst(np.random.randn(n_epo, n_chan, n_times), info=info)
+        # Adding a constant to create a difference
+        data_copy = inst1.get_data().copy()  # no data attribute for EpochsArray
+        data_copy[:, region_start:region_end, time_start:time_end] += (
+            2  # Modify the copy
+        )
+        inst2 = Inst(
+            data=data_copy, info=info
+        )  # Use the modified copy as a new instance
+
+    elif Inst == EvokedArray:
+        # Create random data for EvokedArray
+        inst1 = Inst(np.random.randn(n_chan, n_times), info=info)
+        data_copy = inst1.data.copy()
+        data_copy[region_start:region_end, time_start:time_end] += 2
+        inst2 = Inst(data=data_copy, info=info)
+
+    elif Inst == EpochsTFRArray:
+        # Create random data for EpochsTFRArray
+        data_tfr1 = np.random.randn(n_epo, n_chan, n_freq, n_times)
+        data_tfr2 = np.random.randn(n_epo, n_chan, n_freq, n_times)
+        inst1 = Inst(
+            data=data_tfr1, info=info, times=np.arange(n_times), freqs=np.arange(n_freq)
+        )
+        inst2 = Inst(
+            data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq)
+        )
+        data_tfr2 = inst2.data.copy()
+        data_tfr2[
+            :, region_start:region_end, freq_start:freq_end, time_start:time_end
+        ] += 2
+        inst2 = Inst(
+            data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq)
+        )
 
-    info = create_info(...)
-    inst1 = Inst(np.random.normal(shape, ...), info=info)
-    inst2 = Inst(np.random.normal(shape, ...), info=info)
+    elif Inst == AverageTFRArray:
+        # Create random data for AverageTFRArray
+        data_tfr1 = np.random.randn(n_chan, n_freq, n_times)
+        data_tfr2 = np.random.randn(n_chan, n_freq, n_times)
+        inst1 = Inst(
+            data=data_tfr1, info=info, times=np.arange(n_times), freqs=np.arange(n_freq)
+        )
+        inst2 = Inst(
+            data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq)
+        )
+        data_tfr2 = inst2.data.copy()
+        data_tfr2[
+            region_start:region_end, freq_start:freq_end, time_start:time_end
+        ] += 2
+        inst2 = Inst(
+            data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq)
+        )
 
+    # test old and new API with sample data
     df = pd.DataFrame(dict(data=[inst1, inst2], condition=["a", "b"]))
-    result = cluster_test(df, "data~condition", ...)
-    assert result  # TODO do something more interesting here
+    kwargs = dict(n_permutations=100, seed=1, tail=1, buffer_size=None, out_type="mask")
+
+    result_new_api = cluster_test(df, "data~condition", **kwargs)
+
+    # make sure channels are last dimension for old API
+    if Inst == EpochsArray:
+        inst1 = inst1.get_data().transpose(0, 2, 1)
+        inst2 = inst2.get_data().transpose(0, 2, 1)
+    elif Inst == EpochsTFRArray:
+        inst1 = inst1.data.transpose(0, 3, 2, 1)
+        inst2 = inst2.data.transpose(0, 3, 2, 1)
+    elif Inst == AverageTFRArray:
+        inst1 = inst1.data.transpose(2, 1, 0)
+        inst2 = inst2.data.transpose(2, 1, 0)
+    else:
+        inst1 = inst1.data.transpose(1, 0)
+        inst2 = inst2.data.transpose(1, 0)
+
+    F_obs, clusters, cluster_pvals, H0 = permutation_cluster_test(
+        [inst1, inst2], **kwargs
+    )
+    assert_array_equal(result_new_api.H0, H0)
+    assert_array_equal(result_new_api.stat_obs, F_obs)
+    assert_array_equal(result_new_api.cluster_p_values, cluster_pvals)
+    assert result_new_api.clusters == clusters

From b5fce8b7f23d96e79e64cfcac732b7a32dd7aa0a Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Tue, 6 Aug 2024 16:54:31 +0200
Subject: [PATCH 36/88] adjusted test to account for multiple subjects

---
 mne/stats/tests/test_cluster_level.py | 40 ++++++++++++++++++++++-----
 1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py
index 1c126494250..e3a701d3691 100644
--- a/mne/stats/tests/test_cluster_level.py
+++ b/mne/stats/tests/test_cluster_level.py
@@ -911,7 +911,7 @@ def test_new_cluster_api(Inst):
     """Test handling different MNE objects in the cluster API."""
     pd = pytest.importorskip("pandas")
 
-    n_epo, n_chan, n_freq, n_times = 2, 3, 4, 5
+    n_subs, n_epo, n_chan, n_freq, n_times = 2, 2, 3, 4, 5
     info = create_info(ch_names=n_chan, sfreq=1000, ch_types="eeg")
     # Introduce a significant difference in a specific region, time, and frequency
     region_start = 1
@@ -976,9 +976,25 @@ def test_new_cluster_api(Inst):
             data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq)
         )
 
-    # test old and new API with sample data
-    df = pd.DataFrame(dict(data=[inst1, inst2], condition=["a", "b"]))
-    kwargs = dict(n_permutations=100, seed=1, tail=1, buffer_size=None, out_type="mask")
+    if Inst == EvokedArray or Inst == AverageTFRArray:
+        # Generate random noise
+        noise = np.random.normal(loc=0, scale=0.1, size=inst1.data.shape)
+        # add noise to the data of the second subject
+        inst1_n = inst1.copy()
+        inst1_n.data = inst1.data + noise
+        inst2_n = inst2.copy()
+        inst2_n.data = inst2.data + noise
+        data = [inst1, inst2, inst1_n, inst2_n]
+        conds = ["a", "b"] * n_subs
+    else:
+        data = [inst1, inst2]
+        conds = ["a", "b"]
+
+    df = pd.DataFrame(dict(data=data, condition=conds))
+
+    kwargs = dict(
+        n_permutations=100, seed=42, tail=1, buffer_size=None, out_type="mask"
+    )
 
     result_new_api = cluster_test(df, "data~condition", **kwargs)
 
@@ -992,14 +1008,24 @@ def test_new_cluster_api(Inst):
     elif Inst == AverageTFRArray:
         inst1 = inst1.data.transpose(2, 1, 0)
         inst2 = inst2.data.transpose(2, 1, 0)
+        inst1_n = inst1_n.data.transpose(2, 1, 0)
+        inst2_n = inst2_n.data.transpose(2, 1, 0)
+        # combine the data of the two subjects
+        inst1 = np.concatenate([inst1[np.newaxis, :], inst1_n[np.newaxis, :]], axis=0)
+        inst2 = np.concatenate([inst2[np.newaxis, :], inst2_n[np.newaxis, :]], axis=0)
     else:
         inst1 = inst1.data.transpose(1, 0)
         inst2 = inst2.data.transpose(1, 0)
+        inst1_n = inst1_n.data.transpose(1, 0)
+        inst2_n = inst2_n.data.transpose(1, 0)
+        # combine the data of the two subjects
+        inst1 = np.concatenate([inst1[np.newaxis, :], inst1_n[np.newaxis, :]], axis=0)
+        inst2 = np.concatenate([inst2[np.newaxis, :], inst2_n[np.newaxis, :]], axis=0)
 
     F_obs, clusters, cluster_pvals, H0 = permutation_cluster_test(
         [inst1, inst2], **kwargs
     )
-    assert_array_equal(result_new_api.H0, H0)
-    assert_array_equal(result_new_api.stat_obs, F_obs)
-    assert_array_equal(result_new_api.cluster_p_values, cluster_pvals)
+    assert_array_almost_equal(result_new_api.H0, H0)
+    assert_array_almost_equal(result_new_api.stat_obs, F_obs)
+    assert_array_almost_equal(result_new_api.cluster_p_values, cluster_pvals)
     assert result_new_api.clusters == clusters

From 3ce510c1e5b53c7cdd123b468cf3ef4d6e55e428 Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Sat, 10 Aug 2024 17:39:40 -0500
Subject: [PATCH 37/88] refactor df validation to return bools

---
 mne/stats/cluster_level.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index 804d035ff51..821d12cfd8f 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -42,6 +42,7 @@
     verbose,
     warn,
 )
+from ..utils.mixin import GetEpochsMixin
 from ..viz import plot_compare_evokeds
 from .parametric import f_oneway, ttest_1samp_no_p
 
@@ -1784,7 +1785,11 @@ def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str):
             f"{prologue} consistent shape, but {len(all_shapes)} different "
             f"shapes were found: {'; '.join(all_shapes)}."
         )
-    return all_types.pop()  # return the type of the data column entries
+    obj_type = all_types.pop()
+    is_epo = GetEpochsMixin in obj_type.__mro__
+    is_tfr = BaseTFR in obj_type.__mro__
+    is_arr = np.ndarray in obj_type.__mro__
+    return is_epo, is_tfr, is_arr
 
 
 @verbose
@@ -1868,7 +1873,7 @@ def cluster_test(
     iv_name = str(np.array(formula.rhs.root).item())
 
     # validate the input dataframe and return the type of the data column entries
-    _dtype = _validate_cluster_df(df, dv_name, iv_name)
+    is_epo, is_tfr, is_arr = _validate_cluster_df(df, dv_name, iv_name)
 
     # for within_subject designs, check if each subject has 2 observations
     _validate_type(within_id, (str, None), "within_id")
@@ -1880,23 +1885,18 @@ def cluster_test(
             raise ValueError("for paired t-test, each subject must have 2 observations")
 
     # extract the data from the dataframe
-    def _extract_data_array(series):
-        return np.concatenate(series.values)
+    outer_func = np.concatenate if is_epo or is_arr else np.array
+    axes = (-3, -1) if is_tfr else (-2, -1)
 
-    def _extract_data_mne(series):  # 2D data
-        return np.array(
-            series.map(lambda inst: inst.get_data().swapaxes(-2, -1)).to_list()
+    def func_mne(series):
+        return outer_func(
+            series.map(lambda inst: inst.get_data().swapaxes(*axes)).to_list()
         )
 
-    def _extract_data_tfr(series):
-        return series.map(lambda inst: inst.get_data().swapaxes(-3, -1)).to_list()
+    def func_array(series):
+        return outer_func(series.values)
 
-    if _dtype is np.ndarray:
-        func = _extract_data_array
-    elif _dtype is BaseTFR:
-        func = _extract_data_tfr
-    else:
-        func = _extract_data_mne
+    func = func_array if is_arr else func_mne
 
     # convert to a list-like X for clustering
     X = df.groupby(iv_name).agg({dv_name: func})[dv_name].to_list()

From feb1911773fa9aeec7528f1e31be44599a6f4c89 Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Sat, 10 Aug 2024 17:40:14 -0500
Subject: [PATCH 38/88] unrelated typing fix

---
 mne/stats/cluster_level.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index 821d12cfd8f..2ab8917226d 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -1993,7 +1993,7 @@ def plot_cluster_time_sensor(
         linestyles: list | dict | None = None,
         cmap_evokeds: None | str | tuple = None,
         cmap_topo: None | str | tuple = None,
-        ci: float | bool | callable() | None = None,
+        ci: float | bool | callable | None = None,
     ):
         """
         Plot the cluster with the lowest p-value.

From 6f9781197e29c410d9a2cef41f1e7228da9ab740 Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Sat, 10 Aug 2024 17:41:19 -0500
Subject: [PATCH 39/88] rework test

---
 mne/stats/tests/test_cluster_level.py | 164 ++++++++++----------------
 1 file changed, 60 insertions(+), 104 deletions(-)

diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py
index e3a701d3691..654f0c552f2 100644
--- a/mne/stats/tests/test_cluster_level.py
+++ b/mne/stats/tests/test_cluster_level.py
@@ -911,121 +911,77 @@ def test_new_cluster_api(Inst):
     """Test handling different MNE objects in the cluster API."""
     pd = pytest.importorskip("pandas")
 
-    n_subs, n_epo, n_chan, n_freq, n_times = 2, 2, 3, 4, 5
+    rng = np.random.default_rng(seed=8675309)
+    is_epo = Inst in (EpochsTFRArray, EpochsArray)
+    is_tfr = Inst in (EpochsTFRArray, AverageTFRArray)
+
+    n_epo, n_chan, n_freq, n_times = 6, 3, 4, 5
+
+    # prepare the dimensions of the simulated data, then simulate
+    size = (n_chan,)
+    if is_epo:
+        size = (n_epo, *size)
+    if is_tfr:
+        size = (*size, n_freq)
+    size = (*size, n_times)
+    data = rng.normal(size=size)
+
+    # construct the instance
     info = create_info(ch_names=n_chan, sfreq=1000, ch_types="eeg")
-    # Introduce a significant difference in a specific region, time, and frequency
-    region_start = 1
-    region_end = 2
-    time_start = 2
-    time_end = 4
-    freq_start = 2
-    freq_end = 4
-
-    if Inst == EpochsArray:
-        # Create random data for EpochsArray
-        inst1 = Inst(np.random.randn(n_epo, n_chan, n_times), info=info)
-        # Adding a constant to create a difference
-        data_copy = inst1.get_data().copy()  # no data attribute for EpochsArray
-        data_copy[:, region_start:region_end, time_start:time_end] += (
-            2  # Modify the copy
-        )
-        inst2 = Inst(
-            data=data_copy, info=info
-        )  # Use the modified copy as a new instance
-
-    elif Inst == EvokedArray:
-        # Create random data for EvokedArray
-        inst1 = Inst(np.random.randn(n_chan, n_times), info=info)
-        data_copy = inst1.data.copy()
-        data_copy[region_start:region_end, time_start:time_end] += 2
-        inst2 = Inst(data=data_copy, info=info)
-
-    elif Inst == EpochsTFRArray:
-        # Create random data for EpochsTFRArray
-        data_tfr1 = np.random.randn(n_epo, n_chan, n_freq, n_times)
-        data_tfr2 = np.random.randn(n_epo, n_chan, n_freq, n_times)
-        inst1 = Inst(
-            data=data_tfr1, info=info, times=np.arange(n_times), freqs=np.arange(n_freq)
-        )
-        inst2 = Inst(
-            data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq)
-        )
-        data_tfr2 = inst2.data.copy()
-        data_tfr2[
-            :, region_start:region_end, freq_start:freq_end, time_start:time_end
-        ] += 2
-        inst2 = Inst(
-            data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq)
-        )
-
-    elif Inst == AverageTFRArray:
-        # Create random data for AverageTFRArray
-        data_tfr1 = np.random.randn(n_chan, n_freq, n_times)
-        data_tfr2 = np.random.randn(n_chan, n_freq, n_times)
-        inst1 = Inst(
-            data=data_tfr1, info=info, times=np.arange(n_times), freqs=np.arange(n_freq)
-        )
-        inst2 = Inst(
-            data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq)
-        )
-        data_tfr2 = inst2.data.copy()
-        data_tfr2[
-            region_start:region_end, freq_start:freq_end, time_start:time_end
-        ] += 2
-        inst2 = Inst(
-            data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq)
-        )
-
-    if Inst == EvokedArray or Inst == AverageTFRArray:
-        # Generate random noise
-        noise = np.random.normal(loc=0, scale=0.1, size=inst1.data.shape)
-        # add noise to the data of the second subject
-        inst1_n = inst1.copy()
-        inst1_n.data = inst1.data + noise
-        inst2_n = inst2.copy()
-        inst2_n.data = inst2.data + noise
-        data = [inst1, inst2, inst1_n, inst2_n]
-        conds = ["a", "b"] * n_subs
+    kw = dict(times=np.arange(n_times), freqs=np.arange(n_freq)) if is_tfr else dict()
+    cond_a = Inst(data=data, info=info, **kw)
+    cond_b = cond_a.copy()
+    # introduce a significant difference in a specific region, time, and frequency
+    ch_start, ch_end = 0, 2  # 2 channels
+    t_start, t_end = 2, 4  # 2 times
+    f_start, f_end = 2, 4  # 2 freqs
+    if is_tfr:
+        cond_b._data[..., ch_start:ch_end, f_start:f_end, t_start:t_end] += 2
+    else:
+        cond_b._data[..., ch_start:ch_end, t_start:t_end] += 2
+    # for Evokeds/AverageTFRs, we create fake "subjects" as our observations within each
+    # condition. We add a bit of noise while we do so.
+    if not is_epo:
+        insts = list()
+        for cond in cond_a, cond_b:
+            for _n in range(n_epo):
+                if not _n:
+                    insts.append(cond)
+                    continue
+                _cond = cond.copy()
+                _cond.data += rng.normal(scale=0.1, size=_cond.data.shape)
+                insts.append(_cond)
+        conds = np.repeat(["a", "b"], n_epo).tolist()
     else:
-        data = [inst1, inst2]
+        # For Epochs(TFR)Array, each epoch is an observation and they're already
+        # noisy/non-identical, so no duplication / noise-addition necessary.
+        insts = [cond_a, cond_b]
         conds = ["a", "b"]
 
-    df = pd.DataFrame(dict(data=data, condition=conds))
-
+    # run new clustering API
+    df = pd.DataFrame(dict(data=insts, condition=conds))
     kwargs = dict(
         n_permutations=100, seed=42, tail=1, buffer_size=None, out_type="mask"
     )
-
     result_new_api = cluster_test(df, "data~condition", **kwargs)
 
     # make sure channels are last dimension for old API
-    if Inst == EpochsArray:
-        inst1 = inst1.get_data().transpose(0, 2, 1)
-        inst2 = inst2.get_data().transpose(0, 2, 1)
-    elif Inst == EpochsTFRArray:
-        inst1 = inst1.data.transpose(0, 3, 2, 1)
-        inst2 = inst2.data.transpose(0, 3, 2, 1)
-    elif Inst == AverageTFRArray:
-        inst1 = inst1.data.transpose(2, 1, 0)
-        inst2 = inst2.data.transpose(2, 1, 0)
-        inst1_n = inst1_n.data.transpose(2, 1, 0)
-        inst2_n = inst2_n.data.transpose(2, 1, 0)
-        # combine the data of the two subjects
-        inst1 = np.concatenate([inst1[np.newaxis, :], inst1_n[np.newaxis, :]], axis=0)
-        inst2 = np.concatenate([inst2[np.newaxis, :], inst2_n[np.newaxis, :]], axis=0)
+    if is_epo:
+        axes = (0, 3, 2, 1) if is_tfr else (0, 2, 1)
+        X = [cond_a.get_data().transpose(*axes), cond_b.get_data().transpose(*axes)]
     else:
-        inst1 = inst1.data.transpose(1, 0)
-        inst2 = inst2.data.transpose(1, 0)
-        inst1_n = inst1_n.data.transpose(1, 0)
-        inst2_n = inst2_n.data.transpose(1, 0)
-        # combine the data of the two subjects
-        inst1 = np.concatenate([inst1[np.newaxis, :], inst1_n[np.newaxis, :]], axis=0)
-        inst2 = np.concatenate([inst2[np.newaxis, :], inst2_n[np.newaxis, :]], axis=0)
-
-    F_obs, clusters, cluster_pvals, H0 = permutation_cluster_test(
-        [inst1, inst2], **kwargs
-    )
+        axes = (2, 1, 0) if is_tfr else (1, 0)
+        Xa = list()
+        Xb = list()
+        for inst, cond in zip(insts, conds):
+            container = Xa if cond == "a" else Xb
+            container.append(inst.get_data().transpose(*axes))
+        X = [np.stack(Xa), np.stack(Xb)]
+
+    F_obs, clusters, cluster_pvals, H0 = permutation_cluster_test(X, **kwargs)
     assert_array_almost_equal(result_new_api.H0, H0)
     assert_array_almost_equal(result_new_api.stat_obs, F_obs)
     assert_array_almost_equal(result_new_api.cluster_p_values, cluster_pvals)
-    assert result_new_api.clusters == clusters
+    assert len(result_new_api.clusters) == len(clusters)
+    for clu1, clu2 in zip(result_new_api.clusters, clusters):
+        assert_array_equal(clu1, clu2)

From b09d20a6759882268c32a5f4f0069620f8bb0a3a Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Mon, 12 Aug 2024 09:08:27 -0500
Subject: [PATCH 40/88] minor cleanup

---
 mne/stats/cluster_level.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index 2ab8917226d..79efde4be4f 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -1885,18 +1885,18 @@ def cluster_test(
             raise ValueError("for paired t-test, each subject must have 2 observations")
 
     # extract the data from the dataframe
-    outer_func = np.concatenate if is_epo or is_arr else np.array
+    outer_func = np.concatenate if is_epo else np.array
     axes = (-3, -1) if is_tfr else (-2, -1)
 
+    def func_arr(series):
+        return np.concatenate(series.values)
+
     def func_mne(series):
         return outer_func(
             series.map(lambda inst: inst.get_data().swapaxes(*axes)).to_list()
         )
 
-    def func_array(series):
-        return outer_func(series.values)
-
-    func = func_array if is_arr else func_mne
+    func = func_arr if is_arr else func_mne
 
     # convert to a list-like X for clustering
     X = df.groupby(iv_name).agg({dv_name: func})[dv_name].to_list()

From 977e153d6b0948b52c9a0ae25eca9ad20c3e71e9 Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Mon, 12 Aug 2024 09:16:24 -0500
Subject: [PATCH 41/88] fix imports

---
 mne/stats/cluster_level.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index 79efde4be4f..141f7c299d4 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -24,13 +24,15 @@
 from scipy.stats import f as fstat
 from scipy.stats import t as tstat
 
-from .. import BaseEpochs, Evoked, EvokedArray
+from ..epochs import BaseEpochs, EvokedArray
+from ..evoked import Evoked
 from ..fixes import has_numba, jit
 from ..parallel import parallel_func
 from ..source_estimate import MixedSourceEstimate, SourceEstimate, VolSourceEstimate
 from ..source_space import SourceSpaces
 from ..time_frequency import BaseTFR
 from ..utils import (
+    GetEpochsMixin,
     ProgressBar,
     _check_option,
     _pl,
@@ -42,7 +44,6 @@
     verbose,
     warn,
 )
-from ..utils.mixin import GetEpochsMixin
 from ..viz import plot_compare_evokeds
 from .parametric import f_oneway, ttest_1samp_no_p
 

From a288d8579546adcf20750ff974bedb043d82ca0d Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Mon, 12 Aug 2024 09:16:35 -0500
Subject: [PATCH 42/88] use MRO in test too

---
 mne/stats/tests/test_cluster_level.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py
index 654f0c552f2..b4d676abe91 100644
--- a/mne/stats/tests/test_cluster_level.py
+++ b/mne/stats/tests/test_cluster_level.py
@@ -39,8 +39,8 @@
     summarize_clusters_stc,
     ttest_1samp_no_p,
 )
-from mne.time_frequency import AverageTFRArray, EpochsTFRArray
-from mne.utils import _record_warnings, catch_logging
+from mne.time_frequency import AverageTFRArray, BaseTFR, EpochsTFRArray
+from mne.utils import GetEpochsMixin, _record_warnings, catch_logging
 
 n_space = 50
 
@@ -912,8 +912,8 @@ def test_new_cluster_api(Inst):
     pd = pytest.importorskip("pandas")
 
     rng = np.random.default_rng(seed=8675309)
-    is_epo = Inst in (EpochsTFRArray, EpochsArray)
-    is_tfr = Inst in (EpochsTFRArray, AverageTFRArray)
+    is_epo = GetEpochsMixin in Inst.__mro__
+    is_tfr = BaseTFR in Inst.__mro__
 
     n_epo, n_chan, n_freq, n_times = 6, 3, 4, 5
 

From 05586c8803316d3cb29d217b93d8cdaf54949309 Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Fri, 14 Jun 2024 14:22:51 +0200
Subject: [PATCH 43/88] added cluster test api, first commit

---
 .../76_new_cluster_test_api.py                | 467 ++++++++++++++++++
 1 file changed, 467 insertions(+)
 create mode 100644 tutorials/stats-sensor-space/76_new_cluster_test_api.py

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
new file mode 100644
index 00000000000..4e2b3af8f6d
--- /dev/null
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -0,0 +1,467 @@
+from pathlib import Path
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from mpl_toolkits.axes_grid1 import make_axes_locatable
+import mne
+
+# eventually we want to use the _permutation_cluster_test function
+
+# import and load dataset
+path_to_p3 = Path("C:/Users/Carina/mne_data/ERP_CORE_P3")
+
+def prep_sample_data(plot_evokeds: bool = False):
+    """
+    Load the P3 dataset and extract the target, non-target and contrast evokeds.
+    """
+    # Define the range of participant IDs
+    participant_ids = range(15, 20)  # This will cover 015 to 019
+
+    evokeds_allsubs = []
+
+    # Loop over each participant ID and generate the corresponding filename
+    for pid in participant_ids:
+        # Create the filename using an f-string, ensuring the participant ID is zero-padded to 3 digits
+        filename_p3 = f"sub-{pid:03d}_ses-P3_task-P3_ave.fif"
+
+        # Print the filename (or perform your desired operations on it)
+        print(filename_p3)
+
+        p3_file_path = Path(path_to_p3) / filename_p3
+
+        evokeds = mne.read_evokeds(p3_file_path)
+
+        # add to list
+        evokeds_allsubs.append(evokeds)
+
+    target_only = [evoked[0] for evoked in evokeds_allsubs]
+    non_target_only = [evoked[1] for evoked in evokeds_allsubs]
+    contrast = [evoked[2] for evoked in evokeds_allsubs]
+
+    if plot_evokeds:
+        # plot the grand average
+        mne.grand_average(target_only).plot()
+        mne.grand_average(non_target_only).plot()
+        mne.grand_average(contrast).plot()
+
+    # create contrast from evokeds target and non-target
+    diff_evoked = [
+        mne.combine_evoked([evokeds_a, evokeds_b], weights=[1, -1])
+        for evokeds_a, evokeds_b in zip(target_only, non_target_only)
+    ]
+
+    if plot_evokeds:
+        mne.grand_average(diff_evoked).plot()
+
+    # crop the evokeds in the post stimulus window
+    contrast = [evokeds.crop(tmin=-0.1, tmax=0.6) for evokeds in contrast]
+    target_only = [evokeds.crop(tmin=-0.1, tmax=0.6) for evokeds in target_only]
+    non_target_only = [evokeds.crop(tmin=-0.1, tmax=0.6) for evokeds in non_target_only]
+
+    return contrast, target_only, non_target_only
+
+
+def old_api_cluster(n_permutations: int = 10000, seed: int = 1234):
+    """
+    Run the cluster test using the old API to get a bechmark result for the new API.
+    Currently implementing a paired t-test with contrast between participants.
+    """
+    contrast, target_only, non_target_only = prep_sample_data()
+
+    # extract the data for each evoked and store in numpy array
+    data = np.array([evoked.data for evoked in contrast])
+
+    # shape should be (n_subjects, n_channels, n_times)
+    data.shape
+
+    # reshape to channels as last dimension
+    data = data.transpose(0, 2, 1)
+
+    data.shape
+
+    adjacency, _ = mne.channels.find_ch_adjacency(contrast[0].info, ch_type="eeg")
+
+    stat_fun, threshold = mne.stats.cluster_level._check_fun(
+        X=data, stat_fun=None, threshold=None, tail=0, kind="within"
+    )
+
+    # adjacency = mne.channels.find_ch_adjacency(contrast[0].info, ch_type='eeg')
+    # Run the analysis
+    T_obs, clusters, cluster_p_values, H0 = (
+        mne.stats.cluster_level._permutation_cluster_test(
+            [data],
+            threshold=threshold,
+            stat_fun=stat_fun,
+            n_jobs=-1,  # takes all CPU cores
+            max_step=1,  # maximum distance between samples (time points)
+            exclude=None,  # exclude no time points or channels
+            step_down_p=0,  # step down in jumps test
+            t_power=1,  # weigh each location by its stats score
+            out_type="indices",
+            check_disjoint=False,
+            buffer_size=None,  # block size for chunking the data
+            n_permutations=n_permutations,
+            tail=0,
+            adjacency=adjacency,
+            seed=seed,
+        )
+    )
+
+    print(min(cluster_p_values))
+
+    plot_cluster(
+        contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values
+    )
+
+    return T_obs, clusters, cluster_p_values, H0
+
+
+# fit cluster test with dataframe as input
+# create condition list that repeats 5times 1 and then 5 times 0
+# 1 = target, 0 = non-target
+# condition = 5 * [1] + 5 * [0]
+
+# 1 = target, 0 = non-target
+# contrast, target_only, non_target_only = prep_sample_data()
+
+# evokeds_list = target_only + non_target_only
+
+
+def create_random_evokeds_id_condition_list(evoked_data_a: list, evoked_data_b: list):
+    """
+    Create a list of shuffled participant IDs, conditions, and evoked data.
+    # Keep the participant IDs and conditions paired but shuffle the order of the evoked data.
+    """
+    import random
+
+    # Example participant IDs
+    participant_ids = ["p1", "p2", "p3", "p4", "p5"] * 2
+
+    # Combine the evoked data into a single list
+    all_evoked_data = evoked_data_a + evoked_data_b
+
+    # Create a corresponding list of conditions
+    conditions = [1] * len(evoked_data_a) + [0] * len(evoked_data_b)
+
+    # Combine the participant IDs, conditions, and evoked data into a list of tuples
+    combined_list = list(zip(participant_ids, conditions, all_evoked_data))
+
+    # Shuffle the combined list
+    random.shuffle(combined_list)
+
+    # Separate the shuffled list back into participant IDs, conditions, and evoked data
+    shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = zip(
+        *combined_list
+    )
+
+    # Convert the tuples back to lists
+    shuffled_participant_ids = list(shuffled_participant_ids)
+    shuffled_conditions = list(shuffled_conditions)
+    shuffled_evoked_data = list(shuffled_evoked_data)
+
+    return shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data
+
+
+def create_random_paired_evokeds_list(evoked_data_a: list, evoked_data_b: list):
+    """
+    Create a list of shuffled evoked data where each pair of target and non-target evoked data is shuffled together.
+    """
+    import random
+
+    # Create a list of tuples where each tuple contains an evoked data and its corresponding label
+    evoked_pairs = [(evoked, 1) for evoked in evoked_data_a] + [
+        (evoked, 0) for evoked in evoked_data_b
+    ]
+
+    # Shuffle the list of tuples
+    random.shuffle(evoked_pairs)
+
+    # Separate the shuffled list back into evoked data and labels
+    shuffled_evoked_data, shuffled_labels = zip(*evoked_pairs)
+
+    # Convert the tuples back to lists
+    shuffled_evoked_data = list(shuffled_evoked_data)
+
+    return shuffled_evoked_data
+
+
+# shuffle order of pairs
+shuffled_evokeds_list = create_random_paired_evokeds_list(target_only, non_target_only)
+# shouldn't change the results (p-value is different though?)
+
+shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = (
+    create_random_evokeds_id_condition_list(
+        evoked_data_a=target_only, evoked_data_b=non_target_only
+    )
+)
+
+
+def prepare_dataframe_for_cluster_function(
+    contrast: bool = False,
+    evokeds: list = None,
+    condition: list = None,
+    subject_index: list = None,
+):
+    """
+    Prepare a dataframe for the cluster test function.
+
+    Parameters
+    ----------
+    contrast : bool, optional
+        If True, a contrast is calculated. Default is False.
+    evokeds : list, optional
+        List of evoked objects. Default is None.
+    condition : list, optional
+        List of conditions for each evoked object. Default is None.
+    subject_index : list, optional
+        List of subject IDs. Default is None.
+
+    """
+    # create an empty dataframe
+    df = pd.DataFrame()
+
+    if contrast == True:
+        # check if evoked list is dividable by 2
+        if len(evokeds) % 2 != 0:
+            raise ValueError("evokeds list needs to be dividable by 2")
+        if condition is not None:
+            # Convert lists to DataFrame for easier manipulation
+            df = pd.DataFrame(
+                {
+                    "evoked": evokeds,
+                    "condition": condition,
+                    "subject_index": subject_index,
+                }
+            )
+
+        return df
+
+
+def cluster_test(
+    df: pd.DataFrame,
+    n_permutations: int = 10000,
+    seed: int = 1234,
+    contrast_weights: list = [1, -1],
+):
+    """
+    Run the cluster test using the new API.
+    # currently supports paired t-test with contrast or with list of conditions
+
+    Parameters
+    ----------
+    dataframe : pd.DataFrame
+        Dataframe with evoked data, conditions and subject IDs.
+    n_permutations : int, optional
+        Number of permutations. Default is 10000.
+    seed : int, optional
+        Random seed. Default is 1234.
+
+    Returns
+    -------
+    T_obs : array
+        The observed test statistic.
+    clusters : list
+        List of clusters.
+    cluster_p_values : array
+        Array of cluster p-values.
+    H0 : array
+        The permuted test statistics.
+    """
+    if df.condition is not None:
+        # Extract unique conditions
+        unique_conditions = np.unique(df.condition)
+        if len(unique_conditions) != 2:
+            raise ValueError("Condition list needs to contain 2 unique values")
+        if df.subject_index is not None:
+            # Initialize a list to hold the combined evoked data
+            evokeds_data = []
+
+            # Process each subject's evoked data
+            for sub_id in df.subject_index.unique():
+                sub_df = df[df.subject_index == sub_id]
+
+                # Split evokeds list based on condition list for this subject
+                evokeds_a = sub_df[sub_df.condition == unique_conditions[0]][
+                    "evoked"
+                ].tolist()
+                evokeds_b = sub_df[sub_df.condition == unique_conditions[1]][
+                    "evoked"
+                ].tolist()
+
+                if len(evokeds_a) != 1 or len(evokeds_b) != 1:
+                    raise ValueError(
+                        f"Subject {sub_id}: Each subject must have exactly one evoked for each condition"
+                    )
+
+                # Calculate contrast based on condition list
+                diff_evoked = mne.combine_evoked(
+                    [evokeds_a[0], evokeds_b[0]], weights=contrast_weights
+                )
+                evokeds_data.append(diff_evoked)
+        else:
+            # calculate length of evokeds list
+            n_evokeds = len(df.evokeds)
+            # now split evokeds list in two lists
+            evokeds_a = df.evokeds[: n_evokeds // 2]
+            evokeds_b = df.evokeds[n_evokeds // 2 :]
+            # create contrast from evokeds_a and evokeds_b
+            diff_evoked = [
+                mne.combine_evoked([evo_a, evo_b], weights=contrast_weights)
+                for evo_a, evo_b in zip(evokeds_a, evokeds_b)
+            ]
+            evokeds_data = diff_evoked
+    else:
+        evokeds_data = df.evokeds
+
+    # extract number of channels
+    n_channels = evokeds_data[0].info["nchan"]
+
+    # loop over rows and extract data from evokeds
+    data_array = np.array([evoked.data for evoked in evokeds_data])
+
+    # find the dimension that is equal to n_channels
+    if data_array.shape[1] == n_channels:
+        # reshape to channels as last dimension
+        data = data_array.transpose(0, 2, 1)
+
+    adjacency, _ = mne.channels.find_ch_adjacency(evokeds_data[0].info, ch_type="eeg")
+
+    stat_fun, threshold = mne.stats.cluster_level._check_fun(
+        X=data, stat_fun=None, threshold=None, tail=0, kind="within"
+    )
+
+    T_obs, clusters, cluster_p_values, H0 = (
+        mne.stats.cluster_level._permutation_cluster_test(
+            [data],
+            threshold=threshold,
+            stat_fun=stat_fun,
+            n_jobs=-1,
+            max_step=1,
+            exclude=None,
+            step_down_p=0.05,
+            t_power=1,
+            out_type="indices",
+            check_disjoint=True,
+            buffer_size=None,
+            n_permutations=n_permutations,
+            tail=0,
+            adjacency=adjacency,
+            seed=seed,
+        )
+    )
+
+    print(min(cluster_p_values))
+
+    # need to adjust plotting function for contrast only data
+    contrast, evokeds_a, evokeds_b = prep_sample_data()
+
+    # plot cluster
+    plot_cluster(contrast, evokeds_a, evokeds_b, T_obs, clusters, cluster_p_values)
+
+    return T_obs, clusters, cluster_p_values, H0
+
+
+def plot_cluster(
+    contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values
+):
+    """
+    Plot the cluster with the lowest p-value.
+
+    Parameters
+    ----------
+    contrast : list
+        List of contrast evoked objects.
+    target_only : list
+        List of target evoked objects.
+    non_target_only : list
+        List of non-target evoked objects.
+    T_obs : array
+        The observed test statistic.
+    clusters : list
+        List of clusters.
+    cluster_p_values : array
+        Array of cluster p-values.
+
+    Returns
+    -------
+    None
+
+    """
+    # configure variables for visualization
+    colors = {"target": "crimson", "non-target": "steelblue"}
+
+    # organize data for plotting
+    evokeds = {"target": target_only, "non-target": non_target_only}
+
+    lowest_p_cluster = np.argmin(cluster_p_values)
+
+    # plot the cluster with the lowest p-value
+    time_inds, space_inds = np.squeeze(clusters[lowest_p_cluster])
+    ch_inds = np.unique(space_inds)
+    time_inds = np.unique(time_inds)
+
+    # get topography for F stat
+    t_map = T_obs[time_inds, ...].mean(axis=0)
+
+    # get signals at the sensors contributing to the cluster
+    sig_times = contrast[0].times[time_inds]
+
+    # create spatial mask
+    mask = np.zeros((t_map.shape[0], 1), dtype=bool)
+    mask[ch_inds, :] = True
+
+    # initialize figure
+    fig, ax_topo = plt.subplots(1, 1, figsize=(10, 3), layout="constrained")
+
+    # plot average test statistic and mark significant sensors
+    t_evoked = mne.EvokedArray(t_map[:, np.newaxis], contrast[0].info, tmin=0)
+    t_evoked.plot_topomap(
+        times=0,
+        mask=mask,
+        axes=ax_topo,
+        cmap="Reds",
+        vlim=(np.min, np.max),
+        show=False,
+        colorbar=False,
+        mask_params=dict(markersize=10),
+    )
+    image = ax_topo.images[0]
+
+    # remove the title that would otherwise say "0.000 s"
+    ax_topo.set_title("")
+
+    # create additional axes (for ERF and colorbar)
+    divider = make_axes_locatable(ax_topo)
+
+    # add axes for colorbar
+    ax_colorbar = divider.append_axes("right", size="5%", pad=0.05)
+    plt.colorbar(image, cax=ax_colorbar)
+    ax_topo.set_xlabel(
+        "Averaged t-map ({:0.3f} - {:0.3f} s)".format(*sig_times[[0, -1]])
+    )
+
+    # add new axis for time courses and plot time courses
+    ax_signals = divider.append_axes("right", size="300%", pad=1.2)
+    title = f"Cluster #1, {len(ch_inds)} sensor"
+    if len(ch_inds) > 1:
+        title += "s (mean)"
+    mne.viz.plot_compare_evokeds(
+        evokeds,
+        title=title,
+        picks=ch_inds,
+        axes=ax_signals,
+        colors=colors,
+        show=False,
+        split_legend=True,
+        truncate_yaxis="auto",
+    )
+
+    # plot temporal cluster extent
+    ymin, ymax = ax_signals.get_ylim()
+    ax_signals.fill_betweenx(
+        (ymin, ymax), sig_times[0], sig_times[-1], color="green", alpha=0.3
+    )
+
+    plt.show()
+
+    return None

From e8770fd0d7f1854cddc59b8fdf5bab414202eff6 Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Fri, 14 Jun 2024 19:02:45 +0200
Subject: [PATCH 44/88] tested dataframe function and results, cleaned up

---
 .../76_new_cluster_test_api.py                | 187 +++++++++---------
 1 file changed, 95 insertions(+), 92 deletions(-)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index 4e2b3af8f6d..3f001251ba5 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -5,7 +5,6 @@
 from mpl_toolkits.axes_grid1 import make_axes_locatable
 import mne
 
-# eventually we want to use the _permutation_cluster_test function
 
 # import and load dataset
 path_to_p3 = Path("C:/Users/Carina/mne_data/ERP_CORE_P3")
@@ -85,7 +84,6 @@ def old_api_cluster(n_permutations: int = 10000, seed: int = 1234):
         X=data, stat_fun=None, threshold=None, tail=0, kind="within"
     )
 
-    # adjacency = mne.channels.find_ch_adjacency(contrast[0].info, ch_type='eeg')
     # Run the analysis
     T_obs, clusters, cluster_p_values, H0 = (
         mne.stats.cluster_level._permutation_cluster_test(
@@ -115,25 +113,15 @@ def old_api_cluster(n_permutations: int = 10000, seed: int = 1234):
 
     return T_obs, clusters, cluster_p_values, H0
 
-
-# fit cluster test with dataframe as input
-# create condition list that repeats 5times 1 and then 5 times 0
-# 1 = target, 0 = non-target
-# condition = 5 * [1] + 5 * [0]
-
-# 1 = target, 0 = non-target
-# contrast, target_only, non_target_only = prep_sample_data()
-
-# evokeds_list = target_only + non_target_only
-
-
-def create_random_evokeds_id_condition_list(evoked_data_a: list, evoked_data_b: list):
+def create_random_evokeds_id_condition_list():
     """
     Create a list of shuffled participant IDs, conditions, and evoked data.
     # Keep the participant IDs and conditions paired but shuffle the order of the evoked data.
     """
     import random
 
+    _ , evoked_data_a, evoked_data_b = prep_sample_data()
+
     # Example participant IDs
     participant_ids = ["p1", "p2", "p3", "p4", "p5"] * 2
 
@@ -162,42 +150,42 @@ def create_random_evokeds_id_condition_list(evoked_data_a: list, evoked_data_b:
     return shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data
 
 
-def create_random_paired_evokeds_list(evoked_data_a: list, evoked_data_b: list):
+def create_random_paired_evokeds_list():
     """
     Create a list of shuffled evoked data where each pair of target and non-target evoked data is shuffled together.
     """
     import random
+    _, evoked_data_a, evoked_data_b = prep_sample_data()
 
-    # Create a list of tuples where each tuple contains an evoked data and its corresponding label
-    evoked_pairs = [(evoked, 1) for evoked in evoked_data_a] + [
-        (evoked, 0) for evoked in evoked_data_b
-    ]
+    # Ensure evoked_data_a and evoked_data_b are of the same length
+    assert len(evoked_data_a) == len(evoked_data_b), "evoked_data_a and evoked_data_b must have the same length"
+    
+    # Create a list of participant indices
+    participant_indices = list(range(len(evoked_data_a)))
 
-    # Shuffle the list of tuples
-    random.shuffle(evoked_pairs)
+    # Shuffle the list of participant indices
+    random.shuffle(participant_indices)
 
-    # Separate the shuffled list back into evoked data and labels
-    shuffled_evoked_data, shuffled_labels = zip(*evoked_pairs)
+    # Reorder evoked data according to the shuffled participant indices
+    shuffled_evoked_data_a = [evoked_data_a[i] for i in participant_indices]
+    shuffled_evoked_data_b = [evoked_data_b[i] for i in participant_indices]
 
-    # Convert the tuples back to lists
-    shuffled_evoked_data = list(shuffled_evoked_data)
+    # Combine the shuffled evoked data into a single list
+    shuffled_evoked_data = shuffled_evoked_data_a + shuffled_evoked_data_b
+
+    # Combine the original evoked data into a single list
+    original_evoked_data = evoked_data_a + evoked_data_b
 
-    return shuffled_evoked_data
+    return original_evoked_data, shuffled_evoked_data
 
 
 # shuffle order of pairs
-shuffled_evokeds_list = create_random_paired_evokeds_list(target_only, non_target_only)
+original_evoked_data, shuffled_evoked_data = create_random_paired_evokeds_list()
 # shouldn't change the results (p-value is different though?)
 
-shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = (
-    create_random_evokeds_id_condition_list(
-        evoked_data_a=target_only, evoked_data_b=non_target_only
-    )
-)
-
+shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = create_random_evokeds_id_condition_list()
 
 def prepare_dataframe_for_cluster_function(
-    contrast: bool = False,
     evokeds: list = None,
     condition: list = None,
     subject_index: list = None,
@@ -216,29 +204,39 @@ def prepare_dataframe_for_cluster_function(
     subject_index : list, optional
         List of subject IDs. Default is None.
 
+    Returns
+    -------
+    df : DataFrame
+        The prepared DataFrame for the cluster test function.
     """
-    # create an empty dataframe
-    df = pd.DataFrame()
-
-    if contrast == True:
-        # check if evoked list is dividable by 2
-        if len(evokeds) % 2 != 0:
-            raise ValueError("evokeds list needs to be dividable by 2")
-        if condition is not None:
-            # Convert lists to DataFrame for easier manipulation
-            df = pd.DataFrame(
-                {
-                    "evoked": evokeds,
-                    "condition": condition,
-                    "subject_index": subject_index,
-                }
-            )
-
-        return df
+    # Initialize the DataFrame with evoked data
+    df = pd.DataFrame({
+        "evoked": evokeds,
+        "condition": condition if condition is not None else np.nan,
+        "subject_index": subject_index if subject_index is not None else np.nan
+    })
+
+    return df
 
+# run with original data
+df = prepare_dataframe_for_cluster_function(evokeds=original_evoked_data,
+                                            condition=None,
+                                            subject_index=None)
+
+df = prepare_dataframe_for_cluster_function(evokeds=shuffled_evoked_data,
+                                            condition=None,
+                                            subject_index=None)
+
+df = prepare_dataframe_for_cluster_function(evokeds=shuffled_evoked_data,
+                                            condition=shuffled_conditions,
+                                        subject_index=shuffled_participant_ids)
+
+
+cluster_test(df)
 
 def cluster_test(
     df: pd.DataFrame,
+    contrast: bool = True,
     n_permutations: int = 10000,
     seed: int = 1234,
     contrast_weights: list = [1, -1],
@@ -267,43 +265,47 @@ def cluster_test(
     H0 : array
         The permuted test statistics.
     """
-    if df.condition is not None:
-        # Extract unique conditions
-        unique_conditions = np.unique(df.condition)
-        if len(unique_conditions) != 2:
-            raise ValueError("Condition list needs to contain 2 unique values")
-        if df.subject_index is not None:
+    # Check if conditions and subject_index are present and valid
+    conditions_present = pd.notna(df['condition']).all()
+    subject_index_present = pd.notna(df['subject_index']).all()
+
+    if contrast == 1:
+        if conditions_present:
+            # Extract unique conditions
+            unique_conditions = np.unique(df.condition)
+            if len(unique_conditions) != 2:
+                raise ValueError("Condition list needs to contain 2 unique values")
             # Initialize a list to hold the combined evoked data
             evokeds_data = []
-
-            # Process each subject's evoked data
-            for sub_id in df.subject_index.unique():
-                sub_df = df[df.subject_index == sub_id]
-
-                # Split evokeds list based on condition list for this subject
-                evokeds_a = sub_df[sub_df.condition == unique_conditions[0]][
-                    "evoked"
-                ].tolist()
-                evokeds_b = sub_df[sub_df.condition == unique_conditions[1]][
-                    "evoked"
-                ].tolist()
-
-                if len(evokeds_a) != 1 or len(evokeds_b) != 1:
-                    raise ValueError(
-                        f"Subject {sub_id}: Each subject must have exactly one evoked for each condition"
+            if subject_index_present:
+                # Process each subject's evoked data
+                for sub_id in df.subject_index.unique():
+                    sub_df = df[df.subject_index == sub_id]
+
+                    # Split evokeds list based on condition list for this subject
+                    evokeds_a = sub_df[sub_df.condition == unique_conditions[0]][
+                        "evoked"
+                    ].tolist()
+                    evokeds_b = sub_df[sub_df.condition == unique_conditions[1]][
+                        "evoked"
+                    ].tolist()
+
+                    if len(evokeds_a) != 1 or len(evokeds_b) != 1:
+                        raise ValueError(
+                            f"Subject {sub_id}: Each subject must have exactly one evoked for each condition"
+                        )
+
+                    # Calculate contrast based on condition list
+                    diff_evoked = mne.combine_evoked(
+                        [evokeds_a[0], evokeds_b[0]], weights=contrast_weights
                     )
-
-                # Calculate contrast based on condition list
-                diff_evoked = mne.combine_evoked(
-                    [evokeds_a[0], evokeds_b[0]], weights=contrast_weights
-                )
-                evokeds_data.append(diff_evoked)
+                    evokeds_data.append(diff_evoked)
         else:
             # calculate length of evokeds list
-            n_evokeds = len(df.evokeds)
+            n_evokeds = len(df.evoked)
             # now split evokeds list in two lists
-            evokeds_a = df.evokeds[: n_evokeds // 2]
-            evokeds_b = df.evokeds[n_evokeds // 2 :]
+            evokeds_a = df.evoked[: n_evokeds // 2]
+            evokeds_b = df.evoked[n_evokeds // 2 :]
             # create contrast from evokeds_a and evokeds_b
             diff_evoked = [
                 mne.combine_evoked([evo_a, evo_b], weights=contrast_weights)
@@ -311,7 +313,7 @@ def cluster_test(
             ]
             evokeds_data = diff_evoked
     else:
-        evokeds_data = df.evokeds
+        evokeds_data = df.evoked.tolist()
 
     # extract number of channels
     n_channels = evokeds_data[0].info["nchan"]
@@ -330,19 +332,20 @@ def cluster_test(
         X=data, stat_fun=None, threshold=None, tail=0, kind="within"
     )
 
+    # Run the analysis
     T_obs, clusters, cluster_p_values, H0 = (
         mne.stats.cluster_level._permutation_cluster_test(
             [data],
             threshold=threshold,
             stat_fun=stat_fun,
-            n_jobs=-1,
-            max_step=1,
-            exclude=None,
-            step_down_p=0.05,
-            t_power=1,
+            n_jobs=-1,  # takes all CPU cores
+            max_step=1,  # maximum distance between samples (time points)
+            exclude=None,  # exclude no time points or channels
+            step_down_p=0,  # step down in jumps test
+            t_power=1,  # weigh each location by its stats score
             out_type="indices",
-            check_disjoint=True,
-            buffer_size=None,
+            check_disjoint=False,
+            buffer_size=None,  # block size for chunking the data
             n_permutations=n_permutations,
             tail=0,
             adjacency=adjacency,

From a081d7d4ff53520abdc910fc75e3fa108d794a15 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 14 Jun 2024 12:24:30 +0000
Subject: [PATCH 45/88] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tutorials/stats-sensor-space/76_new_cluster_test_api.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index 3f001251ba5..5d943985aa2 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -1,14 +1,17 @@
 from pathlib import Path
+
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 from mpl_toolkits.axes_grid1 import make_axes_locatable
+
 import mne
 
 
 # import and load dataset
 path_to_p3 = Path("C:/Users/Carina/mne_data/ERP_CORE_P3")
 
+
 def prep_sample_data(plot_evokeds: bool = False):
     """
     Load the P3 dataset and extract the target, non-target and contrast evokeds.

From d6d70c8b461523c87c472ad7d9c0b6e1e4403689 Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Fri, 14 Jun 2024 19:04:48 +0200
Subject: [PATCH 46/88] added ToDos

---
 tutorials/stats-sensor-space/76_new_cluster_test_api.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index 5d943985aa2..51ad611aa58 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -7,6 +7,8 @@
 
 import mne
 
+# TODO: implement formulaic design matrix for paired t-test
+# TODO: @erik: add dataset to mne-data
 
 # import and load dataset
 path_to_p3 = Path("C:/Users/Carina/mne_data/ERP_CORE_P3")
@@ -235,8 +237,6 @@ def prepare_dataframe_for_cluster_function(
                                         subject_index=shuffled_participant_ids)
 
 
-cluster_test(df)
-
 def cluster_test(
     df: pd.DataFrame,
     contrast: bool = True,
@@ -471,3 +471,5 @@ def plot_cluster(
     plt.show()
 
     return None
+
+cluster_test(df)
\ No newline at end of file

From 834526146fd68d6bc243fcbd824543d055458938 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 14 Jun 2024 17:04:48 +0000
Subject: [PATCH 47/88] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../76_new_cluster_test_api.py                | 55 ++++++++++++-------
 1 file changed, 34 insertions(+), 21 deletions(-)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index 51ad611aa58..08917f78f03 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -118,6 +118,7 @@ def old_api_cluster(n_permutations: int = 10000, seed: int = 1234):
 
     return T_obs, clusters, cluster_p_values, H0
 
+
 def create_random_evokeds_id_condition_list():
     """
     Create a list of shuffled participant IDs, conditions, and evoked data.
@@ -125,7 +126,7 @@ def create_random_evokeds_id_condition_list():
     """
     import random
 
-    _ , evoked_data_a, evoked_data_b = prep_sample_data()
+    _, evoked_data_a, evoked_data_b = prep_sample_data()
 
     # Example participant IDs
     participant_ids = ["p1", "p2", "p3", "p4", "p5"] * 2
@@ -160,11 +161,14 @@ def create_random_paired_evokeds_list():
     Create a list of shuffled evoked data where each pair of target and non-target evoked data is shuffled together.
     """
     import random
+
     _, evoked_data_a, evoked_data_b = prep_sample_data()
 
     # Ensure evoked_data_a and evoked_data_b are of the same length
-    assert len(evoked_data_a) == len(evoked_data_b), "evoked_data_a and evoked_data_b must have the same length"
-    
+    assert len(evoked_data_a) == len(
+        evoked_data_b
+    ), "evoked_data_a and evoked_data_b must have the same length"
+
     # Create a list of participant indices
     participant_indices = list(range(len(evoked_data_a)))
 
@@ -188,7 +192,10 @@ def create_random_paired_evokeds_list():
 original_evoked_data, shuffled_evoked_data = create_random_paired_evokeds_list()
 # shouldn't change the results (p-value is different though?)
 
-shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = create_random_evokeds_id_condition_list()
+shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = (
+    create_random_evokeds_id_condition_list()
+)
+
 
 def prepare_dataframe_for_cluster_function(
     evokeds: list = None,
@@ -215,26 +222,31 @@ def prepare_dataframe_for_cluster_function(
         The prepared DataFrame for the cluster test function.
     """
     # Initialize the DataFrame with evoked data
-    df = pd.DataFrame({
-        "evoked": evokeds,
-        "condition": condition if condition is not None else np.nan,
-        "subject_index": subject_index if subject_index is not None else np.nan
-    })
+    df = pd.DataFrame(
+        {
+            "evoked": evokeds,
+            "condition": condition if condition is not None else np.nan,
+            "subject_index": subject_index if subject_index is not None else np.nan,
+        }
+    )
 
     return df
 
+
 # run with original data
-df = prepare_dataframe_for_cluster_function(evokeds=original_evoked_data,
-                                            condition=None,
-                                            subject_index=None)
+df = prepare_dataframe_for_cluster_function(
+    evokeds=original_evoked_data, condition=None, subject_index=None
+)
 
-df = prepare_dataframe_for_cluster_function(evokeds=shuffled_evoked_data,
-                                            condition=None,
-                                            subject_index=None)
+df = prepare_dataframe_for_cluster_function(
+    evokeds=shuffled_evoked_data, condition=None, subject_index=None
+)
 
-df = prepare_dataframe_for_cluster_function(evokeds=shuffled_evoked_data,
-                                            condition=shuffled_conditions,
-                                        subject_index=shuffled_participant_ids)
+df = prepare_dataframe_for_cluster_function(
+    evokeds=shuffled_evoked_data,
+    condition=shuffled_conditions,
+    subject_index=shuffled_participant_ids,
+)
 
 
 def cluster_test(
@@ -269,8 +281,8 @@ def cluster_test(
         The permuted test statistics.
     """
     # Check if conditions and subject_index are present and valid
-    conditions_present = pd.notna(df['condition']).all()
-    subject_index_present = pd.notna(df['subject_index']).all()
+    conditions_present = pd.notna(df["condition"]).all()
+    subject_index_present = pd.notna(df["subject_index"]).all()
 
     if contrast == 1:
         if conditions_present:
@@ -472,4 +484,5 @@ def plot_cluster(
 
     return None
 
-cluster_test(df)
\ No newline at end of file
+
+cluster_test(df)

From 0373195a6cfba2bed5299aa2d883e64c5134de82 Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Wed, 19 Jun 2024 19:28:07 +0200
Subject: [PATCH 48/88] added formula support and implemented suggestions

---
 .../76_new_cluster_test_api.py                | 51 ++++++++++++++++---
 1 file changed, 45 insertions(+), 6 deletions(-)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index 08917f78f03..eef90a2612b 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -225,8 +225,8 @@ def prepare_dataframe_for_cluster_function(
     df = pd.DataFrame(
         {
             "evoked": evokeds,
-            "condition": condition if condition is not None else np.nan,
-            "subject_index": subject_index if subject_index is not None else np.nan,
+            "condition": condition if condition is not None else pd.NA,
+            "subject_index": subject_index if subject_index is not None else pd.NA,
         }
     )
 
@@ -251,10 +251,11 @@ def prepare_dataframe_for_cluster_function(
 
 def cluster_test(
     df: pd.DataFrame,
-    contrast: bool = True,
+    formula: str = None, # Wilkinson notation formula for design matrix
+    contrast: bool = True, # will be replaced by formulaic design matrix
     n_permutations: int = 10000,
-    seed: int = 1234,
-    contrast_weights: list = [1, -1],
+    seed: None | int | np.random.RandomState = None,
+    contrast_weights: list = [1, -1], # will be replaced by formulaic design matrix
 ):
     """
     Run the cluster test using the new API.
@@ -284,6 +285,22 @@ def cluster_test(
     conditions_present = pd.notna(df["condition"]).all()
     subject_index_present = pd.notna(df["subject_index"]).all()
 
+    # add a data column to the dataframe (numpy array)
+    df["data"] = [evoked.data for evoked in df.evoked]
+
+    # convert wide format to long format
+    df_long = convert_wide_to_long(df)
+
+    # check if formula is present
+    if formula is not None:
+        import formulaic
+
+        # create design matrix based on formula
+        # Create the design matrix using formulaic
+        y, X = formulaic.model_matrix(formula, df_long)
+
+        # what to do with the design matrix?
+
     if contrast == 1:
         if conditions_present:
             # Extract unique conditions
@@ -378,6 +395,29 @@ def cluster_test(
 
     return T_obs, clusters, cluster_p_values, H0
 
+# Convert wide format to long format
+def convert_wide_to_long(df):
+    long_format_data = []
+    for idx, row in df.iterrows():
+        condition = row['condition']
+        subject_index = row['subject_index']
+        data_2d = row['data']
+        
+        for channel in range(data_2d.shape[0]):
+            for timepoint in range(data_2d.shape[1]):
+                long_format_data.append({
+                    'condition': condition,
+                    'subject_index': subject_index,
+                    'channel': channel,
+                    'timepoint': timepoint,
+                    'value': data_2d[channel, timepoint]
+                })
+    
+    df_long = pd.DataFrame(long_format_data)
+    return df_long
+
+df_long = convert_wide_to_long(df)
+
 
 def plot_cluster(
     contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values
@@ -482,7 +522,6 @@ def plot_cluster(
 
     plt.show()
 
-    return None
 
 
 cluster_test(df)

From 8bc44f968614f217d398fdc1894d8af1a0787115 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 19 Jun 2024 17:28:23 +0000
Subject: [PATCH 49/88] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../76_new_cluster_test_api.py                | 35 ++++++++++---------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index eef90a2612b..7c0abc95fae 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -251,11 +251,11 @@ def prepare_dataframe_for_cluster_function(
 
 def cluster_test(
     df: pd.DataFrame,
-    formula: str = None, # Wilkinson notation formula for design matrix
-    contrast: bool = True, # will be replaced by formulaic design matrix
+    formula: str = None,  # Wilkinson notation formula for design matrix
+    contrast: bool = True,  # will be replaced by formulaic design matrix
     n_permutations: int = 10000,
     seed: None | int | np.random.RandomState = None,
-    contrast_weights: list = [1, -1], # will be replaced by formulaic design matrix
+    contrast_weights: list = [1, -1],  # will be replaced by formulaic design matrix
 ):
     """
     Run the cluster test using the new API.
@@ -395,27 +395,31 @@ def cluster_test(
 
     return T_obs, clusters, cluster_p_values, H0
 
+
 # Convert wide format to long format
 def convert_wide_to_long(df):
     long_format_data = []
     for idx, row in df.iterrows():
-        condition = row['condition']
-        subject_index = row['subject_index']
-        data_2d = row['data']
-        
+        condition = row["condition"]
+        subject_index = row["subject_index"]
+        data_2d = row["data"]
+
         for channel in range(data_2d.shape[0]):
             for timepoint in range(data_2d.shape[1]):
-                long_format_data.append({
-                    'condition': condition,
-                    'subject_index': subject_index,
-                    'channel': channel,
-                    'timepoint': timepoint,
-                    'value': data_2d[channel, timepoint]
-                })
-    
+                long_format_data.append(
+                    {
+                        "condition": condition,
+                        "subject_index": subject_index,
+                        "channel": channel,
+                        "timepoint": timepoint,
+                        "value": data_2d[channel, timepoint],
+                    }
+                )
+
     df_long = pd.DataFrame(long_format_data)
     return df_long
 
+
 df_long = convert_wide_to_long(df)
 
 
@@ -523,5 +527,4 @@ def plot_cluster(
     plt.show()
 
 
-
 cluster_test(df)

From 654a3504412571b4fc72f10be71a9bb686048c05 Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Sat, 22 Jun 2024 11:10:13 +0200
Subject: [PATCH 50/88] fixed linting errors

---
 .../76_new_cluster_test_api.py                | 35 +++++++++++++------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index 7c0abc95fae..2f1d55383d2 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -6,6 +6,7 @@
 from mpl_toolkits.axes_grid1 import make_axes_locatable
 
 import mne
+from mne.utils import _soft_import_
 
 # TODO: implement formulaic design matrix for paired t-test
 # TODO: @erik: add dataset to mne-data
@@ -15,9 +16,7 @@
 
 
 def prep_sample_data(plot_evokeds: bool = False):
-    """
-    Load the P3 dataset and extract the target, non-target and contrast evokeds.
-    """
+    """Load the P3 dataset."""
     # Define the range of participant IDs
     participant_ids = range(15, 20)  # This will cover 015 to 019
 
@@ -25,7 +24,7 @@ def prep_sample_data(plot_evokeds: bool = False):
 
     # Loop over each participant ID and generate the corresponding filename
     for pid in participant_ids:
-        # Create the filename using an f-string, ensuring the participant ID is zero-padded to 3 digits
+        # Create the filename using an f-string, ID is zero-padded to 3 digits
         filename_p3 = f"sub-{pid:03d}_ses-P3_task-P3_ave.fif"
 
         # Print the filename (or perform your desired operations on it)
@@ -67,7 +66,8 @@ def prep_sample_data(plot_evokeds: bool = False):
 
 def old_api_cluster(n_permutations: int = 10000, seed: int = 1234):
     """
-    Run the cluster test using the old API to get a bechmark result for the new API.
+    Run the cluster test using the old API to get a benchmark result for the new API.
+
     Currently implementing a paired t-test with contrast between participants.
     """
     contrast, target_only, non_target_only = prep_sample_data()
@@ -122,7 +122,8 @@ def old_api_cluster(n_permutations: int = 10000, seed: int = 1234):
 def create_random_evokeds_id_condition_list():
     """
     Create a list of shuffled participant IDs, conditions, and evoked data.
-    # Keep the participant IDs and conditions paired but shuffle the order of the evoked data.
+
+    # Keep the participant IDs and conditions paired but shuffle the order of the data.
     """
     import random
 
@@ -158,7 +159,10 @@ def create_random_evokeds_id_condition_list():
 
 def create_random_paired_evokeds_list():
     """
-    Create a list of shuffled evoked data where each pair of target and non-target evoked data is shuffled together.
+    Create shuffled paired evoked data.
+
+    Create a list of shuffled evoked data where each pair of target
+    and non-target evoked data is shuffled together.
     """
     import random
 
@@ -255,10 +259,11 @@ def cluster_test(
     contrast: bool = True,  # will be replaced by formulaic design matrix
     n_permutations: int = 10000,
     seed: None | int | np.random.RandomState = None,
-    contrast_weights: list = [1, -1],  # will be replaced by formulaic design matrix
+    contrast_weights: list = (1, -1),  # will be replaced by formulaic design matrix
 ):
     """
     Run the cluster test using the new API.
+
     # currently supports paired t-test with contrast or with list of conditions
 
     Parameters
@@ -293,12 +298,14 @@ def cluster_test(
 
     # check if formula is present
     if formula is not None:
-        import formulaic
+        formulaic = _soft_import_("formulaic")  # soft import
 
         # create design matrix based on formula
         # Create the design matrix using formulaic
         y, X = formulaic.model_matrix(formula, df_long)
 
+        # sign flip for paired t-test
+
         # what to do with the design matrix?
 
     if contrast == 1:
@@ -324,7 +331,7 @@ def cluster_test(
 
                     if len(evokeds_a) != 1 or len(evokeds_b) != 1:
                         raise ValueError(
-                            f"Subject {sub_id}: Each subject must have exactly one evoked for each condition"
+                            f"Subject {sub_id}: subject must have one evoked per cond"
                         )
 
                     # Calculate contrast based on condition list
@@ -398,6 +405,14 @@ def cluster_test(
 
 # Convert wide format to long format
 def convert_wide_to_long(df):
+    """
+    Convert a DataFrame from wide to long.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame in wide format.
+    """
     long_format_data = []
     for idx, row in df.iterrows():
         condition = row["condition"]

From d1ed8a104375b77d7bccbd115235490b9668f712 Mon Sep 17 00:00:00 2001
From: Eric Larson <larson.eric.d@gmail.com>
Date: Tue, 25 Jun 2024 12:25:21 -0400
Subject: [PATCH 51/88] ENH: Add dataset [skip azp] [skip actions]

---
 mne/datasets/config.py                                  | 4 ++--
 pyproject.toml                                          | 3 +++
 tutorials/stats-sensor-space/76_new_cluster_test_api.py | 7 +++----
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/mne/datasets/config.py b/mne/datasets/config.py
index 2cd937dbdee..1fb4282c513 100644
--- a/mne/datasets/config.py
+++ b/mne/datasets/config.py
@@ -88,7 +88,7 @@
 # here: ↓↓↓↓↓↓↓↓
 RELEASES = dict(
     testing="0.154",
-    misc="0.27",
+    misc="0.30",
     phantom_kit="0.2",
     ucl_opm_auditory="0.2",
 )
@@ -129,7 +129,7 @@
 )
 MNE_DATASETS["misc"] = dict(
     archive_name=f"{MISC_VERSIONED}.tar.gz",  # 'mne-misc-data',
-    hash="md5:e343d3a00cb49f8a2f719d14f4758afe",
+    hash="md5:201d35531d3c03701cf50e38bb73481f",
     url=(
         "https://codeload.github.com/mne-tools/mne-misc-data/tar.gz/"
         f'{RELEASES["misc"]}'
diff --git a/pyproject.toml b/pyproject.toml
index 5427bfe16dc..47e54f4a5a7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -108,6 +108,7 @@ full-no-qt = [
     "defusedxml",
     "neo",
     "antio",
+    "formulaic",
 ]
 full = ["mne[full-no-qt]", "PyQt6!=6.6.0", "PyQt6-Qt6!=6.6.0,!=6.7.0"]
 full-pyqt6 = ["mne[full]"]
@@ -146,6 +147,7 @@ test_extra = [
     "snirf",
     "neo",
     "mne-bids",
+    "formulaic",
 ]
 
 # Dependencies for building the documentation
@@ -158,6 +160,7 @@ doc = [
     "sphinxcontrib-towncrier",
     "memory_profiler",
     "neo",
+    "formulaic",
     "seaborn!=0.11.2",
     "sphinx_copybutton",
     "sphinx-design",
diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index 2f1d55383d2..8eb7637df53 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -6,13 +6,12 @@
 from mpl_toolkits.axes_grid1 import make_axes_locatable
 
 import mne
-from mne.utils import _soft_import_
+from mne.utils import _soft_import
 
 # TODO: implement formulaic design matrix for paired t-test
-# TODO: @erik: add dataset to mne-data
 
 # import and load dataset
-path_to_p3 = Path("C:/Users/Carina/mne_data/ERP_CORE_P3")
+path_to_p3 = mne.datasets.misc.data_path() / "ERP_CORE" / "P3"
 
 
 def prep_sample_data(plot_evokeds: bool = False):
@@ -298,7 +297,7 @@ def cluster_test(
 
     # check if formula is present
     if formula is not None:
-        formulaic = _soft_import_("formulaic")  # soft import
+        formulaic = _soft_import("formulaic")  # soft import
 
         # create design matrix based on formula
         # Create the design matrix using formulaic

From c634a44ffee8c7de971008ba6f2d4fa088f9874e Mon Sep 17 00:00:00 2001
From: Eric Larson <larson.eric.d@gmail.com>
Date: Tue, 25 Jun 2024 12:26:57 -0400
Subject: [PATCH 52/88] FIX: One more [skip azp] [skip actions]

---
 environment.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/environment.yml b/environment.yml
index a0dbdf5ec49..45898f4fd5b 100644
--- a/environment.yml
+++ b/environment.yml
@@ -65,3 +65,4 @@ dependencies:
   - lazy_loader
   - defusedxml
   - python-neo
+  - formulaic

From 0c2eb4f7736fcc085e93befbe7be9aaea97d9f60 Mon Sep 17 00:00:00 2001
From: Eric Larson <larson.eric.d@gmail.com>
Date: Tue, 25 Jun 2024 12:39:41 -0400
Subject: [PATCH 53/88] FIX: Title [skip azp] [skip actions]

---
 .../stats-sensor-space/76_new_cluster_test_api.py    | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index 8eb7637df53..f9c4f61ad5f 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -1,3 +1,15 @@
+"""
+.. _tut-new-cluster-test-api:
+
+====================
+New cluster test API
+====================
+
+This tutorial shows how to use the new API for cluster testing.
+"""
+# License: BSD-3-Clause
+# Copyright the MNE-Python contributors.
+
 from pathlib import Path
 
 import matplotlib.pyplot as plt

From f46a79c1e94ad8eacc310243b542d220951dd068 Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Sun, 30 Jun 2024 20:11:28 +0200
Subject: [PATCH 54/88] first draft of formulaic paired t-test

---
 .../76_new_cluster_test_api.py                | 342 ++++++++++++------
 1 file changed, 224 insertions(+), 118 deletions(-)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index f9c4f61ad5f..6a3a966bbcc 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -15,12 +15,13 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+import scipy
 from mpl_toolkits.axes_grid1 import make_axes_locatable
 
 import mne
 from mne.utils import _soft_import
 
-# TODO: implement formulaic design matrix for paired t-test
+# TODO: test function and update docstrings
 
 # import and load dataset
 path_to_p3 = mne.datasets.misc.data_path() / "ERP_CORE" / "P3"
@@ -248,15 +249,6 @@ def prepare_dataframe_for_cluster_function(
     return df
 
 
-# run with original data
-df = prepare_dataframe_for_cluster_function(
-    evokeds=original_evoked_data, condition=None, subject_index=None
-)
-
-df = prepare_dataframe_for_cluster_function(
-    evokeds=shuffled_evoked_data, condition=None, subject_index=None
-)
-
 df = prepare_dataframe_for_cluster_function(
     evokeds=shuffled_evoked_data,
     condition=shuffled_conditions,
@@ -267,24 +259,56 @@ def prepare_dataframe_for_cluster_function(
 def cluster_test(
     df: pd.DataFrame,
     formula: str = None,  # Wilkinson notation formula for design matrix
-    contrast: bool = True,  # will be replaced by formulaic design matrix
     n_permutations: int = 10000,
     seed: None | int | np.random.RandomState = None,
-    contrast_weights: list = (1, -1),  # will be replaced by formulaic design matrix
+    tail: int = 0,  # 0 for two-tailed, 1 for greater, -1 for less
+    n_jobs: int = 1,  # how many cores to use
+    adjacency: tuple = None,
+    max_step: int = 1,  # maximum distance between samples (time points)
+    exclude: list = None,  # exclude no time points or channels
+    step_down_p: int = 0,  # step down in jumps test
+    t_power: int = 1,  # weigh each location by its stats score
+    out_type: str = "indices",
+    check_disjoint: bool = False,
+    buffer_size: int = None,  # block size for chunking the data
 ):
     """
     Run the cluster test using the new API.
 
-    # currently supports paired t-test with contrast or with list of conditions
+    # currently supports paired t-test
 
     Parameters
     ----------
     dataframe : pd.DataFrame
         Dataframe with evoked data, conditions and subject IDs.
+    formula : str, optional
+        Wilkinson notation formula for design matrix. Default is None.
     n_permutations : int, optional
         Number of permutations. Default is 10000.
+    seed : None | int | np.random.RandomState, optional
+        Seed for the random number generator. Default is None.
+    tail : int, optional
+        0 for two-tailed, 1 for greater, -1 for less. Default is 0.
+    n_jobs : int, optional
+        How many cores to use. Default is 1.
+    adjacency : None, optional
+        Adjacency matrix. Default is None.
+    max_step : int, optional
+        Maximum distance between samples (time points). Default is 1.
+    exclude : np.Array, optional
+        Exclude no time points or channels. Default is None.
+    step_down_p : int, optional
+        Step down in jumps test. Default is 0.
+    t_power : int, optional
+        Weigh each location by its stats score. Default is 1.
+    out_type : str, optional
+        Output type. Default is "indices".
+    check_disjoint : bool, optional
+        Check if clusters are disjoint. Default is False.
+    buffer_size : int, optional
+        Block size for chunking the data. Default is None.
     seed : int, optional
-        Random seed. Default is 1234.
+        Seed for the random number generator. Default is None.
 
     Returns
     -------
@@ -297,108 +321,78 @@ def cluster_test(
     H0 : array
         The permuted test statistics.
     """
-    # Check if conditions and subject_index are present and valid
-    conditions_present = pd.notna(df["condition"]).all()
-    subject_index_present = pd.notna(df["subject_index"]).all()
-
+    # for now this assumes a dataframe with a column for evoked data
     # add a data column to the dataframe (numpy array)
     df["data"] = [evoked.data for evoked in df.evoked]
 
-    # convert wide format to long format
-    df_long = convert_wide_to_long(df)
+    # extract number of channels and timepoints
+    # (eventually should also allow for frequency)
+    n_channels, n_timepoints = df["data"][0].shape
+
+    # convert wide format to long format for formulaic
+    df_long = unpack_time_and_channels(df)
+
+    # Pivot the DataFrame
+    pivot_df = df_long.pivot_table(
+        index=["subject_index", "channel", "timepoint"],
+        columns="condition",
+        values="value",
+    ).reset_index()
+
+    # if not 2 unique conditions raise error
+    if len(pd.unique(df.condition)) != 2:
+        raise ValueError("Condition list needs to contain 2 unique values")
+
+    # Compute the difference (assuming there are only 2 conditions)
+    pivot_df["y"] = pivot_df[0] - pivot_df[1]
+
+    # Optional: Clean up the DataFrame
+    pivot_df = pivot_df[["subject_index", "channel", "timepoint", "y"]]
 
     # check if formula is present
     if formula is not None:
-        formulaic = _soft_import("formulaic")  # soft import
+        formulaic = _soft_import(
+            "formulaic", purpose="set up Design Matrix"
+        )  # soft import (not a dependency for MNE)
 
-        # create design matrix based on formula
+        # for the paired t-test y is the difference between conditions
+        # X is the design matrix with a column with 1s and 0s for each participant
         # Create the design matrix using formulaic
-        y, X = formulaic.model_matrix(formula, df_long)
-
-        # sign flip for paired t-test
-
-        # what to do with the design matrix?
-
-    if contrast == 1:
-        if conditions_present:
-            # Extract unique conditions
-            unique_conditions = np.unique(df.condition)
-            if len(unique_conditions) != 2:
-                raise ValueError("Condition list needs to contain 2 unique values")
-            # Initialize a list to hold the combined evoked data
-            evokeds_data = []
-            if subject_index_present:
-                # Process each subject's evoked data
-                for sub_id in df.subject_index.unique():
-                    sub_df = df[df.subject_index == sub_id]
-
-                    # Split evokeds list based on condition list for this subject
-                    evokeds_a = sub_df[sub_df.condition == unique_conditions[0]][
-                        "evoked"
-                    ].tolist()
-                    evokeds_b = sub_df[sub_df.condition == unique_conditions[1]][
-                        "evoked"
-                    ].tolist()
-
-                    if len(evokeds_a) != 1 or len(evokeds_b) != 1:
-                        raise ValueError(
-                            f"Subject {sub_id}: subject must have one evoked per cond"
-                        )
-
-                    # Calculate contrast based on condition list
-                    diff_evoked = mne.combine_evoked(
-                        [evokeds_a[0], evokeds_b[0]], weights=contrast_weights
-                    )
-                    evokeds_data.append(diff_evoked)
-        else:
-            # calculate length of evokeds list
-            n_evokeds = len(df.evoked)
-            # now split evokeds list in two lists
-            evokeds_a = df.evoked[: n_evokeds // 2]
-            evokeds_b = df.evoked[n_evokeds // 2 :]
-            # create contrast from evokeds_a and evokeds_b
-            diff_evoked = [
-                mne.combine_evoked([evo_a, evo_b], weights=contrast_weights)
-                for evo_a, evo_b in zip(evokeds_a, evokeds_b)
-            ]
-            evokeds_data = diff_evoked
+        y, X = formulaic.model_matrix(formula, pivot_df)
     else:
-        evokeds_data = df.evoked.tolist()
-
-    # extract number of channels
-    n_channels = evokeds_data[0].info["nchan"]
-
-    # loop over rows and extract data from evokeds
-    data_array = np.array([evoked.data for evoked in evokeds_data])
+        raise ValueError(
+            "Formula is required and needs to be a string in Wilkinson notation."
+        )
 
-    # find the dimension that is equal to n_channels
-    if data_array.shape[1] == n_channels:
-        # reshape to channels as last dimension
-        data = data_array.transpose(0, 2, 1)
+    # now prep design matrix outcome variable for input into MNE cluster function
+    # we initially had first channels, then timepoints,
+    # now we need first timepoints, then channels
+    y_for_cluster = y.values.reshape(-1, n_channels, n_timepoints).transpose(0, 2, 1)
 
-    adjacency, _ = mne.channels.find_ch_adjacency(evokeds_data[0].info, ch_type="eeg")
+    adjacency, _ = mne.channels.find_ch_adjacency(df["evoked"][0].info, ch_type="eeg")
 
+    # define stat function and threshold
     stat_fun, threshold = mne.stats.cluster_level._check_fun(
-        X=data, stat_fun=None, threshold=None, tail=0, kind="within"
+        X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="within"
     )
 
-    # Run the analysis
+    # Run the cluster-based permutation test
     T_obs, clusters, cluster_p_values, H0 = (
         mne.stats.cluster_level._permutation_cluster_test(
-            [data],
+            [y_for_cluster],
+            n_permutations=10000,
             threshold=threshold,
             stat_fun=stat_fun,
-            n_jobs=-1,  # takes all CPU cores
-            max_step=1,  # maximum distance between samples (time points)
-            exclude=None,  # exclude no time points or channels
-            step_down_p=0,  # step down in jumps test
-            t_power=1,  # weigh each location by its stats score
-            out_type="indices",
-            check_disjoint=False,
-            buffer_size=None,  # block size for chunking the data
-            n_permutations=n_permutations,
-            tail=0,
+            tail=tail,
+            n_jobs=n_jobs,
             adjacency=adjacency,
+            max_step=max_step,  # maximum distance between samples (time points)
+            exclude=exclude,  # exclude no time points or channels
+            step_down_p=step_down_p,  # step down in jumps test
+            t_power=t_power,  # weigh each location by its stats score
+            out_type=out_type,
+            check_disjoint=check_disjoint,
+            buffer_size=buffer_size,  # block size for chunking the data
             seed=seed,
         )
     )
@@ -414,39 +408,44 @@ def cluster_test(
     return T_obs, clusters, cluster_p_values, H0
 
 
-# Convert wide format to long format
-def convert_wide_to_long(df):
+def unpack_time_and_channels(df):
     """
-    Convert a DataFrame from wide to long.
+    Extract the time and channel data from the DataFrame.
 
     Parameters
     ----------
     df : pd.DataFrame
         DataFrame in wide format.
     """
-    long_format_data = []
-    for idx, row in df.iterrows():
-        condition = row["condition"]
-        subject_index = row["subject_index"]
-        data_2d = row["data"]
-
-        for channel in range(data_2d.shape[0]):
-            for timepoint in range(data_2d.shape[1]):
-                long_format_data.append(
-                    {
-                        "condition": condition,
-                        "subject_index": subject_index,
-                        "channel": channel,
-                        "timepoint": timepoint,
-                        "value": data_2d[channel, timepoint],
-                    }
-                )
+    # Extracting all necessary data using list comprehensions for better performance
+    long_format_data = [
+        {
+            "condition": row["condition"],
+            "subject_index": row["subject_index"],
+            "channel": channel,
+            "timepoint": timepoint,
+            "value": row["data"][channel, timepoint],
+        }
+        for idx, row in df.iterrows()
+        for channel in range(row["data"].shape[0])
+        for timepoint in range(row["data"].shape[1])
+    ]
 
+    # Creating the long format DataFrame
     df_long = pd.DataFrame(long_format_data)
+
     return df_long
 
 
-df_long = convert_wide_to_long(df)
+# Example usage
+# Sample wide format DataFrame
+df_wide = pd.DataFrame(
+    {
+        "condition": ["A", "B"],
+        "subject_index": [1, 2],
+        "data": [np.array([[1, 2, 3], [4, 5, 6]]), np.array([[7, 8, 9], [10, 11, 12]])],
+    }
+)
 
 
 def plot_cluster(
@@ -553,4 +552,111 @@ def plot_cluster(
     plt.show()
 
 
-cluster_test(df)
+# translated the limo permutation ttest from matlab to python
+def limo_ttest_permute(Data, n_perm=None):
+    """
+    Pseudo one-sample t-test using sign-test with permutations.
+
+    Parameters
+    ----------
+    Data (numpy.ndarray): A matrix of data for the one-sample t-test.
+                          Shape can be (n_channels, n_var, n_obs) or
+                          (n_var, n_obs).
+                        n_perm (int, optional): Number of permutations to perform.
+    If None, it defaults based on the number of observations.
+
+    Returns
+    -------
+    t_vals (numpy.ndarray): t-values under H0.
+    p_vals (numpy.ndarray): p-values under H0.
+    dfe (int): Degrees of freedom.
+    """
+    # Check inputs and reshape if necessary
+    if Data.ndim == 3:
+        n_channels, n_var, n_obs = Data.shape
+    else:
+        n_channels = 1
+        n_var, n_obs = Data.shape
+        Data = Data[np.newaxis, ...]
+
+    # Warn if the number of observations is very small
+    if n_obs < 7:
+        n_psbl_prms = 2**n_obs
+        print(
+            f"Due to the very limited number of observations, "
+            f"the total number of possible permutations is small ({n_psbl_prms}). "
+            "Thus, only a limited number of p-values are possible "
+            "and the test might be overly conservative."
+        )
+
+    # Set up permutation test
+    if n_obs <= 12:
+        n_perm = 2**n_obs  # total number of possible permutations
+        exact = True
+        print(
+            "Due to the limited number of observations, all possible permutations "
+            "of the data will be computed instead of random permutations."
+        )
+    else:
+        exact = False
+        if n_perm is None:
+            n_perm = 1000
+
+    print(f"Executing permutation test with {n_perm} permutations...")
+
+    # Initialize variables
+    t_vals = np.full(
+        (n_channels, n_var, n_perm), np.nan
+    )  # Array to store t-values for each permutation
+    sqrt_nXnM1 = np.sqrt(
+        n_obs * (n_obs - 1)
+    )  # Precompute constant for t-value calculation
+    dfe = n_obs - 1  # Degrees of freedom
+
+    if exact:
+        # Use all possible permutations
+        for perm in range(n_perm):
+            # Set sign of each trial / participant's data
+            temp = np.array(
+                [int(x) for x in bin(perm)[2:].zfill(n_obs)]
+            )  # Convert perm index to binary array
+            sn = np.where(temp == 0, -1, 1)  # Map 0 to -1 and 1 to 1
+            sn_mtrx = np.tile(sn, (n_var, 1)).T  # Repeat sn for each variable
+
+            for c in range(n_channels):
+                data = Data[c, :, :]
+                d_perm = data * sn_mtrx  # Apply sign flip to data
+
+                # Compute t-score of permuted data
+                sm = np.sum(d_perm, axis=1)  # Sum of permuted data
+                mn = sm / n_obs  # Mean of permuted data
+                sm_sqrs = (
+                    np.sum(d_perm**2, axis=1) - (sm**2) / n_obs
+                )  # Sum of squares for standard error
+                stder = np.sqrt(sm_sqrs) / sqrt_nXnM1  # Standard error
+                t_vals[c, :, perm] = mn / stder  # Compute t-values
+
+    else:
+        # Use random permutations
+        for perm in range(n_perm):
+            # Randomly set sign of each trial / participant's data
+            sn = (np.random.rand(n_obs) > 0.5) * 2 - 1  # Generate random sign flips
+            sn_mtrx = np.tile(sn, (n_var, 1))  # Repeat sn for each variable
+
+            for c in range(n_channels):
+                data = Data[c, :, :]
+                d_perm = data * sn_mtrx  # Apply sign flip to data
+
+                # Compute t-score of permuted data
+                sm = np.sum(d_perm, axis=1)  # Sum of permuted data
+                mn = sm / n_obs  # Mean of permuted data
+                sm_sqrs = (
+                    np.sum(d_perm**2, axis=1) - (sm**2) / n_obs
+                )  # Sum of squares for standard error
+                stder = np.sqrt(sm_sqrs) / sqrt_nXnM1  # Standard error
+                t_vals[c, :, perm] = mn / stder  # Compute t-values
+
+    # Compute p-values from t-values
+    p_vals = 2 * scipy.stats.cdf(-np.abs(t_vals), dfe)
+
+    return t_vals, p_vals, dfe

From 5d1cbae78a354321aef1f7f2fa5ecf6881f1533c Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Sat, 6 Jul 2024 10:36:55 +0200
Subject: [PATCH 55/88] first draft without cluster plotting class implemented

---
 mne/stats/cluster_level.py                    | 293 +++++++
 .../76_new_cluster_test_api.py                | 722 +++---------------
 2 files changed, 392 insertions(+), 623 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index 50743c104ef..e50991254fe 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -4,12 +4,17 @@
 # License: BSD-3-Clause
 # Copyright the MNE-Python contributors.
 
+import matplotlib.pyplot as plt
 import numpy as np
+import pandas as pd
+from mpl_toolkits.axes_grid1 import make_axes_locatable
 from scipy import ndimage, sparse
 from scipy.sparse.csgraph import connected_components
 from scipy.stats import f as fstat
 from scipy.stats import t as tstat
 
+from .. import EvokedArray
+from ..channels import find_ch_adjacency
 from ..fixes import has_numba, jit
 from ..parallel import parallel_func
 from ..source_estimate import MixedSourceEstimate, SourceEstimate, VolSourceEstimate
@@ -18,6 +23,7 @@
     ProgressBar,
     _check_option,
     _pl,
+    _soft_import,
     _validate_type,
     check_random_state,
     logger,
@@ -25,6 +31,7 @@
     verbose,
     warn,
 )
+from ..viz import plot_compare_evokeds
 from .parametric import f_oneway, ttest_1samp_no_p
 
 
@@ -1723,3 +1730,289 @@ def summarize_clusters_stc(
     data_summary[:, 0] = np.sum(data_summary, axis=1)
 
     return klass(data_summary, vertices, tmin, tstep, subject)
+
+
+def cluster_test(
+    df: pd.DataFrame,
+    formula: str = None,  # Wilkinson notation formula for design matrix
+    n_permutations: int = 10000,
+    seed: None | int | np.random.RandomState = None,
+    tail: int = 0,  # 0 for two-tailed, 1 for greater, -1 for less
+    n_jobs: int = 1,  # how many cores to use
+    adjacency: tuple = None,
+    max_step: int = 1,  # maximum distance between samples (time points)
+    exclude: list = None,  # exclude no time points or channels
+    step_down_p: int = 0,  # step down in jumps test
+    t_power: int = 1,  # weigh each location by its stats score
+    out_type: str = "indices",
+    check_disjoint: bool = False,
+    buffer_size: int = None,  # block size for chunking the data
+):
+    """
+    Run the cluster test using the new API.
+
+    # currently supports paired t-test
+
+    Parameters
+    ----------
+    dataframe : pd.DataFrame
+        Dataframe with evoked data, conditions and subject IDs.
+    formula : str, optional
+        Wilkinson notation formula for design matrix. Default is None.
+    n_permutations : int, optional
+        Number of permutations. Default is 10000.
+    seed : None | int | np.random.RandomState, optional
+        Seed for the random number generator. Default is None.
+    tail : int, optional
+        0 for two-tailed, 1 for greater, -1 for less. Default is 0.
+    n_jobs : int, optional
+        How many cores to use. Default is 1.
+    adjacency : None, optional
+        Adjacency matrix. Default is None.
+    max_step : int, optional
+        Maximum distance between samples (time points). Default is 1.
+    exclude : np.Array, optional
+        Exclude no time points or channels. Default is None.
+    step_down_p : int, optional
+        Step down in jumps test. Default is 0.
+    t_power : int, optional
+        Weigh each location by its stats score. Default is 1.
+    out_type : str, optional
+        Output type. Default is "indices".
+    check_disjoint : bool, optional
+        Check if clusters are disjoint. Default is False.
+    buffer_size : int, optional
+        Block size for chunking the data. Default is None.
+    seed : int, optional
+        Seed for the random number generator. Default is None.
+
+    Returns
+    -------
+    T_obs : array
+        The observed test statistic.
+    clusters : list
+        List of clusters.
+    cluster_p_values : array
+        Array of cluster p-values.
+    H0 : array
+        The permuted test statistics.
+    """
+    # for now this assumes a dataframe with a column for evoked data or epochs
+    # add a data column to the dataframe (numpy array)
+    df["data"] = [evoked.data for evoked in df.evoked]
+
+    # extract number of channels and timepoints
+    # (eventually should also allow for frequency)
+    n_channels, n_timepoints = df["data"][0].shape
+
+    # convert wide format to long format for formulaic
+    df_long = unpack_time_and_channels(df)
+
+    # Pivot the DataFrame
+    pivot_df = df_long.pivot_table(
+        index=["subject_index", "channel", "timepoint"],
+        columns="condition",
+        values="value",
+    ).reset_index()
+
+    # if not 2 unique conditions raise error
+    if len(pd.unique(df.condition)) != 2:
+        raise ValueError("Condition list needs to contain 2 unique values")
+
+    # Get the unique conditions
+    conditions = np.unique(df.condition)
+
+    # Compute the difference (assuming there are only 2 conditions)
+    pivot_df["evoked"] = pivot_df[conditions[0]] - pivot_df[conditions[1]]
+
+    # Optional: Clean up the DataFrame
+    pivot_df = pivot_df[["subject_index", "channel", "timepoint", "evoked"]]
+
+    # check if formula is present
+    if formula is not None:
+        formulaic = _soft_import(
+            "formulaic", purpose="set up Design Matrix"
+        )  # soft import (not a dependency for MNE)
+
+        # for the paired t-test y is the difference between conditions
+        # X is the design matrix with a column with 1s and 0s for each participant
+        # Create the design matrix using formulaic
+        y, X = formulaic.model_matrix(formula, pivot_df)
+    else:
+        raise ValueError(
+            "Formula is required and needs to be a string in Wilkinson notation."
+        )
+
+    # now prep design matrix outcome variable for input into MNE cluster function
+    # we initially had first channels, then timepoints,
+    # now we need first timepoints, then channels
+    y_for_cluster = y.values.reshape(-1, n_channels, n_timepoints).transpose(0, 2, 1)
+
+    adjacency, _ = find_ch_adjacency(df["evoked"][0].info, ch_type="eeg")
+
+    # define stat function and threshold
+    stat_fun, threshold = _check_fun(
+        X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="within"
+    )
+
+    # Run the cluster-based permutation test
+    T_obs, clusters, cluster_p_values, H0 = _permutation_cluster_test(
+        [y_for_cluster],
+        n_permutations=10000,
+        threshold=threshold,
+        stat_fun=stat_fun,
+        tail=tail,
+        n_jobs=n_jobs,
+        adjacency=adjacency,
+        max_step=max_step,  # maximum distance between samples (time points)
+        exclude=exclude,  # exclude no time points or channels
+        step_down_p=step_down_p,  # step down in jumps test
+        t_power=t_power,  # weigh each location by its stats score
+        out_type=out_type,
+        check_disjoint=check_disjoint,
+        buffer_size=buffer_size,  # block size for chunking the data
+        seed=seed,
+    )
+
+    print(min(cluster_p_values))
+
+    return T_obs, clusters, cluster_p_values, H0
+
+
+def unpack_time_and_channels(df):
+    """
+    Extract the time and channel data from the DataFrame.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame in wide format.
+    """
+    # Extracting all necessary data using list comprehensions for better performance
+    long_format_data = [
+        {
+            "condition": row["condition"],
+            "subject_index": row["subject_index"],
+            "channel": channel,
+            "timepoint": timepoint,
+            "value": row["data"][channel, timepoint],
+        }
+        for idx, row in df.iterrows()
+        for channel in range(row["data"].shape[0])
+        for timepoint in range(row["data"].shape[1])
+    ]
+
+    # Creating the long format DataFrame
+    df_long = pd.DataFrame(long_format_data)
+
+    return df_long
+
+
+def plot_cluster(
+    contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values
+):
+    """
+    Plot the cluster with the lowest p-value.
+
+    Parameters
+    ----------
+    contrast : list
+        List of contrast evoked objects.
+    target_only : list
+        List of target evoked objects.
+    non_target_only : list
+        List of non-target evoked objects.
+    T_obs : array
+        The observed test statistic.
+    clusters : list
+        List of clusters.
+    cluster_p_values : array
+        Array of cluster p-values.
+
+    Returns
+    -------
+    None
+
+    """
+    # configure variables for visualization
+    colors = {"target": "crimson", "non-target": "steelblue"}
+
+    # organize data for plotting
+    evokeds = {"target": target_only, "non-target": non_target_only}
+
+    lowest_p_cluster = np.argmin(cluster_p_values)
+
+    # plot the cluster with the lowest p-value
+    time_inds, space_inds = np.squeeze(clusters[lowest_p_cluster])
+    ch_inds = np.unique(space_inds)
+    time_inds = np.unique(time_inds)
+
+    # get topography for F stat
+    t_map = T_obs[time_inds, ...].mean(axis=0)
+
+    # get signals at the sensors contributing to the cluster
+    sig_times = contrast[0].times[time_inds]
+
+    # create spatial mask
+    mask = np.zeros((t_map.shape[0], 1), dtype=bool)
+    mask[ch_inds, :] = True
+
+    # initialize figure
+    fig, ax_topo = plt.subplots(1, 1, figsize=(10, 3), layout="constrained")
+
+    # plot average test statistic and mark significant sensors
+    t_evoked = EvokedArray(t_map[:, np.newaxis], contrast[0].info, tmin=0)
+    t_evoked.plot_topomap(
+        times=0,
+        mask=mask,
+        axes=ax_topo,
+        cmap="Reds",
+        vlim=(np.min, np.max),
+        show=False,
+        colorbar=False,
+        mask_params=dict(markersize=10),
+    )
+    image = ax_topo.images[0]
+
+    # remove the title that would otherwise say "0.000 s"
+    ax_topo.set_title("")
+
+    # soft import?
+    # make_axes_locatable = _soft_import(
+    #    "mpl_toolkits.axes_grid1.make_axes_locatable",
+    #    purpose="plot cluster results"
+    # )  # soft import (not a dependency for MNE)
+
+    # create additional axes (for ERF and colorbar)
+    divider = make_axes_locatable(ax_topo)
+
+    # add axes for colorbar
+    ax_colorbar = divider.append_axes("right", size="5%", pad=0.05)
+    plt.colorbar(image, cax=ax_colorbar)
+    ax_topo.set_xlabel(
+        "Averaged t-map ({:0.3f} - {:0.3f} s)".format(*sig_times[[0, -1]])
+    )
+
+    # add new axis for time courses and plot time courses
+    ax_signals = divider.append_axes("right", size="300%", pad=1.2)
+    title = f"Cluster #1, {len(ch_inds)} sensor"
+    if len(ch_inds) > 1:
+        title += "s (mean)"
+        plot_compare_evokeds(
+            evokeds,
+            title=title,
+            picks=ch_inds,
+            axes=ax_signals,
+            colors=colors,
+            show=False,
+            split_legend=True,
+            truncate_yaxis="auto",
+        )
+
+    # plot temporal cluster extent
+    ymin, ymax = ax_signals.get_ylim()
+    ax_signals.fill_betweenx(
+        (ymin, ymax), sig_times[0], sig_times[-1], color="green", alpha=0.3
+    )
+
+    plt.show()
diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index 6a3a966bbcc..ec8bd8275a1 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -1,662 +1,138 @@
 """
 .. _tut-new-cluster-test-api:
 
-====================
-New cluster test API
-====================
+===============================================================
+New cluster test API that allows for Wilkinson style formulas
+===============================================================
 
 This tutorial shows how to use the new API for cluster testing.
+This script shows how to estimate significant clusters in
+evoked contrast data of multiple subjects.
+It uses a non-parametric statistical procedure based on permutations and
+cluster level statistics.
+
+The procedure consists of:
+
+  - loading evoked data from multiple subjects
+  - construct a dataframe that contains the difference between conditions
+  - run the new cluster test function
+
+Here, the unit of observation are evokeds from multiple subjects (2nd level analysis).
+
+For more information on cluster-based permutation testing in MNE-Python,
+see also: :ref:`tut-cluster-one-samp-tfr`.
 """
+# Authors: Carina Forster <carinaforster0611@gmail.com>
+#
 # License: BSD-3-Clause
 # Copyright the MNE-Python contributors.
 
+# %%
+
 from pathlib import Path
 
-import matplotlib.pyplot as plt
-import numpy as np
 import pandas as pd
-import scipy
-from mpl_toolkits.axes_grid1 import make_axes_locatable
 
 import mne
-from mne.utils import _soft_import
-
-# TODO: test function and update docstrings
 
-# import and load dataset
+# Set parameters
+# --------------
+# Define the path to the P3 dataset
 path_to_p3 = mne.datasets.misc.data_path() / "ERP_CORE" / "P3"
 
+# Define the range of participant IDs
+participant_ids = range(15, 20)  # This will cover 015 to 019
 
-def prep_sample_data(plot_evokeds: bool = False):
-    """Load the P3 dataset."""
-    # Define the range of participant IDs
-    participant_ids = range(15, 20)  # This will cover 015 to 019
-
-    evokeds_allsubs = []
-
-    # Loop over each participant ID and generate the corresponding filename
-    for pid in participant_ids:
-        # Create the filename using an f-string, ID is zero-padded to 3 digits
-        filename_p3 = f"sub-{pid:03d}_ses-P3_task-P3_ave.fif"
-
-        # Print the filename (or perform your desired operations on it)
-        print(filename_p3)
-
-        p3_file_path = Path(path_to_p3) / filename_p3
-
-        evokeds = mne.read_evokeds(p3_file_path)
-
-        # add to list
-        evokeds_allsubs.append(evokeds)
-
-    target_only = [evoked[0] for evoked in evokeds_allsubs]
-    non_target_only = [evoked[1] for evoked in evokeds_allsubs]
-    contrast = [evoked[2] for evoked in evokeds_allsubs]
-
-    if plot_evokeds:
-        # plot the grand average
-        mne.grand_average(target_only).plot()
-        mne.grand_average(non_target_only).plot()
-        mne.grand_average(contrast).plot()
-
-    # create contrast from evokeds target and non-target
-    diff_evoked = [
-        mne.combine_evoked([evokeds_a, evokeds_b], weights=[1, -1])
-        for evokeds_a, evokeds_b in zip(target_only, non_target_only)
-    ]
-
-    if plot_evokeds:
-        mne.grand_average(diff_evoked).plot()
-
-    # crop the evokeds in the post stimulus window
-    contrast = [evokeds.crop(tmin=-0.1, tmax=0.6) for evokeds in contrast]
-    target_only = [evokeds.crop(tmin=-0.1, tmax=0.6) for evokeds in target_only]
-    non_target_only = [evokeds.crop(tmin=-0.1, tmax=0.6) for evokeds in non_target_only]
-
-    return contrast, target_only, non_target_only
-
-
-def old_api_cluster(n_permutations: int = 10000, seed: int = 1234):
-    """
-    Run the cluster test using the old API to get a benchmark result for the new API.
-
-    Currently implementing a paired t-test with contrast between participants.
-    """
-    contrast, target_only, non_target_only = prep_sample_data()
-
-    # extract the data for each evoked and store in numpy array
-    data = np.array([evoked.data for evoked in contrast])
-
-    # shape should be (n_subjects, n_channels, n_times)
-    data.shape
-
-    # reshape to channels as last dimension
-    data = data.transpose(0, 2, 1)
-
-    data.shape
+# store the evoked data of all subjects
+evokeds_allsubs = []
 
-    adjacency, _ = mne.channels.find_ch_adjacency(contrast[0].info, ch_type="eeg")
+# Loop over each participant ID and generate the corresponding filename
+for pid in participant_ids:
+    # Create the filename using an f-string, ID is zero-padded to 3 digits
+    filename_p3 = f"sub-{pid:03d}_ses-P3_task-P3_ave.fif"
 
-    stat_fun, threshold = mne.stats.cluster_level._check_fun(
-        X=data, stat_fun=None, threshold=None, tail=0, kind="within"
-    )
+    # Create the full path to the file
+    p3_file_path = Path(path_to_p3) / filename_p3
 
-    # Run the analysis
-    T_obs, clusters, cluster_p_values, H0 = (
-        mne.stats.cluster_level._permutation_cluster_test(
-            [data],
-            threshold=threshold,
-            stat_fun=stat_fun,
-            n_jobs=-1,  # takes all CPU cores
-            max_step=1,  # maximum distance between samples (time points)
-            exclude=None,  # exclude no time points or channels
-            step_down_p=0,  # step down in jumps test
-            t_power=1,  # weigh each location by its stats score
-            out_type="indices",
-            check_disjoint=False,
-            buffer_size=None,  # block size for chunking the data
-            n_permutations=n_permutations,
-            tail=0,
-            adjacency=adjacency,
-            seed=seed,
-        )
-    )
+    # load the evoked data
+    evokeds = mne.read_evokeds(p3_file_path)
 
-    print(min(cluster_p_values))
+    # add subjects evoked data to list
+    evokeds_allsubs.append(evokeds)
 
-    plot_cluster(
-        contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values
-    )
+# the P3b dataset is part of the freely available ERP CORE dataset
+# participants were presented with a visual oddball task
+# and the P3b component was analyzed
+# the conditions of interest are the target (rare visual stimuli)
+# and non-target stimuli (frequency visual stimuli)
 
-    return T_obs, clusters, cluster_p_values, H0
+# let's extract the target and non-target evokeds
+target_only = [evoked[0] for evoked in evokeds_allsubs]
+non_target_only = [evoked[1] for evoked in evokeds_allsubs]
 
+# let's first have a look at the data
+# create contrast from target and non-target evokeds
+diff_evoked = [
+    mne.combine_evoked([evokeds_a, evokeds_b], weights=[1, -1])
+    for evokeds_a, evokeds_b in zip(target_only, non_target_only)
+]
 
-def create_random_evokeds_id_condition_list():
-    """
-    Create a list of shuffled participant IDs, conditions, and evoked data.
+# plot the grand average of the difference signal
+mne.grand_average(diff_evoked).plot()
+# plot the topography of the difference signal
+mne.grand_average(diff_evoked).plot_topomap()
 
-    # Keep the participant IDs and conditions paired but shuffle the order of the data.
-    """
-    import random
+# we can see that the strongest difference is around 400 ms in
+# visual channels (occipital region)
 
-    _, evoked_data_a, evoked_data_b = prep_sample_data()
+# Next we prepare a dataframe for the cluster test function
+# the dataframe should contain the contrast evoked data and the subject index
+# each row in the dataframe should represent one observation (evoked data)
 
-    # Example participant IDs
-    participant_ids = ["p1", "p2", "p3", "p4", "p5"] * 2
+# save the evoked data for both conditions in one list
+evokeds_conditions = target_only + non_target_only
 
-    # Combine the evoked data into a single list
-    all_evoked_data = evoked_data_a + evoked_data_b
+# set up a list that defines the condition for each evoked data
+# this will be used to create the conditions column in the dataframe
+conditions = ["target"] * len(target_only) + ["non-target"] * len(non_target_only)
 
-    # Create a corresponding list of conditions
-    conditions = [1] * len(evoked_data_a) + [0] * len(evoked_data_b)
+# finally add a column that defines the subject index
+# this will be used to create the subject_index column in the dataframe
+# we multiply the participant_ids by 2 to account for the two conditions
+subject_index = list(participant_ids) * 2
 
-    # Combine the participant IDs, conditions, and evoked data into a list of tuples
-    combined_list = list(zip(participant_ids, conditions, all_evoked_data))
-
-    # Shuffle the combined list
-    random.shuffle(combined_list)
-
-    # Separate the shuffled list back into participant IDs, conditions, and evoked data
-    shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = zip(
-        *combined_list
-    )
-
-    # Convert the tuples back to lists
-    shuffled_participant_ids = list(shuffled_participant_ids)
-    shuffled_conditions = list(shuffled_conditions)
-    shuffled_evoked_data = list(shuffled_evoked_data)
-
-    return shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data
-
-
-def create_random_paired_evokeds_list():
-    """
-    Create shuffled paired evoked data.
-
-    Create a list of shuffled evoked data where each pair of target
-    and non-target evoked data is shuffled together.
-    """
-    import random
-
-    _, evoked_data_a, evoked_data_b = prep_sample_data()
-
-    # Ensure evoked_data_a and evoked_data_b are of the same length
-    assert len(evoked_data_a) == len(
-        evoked_data_b
-    ), "evoked_data_a and evoked_data_b must have the same length"
-
-    # Create a list of participant indices
-    participant_indices = list(range(len(evoked_data_a)))
-
-    # Shuffle the list of participant indices
-    random.shuffle(participant_indices)
-
-    # Reorder evoked data according to the shuffled participant indices
-    shuffled_evoked_data_a = [evoked_data_a[i] for i in participant_indices]
-    shuffled_evoked_data_b = [evoked_data_b[i] for i in participant_indices]
-
-    # Combine the shuffled evoked data into a single list
-    shuffled_evoked_data = shuffled_evoked_data_a + shuffled_evoked_data_b
-
-    # Combine the original evoked data into a single list
-    original_evoked_data = evoked_data_a + evoked_data_b
-
-    return original_evoked_data, shuffled_evoked_data
-
-
-# shuffle order of pairs
-original_evoked_data, shuffled_evoked_data = create_random_paired_evokeds_list()
-# shouldn't change the results (p-value is different though?)
-
-shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = (
-    create_random_evokeds_id_condition_list()
+# create the dataframe
+df = pd.DataFrame(
+    {
+        "evoked": evokeds_conditions,
+        "condition": conditions,
+        "subject_index": subject_index,
+    }
 )
 
+# now we can run the cluster test function
+# we will use the new API that allows for Wilkinson style formulas
+# the formula should be a string in Wilkinson notation
 
-def prepare_dataframe_for_cluster_function(
-    evokeds: list = None,
-    condition: list = None,
-    subject_index: list = None,
-):
-    """
-    Prepare a dataframe for the cluster test function.
-
-    Parameters
-    ----------
-    contrast : bool, optional
-        If True, a contrast is calculated. Default is False.
-    evokeds : list, optional
-        List of evoked objects. Default is None.
-    condition : list, optional
-        List of conditions for each evoked object. Default is None.
-    subject_index : list, optional
-        List of subject IDs. Default is None.
-
-    Returns
-    -------
-    df : DataFrame
-        The prepared DataFrame for the cluster test function.
-    """
-    # Initialize the DataFrame with evoked data
-    df = pd.DataFrame(
-        {
-            "evoked": evokeds,
-            "condition": condition if condition is not None else pd.NA,
-            "subject_index": subject_index if subject_index is not None else pd.NA,
-        }
-    )
-
-    return df
-
-
-df = prepare_dataframe_for_cluster_function(
-    evokeds=shuffled_evoked_data,
-    condition=shuffled_conditions,
-    subject_index=shuffled_participant_ids,
-)
+# we want to test whether there is a significant difference between
+# target and non-target stimuli in the post-stimulus window
+# we will use a cluster-based permutation paired t-test for this
 
+# let's first define the formula based on Wilkinson notation
+formula = "evoked ~ 1 + C(subject_index)"
 
-def cluster_test(
-    df: pd.DataFrame,
-    formula: str = None,  # Wilkinson notation formula for design matrix
-    n_permutations: int = 10000,
-    seed: None | int | np.random.RandomState = None,
-    tail: int = 0,  # 0 for two-tailed, 1 for greater, -1 for less
-    n_jobs: int = 1,  # how many cores to use
-    adjacency: tuple = None,
-    max_step: int = 1,  # maximum distance between samples (time points)
-    exclude: list = None,  # exclude no time points or channels
-    step_down_p: int = 0,  # step down in jumps test
-    t_power: int = 1,  # weigh each location by its stats score
-    out_type: str = "indices",
-    check_disjoint: bool = False,
-    buffer_size: int = None,  # block size for chunking the data
-):
-    """
-    Run the cluster test using the new API.
-
-    # currently supports paired t-test
-
-    Parameters
-    ----------
-    dataframe : pd.DataFrame
-        Dataframe with evoked data, conditions and subject IDs.
-    formula : str, optional
-        Wilkinson notation formula for design matrix. Default is None.
-    n_permutations : int, optional
-        Number of permutations. Default is 10000.
-    seed : None | int | np.random.RandomState, optional
-        Seed for the random number generator. Default is None.
-    tail : int, optional
-        0 for two-tailed, 1 for greater, -1 for less. Default is 0.
-    n_jobs : int, optional
-        How many cores to use. Default is 1.
-    adjacency : None, optional
-        Adjacency matrix. Default is None.
-    max_step : int, optional
-        Maximum distance between samples (time points). Default is 1.
-    exclude : np.Array, optional
-        Exclude no time points or channels. Default is None.
-    step_down_p : int, optional
-        Step down in jumps test. Default is 0.
-    t_power : int, optional
-        Weigh each location by its stats score. Default is 1.
-    out_type : str, optional
-        Output type. Default is "indices".
-    check_disjoint : bool, optional
-        Check if clusters are disjoint. Default is False.
-    buffer_size : int, optional
-        Block size for chunking the data. Default is None.
-    seed : int, optional
-        Seed for the random number generator. Default is None.
-
-    Returns
-    -------
-    T_obs : array
-        The observed test statistic.
-    clusters : list
-        List of clusters.
-    cluster_p_values : array
-        Array of cluster p-values.
-    H0 : array
-        The permuted test statistics.
-    """
-    # for now this assumes a dataframe with a column for evoked data
-    # add a data column to the dataframe (numpy array)
-    df["data"] = [evoked.data for evoked in df.evoked]
-
-    # extract number of channels and timepoints
-    # (eventually should also allow for frequency)
-    n_channels, n_timepoints = df["data"][0].shape
-
-    # convert wide format to long format for formulaic
-    df_long = unpack_time_and_channels(df)
-
-    # Pivot the DataFrame
-    pivot_df = df_long.pivot_table(
-        index=["subject_index", "channel", "timepoint"],
-        columns="condition",
-        values="value",
-    ).reset_index()
-
-    # if not 2 unique conditions raise error
-    if len(pd.unique(df.condition)) != 2:
-        raise ValueError("Condition list needs to contain 2 unique values")
-
-    # Compute the difference (assuming there are only 2 conditions)
-    pivot_df["y"] = pivot_df[0] - pivot_df[1]
-
-    # Optional: Clean up the DataFrame
-    pivot_df = pivot_df[["subject_index", "channel", "timepoint", "y"]]
-
-    # check if formula is present
-    if formula is not None:
-        formulaic = _soft_import(
-            "formulaic", purpose="set up Design Matrix"
-        )  # soft import (not a dependency for MNE)
-
-        # for the paired t-test y is the difference between conditions
-        # X is the design matrix with a column with 1s and 0s for each participant
-        # Create the design matrix using formulaic
-        y, X = formulaic.model_matrix(formula, pivot_df)
-    else:
-        raise ValueError(
-            "Formula is required and needs to be a string in Wilkinson notation."
-        )
-
-    # now prep design matrix outcome variable for input into MNE cluster function
-    # we initially had first channels, then timepoints,
-    # now we need first timepoints, then channels
-    y_for_cluster = y.values.reshape(-1, n_channels, n_timepoints).transpose(0, 2, 1)
-
-    adjacency, _ = mne.channels.find_ch_adjacency(df["evoked"][0].info, ch_type="eeg")
-
-    # define stat function and threshold
-    stat_fun, threshold = mne.stats.cluster_level._check_fun(
-        X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="within"
-    )
-
-    # Run the cluster-based permutation test
-    T_obs, clusters, cluster_p_values, H0 = (
-        mne.stats.cluster_level._permutation_cluster_test(
-            [y_for_cluster],
-            n_permutations=10000,
-            threshold=threshold,
-            stat_fun=stat_fun,
-            tail=tail,
-            n_jobs=n_jobs,
-            adjacency=adjacency,
-            max_step=max_step,  # maximum distance between samples (time points)
-            exclude=exclude,  # exclude no time points or channels
-            step_down_p=step_down_p,  # step down in jumps test
-            t_power=t_power,  # weigh each location by its stats score
-            out_type=out_type,
-            check_disjoint=check_disjoint,
-            buffer_size=buffer_size,  # block size for chunking the data
-            seed=seed,
-        )
-    )
-
-    print(min(cluster_p_values))
-
-    # need to adjust plotting function for contrast only data
-    contrast, evokeds_a, evokeds_b = prep_sample_data()
-
-    # plot cluster
-    plot_cluster(contrast, evokeds_a, evokeds_b, T_obs, clusters, cluster_p_values)
-
-    return T_obs, clusters, cluster_p_values, H0
-
-
-def unpack_time_and_channels(df):
-    """
-    Extract the time and channel data from the DataFrame.
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        DataFrame in wide format.
-    """
-    # Extracting all necessary data using list comprehensions for better performance
-    long_format_data = [
-        {
-            "condition": row["condition"],
-            "subject_index": row["subject_index"],
-            "channel": channel,
-            "timepoint": timepoint,
-            "value": row["data"][channel, timepoint],
-        }
-        for idx, row in df.iterrows()
-        for channel in range(row["data"].shape[0])
-        for timepoint in range(row["data"].shape[1])
-    ]
-
-    # Creating the long format DataFrame
-    df_long = pd.DataFrame(long_format_data)
-
-    return df_long
-
-
-# Example usage
-# Sample wide format DataFrame
-df_wide = pd.DataFrame(
-    {
-        "condition": ["A", "B"],
-        "subject_index": [1, 2],
-        "data": [np.array([[1, 2, 3], [4, 5, 6]]), np.array([[7, 8, 9], [10, 11, 12]])],
-    }
+# run the cluster test
+T_obs, clusters, cluster_p_values, H0 = mne.stats.cluster_level.cluster_test(
+    df=df, formula=formula
 )
 
-
-def plot_cluster(
-    contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values
-):
-    """
-    Plot the cluster with the lowest p-value.
-
-    Parameters
-    ----------
-    contrast : list
-        List of contrast evoked objects.
-    target_only : list
-        List of target evoked objects.
-    non_target_only : list
-        List of non-target evoked objects.
-    T_obs : array
-        The observed test statistic.
-    clusters : list
-        List of clusters.
-    cluster_p_values : array
-        Array of cluster p-values.
-
-    Returns
-    -------
-    None
-
-    """
-    # configure variables for visualization
-    colors = {"target": "crimson", "non-target": "steelblue"}
-
-    # organize data for plotting
-    evokeds = {"target": target_only, "non-target": non_target_only}
-
-    lowest_p_cluster = np.argmin(cluster_p_values)
-
-    # plot the cluster with the lowest p-value
-    time_inds, space_inds = np.squeeze(clusters[lowest_p_cluster])
-    ch_inds = np.unique(space_inds)
-    time_inds = np.unique(time_inds)
-
-    # get topography for F stat
-    t_map = T_obs[time_inds, ...].mean(axis=0)
-
-    # get signals at the sensors contributing to the cluster
-    sig_times = contrast[0].times[time_inds]
-
-    # create spatial mask
-    mask = np.zeros((t_map.shape[0], 1), dtype=bool)
-    mask[ch_inds, :] = True
-
-    # initialize figure
-    fig, ax_topo = plt.subplots(1, 1, figsize=(10, 3), layout="constrained")
-
-    # plot average test statistic and mark significant sensors
-    t_evoked = mne.EvokedArray(t_map[:, np.newaxis], contrast[0].info, tmin=0)
-    t_evoked.plot_topomap(
-        times=0,
-        mask=mask,
-        axes=ax_topo,
-        cmap="Reds",
-        vlim=(np.min, np.max),
-        show=False,
-        colorbar=False,
-        mask_params=dict(markersize=10),
-    )
-    image = ax_topo.images[0]
-
-    # remove the title that would otherwise say "0.000 s"
-    ax_topo.set_title("")
-
-    # create additional axes (for ERF and colorbar)
-    divider = make_axes_locatable(ax_topo)
-
-    # add axes for colorbar
-    ax_colorbar = divider.append_axes("right", size="5%", pad=0.05)
-    plt.colorbar(image, cax=ax_colorbar)
-    ax_topo.set_xlabel(
-        "Averaged t-map ({:0.3f} - {:0.3f} s)".format(*sig_times[[0, -1]])
-    )
-
-    # add new axis for time courses and plot time courses
-    ax_signals = divider.append_axes("right", size="300%", pad=1.2)
-    title = f"Cluster #1, {len(ch_inds)} sensor"
-    if len(ch_inds) > 1:
-        title += "s (mean)"
-    mne.viz.plot_compare_evokeds(
-        evokeds,
-        title=title,
-        picks=ch_inds,
-        axes=ax_signals,
-        colors=colors,
-        show=False,
-        split_legend=True,
-        truncate_yaxis="auto",
-    )
-
-    # plot temporal cluster extent
-    ymin, ymax = ax_signals.get_ylim()
-    ax_signals.fill_betweenx(
-        (ymin, ymax), sig_times[0], sig_times[-1], color="green", alpha=0.3
-    )
-
-    plt.show()
-
-
-# translated the limo permutation ttest from matlab to python
-def limo_ttest_permute(Data, n_perm=None):
-    """
-    Pseudo one-sample t-test using sign-test with permutations.
-
-    Parameters
-    ----------
-    Data (numpy.ndarray): A matrix of data for the one-sample t-test.
-                          Shape can be (n_channels, n_var, n_obs) or
-                          (n_var, n_obs).
-                        n_perm (int, optional): Number of permutations to perform.
-    If None, it defaults based on the number of observations.
-
-    Returns
-    -------
-    t_vals (numpy.ndarray): t-values under H0.
-    p_vals (numpy.ndarray): p-values under H0.
-    dfe (int): Degrees of freedom.
-    """
-    # Check inputs and reshape if necessary
-    if Data.ndim == 3:
-        n_channels, n_var, n_obs = Data.shape
-    else:
-        n_channels = 1
-        n_var, n_obs = Data.shape
-        Data = Data[np.newaxis, ...]
-
-    # Warn if the number of observations is very small
-    if n_obs < 7:
-        n_psbl_prms = 2**n_obs
-        print(
-            f"Due to the very limited number of observations, "
-            f"the total number of possible permutations is small ({n_psbl_prms}). "
-            "Thus, only a limited number of p-values are possible "
-            "and the test might be overly conservative."
-        )
-
-    # Set up permutation test
-    if n_obs <= 12:
-        n_perm = 2**n_obs  # total number of possible permutations
-        exact = True
-        print(
-            "Due to the limited number of observations, all possible permutations "
-            "of the data will be computed instead of random permutations."
-        )
-    else:
-        exact = False
-        if n_perm is None:
-            n_perm = 1000
-
-    print(f"Executing permutation test with {n_perm} permutations...")
-
-    # Initialize variables
-    t_vals = np.full(
-        (n_channels, n_var, n_perm), np.nan
-    )  # Array to store t-values for each permutation
-    sqrt_nXnM1 = np.sqrt(
-        n_obs * (n_obs - 1)
-    )  # Precompute constant for t-value calculation
-    dfe = n_obs - 1  # Degrees of freedom
-
-    if exact:
-        # Use all possible permutations
-        for perm in range(n_perm):
-            # Set sign of each trial / participant's data
-            temp = np.array(
-                [int(x) for x in bin(perm)[2:].zfill(n_obs)]
-            )  # Convert perm index to binary array
-            sn = np.where(temp == 0, -1, 1)  # Map 0 to -1 and 1 to 1
-            sn_mtrx = np.tile(sn, (n_var, 1)).T  # Repeat sn for each variable
-
-            for c in range(n_channels):
-                data = Data[c, :, :]
-                d_perm = data * sn_mtrx  # Apply sign flip to data
-
-                # Compute t-score of permuted data
-                sm = np.sum(d_perm, axis=1)  # Sum of permuted data
-                mn = sm / n_obs  # Mean of permuted data
-                sm_sqrs = (
-                    np.sum(d_perm**2, axis=1) - (sm**2) / n_obs
-                )  # Sum of squares for standard error
-                stder = np.sqrt(sm_sqrs) / sqrt_nXnM1  # Standard error
-                t_vals[c, :, perm] = mn / stder  # Compute t-values
-
-    else:
-        # Use random permutations
-        for perm in range(n_perm):
-            # Randomly set sign of each trial / participant's data
-            sn = (np.random.rand(n_obs) > 0.5) * 2 - 1  # Generate random sign flips
-            sn_mtrx = np.tile(sn, (n_var, 1))  # Repeat sn for each variable
-
-            for c in range(n_channels):
-                data = Data[c, :, :]
-                d_perm = data * sn_mtrx  # Apply sign flip to data
-
-                # Compute t-score of permuted data
-                sm = np.sum(d_perm, axis=1)  # Sum of permuted data
-                mn = sm / n_obs  # Mean of permuted data
-                sm_sqrs = (
-                    np.sum(d_perm**2, axis=1) - (sm**2) / n_obs
-                )  # Sum of squares for standard error
-                stder = np.sqrt(sm_sqrs) / sqrt_nXnM1  # Standard error
-                t_vals[c, :, perm] = mn / stder  # Compute t-values
-
-    # Compute p-values from t-values
-    p_vals = 2 * scipy.stats.cdf(-np.abs(t_vals), dfe)
-
-    return t_vals, p_vals, dfe
+# finally let's plot the results
+# we plot the cluster with the lowest p-value
+# and the topomap of the significant cluster
+# we can see that there is something going on around 400 ms
+# in the visual channels
+# however the cluster is not significant which is not surprising
+# given the small sample size (only 5 subjects)
+mne.stats.cluster_level.plot_cluster(
+    diff_evoked, target_only, non_target_only, T_obs, clusters, cluster_p_values
+)

From 268d0cfb801d60df8cab9d1d83ebf1a7c1837e6f Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Sat, 6 Jul 2024 11:01:12 +0200
Subject: [PATCH 56/88] cleaned up plotting function

---
 mne/stats/cluster_level.py                    | 61 ++++++++++---------
 .../76_new_cluster_test_api.py                |  6 +-
 2 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index e50991254fe..bd1c2c90970 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -1749,14 +1749,14 @@ def cluster_test(
     buffer_size: int = None,  # block size for chunking the data
 ):
     """
-    Run the cluster test using the new API.
+    Run a cluster permutation test based on formulaic input.
 
-    # currently supports paired t-test
+    # currently only supports paired t-test on evokeds or epochs
 
     Parameters
     ----------
     dataframe : pd.DataFrame
-        Dataframe with evoked data, conditions and subject IDs.
+        Dataframe with evoked/epoched data, conditions and subject IDs.
     formula : str, optional
         Wilkinson notation formula for design matrix. Default is None.
     n_permutations : int, optional
@@ -1788,6 +1788,7 @@ def cluster_test(
 
     Returns
     -------
+    TODO: turn this into a class for further plotting
     T_obs : array
         The observed test statistic.
     clusters : list
@@ -1808,7 +1809,7 @@ def cluster_test(
     # convert wide format to long format for formulaic
     df_long = unpack_time_and_channels(df)
 
-    # Pivot the DataFrame
+    # pivot the DataFrame
     pivot_df = df_long.pivot_table(
         index=["subject_index", "channel", "timepoint"],
         columns="condition",
@@ -1819,7 +1820,7 @@ def cluster_test(
     if len(pd.unique(df.condition)) != 2:
         raise ValueError("Condition list needs to contain 2 unique values")
 
-    # Get the unique conditions
+    # get the unique conditions
     conditions = np.unique(df.condition)
 
     # Compute the difference (assuming there are only 2 conditions)
@@ -1843,9 +1844,8 @@ def cluster_test(
             "Formula is required and needs to be a string in Wilkinson notation."
         )
 
-    # now prep design matrix outcome variable for input into MNE cluster function
-    # we initially had first channels, then timepoints,
-    # now we need first timepoints, then channels
+    # now prep design matrix for input into MNE cluster function
+    # cluster functions expects channels as list dimension
     y_for_cluster = y.values.reshape(-1, n_channels, n_timepoints).transpose(0, 2, 1)
 
     adjacency, _ = find_ch_adjacency(df["evoked"][0].info, ch_type="eeg")
@@ -1858,7 +1858,7 @@ def cluster_test(
     # Run the cluster-based permutation test
     T_obs, clusters, cluster_p_values, H0 = _permutation_cluster_test(
         [y_for_cluster],
-        n_permutations=10000,
+        n_permutations=n_permutations,
         threshold=threshold,
         stat_fun=stat_fun,
         tail=tail,
@@ -1874,19 +1874,24 @@ def cluster_test(
         seed=seed,
     )
 
-    print(min(cluster_p_values))
+    print(f"smallest cluster p-value: {min(cluster_p_values)}")
 
     return T_obs, clusters, cluster_p_values, H0
 
 
-def unpack_time_and_channels(df):
+def unpack_time_and_channels(df: pd.DataFrame = None) -> pd.DataFrame:
     """
-    Extract the time and channel data from the DataFrame.
+    Extract timepoints and channels and convert to long.
 
     Parameters
     ----------
     df : pd.DataFrame
         DataFrame in wide format.
+
+    Returns
+    -------
+    df_long : pd.DataFrame
+        DataFrame in long format.
     """
     # Extracting all necessary data using list comprehensions for better performance
     long_format_data = [
@@ -1908,20 +1913,18 @@ def unpack_time_and_channels(df):
     return df_long
 
 
-def plot_cluster(
-    contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values
-):
+def plot_cluster(cond_dict, T_obs, clusters, cluster_p_values):
     """
     Plot the cluster with the lowest p-value.
 
+    2D cluster plotted with topoplot on the left and evoked signals on the right.
+    Timepoints that are part of the cluster are
+    highlighted in green on the evoked signals.
+
     Parameters
     ----------
-    contrast : list
-        List of contrast evoked objects.
-    target_only : list
-        List of target evoked objects.
-    non_target_only : list
-        List of non-target evoked objects.
+    cond_dict : dict
+        Dictionary with conditions as keys and evoked data as values.
     T_obs : array
         The observed test statistic.
     clusters : list
@@ -1934,11 +1937,13 @@ def plot_cluster(
     None
 
     """
-    # configure variables for visualization
-    colors = {"target": "crimson", "non-target": "steelblue"}
+    # extract condition labels from the dictionary
+    cond_keys = list(cond_dict.keys())
+    # extract the evokeds from the dictionary
+    cond_values = list(cond_dict.values())
 
-    # organize data for plotting
-    evokeds = {"target": target_only, "non-target": non_target_only}
+    # configure variables for visualization
+    colors = {cond_keys[0]: "crimson", cond_keys[1]: "steelblue"}
 
     lowest_p_cluster = np.argmin(cluster_p_values)
 
@@ -1951,7 +1956,7 @@ def plot_cluster(
     t_map = T_obs[time_inds, ...].mean(axis=0)
 
     # get signals at the sensors contributing to the cluster
-    sig_times = contrast[0].times[time_inds]
+    sig_times = cond_values[0][0].times[time_inds]
 
     # create spatial mask
     mask = np.zeros((t_map.shape[0], 1), dtype=bool)
@@ -1961,7 +1966,7 @@ def plot_cluster(
     fig, ax_topo = plt.subplots(1, 1, figsize=(10, 3), layout="constrained")
 
     # plot average test statistic and mark significant sensors
-    t_evoked = EvokedArray(t_map[:, np.newaxis], contrast[0].info, tmin=0)
+    t_evoked = EvokedArray(t_map[:, np.newaxis], cond_values[0][0].info, tmin=0)
     t_evoked.plot_topomap(
         times=0,
         mask=mask,
@@ -1999,7 +2004,7 @@ def plot_cluster(
     if len(ch_inds) > 1:
         title += "s (mean)"
         plot_compare_evokeds(
-            evokeds,
+            cond_dict,
             title=title,
             picks=ch_inds,
             axes=ax_signals,
diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index ec8bd8275a1..a88904a5b5b 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -125,6 +125,8 @@
 T_obs, clusters, cluster_p_values, H0 = mne.stats.cluster_level.cluster_test(
     df=df, formula=formula
 )
+# set up conditions dictionary for cluster plots
+conditions_dict = {"target": target_only, "non-target": non_target_only}
 
 # finally let's plot the results
 # we plot the cluster with the lowest p-value
@@ -133,6 +135,4 @@
 # in the visual channels
 # however the cluster is not significant which is not surprising
 # given the small sample size (only 5 subjects)
-mne.stats.cluster_level.plot_cluster(
-    diff_evoked, target_only, non_target_only, T_obs, clusters, cluster_p_values
-)
+mne.stats.cluster_level.plot_cluster(conditions_dict, T_obs, clusters, cluster_p_values)

From 2f722bdac329e30911aad847e401871ed4f23dd8 Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Sat, 6 Jul 2024 11:53:49 +0200
Subject: [PATCH 57/88] implemented cluser results class

---
 mne/stats/cluster_level.py                    | 213 +++++++++---------
 .../76_new_cluster_test_api.py                |  23 +-
 2 files changed, 124 insertions(+), 112 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index bd1c2c90970..f82fe8d7dec 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -1788,15 +1788,8 @@ def cluster_test(
 
     Returns
     -------
-    TODO: turn this into a class for further plotting
-    T_obs : array
-        The observed test statistic.
-    clusters : list
-        List of clusters.
-    cluster_p_values : array
-        Array of cluster p-values.
-    H0 : array
-        The permuted test statistics.
+    ClusterResult
+        Object containing the results of the cluster permutation test.
     """
     # for now this assumes a dataframe with a column for evoked data or epochs
     # add a data column to the dataframe (numpy array)
@@ -1876,7 +1869,7 @@ def cluster_test(
 
     print(f"smallest cluster p-value: {min(cluster_p_values)}")
 
-    return T_obs, clusters, cluster_p_values, H0
+    return ClusterResult(T_obs, clusters, cluster_p_values, H0)
 
 
 def unpack_time_and_channels(df: pd.DataFrame = None) -> pd.DataFrame:
@@ -1913,111 +1906,127 @@ def unpack_time_and_channels(df: pd.DataFrame = None) -> pd.DataFrame:
     return df_long
 
 
-def plot_cluster(cond_dict, T_obs, clusters, cluster_p_values):
+class ClusterResult:
     """
-    Plot the cluster with the lowest p-value.
-
-    2D cluster plotted with topoplot on the left and evoked signals on the right.
-    Timepoints that are part of the cluster are
-    highlighted in green on the evoked signals.
+    Object containing the results of the cluster permutation test.
 
     Parameters
     ----------
-    cond_dict : dict
-        Dictionary with conditions as keys and evoked data as values.
-    T_obs : array
+    T_obs : np.ndarray
         The observed test statistic.
     clusters : list
         List of clusters.
-    cluster_p_values : array
-        Array of cluster p-values.
-
-    Returns
-    -------
-    None
-
+    cluster_p_values : np.ndarray
+        P-values for each cluster.
+    H0 : np.ndarray
+        Max cluster level stats observed under permutation.
     """
-    # extract condition labels from the dictionary
-    cond_keys = list(cond_dict.keys())
-    # extract the evokeds from the dictionary
-    cond_values = list(cond_dict.values())
-
-    # configure variables for visualization
-    colors = {cond_keys[0]: "crimson", cond_keys[1]: "steelblue"}
-
-    lowest_p_cluster = np.argmin(cluster_p_values)
-
-    # plot the cluster with the lowest p-value
-    time_inds, space_inds = np.squeeze(clusters[lowest_p_cluster])
-    ch_inds = np.unique(space_inds)
-    time_inds = np.unique(time_inds)
-
-    # get topography for F stat
-    t_map = T_obs[time_inds, ...].mean(axis=0)
-
-    # get signals at the sensors contributing to the cluster
-    sig_times = cond_values[0][0].times[time_inds]
-
-    # create spatial mask
-    mask = np.zeros((t_map.shape[0], 1), dtype=bool)
-    mask[ch_inds, :] = True
-
-    # initialize figure
-    fig, ax_topo = plt.subplots(1, 1, figsize=(10, 3), layout="constrained")
-
-    # plot average test statistic and mark significant sensors
-    t_evoked = EvokedArray(t_map[:, np.newaxis], cond_values[0][0].info, tmin=0)
-    t_evoked.plot_topomap(
-        times=0,
-        mask=mask,
-        axes=ax_topo,
-        cmap="Reds",
-        vlim=(np.min, np.max),
-        show=False,
-        colorbar=False,
-        mask_params=dict(markersize=10),
-    )
-    image = ax_topo.images[0]
 
-    # remove the title that would otherwise say "0.000 s"
-    ax_topo.set_title("")
+    def __init__(self, T_obs, clusters, cluster_p_values, H0):
+        self.T_obs = T_obs
+        self.clusters = clusters
+        self.cluster_p_values = cluster_p_values
+        self.H0 = H0
+
+    def plot_cluster(self, cond_dict: dict = None):
+        """
+        Plot the cluster with the lowest p-value.
+
+        2D cluster plotted with topoplot on the left and evoked signals on the right.
+        Timepoints that are part of the cluster are
+        highlighted in green on the evoked signals.
+
+        Parameters
+        ----------
+        cond_dict : dict
+            Dictionary with condition labels as keys and evoked objects as values.
+
+        Returns
+        -------
+        None
+
+        """
+        # extract condition labels from the dictionary
+        cond_keys = list(cond_dict.keys())
+        # extract the evokeds from the dictionary
+        cond_values = list(cond_dict.values())
+
+        # configure variables for visualization
+        colors = {cond_keys[0]: "crimson", cond_keys[1]: "steelblue"}
+
+        lowest_p_cluster = np.argmin(self.cluster_p_values)
+
+        # plot the cluster with the lowest p-value
+        time_inds, space_inds = np.squeeze(self.clusters[lowest_p_cluster])
+        ch_inds = np.unique(space_inds)
+        time_inds = np.unique(time_inds)
+
+        # get topography for F stat
+        t_map = self.T_obs[time_inds, ...].mean(axis=0)
+
+        # get signals at the sensors contributing to the cluster
+        sig_times = cond_values[0][0].times[time_inds]
+
+        # create spatial mask
+        mask = np.zeros((t_map.shape[0], 1), dtype=bool)
+        mask[ch_inds, :] = True
+
+        # initialize figure
+        fig, ax_topo = plt.subplots(1, 1, figsize=(10, 3), layout="constrained")
+
+        # plot average test statistic and mark significant sensors
+        t_evoked = EvokedArray(t_map[:, np.newaxis], cond_values[0][0].info, tmin=0)
+        t_evoked.plot_topomap(
+            times=0,
+            mask=mask,
+            axes=ax_topo,
+            cmap="Reds",
+            vlim=(np.min, np.max),
+            show=False,
+            colorbar=False,
+            mask_params=dict(markersize=10),
+        )
+        image = ax_topo.images[0]
 
-    # soft import?
-    # make_axes_locatable = _soft_import(
-    #    "mpl_toolkits.axes_grid1.make_axes_locatable",
-    #    purpose="plot cluster results"
-    # )  # soft import (not a dependency for MNE)
+        # remove the title that would otherwise say "0.000 s"
+        ax_topo.set_title("")
 
-    # create additional axes (for ERF and colorbar)
-    divider = make_axes_locatable(ax_topo)
+        # soft import?
+        # make_axes_locatable = _soft_import(
+        #    "mpl_toolkits.axes_grid1.make_axes_locatable",
+        #    purpose="plot cluster results"
+        # )  # soft import (not a dependency for MNE)
 
-    # add axes for colorbar
-    ax_colorbar = divider.append_axes("right", size="5%", pad=0.05)
-    plt.colorbar(image, cax=ax_colorbar)
-    ax_topo.set_xlabel(
-        "Averaged t-map ({:0.3f} - {:0.3f} s)".format(*sig_times[[0, -1]])
-    )
+        # create additional axes (for ERF and colorbar)
+        divider = make_axes_locatable(ax_topo)
 
-    # add new axis for time courses and plot time courses
-    ax_signals = divider.append_axes("right", size="300%", pad=1.2)
-    title = f"Cluster #1, {len(ch_inds)} sensor"
-    if len(ch_inds) > 1:
-        title += "s (mean)"
-        plot_compare_evokeds(
-            cond_dict,
-            title=title,
-            picks=ch_inds,
-            axes=ax_signals,
-            colors=colors,
-            show=False,
-            split_legend=True,
-            truncate_yaxis="auto",
+        # add axes for colorbar
+        ax_colorbar = divider.append_axes("right", size="5%", pad=0.05)
+        plt.colorbar(image, cax=ax_colorbar)
+        ax_topo.set_xlabel(
+            "Averaged t-map ({:0.3f} - {:0.3f} s)".format(*sig_times[[0, -1]])
         )
 
-    # plot temporal cluster extent
-    ymin, ymax = ax_signals.get_ylim()
-    ax_signals.fill_betweenx(
-        (ymin, ymax), sig_times[0], sig_times[-1], color="green", alpha=0.3
-    )
+        # add new axis for time courses and plot time courses
+        ax_signals = divider.append_axes("right", size="300%", pad=1.2)
+        title = f"Cluster #1, {len(ch_inds)} sensor"
+        if len(ch_inds) > 1:
+            title += "s (mean)"
+            plot_compare_evokeds(
+                cond_dict,
+                title=title,
+                picks=ch_inds,
+                axes=ax_signals,
+                colors=colors,
+                show=False,
+                split_legend=True,
+                truncate_yaxis="auto",
+            )
+
+        # plot temporal cluster extent
+        ymin, ymax = ax_signals.get_ylim()
+        ax_signals.fill_betweenx(
+            (ymin, ymax), sig_times[0], sig_times[-1], color="green", alpha=0.3
+        )
 
-    plt.show()
+        plt.show()
diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index a88904a5b5b..3acfd21f7f0 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -15,7 +15,8 @@
 
   - loading evoked data from multiple subjects
   - construct a dataframe that contains the difference between conditions
-  - run the new cluster test function
+  - run the new cluster test function with formula in Wilkinson notation
+  - plot the results with the ClusterResults Class
 
 Here, the unit of observation are evokeds from multiple subjects (2nd level analysis).
 
@@ -121,18 +122,20 @@
 # let's first define the formula based on Wilkinson notation
 formula = "evoked ~ 1 + C(subject_index)"
 
-# run the cluster test
-T_obs, clusters, cluster_p_values, H0 = mne.stats.cluster_level.cluster_test(
-    df=df, formula=formula
-)
+# run the cluster test and return the cluster_result object
+cluster_result = mne.stats.cluster_level.cluster_test(df=df, formula=formula)
+
+# note that we ran an exact test due to the small sample size (only 15 permutations)
+
 # set up conditions dictionary for cluster plots
 conditions_dict = {"target": target_only, "non-target": non_target_only}
 
-# finally let's plot the results
+# finally let's plot the results using the ClusterResults class
+
 # we plot the cluster with the lowest p-value
-# and the topomap of the significant cluster
+
 # we can see that there is something going on around 400 ms
-# in the visual channels
-# however the cluster is not significant which is not surprising
+# in the visual channels (topomap on the left)
+# however the cluster is not significant which is unsurprising
 # given the small sample size (only 5 subjects)
-mne.stats.cluster_level.plot_cluster(conditions_dict, T_obs, clusters, cluster_p_values)
+cluster_result.plot_cluster(cond_dict=conditions_dict)

From fb75cfd66f2fb8e2fbdc55a9907931441f2dd13f Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Thu, 18 Jul 2024 14:35:02 +0200
Subject: [PATCH 58/88] fixed codespell

---
 mne/stats/cluster_level.py                    | 55 +++++++++++--------
 .../76_new_cluster_test_api.py                | 34 +++++++-----
 2 files changed, 50 insertions(+), 39 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index f82fe8d7dec..bb1f31ef6fd 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -1813,8 +1813,14 @@ def cluster_test(
     if len(pd.unique(df.condition)) != 2:
         raise ValueError("Condition list needs to contain 2 unique values")
 
-    # get the unique conditions
-    conditions = np.unique(df.condition)
+    # Get unique elements and the indices of their first occurrences
+    unique_elements, indices = np.unique(df.condition, return_index=True)
+
+    # Sort unique elements by the indices of their first occurrences
+    conditions = unique_elements[np.argsort(indices)]
+
+    # print the contrast used for the paired t-test
+    print(f"Contrast used for paired t-test: {conditions[0]} - {conditions[1]}")
 
     # Compute the difference (assuming there are only 2 conditions)
     pivot_df["evoked"] = pivot_df[conditions[0]] - pivot_df[conditions[1]]
@@ -1961,8 +1967,8 @@ def plot_cluster(self, cond_dict: dict = None):
         ch_inds = np.unique(space_inds)
         time_inds = np.unique(time_inds)
 
-        # get topography for F stat
-        t_map = self.T_obs[time_inds, ...].mean(axis=0)
+        # get topography for t stat
+        t_map = self.T_obs[time_inds, ...].mean(axis=0).astype(int)
 
         # get signals at the sensors contributing to the cluster
         sig_times = cond_values[0][0].times[time_inds]
@@ -1980,11 +1986,11 @@ def plot_cluster(self, cond_dict: dict = None):
             times=0,
             mask=mask,
             axes=ax_topo,
-            cmap="Reds",
-            vlim=(np.min, np.max),
+            cmap="RdBu_r",
             show=False,
             colorbar=False,
             mask_params=dict(markersize=10),
+            scalings=1.00,
         )
         image = ax_topo.images[0]
 
@@ -2001,32 +2007,33 @@ def plot_cluster(self, cond_dict: dict = None):
         divider = make_axes_locatable(ax_topo)
 
         # add axes for colorbar
-        ax_colorbar = divider.append_axes("right", size="5%", pad=0.05)
-        plt.colorbar(image, cax=ax_colorbar)
+        ax_colorbar = divider.append_axes("right", size="5%", pad=0.1)
+        cbar = plt.colorbar(image, cax=ax_colorbar)
+        cbar.set_label("t-value")
         ax_topo.set_xlabel(
-            "Averaged t-map ({:0.3f} - {:0.3f} s)".format(*sig_times[[0, -1]])
+            "average from {:0.3f} to {:0.3f} s".format(*sig_times[[0, -1]])
         )
 
         # add new axis for time courses and plot time courses
-        ax_signals = divider.append_axes("right", size="300%", pad=1.2)
-        title = f"Cluster #1, {len(ch_inds)} sensor"
-        if len(ch_inds) > 1:
-            title += "s (mean)"
-            plot_compare_evokeds(
-                cond_dict,
-                title=title,
-                picks=ch_inds,
-                axes=ax_signals,
-                colors=colors,
-                show=False,
-                split_legend=True,
-                truncate_yaxis="auto",
-            )
+        ax_signals = divider.append_axes("right", size="300%", pad=1.3)
+        title = f"Signal averaged over {len(ch_inds)} sensor(s)"
+        plot_compare_evokeds(
+            cond_dict,
+            title=title,
+            picks=ch_inds,
+            axes=ax_signals,
+            colors=colors,
+            show=False,
+            split_legend=True,
+            truncate_yaxis="auto",
+            truncate_xaxis=False,
+        )
+        plt.legend(frameon=False, loc="upper left")
 
         # plot temporal cluster extent
         ymin, ymax = ax_signals.get_ylim()
         ax_signals.fill_betweenx(
-            (ymin, ymax), sig_times[0], sig_times[-1], color="green", alpha=0.3
+            (ymin, ymax), sig_times[0], sig_times[-1], color="grey", alpha=0.3
         )
 
         plt.show()
diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index 3acfd21f7f0..842e0543b0b 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -6,8 +6,9 @@
 ===============================================================
 
 This tutorial shows how to use the new API for cluster testing.
-This script shows how to estimate significant clusters in
-evoked contrast data of multiple subjects.
+The new API allows for Wilkinson style formulas and allows for more flexibility in
+the design of the test. Here we will demonstrate how to use the new API for
+a standard paired t-test on evoked data from multiple subjects.
 It uses a non-parametric statistical procedure based on permutations and
 cluster level statistics.
 
@@ -16,7 +17,7 @@
   - loading evoked data from multiple subjects
   - construct a dataframe that contains the difference between conditions
   - run the new cluster test function with formula in Wilkinson notation
-  - plot the results with the ClusterResults Class
+  - plot the results with the new ClusterResults API
 
 Here, the unit of observation are evokeds from multiple subjects (2nd level analysis).
 
@@ -41,13 +42,14 @@
 # Define the path to the P3 dataset
 path_to_p3 = mne.datasets.misc.data_path() / "ERP_CORE" / "P3"
 
-# Define the range of participant IDs
-participant_ids = range(15, 20)  # This will cover 015 to 019
+# Define the range of participant IDs (we only have 5 participants in the dataset)
+participant_ids = range(15, 20)  # This will cover participant 15 to 19
 
 # store the evoked data of all subjects
 evokeds_allsubs = []
 
 # Loop over each participant ID and generate the corresponding filename
+# to load the evoked data
 for pid in participant_ids:
     # Create the filename using an f-string, ID is zero-padded to 3 digits
     filename_p3 = f"sub-{pid:03d}_ses-P3_task-P3_ave.fif"
@@ -58,21 +60,22 @@
     # load the evoked data
     evokeds = mne.read_evokeds(p3_file_path)
 
-    # add subjects evoked data to list
+    # add single subjects evoked data to a list
     evokeds_allsubs.append(evokeds)
 
 # the P3b dataset is part of the freely available ERP CORE dataset
 # participants were presented with a visual oddball task
 # and the P3b component was analyzed
 # the conditions of interest are the target (rare visual stimuli)
-# and non-target stimuli (frequency visual stimuli)
+# and non-target stimuli (frequent visual stimuli)
 
 # let's extract the target and non-target evokeds
 target_only = [evoked[0] for evoked in evokeds_allsubs]
 non_target_only = [evoked[1] for evoked in evokeds_allsubs]
 
 # let's first have a look at the data
-# create contrast from target and non-target evokeds
+
+# create contrast target - non-target
 diff_evoked = [
     mne.combine_evoked([evokeds_a, evokeds_b], weights=[1, -1])
     for evokeds_a, evokeds_b in zip(target_only, non_target_only)
@@ -84,7 +87,7 @@
 mne.grand_average(diff_evoked).plot_topomap()
 
 # we can see that the strongest difference is around 400 ms in
-# visual channels (occipital region)
+# central-parietal channels with a stronger evoked signal for target stimuli
 
 # Next we prepare a dataframe for the cluster test function
 # the dataframe should contain the contrast evoked data and the subject index
@@ -93,7 +96,7 @@
 # save the evoked data for both conditions in one list
 evokeds_conditions = target_only + non_target_only
 
-# set up a list that defines the condition for each evoked data
+# create a list that defines the condition for each evoked data
 # this will be used to create the conditions column in the dataframe
 conditions = ["target"] * len(target_only) + ["non-target"] * len(non_target_only)
 
@@ -102,7 +105,7 @@
 # we multiply the participant_ids by 2 to account for the two conditions
 subject_index = list(participant_ids) * 2
 
-# create the dataframe
+# create the dataframe containing the evoked data, the condition and the subject index
 df = pd.DataFrame(
     {
         "evoked": evokeds_conditions,
@@ -122,20 +125,21 @@
 # let's first define the formula based on Wilkinson notation
 formula = "evoked ~ 1 + C(subject_index)"
 
-# run the cluster test and return the cluster_result object
+# run the new cluster test API and return the new cluster_result object
 cluster_result = mne.stats.cluster_level.cluster_test(df=df, formula=formula)
 
 # note that we ran an exact test due to the small sample size (only 15 permutations)
 
 # set up conditions dictionary for cluster plots
+# this is necessary for plotting the evoked data and the cluster result on top
 conditions_dict = {"target": target_only, "non-target": non_target_only}
 
 # finally let's plot the results using the ClusterResults class
 
 # we plot the cluster with the lowest p-value
-
+cluster_result.plot_cluster(cond_dict=conditions_dict)
 # we can see that there is something going on around 400 ms
-# in the visual channels (topomap on the left)
+# with a stronger signal for target trials in right central-parietal channels
+
 # however the cluster is not significant which is unsurprising
 # given the small sample size (only 5 subjects)
-cluster_result.plot_cluster(cond_dict=conditions_dict)

From a87ffed0dedb2e3e27f5b6b99be0db8ae6a32d55 Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Mon, 22 Jul 2024 20:22:20 +0200
Subject: [PATCH 59/88] first review

---
 mne/stats/cluster_level.py | 384 ++++++++++++++++++++++++++-----------
 1 file changed, 272 insertions(+), 112 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index bb1f31ef6fd..847c464259c 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -4,6 +4,10 @@
 # License: BSD-3-Clause
 # Copyright the MNE-Python contributors.
 
+from __future__ import annotations
+
+from typing import Literal
+
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
@@ -13,12 +17,13 @@
 from scipy.stats import f as fstat
 from scipy.stats import t as tstat
 
-from .. import EvokedArray
-from ..channels import find_ch_adjacency
+from .. import Epochs, Evoked
+from ..epochs import EpochsArray, EvokedArray
 from ..fixes import has_numba, jit
 from ..parallel import parallel_func
 from ..source_estimate import MixedSourceEstimate, SourceEstimate, VolSourceEstimate
 from ..source_space import SourceSpaces
+from ..time_frequency import AverageTFR, AverageTFRArray, EpochsTFR, EpochsTFRArray
 from ..utils import (
     ProgressBar,
     _check_option,
@@ -938,7 +943,7 @@ def _permutation_cluster_test(
     sample_shape = X[0].shape[1:]
     for x in X:
         if x.shape[1:] != sample_shape:
-            raise ValueError("All samples mush have the same size")
+            raise ValueError("All samples must have the same size")
 
     # flatten the last dimensions in case the data is high dimensional
     X = [np.reshape(x, (x.shape[0], -1)) for x in X]
@@ -1732,21 +1737,186 @@ def summarize_clusters_stc(
     return klass(data_summary, vertices, tmin, tstep, subject)
 
 
+def validate_input_dataframe(df: pd.DataFrame, formula: str):
+    """
+    Validate the input dataframe for the cluster permutation test.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Dataframe with 3 columns (subject_index, condition, data).
+    formula : formulaic.ModelSpec
+        Wilkinson style Formula for the design matrix.
+
+    Returns
+    -------
+    dv_name : str
+        Name of the dependent variable.
+    """
+    # extract dependent variable name from formula
+    formulaic = _soft_import(
+        "formulaic", purpose="set up Design Matrix"
+    )  # soft import (not a dependency for MNE)
+    formula = formulaic.Formula(formula)
+    dv_name = str(formula.lhs)
+
+    # check if all necessary columns are present
+    if dv_name not in df.columns:
+        raise ValueError("""DataFrame needs to contain a column
+                        with the dependent variable name
+                        as defined in the formula""")
+    if "condition" not in df.columns:
+        raise ValueError("DataFrame needs to contain a condition column")
+    if "subject_index" not in df.columns:
+        raise ValueError("DataFrame needs to contain a subject_index column")
+
+    # check if the data column contains only valid types
+    check_column_types(df[dv_name])
+
+    # check if the shape of the data is consistent
+    if not all(data.data.shape == df[dv_name][0].data.shape for data in df[dv_name]):
+        raise ValueError("Data objects need to have the same shape")
+
+    # check if the condition column contains only 2 unique values
+    if len(pd.unique(df.condition)) != 2:
+        raise ValueError("currently only supports 2 conditions.")
+
+    return dv_name
+
+
+def check_column_types(input_data: np.ndarray):
+    """
+    Check if the column types are valid for the cluster permutation test.
+
+    Parameters
+    ----------
+    input_data : np.Array
+        Data to be checked for the cluster permutation test.
+    """
+    # Get the type of the first element
+    first_type = type(input_data.iloc[0])
+
+    # Define the possible valid types
+    valid_types = (
+        Evoked,
+        EvokedArray,
+        Epochs,
+        EpochsArray,
+        AverageTFR,
+        EpochsTFR,
+        EpochsTFRArray,
+        AverageTFRArray,
+    )
+
+    # Check if the type of the first element is a valid type
+    if first_type not in valid_types:
+        raise ValueError(f"Object type '{first_type}' is not a valid type.")
+
+    # Check if all elements are of the same type as the first one
+    if not all(isinstance(data, first_type) for data in input_data):
+        raise ValueError("Data column must contain objects of the same type.")
+
+
+def prepare_data_for_cluster_test(input_df: pd.DataFrame, dv_name: str):
+    """
+    Prepare the data for the cluster permutation test.
+
+    Parameters
+    ----------
+    input_data : np.ndarray
+        Data to be prepared for the cluster permutation test.
+
+    Returns
+    -------
+    data : np.Array
+        Data prepared for the cluster permutation test.
+    """
+    # extract data and add to dataframe
+    input_df["data"] = [data.data for data in input_df[dv_name]]
+
+    # extract dimensions from time series or time-frequency data
+    first_data_obj = input_df["data"].iloc[0]
+    if isinstance(first_data_obj, (Epochs, Evoked, EpochsArray, EvokedArray)):
+        n_channels, n_timepoints = first_data_obj.get_data().shape
+    if isinstance(
+        first_data_obj, (AverageTFR, EpochsTFR, AverageTFRArray, EpochsTFRArray)
+    ):
+        n_channels, n_freqs, n_timepoints = first_data_obj.get_data().shape
+
+    reshaped_data = []
+
+    for idx, row in input_df.iterrows():
+        subject_index = row["subject_index"]
+        condition = row["condition"]
+        data_array = row["data"]
+
+        if data_array.ndim == 2:
+            n_channels, n_timepoints = data_array.shape
+            # timepoints are the columns
+            df_temp = pd.DataFrame(
+                data_array, columns=[f"timepoint_{i}" for i in range(n_timepoints)]
+            )
+            df_temp["channel"] = range(n_channels)
+            df_temp["subject_index"] = subject_index
+            df_temp["condition"] = condition
+
+            reshaped_data.append(df_temp)
+
+        elif data_array.ndim == 3:
+            n_channels, n_freqs, n_timepoints = data_array.shape
+            # timepoints are the columns
+            df_temp = pd.DataFrame(
+                data_array.reshape(-1, n_timepoints),
+                columns=[f"timepoint_{i}" for i in range(n_timepoints)],
+            )
+            df_temp["frequency"] = np.repeat(range(n_freqs), n_channels)
+            df_temp["channel"] = np.tile(range(n_channels), n_freqs)
+            df_temp["subject_index"] = subject_index
+            df_temp["condition"] = condition
+
+            reshaped_data.append(df_temp)
+
+        else:
+            raise ValueError(f"Unsupported data array dimensions: {data_array.ndim}")
+    # combine the reshaped data
+    combined_df = pd.concat(reshaped_data, ignore_index=True)
+    # Convert the dataframe to long format
+    id_vars = ["subject_index", "condition", "channel"]
+    if "frequency" in combined_df.columns:
+        id_vars.append("frequency")
+
+    reshaped_df = pd.melt(
+        combined_df, id_vars=id_vars, var_name="timepoint", value_name="value"
+    )
+
+    # rename column and convert to integer
+    reshaped_df["timepoint"] = (
+        reshaped_df["timepoint"].str.replace("timepoint_", "").astype(int)
+    )
+
+    # return the reshaped dataframe and dimensions
+    if data_array.ndim == 2:
+        return reshaped_df, data_array.ndim, n_channels, n_timepoints
+    elif data_array.ndim == 3:
+        return reshaped_df, data_array.ndim, n_channels, n_freqs, n_timepoints
+
+
 def cluster_test(
     df: pd.DataFrame,
-    formula: str = None,  # Wilkinson notation formula for design matrix
-    n_permutations: int = 10000,
+    formula: str,  # Wilkinson notation formula for design matrix
+    paired_test: bool,  # whether to run a paired t-test or unpaired test
+    n_permutations: int = 1024,  # same default as in old API
     seed: None | int | np.random.RandomState = None,
-    tail: int = 0,  # 0 for two-tailed, 1 for greater, -1 for less
+    tail: Literal[-1, 0, 1] = 0,  # 0 for two-tailed, 1 for greater, -1 for less
     n_jobs: int = 1,  # how many cores to use
-    adjacency: tuple = None,
+    adjacency: tuple | None = None,
     max_step: int = 1,  # maximum distance between samples (time points)
-    exclude: list = None,  # exclude no time points or channels
+    exclude: list | None = None,  # exclude no time points or channels
     step_down_p: int = 0,  # step down in jumps test
     t_power: int = 1,  # weigh each location by its stats score
-    out_type: str = "indices",
+    out_type: Literal["indices", "mask"] = "indices",
     check_disjoint: bool = False,
-    buffer_size: int = None,  # block size for chunking the data
+    buffer_size: int | None = None,  # block size for chunking the data
 ):
     """
     Run a cluster permutation test based on formulaic input.
@@ -1755,12 +1925,14 @@ def cluster_test(
 
     Parameters
     ----------
-    dataframe : pd.DataFrame
-        Dataframe with evoked/epoched data, conditions and subject IDs.
-    formula : str, optional
-        Wilkinson notation formula for design matrix. Default is None.
+    df : pd.DataFrame
+        Dataframe with 3 columns (subject_index, condition, evoked).
+    formula : str
+        Wilkinson notation formula for design matrix.
+    paired_test: bool
+        Whether to run a paired t-test.
     n_permutations : int, optional
-        Number of permutations. Default is 10000.
+        Number of permutations. Default is 1024.
     seed : None | int | np.random.RandomState, optional
         Seed for the random number generator. Default is None.
     tail : int, optional
@@ -1768,7 +1940,7 @@ def cluster_test(
     n_jobs : int, optional
         How many cores to use. Default is 1.
     adjacency : None, optional
-        Adjacency matrix. Default is None.
+        Provide a adjacency matrix. Default is None.
     max_step : int, optional
         Maximum distance between samples (time points). Default is 1.
     exclude : np.Array, optional
@@ -1791,27 +1963,38 @@ def cluster_test(
     ClusterResult
         Object containing the results of the cluster permutation test.
     """
-    # for now this assumes a dataframe with a column for evoked data or epochs
-    # add a data column to the dataframe (numpy array)
-    df["data"] = [evoked.data for evoked in df.evoked]
-
-    # extract number of channels and timepoints
-    # (eventually should also allow for frequency)
-    n_channels, n_timepoints = df["data"][0].shape
-
-    # convert wide format to long format for formulaic
-    df_long = unpack_time_and_channels(df)
-
-    # pivot the DataFrame
-    pivot_df = df_long.pivot_table(
-        index=["subject_index", "channel", "timepoint"],
-        columns="condition",
-        values="value",
-    ).reset_index()
-
-    # if not 2 unique conditions raise error
-    if len(pd.unique(df.condition)) != 2:
-        raise ValueError("Condition list needs to contain 2 unique values")
+    # check if formula is present
+    if formula is None:
+        raise ValueError("Wilkinson style formula is required.")
+
+    # validate the input dataframe and return name of dependent variable
+    dv_name = validate_input_dataframe(df, formula)
+
+    # prepare the data for the cluster permutation test
+    prep_result = prepare_data_for_cluster_test(df, dv_name)
+
+    if prep_result[1] == 2:
+        # pivot the dataframe based on condition for later subtraction
+        pivot_df = (
+            prep_result[0]
+            .pivot_table(
+                index=["subject_index", "channel", "timepoint"],
+                columns="condition",
+                values="value",
+            )
+            .reset_index()
+        )
+    elif prep_result[1] == 3:
+        # pivot the dataframe based on condition for later subtraction
+        pivot_df = (
+            prep_result[0]
+            .pivot_table(
+                index=["subject_index", "channel", "frequency", "timepoint"],
+                columns="condition",
+                values="value",
+            )
+            .reset_index()
+        )
 
     # Get unique elements and the indices of their first occurrences
     unique_elements, indices = np.unique(df.condition, return_index=True)
@@ -1819,41 +2002,51 @@ def cluster_test(
     # Sort unique elements by the indices of their first occurrences
     conditions = unique_elements[np.argsort(indices)]
 
-    # print the contrast used for the paired t-test
-    print(f"Contrast used for paired t-test: {conditions[0]} - {conditions[1]}")
+    # store the contrast for the clusterResults object
+    contrast = f"{conditions[0]} - {conditions[1]}"
 
-    # Compute the difference (assuming there are only 2 conditions)
-    pivot_df["evoked"] = pivot_df[conditions[0]] - pivot_df[conditions[1]]
+    # print the contrast used for the paired t-test so the user knows
+    # what is subtracted from what
+    logger.info(f"Contrast used for paired t-test: {contrast}")
 
-    # Optional: Clean up the DataFrame
-    pivot_df = pivot_df[["subject_index", "channel", "timepoint", "evoked"]]
+    # Compute the difference (assuming there are only 2 conditions)
+    pivot_df[dv_name] = pivot_df[conditions[0]] - pivot_df[conditions[1]]
+
+    # for the paired t-test y is the difference between conditions
+    # X is the design matrix with a column with 1s and 0s for each participant
+    # Create the design matrix using formulaic
+    formulaic = _soft_import(
+        "formulaic", purpose="set up Design Matrix"
+    )  # soft import (not a dependency for MNE)
+    y, X = formulaic.model_matrix(formula, pivot_df)
+
+    # Prepare design matrix for input into MNE cluster function
+    # MNE cluster functions expect channels as the last dimension
+
+    if prep_result[1] == 2:
+        # Reshape y.values into a 3D array: (participants, n_channels, n_timepoints)
+        y_reshaped = y.values.reshape(-1, prep_result[2], prep_result[3])
+        # Transpose the array to have channels as the last dimension
+        y_for_cluster = y_reshaped.transpose(0, 2, 1)
+    elif prep_result[1] == 3:
+        # Reshape y.values into a 4D array:
+        # (participants, n_channels, n_freqs, n_timepoints)
+        y_reshaped = y.values.reshape(
+            -1, prep_result[2], prep_result[3], prep_result[4]
+        )
+        # Transpose the array to have channels as the last dimension
+        y_for_cluster = y_reshaped.transpose(0, 3, 2, 1)
 
-    # check if formula is present
-    if formula is not None:
-        formulaic = _soft_import(
-            "formulaic", purpose="set up Design Matrix"
-        )  # soft import (not a dependency for MNE)
-
-        # for the paired t-test y is the difference between conditions
-        # X is the design matrix with a column with 1s and 0s for each participant
-        # Create the design matrix using formulaic
-        y, X = formulaic.model_matrix(formula, pivot_df)
+    if paired_test:
+        # define stat function and threshold
+        stat_fun, threshold = _check_fun(
+            X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="within"
+        )
     else:
-        raise ValueError(
-            "Formula is required and needs to be a string in Wilkinson notation."
+        # define stat function and threshold
+        stat_fun, threshold = _check_fun(
+            X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="between"
         )
-
-    # now prep design matrix for input into MNE cluster function
-    # cluster functions expects channels as list dimension
-    y_for_cluster = y.values.reshape(-1, n_channels, n_timepoints).transpose(0, 2, 1)
-
-    adjacency, _ = find_ch_adjacency(df["evoked"][0].info, ch_type="eeg")
-
-    # define stat function and threshold
-    stat_fun, threshold = _check_fun(
-        X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="within"
-    )
-
     # Run the cluster-based permutation test
     T_obs, clusters, cluster_p_values, H0 = _permutation_cluster_test(
         [y_for_cluster],
@@ -1878,40 +2071,6 @@ def cluster_test(
     return ClusterResult(T_obs, clusters, cluster_p_values, H0)
 
 
-def unpack_time_and_channels(df: pd.DataFrame = None) -> pd.DataFrame:
-    """
-    Extract timepoints and channels and convert to long.
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        DataFrame in wide format.
-
-    Returns
-    -------
-    df_long : pd.DataFrame
-        DataFrame in long format.
-    """
-    # Extracting all necessary data using list comprehensions for better performance
-    long_format_data = [
-        {
-            "condition": row["condition"],
-            "subject_index": row["subject_index"],
-            "channel": channel,
-            "timepoint": timepoint,
-            "value": row["data"][channel, timepoint],
-        }
-        for idx, row in df.iterrows()
-        for channel in range(row["data"].shape[0])
-        for timepoint in range(row["data"].shape[1])
-    ]
-
-    # Creating the long format DataFrame
-    df_long = pd.DataFrame(long_format_data)
-
-    return df_long
-
-
 class ClusterResult:
     """
     Object containing the results of the cluster permutation test.
@@ -1928,13 +2087,19 @@ class ClusterResult:
         Max cluster level stats observed under permutation.
     """
 
-    def __init__(self, T_obs, clusters, cluster_p_values, H0):
+    def __init__(
+        self,
+        T_obs: np.typing.NDArray,
+        clusters: list,
+        cluster_p_values: np.typing.NDArray,
+        H0: np.typing.NDArray,
+    ):
         self.T_obs = T_obs
         self.clusters = clusters
         self.cluster_p_values = cluster_p_values
         self.H0 = H0
 
-    def plot_cluster(self, cond_dict: dict = None):
+    def plot_cluster(self, condition_labels: dict):
         """
         Plot the cluster with the lowest p-value.
 
@@ -1944,18 +2109,13 @@ def plot_cluster(self, cond_dict: dict = None):
 
         Parameters
         ----------
-        cond_dict : dict
+        condition_labels : dict
             Dictionary with condition labels as keys and evoked objects as values.
-
-        Returns
-        -------
-        None
-
         """
         # extract condition labels from the dictionary
-        cond_keys = list(cond_dict.keys())
+        cond_keys = list(condition_labels.keys())
         # extract the evokeds from the dictionary
-        cond_values = list(cond_dict.values())
+        cond_values = list(condition_labels.values())
 
         # configure variables for visualization
         colors = {cond_keys[0]: "crimson", cond_keys[1]: "steelblue"}
@@ -2018,7 +2178,7 @@ def plot_cluster(self, cond_dict: dict = None):
         ax_signals = divider.append_axes("right", size="300%", pad=1.3)
         title = f"Signal averaged over {len(ch_inds)} sensor(s)"
         plot_compare_evokeds(
-            cond_dict,
+            condition_labels,
             title=title,
             picks=ch_inds,
             axes=ax_signals,

From 1f857ad2d97e684b45378c38a943e3763519fd86 Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Mon, 22 Jul 2024 20:44:43 +0200
Subject: [PATCH 60/88] quick clean up

---
 .../76_new_cluster_test_api.py                | 27 ++++++++++++++-----
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index 842e0543b0b..efbc6d5e3f0 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -29,7 +29,7 @@
 # License: BSD-3-Clause
 # Copyright the MNE-Python contributors.
 
-# %%
+# %% Load the required packages
 
 from pathlib import Path
 
@@ -37,6 +37,8 @@
 
 import mne
 
+# %% Load the P3 dataset
+
 # Set parameters
 # --------------
 # Define the path to the P3 dataset
@@ -69,6 +71,8 @@
 # the conditions of interest are the target (rare visual stimuli)
 # and non-target stimuli (frequent visual stimuli)
 
+# %% visually inspect the evoked data for each condition
+
 # let's extract the target and non-target evokeds
 target_only = [evoked[0] for evoked in evokeds_allsubs]
 non_target_only = [evoked[1] for evoked in evokeds_allsubs]
@@ -89,7 +93,8 @@
 # we can see that the strongest difference is around 400 ms in
 # central-parietal channels with a stronger evoked signal for target stimuli
 
-# Next we prepare a dataframe for the cluster test function
+# %% Prepare the dataframe for the new cluster test API
+
 # the dataframe should contain the contrast evoked data and the subject index
 # each row in the dataframe should represent one observation (evoked data)
 
@@ -114,7 +119,8 @@
     }
 )
 
-# now we can run the cluster test function
+# %% run the cluster test function with formulaic input
+
 # we will use the new API that allows for Wilkinson style formulas
 # the formula should be a string in Wilkinson notation
 
@@ -123,12 +129,21 @@
 # we will use a cluster-based permutation paired t-test for this
 
 # let's first define the formula based on Wilkinson notation
+# we want to predict the evoked difference signal based on the subject
+# the cluster test randomly permutes the subject label
+# the 1 in the formula represents the intercept which is always included
+# C is a categorical variable that will be dummy coded
 formula = "evoked ~ 1 + C(subject_index)"
 
 # run the new cluster test API and return the new cluster_result object
-cluster_result = mne.stats.cluster_level.cluster_test(df=df, formula=formula)
+cluster_result = mne.stats.cluster_level.cluster_test(
+    df=df, formula=formula, paired_test=True, adjacency=None
+)
+
+# note that we ran an exact test due to the small sample size
+# (only 15 permutations)
 
-# note that we ran an exact test due to the small sample size (only 15 permutations)
+# %% plot the results
 
 # set up conditions dictionary for cluster plots
 # this is necessary for plotting the evoked data and the cluster result on top
@@ -137,7 +152,7 @@
 # finally let's plot the results using the ClusterResults class
 
 # we plot the cluster with the lowest p-value
-cluster_result.plot_cluster(cond_dict=conditions_dict)
+cluster_result.plot_cluster(condition_labels=conditions_dict)
 # we can see that there is something going on around 400 ms
 # with a stronger signal for target trials in right central-parietal channels
 

From 450738bcbdfa472297c2fd9f02c3c5ee454bae1b Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Mon, 22 Jul 2024 21:52:00 +0200
Subject: [PATCH 61/88] test compare_old_vs_new_cluster_API

---
 mne/stats/tests/test_cluster_level.py | 150 +++++++++++++++++++++++++-
 1 file changed, 149 insertions(+), 1 deletion(-)

diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py
index e319d018328..097754f097b 100644
--- a/mne/stats/tests/test_cluster_level.py
+++ b/mne/stats/tests/test_cluster_level.py
@@ -6,6 +6,7 @@
 from functools import partial
 
 import numpy as np
+import pandas as pd
 import pytest
 from numpy.testing import (
     assert_allclose,
@@ -15,10 +16,20 @@
 )
 from scipy import linalg, sparse, stats
 
-from mne import MixedSourceEstimate, SourceEstimate, SourceSpaces, VolSourceEstimate
+from mne import (
+    EvokedArray,
+    MixedSourceEstimate,
+    SourceEstimate,
+    SourceSpaces,
+    VolSourceEstimate,
+    create_info,
+)
 from mne.fixes import _eye_array
 from mne.stats import combine_adjacency, ttest_ind_no_p
 from mne.stats.cluster_level import (
+    _check_fun,
+    _permutation_cluster_test,
+    cluster_test,
     f_oneway,
     permutation_cluster_1samp_test,
     permutation_cluster_test,
@@ -27,6 +38,7 @@
     summarize_clusters_stc,
     ttest_1samp_no_p,
 )
+from mne.time_frequency import AverageTFRArray
 from mne.utils import _record_warnings, catch_logging
 
 n_space = 50
@@ -867,3 +879,139 @@ def test_output_equiv(shape, out_type, adjacency, threshold):
             assert out_type == "indices"
             got_mask[np.ix_(*clu)] = n
     assert_array_equal(got_mask, want_mask)
+
+
+def create_sample_data_cluster_test():
+    """Create sample data to test new cluster API."""
+    # Prepare some dummy data
+    n_subjects = 20
+    n_conditions = 2
+    n_channels = 5
+    n_timepoints = 8
+    n_freqs = 3
+
+    # Create dummy data
+    dummy_data_2d = [
+        np.random.rand(n_channels, n_timepoints)
+        for _ in range(n_subjects * n_conditions)
+    ]
+    dummy_data_3d = [
+        np.random.rand(n_channels, n_freqs, n_timepoints)
+        for _ in range(n_subjects * n_conditions)
+    ]
+
+    # Create a DataFrame with dummy data
+    df_2d = pd.DataFrame(
+        {
+            "subject_index": np.repeat(range(n_subjects), n_conditions),
+            "condition": np.tile(["cond1", "cond2"], n_subjects),
+            "data": dummy_data_2d,
+        }
+    )
+
+    df_3d = pd.DataFrame(
+        {
+            "subject_index": np.repeat(range(n_subjects), n_conditions),
+            "condition": np.tile(["cond1", "cond2"], n_subjects),
+            "data": dummy_data_3d,
+        }
+    )
+
+    return df_2d, df_3d
+
+
+def compare_old_and_new_cluster_api():
+    """Make sure old and new cluster API results are the same."""
+    # load sample data
+    df_2d, df_3d = create_sample_data_cluster_test()
+
+    # mandatory parameters for new cluster API
+    formula = "evoked ~ 1 + C(subject_index)"
+
+    data_to_test = [df_2d, df_3d]
+
+    # save 2D and 3D data results for both old and new API
+    result_old_api_all = []
+    result_new_api_all = []
+    d_all = []
+
+    for df in data_to_test:
+        # Pivot the DataFrame to have conditions as columns for old API
+        pivot_df = df.pivot(index="subject_index", columns="condition", values="data")
+
+        # Subtract condition 2 data from condition 1 data for each subject
+        pivot_df["cond_diff"] = pivot_df.apply(
+            lambda row: row["cond1"] - row["cond1"], axis=1
+        )
+
+        # Extract the 'cond_diff' column as a numpy array
+        cond_diff_array = np.stack(pivot_df["cond_diff"].values)
+
+        # extract data and reshape for old API
+        if pivot_df.cond_diff[0].ndim == 2:
+            # reshape to channels as last dimension
+            d = cond_diff_array.transpose(0, 2, 1)
+        else:
+            # reshape 3D data to channels as last dimension
+            d = cond_diff_array.transpose(0, 3, 2, 1)
+
+        # define test statistic
+        stat_fun, threshold = _check_fun(
+            X=d, stat_fun=None, threshold=None, tail=0, kind="within"
+        )
+
+        # Run old cluster api
+        result_old_api = _permutation_cluster_test(
+            [d],
+            threshold=threshold,
+            stat_fun=stat_fun,
+            n_jobs=-1,  # takes all CPU cores
+            max_step=1,  # maximum distance between samples (time points)
+            exclude=None,  # exclude no time points or channels
+            step_down_p=0,  # step down in jumps test
+            t_power=1,  # weigh each location by its stats score
+            out_type="indices",
+            check_disjoint=False,
+            buffer_size=None,  # block size for chunking the data
+            n_permutations=1024,
+            tail=0,
+            adjacency=None,
+            seed=42,
+        )
+        result_old_api_all.append(result_old_api)
+        d_all.append(d)
+
+        if df.data[0].ndim == 2:
+            # convert each row in data column into evoked object
+            df["evoked"] = df["data"].apply(
+                lambda x: EvokedArray(
+                    x, create_info(df.data[0].shape[0], 1000.0, "eeg")
+                )
+            )
+        else:
+            # convert each row in data column into evoked object
+            df["evoked"] = df["data"].apply(
+                lambda x: AverageTFRArray(
+                    create_info(df.data[0].shape[0], 1000.0, "eeg"),
+                    x,
+                    times=np.arange(df.data[0].shape[2]),
+                    freqs=np.arange(df.data[0].shape[1]),
+                )
+            )
+
+        # run the new cluster test API and return the new cluster_result object
+        cluster_result = cluster_test(
+            df=df, formula=formula, paired_test=True, adjacency=None, seed=42
+        )
+        result_new_api_all.append(cluster_result)
+
+    # compare old and new API results both for 2D and 3D data
+    for result_old_api, result_new_api in zip(result_old_api_all, result_new_api_all):
+        # compare the cluster statistics
+        assert_array_equal(result_old_api[0], result_new_api.T_obs)
+
+        # compare the cluster indices
+        assert_array_equal(result_old_api[1], result_new_api.clusters)
+
+        # compare the cluster p-values
+        assert_array_equal(result_old_api[2], result_new_api.cluster_p_values)

From d41efbe73f5b5998679b6de910789d61bcbd9bd0 Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Thu, 25 Jul 2024 12:17:36 -0500
Subject: [PATCH 62/88] simplify tests

Co-authored-by: Carina Forster <carinaforster0611@gmail.com>
---
 mne/stats/tests/test_cluster_level.py | 136 ++++++++------------------
 1 file changed, 41 insertions(+), 95 deletions(-)

diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py
index 097754f097b..f9be3693441 100644
--- a/mne/stats/tests/test_cluster_level.py
+++ b/mne/stats/tests/test_cluster_level.py
@@ -17,6 +17,7 @@
 from scipy import linalg, sparse, stats
 
 from mne import (
+    EpochsArray,
     EvokedArray,
     MixedSourceEstimate,
     SourceEstimate,
@@ -27,8 +28,6 @@
 from mne.fixes import _eye_array
 from mne.stats import combine_adjacency, ttest_ind_no_p
 from mne.stats.cluster_level import (
-    _check_fun,
-    _permutation_cluster_test,
     cluster_test,
     f_oneway,
     permutation_cluster_1samp_test,
@@ -38,7 +37,7 @@
     summarize_clusters_stc,
     ttest_1samp_no_p,
 )
-from mne.time_frequency import AverageTFRArray
+from mne.time_frequency import AverageTFRArray, EpochsTFRArray
 from mne.utils import _record_warnings, catch_logging
 
 n_space = 50
@@ -920,98 +919,45 @@ def create_sample_data_cluster_test():
     return df_2d, df_3d
 
 
-def compare_old_and_new_cluster_api():
-    """Make sure old and new cluster API results are the same."""
-    # load sample data
-    df_2d, df_3d = create_sample_data_cluster_test()
-
-    # mandatory parameters for new cluster API
-    formula = "evoked ~ 1 + C(subject_index)"
-
-    data_to_test = [df_2d, df_3d]
-
-    # save 2D and 3D data results for both old and new API
-    result_old_api_all = []
-    result_new_api_all = []
-    d_all = []
-
-    for df in data_to_test:
-        # Pivot the DataFrame to have conditions as columns for old API
-        pivot_df = df.pivot(index="subject_index", columns="condition", values="data")
-
-        # Subtract condition 2 data from condition 1 data for each subject
-        pivot_df["cond_diff"] = pivot_df.apply(
-            lambda row: row["cond1"] - row["cond1"], axis=1
-        )
-
-        # Extract the 'cond_diff' column as a numpy array
-        cond_diff_array = np.stack(pivot_df["cond_diff"].values)
-
-        # extract data and reshape for old API
-        if pivot_df.cond_diff[0].ndim == 2:
-            # reshape to channels as last dimension
-            d = cond_diff_array.transpose(0, 2, 1)
-        else:
-            # reshape 3D data to channels as last dimension
-            d = cond_diff_array.transpose(0, 3, 2, 1)
-
-        # define test statistic
-        stat_fun, threshold = _check_fun(
-            X=d, stat_fun=None, threshold=None, tail=0, kind="within"
-        )
-
-        # Run old cluster api
-        result_old_api = _permutation_cluster_test(
-            [d],
-            threshold=threshold,
-            stat_fun=stat_fun,
-            n_jobs=-1,  # takes all CPU cores
-            max_step=1,  # maximum distance between samples (time points)
-            exclude=None,  # exclude no time points or channels
-            step_down_p=0,  # step down in jumps test
-            t_power=1,  # weigh each location by its stats score
-            out_type="indices",
-            check_disjoint=False,
-            buffer_size=None,  # block size for chunking the data
-            n_permutations=1024,
-            tail=0,
-            adjacency=None,
-            seed=42,
-        )
-        result_old_api_all.append(result_old_api)
-        d_all.append(d)
-
-        if df.data[0].ndim == 2:
-            # convert each row in data column into evoked object
-            df["evoked"] = df["data"].apply(
-                lambda x: EvokedArray(
-                    x, create_info(df.data[0].shape[0], 1000.0, "eeg")
-                )
-            )
-        else:
-            # convert each row in data column into evoked object
-            df["evoked"] = df["data"].apply(
-                lambda x: AverageTFRArray(
-                    create_info(df.data[0].shape[0], 1000.0, "eeg"),
-                    x,
-                    times=np.arange(df.data[0].shape[2]),
-                    freqs=np.arange(df.data[0].shape[1]),
-                )
-            )
-
-        # run the new cluster test API and return the new cluster_result object
-        cluster_result = cluster_test(
-            df=df, formula=formula, paired_test=True, adjacency=None, seed=42
+def test_compare_old_and_new_cluster_api():
+    """Test for same results from old and new APIs."""
+    condition1_1d, condition2_1d, condition1_2d, condition2_2d = _get_conditions()
+    df_1d = pd.DataFrame(
+        dict(
+            data=[condition1_1d, condition2_1d],
+            condition=["a", "b"],
         )
-        result_new_api_all.append(cluster_result)
-
-    # compare old and new API results both for 2D and 3D data
-    for result_old_api, result_new_api in zip(result_old_api_all, result_new_api_all):
-        # compare the cluster statistics
-        assert_array_equal(result_old_api[0], result_new_api.T_obs)
+    )
+    kwargs = dict(n_permutations=100, tail=1, seed=1, buffer_size=None, out_type="mask")
+    F_obs, clusters, cluster_pvals, H0 = permutation_cluster_test(
+        [condition1_1d, condition2_1d], **kwargs
+    )
+    formula = "data ~ condition"
+    cluster_result = cluster_test(df_1d, formula, **kwargs)
+    assert_array_equal(cluster_result.H0, H0)
+    assert_array_equal(cluster_result.stat_obs, F_obs)
+    assert_array_equal(cluster_result.cluster_p_values, cluster_pvals)
+    assert cluster_result.clusters == clusters
 
-        # compare the cluster indices
-        assert_array_equal(result_old_api[1], result_new_api.clusters)
 
-        # compare the cluster p-values
-        assert_array_equal(result_old_api[2], result_new_api.cluster_p_values)
+@pytest.mark.parametrize(
+    "Inst", (EpochsArray, EvokedArray, EpochsTFRArray, AverageTFRArray)
+)
+def test_new_cluster_api(Inst):
+    """Test handling different MNE objects in the cluster API."""
+    pd = pytest.importorskip("pandas")
+
+    n_epo, n_chan, n_freq, n_times = 2, 3, 5, 7
+    shape = (n_chan, n_times)
+    if Inst in (EpochsArray, EpochsTFRArray):
+        shape = (n_epo,) + shape
+    if Inst in (EpochsTFRArray, AverageTFRArray):
+        shape = shape[:-1] + (n_freq, shape[-1])
+
+    info = create_info(...)
+    inst1 = Inst(np.random.normal(shape, ...), info=info)
+    inst2 = Inst(np.random.normal(shape, ...), info=info)
+
+    df = pd.DataFrame(dict(data=[inst1, inst2], condition=["a", "b"]))
+    result = cluster_test(df, "data~condition", ...)
+    assert result  # TODO do something more interesting here

From 9523fae7996f0006cd5c379767d858e6cc2694ef Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Thu, 25 Jul 2024 12:23:49 -0500
Subject: [PATCH 63/88] refactor cluster_test

Co-authored-by: Eric Larson <larson.eric.d@gmail.com>
Co-authored-by: Carina Forster <carinaforster0611@gmail.com>
---
 mne/stats/cluster_level.py | 419 ++++++++++++-------------------------
 1 file changed, 139 insertions(+), 280 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index 847c464259c..20b54f1f592 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -17,13 +17,12 @@
 from scipy.stats import f as fstat
 from scipy.stats import t as tstat
 
-from .. import Epochs, Evoked
-from ..epochs import EpochsArray, EvokedArray
+from .. import BaseEpochs, Evoked, EvokedArray
 from ..fixes import has_numba, jit
 from ..parallel import parallel_func
 from ..source_estimate import MixedSourceEstimate, SourceEstimate, VolSourceEstimate
 from ..source_space import SourceSpaces
-from ..time_frequency import AverageTFR, AverageTFRArray, EpochsTFR, EpochsTFRArray
+from ..time_frequency import BaseTFR
 from ..utils import (
     ProgressBar,
     _check_option,
@@ -1737,191 +1736,65 @@ def summarize_clusters_stc(
     return klass(data_summary, vertices, tmin, tstep, subject)
 
 
-def validate_input_dataframe(df: pd.DataFrame, formula: str):
-    """
-    Validate the input dataframe for the cluster permutation test.
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        Dataframe with 3 columns (subject_index, condition, data).
-    formula : formulaic.ModelSpec
-        Wilkinson style Formula for the design matrix.
-
-    Returns
-    -------
-    dv_name : str
-        Name of the dependent variable.
-    """
-    # extract dependent variable name from formula
-    formulaic = _soft_import(
-        "formulaic", purpose="set up Design Matrix"
-    )  # soft import (not a dependency for MNE)
-    formula = formulaic.Formula(formula)
-    dv_name = str(formula.lhs)
-
+def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str):
     # check if all necessary columns are present
-    if dv_name not in df.columns:
-        raise ValueError("""DataFrame needs to contain a column
-                        with the dependent variable name
-                        as defined in the formula""")
-    if "condition" not in df.columns:
-        raise ValueError("DataFrame needs to contain a condition column")
-    if "subject_index" not in df.columns:
-        raise ValueError("DataFrame needs to contain a subject_index column")
-
-    # check if the data column contains only valid types
-    check_column_types(df[dv_name])
-
+    missing = ({dv_name} | {iv_name}) - set(df.columns)
+    sep = '", "'
+    if missing:
+        raise ValueError(
+            f"DataFrame must contain a column named for each term in `formula`. "
+            f"Column{_pl(missing)} missing for term{_pl(missing)} "
+            f'"{sep.join(missing)}".'
+        )
+    # check if the data column contains valid (and consistent) instance types
+    inst = df[dv_name].iloc[0]
+    valid_types = (Evoked, BaseEpochs, BaseTFR, np.ndarray)
+    _validate_type(inst, valid_types, f"Data in dependent variable column '{dv_name}'")
+    all_types = set(df[dv_name].map(type))
+    all_type_names = ", ".join([type(x).__name__ for x in all_types])
+    prologue = f"Data in dependent variable column '{dv_name}' must all have "
+    if len(all_types) > 1:
+        raise ValueError(
+            f"{prologue} the same type, but found types {{{all_type_names}}}."
+        )
     # check if the shape of the data is consistent
-    if not all(data.data.shape == df[dv_name][0].data.shape for data in df[dv_name]):
-        raise ValueError("Data objects need to have the same shape")
-
-    # check if the condition column contains only 2 unique values
-    if len(pd.unique(df.condition)) != 2:
-        raise ValueError("currently only supports 2 conditions.")
-
-    return dv_name
-
-
-def check_column_types(input_data: np.ndarray):
-    """
-    Check if the column types are valid for the cluster permutation test.
-
-    Parameters
-    ----------
-    input_data : np.Array
-        Data to be checked for the cluster permutation test.
-    """
-    # Get the type of the first element
-    first_type = type(input_data.iloc[0])
-
-    # Define the possible valid types
-    valid_types = (
-        Evoked,
-        EvokedArray,
-        Epochs,
-        EpochsArray,
-        AverageTFR,
-        EpochsTFR,
-        EpochsTFRArray,
-        AverageTFRArray,
-    )
-
-    # Check if the type of the first element is a valid type
-    if first_type not in valid_types:
-        raise ValueError(f"Object type '{first_type}' is not a valid type.")
-
-    # Check if all elements are of the same type as the first one
-    if not all(isinstance(data, first_type) for data in input_data):
-        raise ValueError("Data column must contain objects of the same type.")
-
-
-def prepare_data_for_cluster_test(input_df: pd.DataFrame, dv_name: str):
-    """
-    Prepare the data for the cluster permutation test.
-
-    Parameters
-    ----------
-    input_data : np.ndarray
-        Data to be prepared for the cluster permutation test.
-
-    Returns
-    -------
-    data : np.Array
-        Data prepared for the cluster permutation test.
-    """
-    # extract data and add to dataframe
-    input_df["data"] = [data.data for data in input_df[dv_name]]
-
-    # extract dimensions from time series or time-frequency data
-    first_data_obj = input_df["data"].iloc[0]
-    if isinstance(first_data_obj, (Epochs, Evoked, EpochsArray, EvokedArray)):
-        n_channels, n_timepoints = first_data_obj.get_data().shape
-    if isinstance(
-        first_data_obj, (AverageTFR, EpochsTFR, AverageTFRArray, EpochsTFRArray)
-    ):
-        n_channels, n_freqs, n_timepoints = first_data_obj.get_data().shape
-
-    reshaped_data = []
-
-    for idx, row in input_df.iterrows():
-        subject_index = row["subject_index"]
-        condition = row["condition"]
-        data_array = row["data"]
-
-        if data_array.ndim == 2:
-            n_channels, n_timepoints = data_array.shape
-            # timepoints are the columns
-            df_temp = pd.DataFrame(
-                data_array, columns=[f"timepoint_{i}" for i in range(n_timepoints)]
-            )
-            df_temp["channel"] = range(n_channels)
-            df_temp["subject_index"] = subject_index
-            df_temp["condition"] = condition
-
-            reshaped_data.append(df_temp)
-
-        elif data_array.ndim == 3:
-            n_channels, n_freqs, n_timepoints = data_array.shape
-            # timepoints are the columns
-            df_temp = pd.DataFrame(
-                data_array.reshape(-1, n_timepoints),
-                columns=[f"timepoint_{i}" for i in range(n_timepoints)],
-            )
-            df_temp["frequency"] = np.repeat(range(n_freqs), n_channels)
-            df_temp["channel"] = np.tile(range(n_channels), n_freqs)
-            df_temp["subject_index"] = subject_index
-            df_temp["condition"] = condition
-
-            reshaped_data.append(df_temp)
-
-        else:
-            raise ValueError(f"Unsupported data array dimensions: {data_array.ndim}")
-    # combine the reshaped data
-    combined_df = pd.concat(reshaped_data, ignore_index=True)
-    # Convert the dataframe to long format
-    id_vars = ["subject_index", "condition", "channel"]
-    if "frequency" in combined_df.columns:
-        id_vars.append("frequency")
-
-    reshaped_df = pd.melt(
-        combined_df, id_vars=id_vars, var_name="timepoint", value_name="value"
-    )
-
-    # rename column and convert to integer
-    reshaped_df["timepoint"] = (
-        reshaped_df["timepoint"].str.replace("timepoint_", "").astype(int)
-    )
-
-    # return the reshaped dataframe and dimensions
-    if data_array.ndim == 2:
-        return reshaped_df, data_array.ndim, n_channels, n_timepoints
-    elif data_array.ndim == 3:
-        return reshaped_df, data_array.ndim, n_channels, n_freqs, n_timepoints
+    if isinstance(inst, np.ndarray):
+        all_shapes = set(df[dv_name].map(lambda x: x.shape[1:]))  # first dim may vary
+    elif isinstance(inst, BaseEpochs):
+        all_shapes = set(df[dv_name].map(lambda x: x.get_data().shape[1:]))
+    else:
+        all_shapes = set(df[dv_name].map(lambda x: x.get_data().shape))
+    if len(all_shapes) > 1:
+        raise ValueError(
+            f"{prologue} consistent shape, but {len(all_shapes)} different "
+            f"shapes were found: {'; '.join(all_shapes)}."
+        )
+    return all_types.pop()
 
 
+@verbose
 def cluster_test(
     df: pd.DataFrame,
-    formula: str,  # Wilkinson notation formula for design matrix
-    paired_test: bool,  # whether to run a paired t-test or unpaired test
-    n_permutations: int = 1024,  # same default as in old API
-    seed: None | int | np.random.RandomState = None,
-    tail: Literal[-1, 0, 1] = 0,  # 0 for two-tailed, 1 for greater, -1 for less
-    n_jobs: int = 1,  # how many cores to use
+    formula: str,
+    *,
+    within_id: str | None = None,
+    stat_fun: callable | None = None,
+    tail: Literal[-1, 0, 1] = 0,
+    threshold=None,
+    n_permutations: int = 1024,
     adjacency: tuple | None = None,
-    max_step: int = 1,  # maximum distance between samples (time points)
-    exclude: list | None = None,  # exclude no time points or channels
-    step_down_p: int = 0,  # step down in jumps test
-    t_power: int = 1,  # weigh each location by its stats score
-    out_type: Literal["indices", "mask"] = "indices",
+    max_step: int = 1,
+    exclude: list | None = None,
+    step_down_p: int = 0,
+    t_power: int = 1,
     check_disjoint: bool = False,
-    buffer_size: int | None = None,  # block size for chunking the data
+    out_type: Literal["indices", "mask"] = "indices",
+    seed: None | int | np.random.RandomState = None,
+    buffer_size: int | None = None,
+    n_jobs: int = 1,
+    verbose=None,
 ):
-    """
-    Run a cluster permutation test based on formulaic input.
-
-    # currently only supports paired t-test on evokeds or epochs
+    """Run a cluster permutation test from a DataFrame and a formula.
 
     Parameters
     ----------
@@ -1929,16 +1802,14 @@ def cluster_test(
         Dataframe with 3 columns (subject_index, condition, evoked).
     formula : str
         Wilkinson notation formula for design matrix.
-    paired_test: bool
-        Whether to run a paired t-test.
-    n_permutations : int, optional
-        Number of permutations. Default is 1024.
-    seed : None | int | np.random.RandomState, optional
-        Seed for the random number generator. Default is None.
+    within_id : None | str
+        Name of column in ``df`` to use in identifying within-group contrasts.
+    stat_fun : None | callable
+        Statistical function to use.
     tail : int, optional
         0 for two-tailed, 1 for greater, -1 for less. Default is 0.
-    n_jobs : int, optional
-        How many cores to use. Default is 1.
+    n_permutations : int, optional
+        Number of permutations. Default is 1024.
     adjacency : None, optional
         Provide a adjacency matrix. Default is None.
     max_step : int, optional
@@ -1949,107 +1820,86 @@ def cluster_test(
         Step down in jumps test. Default is 0.
     t_power : int, optional
         Weigh each location by its stats score. Default is 1.
-    out_type : str, optional
-        Output type. Default is "indices".
     check_disjoint : bool, optional
         Check if clusters are disjoint. Default is False.
+    out_type : str, optional
+        Output type. Default is "indices".
+    seed : None | int | np.random.RandomState, optional
+        Seed for the random number generator. Default is None.
     buffer_size : int, optional
         Block size for chunking the data. Default is None.
-    seed : int, optional
-        Seed for the random number generator. Default is None.
+    n_jobs : int, optional
+        How many cores to use. Default is 1.
+    %(verbose)s
 
     Returns
     -------
     ClusterResult
         Object containing the results of the cluster permutation test.
     """
-    # check if formula is present
-    if formula is None:
-        raise ValueError("Wilkinson style formula is required.")
-
-    # validate the input dataframe and return name of dependent variable
-    dv_name = validate_input_dataframe(df, formula)
-
-    # prepare the data for the cluster permutation test
-    prep_result = prepare_data_for_cluster_test(df, dv_name)
-
-    if prep_result[1] == 2:
-        # pivot the dataframe based on condition for later subtraction
-        pivot_df = (
-            prep_result[0]
-            .pivot_table(
-                index=["subject_index", "channel", "timepoint"],
-                columns="condition",
-                values="value",
-            )
-            .reset_index()
-        )
-    elif prep_result[1] == 3:
-        # pivot the dataframe based on condition for later subtraction
-        pivot_df = (
-            prep_result[0]
-            .pivot_table(
-                index=["subject_index", "channel", "frequency", "timepoint"],
-                columns="condition",
-                values="value",
-            )
-            .reset_index()
+    # parse formula
+    formulaic = _soft_import("formulaic", purpose="parse formula for clustering")
+    parser = formulaic.parser.DefaultFormulaParser(include_intercept=False)
+    formula = formulaic.Formula(formula, _parser=parser)
+    dv_name = str(np.array(formula.lhs.root).item())
+    iv_name = str(np.array(formula.rhs.root).item())
+    # validate the input dataframe and return the type of the data column entries
+    _dtype = _validate_cluster_df(df, dv_name, iv_name)
+
+    # for within_subject
+    _validate_type(within_id, (str, None), "within_id")
+    if within_id:
+        df = df.copy(deep=False)  # Don't mutate input dataframe row order!
+        df.sort_values([iv_name, within_id], inplace=True)
+        counts = df[within_id].value_counts()
+        if any(counts != 2):
+            raise ValueError("Badness 10000")
+
+    # extract the data
+
+    def _extract_data_array(series):
+        return np.concatenate(series.values)
+
+    def _extract_data_mne(series):
+        return np.array(
+            series.map(lambda inst: inst.get_data().swapaxes(-2, -1)).to_list()
         )
 
-    # Get unique elements and the indices of their first occurrences
-    unique_elements, indices = np.unique(df.condition, return_index=True)
-
-    # Sort unique elements by the indices of their first occurrences
-    conditions = unique_elements[np.argsort(indices)]
-
-    # store the contrast for the clusterResults object
-    contrast = f"{conditions[0]} - {conditions[1]}"
-
-    # print the contrast used for the paired t-test so the user knows
-    # what is subtracted from what
-    logger.info(f"Contrast used for paired t-test: {contrast}")
-
-    # Compute the difference (assuming there are only 2 conditions)
-    pivot_df[dv_name] = pivot_df[conditions[0]] - pivot_df[conditions[1]]
-
-    # for the paired t-test y is the difference between conditions
-    # X is the design matrix with a column with 1s and 0s for each participant
-    # Create the design matrix using formulaic
-    formulaic = _soft_import(
-        "formulaic", purpose="set up Design Matrix"
-    )  # soft import (not a dependency for MNE)
-    y, X = formulaic.model_matrix(formula, pivot_df)
-
-    # Prepare design matrix for input into MNE cluster function
-    # MNE cluster functions expect channels as the last dimension
-
-    if prep_result[1] == 2:
-        # Reshape y.values into a 3D array: (participants, n_channels, n_timepoints)
-        y_reshaped = y.values.reshape(-1, prep_result[2], prep_result[3])
-        # Transpose the array to have channels as the last dimension
-        y_for_cluster = y_reshaped.transpose(0, 2, 1)
-    elif prep_result[1] == 3:
-        # Reshape y.values into a 4D array:
-        # (participants, n_channels, n_freqs, n_timepoints)
-        y_reshaped = y.values.reshape(
-            -1, prep_result[2], prep_result[3], prep_result[4]
-        )
-        # Transpose the array to have channels as the last dimension
-        y_for_cluster = y_reshaped.transpose(0, 3, 2, 1)
+    def _extract_data_tfr(series):
+        return series.map(lambda inst: inst.get_data().swapaxes(-3, -1)).to_list()
 
-    if paired_test:
-        # define stat function and threshold
-        stat_fun, threshold = _check_fun(
-            X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="within"
-        )
+    if _dtype is np.ndarray:
+        func = _extract_data_array
+    elif _dtype is BaseTFR:
+        func = _extract_data_tfr
     else:
-        # define stat function and threshold
-        stat_fun, threshold = _check_fun(
-            X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="between"
-        )
+        func = _extract_data_mne
+    # convert to a list-like X for clustering
+    X = df.groupby(iv_name).agg({dv_name: func})[dv_name].to_list()
+
+    # determine test type
+    if len(X) == 1:
+        kind = "within"
+    elif len(X) > 2:
+        kind = "between"
+    elif len(set(x.shape for x in X)) > 1:
+        kind = "between"
+    # by now we know there are exactly 2 elements in X, and their shapes match
+    elif within_id in df:
+        kind = "within"
+        X = X[0] - X[1]
+    else:
+        kind = "between"
+
+    # define stat function and threshold
+    stat_fun, threshold = _check_fun(
+        X=X, stat_fun=stat_fun, threshold=threshold, tail=tail, kind=kind
+    )
+    if kind == "within":
+        X = [X]
     # Run the cluster-based permutation test
-    T_obs, clusters, cluster_p_values, H0 = _permutation_cluster_test(
-        [y_for_cluster],
+    stat_obs, clusters, cluster_p_values, H0 = _permutation_cluster_test(
+        X,
         n_permutations=n_permutations,
         threshold=threshold,
         stat_fun=stat_fun,
@@ -2066,9 +1916,9 @@ def cluster_test(
         seed=seed,
     )
 
-    print(f"smallest cluster p-value: {min(cluster_p_values)}")
+    # print(f"smallest cluster p-value: {min(cluster_p_values)}")
 
-    return ClusterResult(T_obs, clusters, cluster_p_values, H0)
+    return ClusterResult(stat_obs, clusters, cluster_p_values, H0, stat_fun)
 
 
 class ClusterResult:
@@ -2077,7 +1927,7 @@ class ClusterResult:
 
     Parameters
     ----------
-    T_obs : np.ndarray
+    stat_obs : np.ndarray
         The observed test statistic.
     clusters : list
         List of clusters.
@@ -2089,15 +1939,24 @@ class ClusterResult:
 
     def __init__(
         self,
-        T_obs: np.typing.NDArray,
+        stat_obs: np.typing.NDArray,
         clusters: list,
         cluster_p_values: np.typing.NDArray,
         H0: np.typing.NDArray,
+        stat_fun: callable,
     ):
-        self.T_obs = T_obs
+        self.stat_obs = stat_obs
         self.clusters = clusters
         self.cluster_p_values = cluster_p_values
         self.H0 = H0
+        self.stat_fun = stat_fun
+        # TODO improve detection of stat name (e.g. unpaired T)?
+        if stat_fun is f_oneway:
+            self.stat_name = "F-statistic"
+        elif stat_fun is ttest_1samp_no_p:
+            self.stat_name = "paired T-statistic"
+        else:
+            self.stat_name = "test statistic"
 
     def plot_cluster(self, condition_labels: dict):
         """
@@ -2128,7 +1987,7 @@ def plot_cluster(self, condition_labels: dict):
         time_inds = np.unique(time_inds)
 
         # get topography for t stat
-        t_map = self.T_obs[time_inds, ...].mean(axis=0).astype(int)
+        t_map = self.stat_obs[time_inds, ...].mean(axis=0).astype(int)
 
         # get signals at the sensors contributing to the cluster
         sig_times = cond_values[0][0].times[time_inds]
@@ -2169,7 +2028,7 @@ def plot_cluster(self, condition_labels: dict):
         # add axes for colorbar
         ax_colorbar = divider.append_axes("right", size="5%", pad=0.1)
         cbar = plt.colorbar(image, cax=ax_colorbar)
-        cbar.set_label("t-value")
+        cbar.set_label(self.stat_name)
         ax_topo.set_xlabel(
             "average from {:0.3f} to {:0.3f} s".format(*sig_times[[0, -1]])
         )

From 9661492e5fcac3962cc04f926a0cfac60c0745c0 Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Thu, 25 Jul 2024 12:25:12 -0500
Subject: [PATCH 64/88] make tutorial match modified API

Co-authored-by: Carina Forster <carinaforster0611@gmail.com>
---
 tutorials/stats-sensor-space/76_new_cluster_test_api.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index efbc6d5e3f0..83b4f019b6f 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -133,11 +133,11 @@
 # the cluster test randomly permutes the subject label
 # the 1 in the formula represents the intercept which is always included
 # C is a categorical variable that will be dummy coded
-formula = "evoked ~ 1 + C(subject_index)"
+formula = "evoked ~ condition"
 
 # run the new cluster test API and return the new cluster_result object
 cluster_result = mne.stats.cluster_level.cluster_test(
-    df=df, formula=formula, paired_test=True, adjacency=None
+    df=df, formula=formula, within_id="subject_index"
 )
 
 # note that we ran an exact test due to the small sample size

From cac05598c1de8c565fc5b823929472a3b2c9535c Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Thu, 25 Jul 2024 12:30:30 -0500
Subject: [PATCH 65/88] remove unused test helper func

---
 mne/stats/tests/test_cluster_level.py | 39 ---------------------------
 1 file changed, 39 deletions(-)

diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py
index f9be3693441..00989e3e00c 100644
--- a/mne/stats/tests/test_cluster_level.py
+++ b/mne/stats/tests/test_cluster_level.py
@@ -880,45 +880,6 @@ def test_output_equiv(shape, out_type, adjacency, threshold):
     assert_array_equal(got_mask, want_mask)
 
 
-def create_sample_data_cluster_test():
-    """Create sample data to test new cluster API."""
-    # Prepare some dummy data
-    n_subjects = 20
-    n_conditions = 2
-    n_channels = 5
-    n_timepoints = 8
-    n_freqs = 3
-
-    # Create dummy data
-    dummy_data_2d = [
-        np.random.rand(n_channels, n_timepoints)
-        for _ in range(n_subjects * n_conditions)
-    ]
-    dummy_data_3d = [
-        np.random.rand(n_channels, n_freqs, n_timepoints)
-        for _ in range(n_subjects * n_conditions)
-    ]
-
-    # Create a DataFrame with dummy data
-    df_2d = pd.DataFrame(
-        {
-            "subject_index": np.repeat(range(n_subjects), n_conditions),
-            "condition": np.tile(["cond1", "cond2"], n_subjects),
-            "data": dummy_data_2d,
-        }
-    )
-
-    df_3d = pd.DataFrame(
-        {
-            "subject_index": np.repeat(range(n_subjects), n_conditions),
-            "condition": np.tile(["cond1", "cond2"], n_subjects),
-            "data": dummy_data_3d,
-        }
-    )
-
-    return df_2d, df_3d
-
-
 def test_compare_old_and_new_cluster_api():
     """Test for same results from old and new APIs."""
     condition1_1d, condition2_1d, condition1_2d, condition2_2d = _get_conditions()

From 47ac8380ea2c1efe046b87e1475cc7bf930962e9 Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Thu, 25 Jul 2024 12:33:11 -0500
Subject: [PATCH 66/88] vulture allowlist update

---
 tools/vulture_allowlist.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/vulture_allowlist.py b/tools/vulture_allowlist.py
index d612d0ec5ed..f030b4d4346 100644
--- a/tools/vulture_allowlist.py
+++ b/tools/vulture_allowlist.py
@@ -146,3 +146,6 @@
 _qt_raise_window
 _qt_disable_paint
 _qt_get_stylesheet
+
+# used in tutorial, not sure why shows up
+plot_cluster

From 033c1585783ee524f8743d7b82ba2b2077b7e6d4 Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Sun, 28 Jul 2024 13:00:41 +0200
Subject: [PATCH 67/88] included BaseTFR in validate_cluster_df

---
 mne/stats/cluster_level.py | 43 +++++++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index 20b54f1f592..001979461bc 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -1737,18 +1737,24 @@ def summarize_clusters_stc(
 
 
 def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str):
+    """Validate the input DataFrame for cluster tests."""
     # check if all necessary columns are present
-    missing = ({dv_name} | {iv_name}) - set(df.columns)
+    missing = ({dv_name} | {iv_name}) - set(df.columns)  # should be empty
     sep = '", "'
-    if missing:
+    if missing:  # if not empty, there are missing columns
         raise ValueError(
             f"DataFrame must contain a column named for each term in `formula`. "
-            f"Column{_pl(missing)} missing for term{_pl(missing)} "
+            f"Column{_pl(missing)} missing for term{_pl(missing)} "  # _pl = pluralize
             f'"{sep.join(missing)}".'
         )
     # check if the data column contains valid (and consistent) instance types
     inst = df[dv_name].iloc[0]
-    valid_types = (Evoked, BaseEpochs, BaseTFR, np.ndarray)
+    valid_types = (
+        Evoked,
+        BaseEpochs,
+        BaseTFR,
+        np.ndarray,
+    )  # Base covers all Epochs and TFRs
     _validate_type(inst, valid_types, f"Data in dependent variable column '{dv_name}'")
     all_types = set(df[dv_name].map(type))
     all_type_names = ", ".join([type(x).__name__ for x in all_types])
@@ -1759,8 +1765,10 @@ def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str):
         )
     # check if the shape of the data is consistent
     if isinstance(inst, np.ndarray):
-        all_shapes = set(df[dv_name].map(lambda x: x.shape[1:]))  # first dim may vary
-    elif isinstance(inst, BaseEpochs):
+        all_shapes = set(
+            df[dv_name].map(lambda x: x.shape[1:])
+        )  # first dim may vary (participants or epochs)
+    elif isinstance(inst, (BaseEpochs | BaseTFR)):  # should include BaseTFR?
         all_shapes = set(df[dv_name].map(lambda x: x.get_data().shape[1:]))
     else:
         all_shapes = set(df[dv_name].map(lambda x: x.get_data().shape))
@@ -1769,14 +1777,14 @@ def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str):
             f"{prologue} consistent shape, but {len(all_shapes)} different "
             f"shapes were found: {'; '.join(all_shapes)}."
         )
-    return all_types.pop()
+    return all_types.pop()  # return the type of the data column entries
 
 
 @verbose
 def cluster_test(
     df: pd.DataFrame,
     formula: str,
-    *,
+    *,  # end of positional-only parameters
     within_id: str | None = None,
     stat_fun: callable | None = None,
     tail: Literal[-1, 0, 1] = 0,
@@ -1799,9 +1807,10 @@ def cluster_test(
     Parameters
     ----------
     df : pd.DataFrame
-        Dataframe with 3 columns (subject_index, condition, evoked).
+        Dataframe containing the data, dependent and independent variables.
     formula : str
-        Wilkinson notation formula for design matrix.
+        Wilkinson notation formula for design matrix. The names of the dependent
+        and independent variable should match the columns in the dataframe.
     within_id : None | str
         Name of column in ``df`` to use in identifying within-group contrasts.
     stat_fun : None | callable
@@ -1841,8 +1850,10 @@ def cluster_test(
     formulaic = _soft_import("formulaic", purpose="parse formula for clustering")
     parser = formulaic.parser.DefaultFormulaParser(include_intercept=False)
     formula = formulaic.Formula(formula, _parser=parser)
+    # extract the dependent and independent variable names
     dv_name = str(np.array(formula.lhs.root).item())
     iv_name = str(np.array(formula.rhs.root).item())
+
     # validate the input dataframe and return the type of the data column entries
     _dtype = _validate_cluster_df(df, dv_name, iv_name)
 
@@ -1853,10 +1864,9 @@ def cluster_test(
         df.sort_values([iv_name, within_id], inplace=True)
         counts = df[within_id].value_counts()
         if any(counts != 2):
-            raise ValueError("Badness 10000")
-
-    # extract the data
+            raise ValueError("for paired tttest, each subject must have 2 observations")
 
+    # extract the data from the dataframe
     def _extract_data_array(series):
         return np.concatenate(series.values)
 
@@ -1874,15 +1884,16 @@ def _extract_data_tfr(series):
         func = _extract_data_tfr
     else:
         func = _extract_data_mne
+
     # convert to a list-like X for clustering
     X = df.groupby(iv_name).agg({dv_name: func})[dv_name].to_list()
 
     # determine test type
     if len(X) == 1:
-        kind = "within"
+        kind = "within"  # data already subtracted
     elif len(X) > 2:
         kind = "between"
-    elif len(set(x.shape for x in X)) > 1:
+    elif len(set(x.shape for x in X)) > 1:  # check if shapes match
         kind = "between"
     # by now we know there are exactly 2 elements in X, and their shapes match
     elif within_id in df:
@@ -1916,8 +1927,6 @@ def _extract_data_tfr(series):
         seed=seed,
     )
 
-    # print(f"smallest cluster p-value: {min(cluster_p_values)}")
-
     return ClusterResult(stat_obs, clusters, cluster_p_values, H0, stat_fun)
 
 

From 2c2f341707cf99525508b934dcb040b94aed26c5 Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Sun, 28 Jul 2024 13:35:40 +0200
Subject: [PATCH 68/88] comments on cluster_test function

---
 mne/stats/cluster_level.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index 001979461bc..8b4c9f15d10 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -1857,7 +1857,7 @@ def cluster_test(
     # validate the input dataframe and return the type of the data column entries
     _dtype = _validate_cluster_df(df, dv_name, iv_name)
 
-    # for within_subject
+    # for within_subject designs, check if each subject has 2 observations
     _validate_type(within_id, (str, None), "within_id")
     if within_id:
         df = df.copy(deep=False)  # Don't mutate input dataframe row order!
@@ -1870,7 +1870,7 @@ def cluster_test(
     def _extract_data_array(series):
         return np.concatenate(series.values)
 
-    def _extract_data_mne(series):
+    def _extract_data_mne(series):  # 2D data
         return np.array(
             series.map(lambda inst: inst.get_data().swapaxes(-2, -1)).to_list()
         )
@@ -1893,21 +1893,26 @@ def _extract_data_tfr(series):
         kind = "within"  # data already subtracted
     elif len(X) > 2:
         kind = "between"
-    elif len(set(x.shape for x in X)) > 1:  # check if shapes match
+    elif (
+        len(set(x.shape for x in X)) > 1
+    ):  # check if there are unequal observations in each group
         kind = "between"
     # by now we know there are exactly 2 elements in X, and their shapes match
     elif within_id in df:
         kind = "within"
         X = X[0] - X[1]
-    else:
+    else:  # what would be another else cas
         kind = "between"
 
     # define stat function and threshold
     stat_fun, threshold = _check_fun(
         X=X, stat_fun=stat_fun, threshold=threshold, tail=tail, kind=kind
     )
-    if kind == "within":
+
+    # check_fun doesn't work with list input`
+    if kind == "within":  # will this create an issue for already subtracted data?
         X = [X]
+
     # Run the cluster-based permutation test
     stat_obs, clusters, cluster_p_values, H0 = _permutation_cluster_test(
         X,

From e9b5fa29522ddcc7f4ddab1a3533e4014dc06fcd Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Sun, 28 Jul 2024 14:11:54 +0200
Subject: [PATCH 69/88] updated clusterResult class and plot function

---
 mne/stats/cluster_level.py | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index 8b4c9f15d10..eebca26d35c 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -1964,7 +1964,8 @@ def __init__(
         self.cluster_p_values = cluster_p_values
         self.H0 = H0
         self.stat_fun = stat_fun
-        # TODO improve detection of stat name (e.g. unpaired T)?
+
+        # unpaired t-test is f_oneway
         if stat_fun is f_oneway:
             self.stat_name = "F-statistic"
         elif stat_fun is ttest_1samp_no_p:
@@ -1972,7 +1973,7 @@ def __init__(
         else:
             self.stat_name = "test statistic"
 
-    def plot_cluster(self, condition_labels: dict):
+    def plot_cluster_time_sensor(self, condition_labels: dict):
         """
         Plot the cluster with the lowest p-value.
 
@@ -1985,13 +1986,20 @@ def plot_cluster(self, condition_labels: dict):
         condition_labels : dict
             Dictionary with condition labels as keys and evoked objects as values.
         """
+        # define colorblind friendly colors
+        colorblind_palette = ["#4daf4a", "#f781bf"]
+
         # extract condition labels from the dictionary
         cond_keys = list(condition_labels.keys())
         # extract the evokeds from the dictionary
         cond_values = list(condition_labels.values())
 
         # configure variables for visualization
-        colors = {cond_keys[0]: "crimson", cond_keys[1]: "steelblue"}
+        colors = {
+            cond_keys[0]: colorblind_palette[0],
+            cond_keys[1]: colorblind_palette[1],
+        }
+        line_styles = {cond_keys[0]: "-", cond_keys[1]: "--"}
 
         lowest_p_cluster = np.argmin(self.cluster_p_values)
 
@@ -2044,18 +2052,23 @@ def plot_cluster(self, condition_labels: dict):
         cbar = plt.colorbar(image, cax=ax_colorbar)
         cbar.set_label(self.stat_name)
         ax_topo.set_xlabel(
-            "average from {:0.3f} to {:0.3f} s".format(*sig_times[[0, -1]])
+            "Spatial cluster extent:\n averaged from {:0.3f} to {:0.3f} s".format(
+                *sig_times[[0, -1]]
+            )
         )
 
         # add new axis for time courses and plot time courses
         ax_signals = divider.append_axes("right", size="300%", pad=1.3)
-        title = f"Signal averaged over {len(ch_inds)} sensor(s)"
+        title = (
+            f"Temporal cluster extent:\nSignal averaged over {len(ch_inds)} sensor(s)"
+        )
         plot_compare_evokeds(
             condition_labels,
             title=title,
             picks=ch_inds,
             axes=ax_signals,
             colors=colors,
+            linestyles=line_styles,
             show=False,
             split_legend=True,
             truncate_yaxis="auto",

From 2fd17d338313a7ab25f80c6f547b154d112e692a Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Sun, 28 Jul 2024 14:12:12 +0200
Subject: [PATCH 70/88] updated function call for plotting

---
 tutorials/stats-sensor-space/76_new_cluster_test_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index 83b4f019b6f..b7f933d127b 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -152,7 +152,7 @@
 # finally let's plot the results using the ClusterResults class
 
 # we plot the cluster with the lowest p-value
-cluster_result.plot_cluster(condition_labels=conditions_dict)
+cluster_result.plot_cluster_time_sensor(condition_labels=conditions_dict)
 # we can see that there is something going on around 400 ms
 # with a stronger signal for target trials in right central-parietal channels
 

From 150c530817bc691c546798d77c43bb68f35f032c Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Sun, 28 Jul 2024 14:14:18 +0200
Subject: [PATCH 71/88] changed color

---
 mne/stats/cluster_level.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index eebca26d35c..9b65807bd38 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -1987,7 +1987,7 @@ def plot_cluster_time_sensor(self, condition_labels: dict):
             Dictionary with condition labels as keys and evoked objects as values.
         """
         # define colorblind friendly colors
-        colorblind_palette = ["#4daf4a", "#f781bf"]
+        colorblind_palette = ["#4daf4a", "#984ea3"]
 
         # extract condition labels from the dictionary
         cond_keys = list(condition_labels.keys())

From 3cc9e2c7f1159851141bbe8d45d77af807e5d429 Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Thu, 1 Aug 2024 12:30:11 -0500
Subject: [PATCH 72/88] docstring/docdict cleanups and fixes

---
 mne/stats/cluster_level.py |  68 +++++++++---------
 mne/utils/docs.py          | 138 ++++++++++++++++++++++---------------
 2 files changed, 118 insertions(+), 88 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index 9b65807bd38..cd86a40e22a 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -1789,12 +1789,12 @@ def cluster_test(
     stat_fun: callable | None = None,
     tail: Literal[-1, 0, 1] = 0,
     threshold=None,
-    n_permutations: int = 1024,
-    adjacency: tuple | None = None,
-    max_step: int = 1,
-    exclude: list | None = None,
-    step_down_p: int = 0,
-    t_power: int = 1,
+    n_permutations: str | int = 1024,
+    adjacency: sparse.spmatrix | False = False,
+    max_step: int = 1,  # TODO may need to provide `max_step_time` and `max_step_freq`
+    exclude: list | None = None,  # TODO needs rethink because user passes MNE objects
+    step_down_p: float = 0.0,
+    t_power: float = 1.0,
     check_disjoint: bool = False,
     out_type: Literal["indices", "mask"] = "indices",
     seed: None | int | np.random.RandomState = None,
@@ -1812,35 +1812,41 @@ def cluster_test(
         Wilkinson notation formula for design matrix. The names of the dependent
         and independent variable should match the columns in the dataframe.
     within_id : None | str
-        Name of column in ``df`` to use in identifying within-group contrasts.
-    stat_fun : None | callable
-        Statistical function to use.
-    tail : int, optional
-        0 for two-tailed, 1 for greater, -1 for less. Default is 0.
-    n_permutations : int, optional
-        Number of permutations. Default is 1024.
-    adjacency : None, optional
-        Provide a adjacency matrix. Default is None.
+        Name of column in ``df`` to use in identifying within-group contrasts. If
+        ``None``, will perform a between-group test. Ignored if the number of groups
+        (unique values in the independent variable column of ``df``) is greater than 2.
+    %(stat_fun_clust_both)s
+    %(tail_clust)s
+    %(threshold_clust_both)s
+    %(n_permutations_clust_all)s
+    %(adjacency_clust_both)s
     max_step : int, optional
         Maximum distance between samples (time points). Default is 1.
-    exclude : np.Array, optional
-        Exclude no time points or channels. Default is None.
-    step_down_p : int, optional
-        Step down in jumps test. Default is 0.
-    t_power : int, optional
-        Weigh each location by its stats score. Default is 1.
-    check_disjoint : bool, optional
-        Check if clusters are disjoint. Default is False.
-    out_type : str, optional
-        Output type. Default is "indices".
-    seed : None | int | np.random.RandomState, optional
-        Seed for the random number generator. Default is None.
-    buffer_size : int, optional
-        Block size for chunking the data. Default is None.
-    n_jobs : int, optional
-        How many cores to use. Default is 1.
+    exclude : array-like of bool | None
+        Mask to apply to the data to exclude certain points from clustering
+        (e.g., medial wall vertices). Should be the same shape as the channels/vertices
+        dimension of the data objects. If ``None``, no points are excluded.
+    %(step_down_p_clust)s
+    %(t_power_clust)s
+    check_disjoint : bool
+        Whether to check if the ``adjacency`` matrix can be separated into disjoint
+        sets before clustering. This may lead to faster clustering, especially if
+        the "time" and/or "frequency" dimensions are large.
+    %(out_type_clust)s
+    %(seed)s
+    buffer_size : int | None
+        Block size to use when computing test statistics. This can significantly
+        reduce memory usage when ``n_jobs > 1`` and memory sharing between
+        processes is enabled (see :func:`mne.set_cache_dir`), because the data will be
+        shared between processes and each process only needs to allocate space for
+        a small block of locations at a time.
+    %(n_jobs)s
     %(verbose)s
 
+    Notes
+    -----
+    %(threshold_clust_t_or_f_notes)s
+
     Returns
     -------
     ClusterResult
diff --git a/mne/utils/docs.py b/mne/utils/docs.py
index 0fa9288bec2..624b2e309e8 100644
--- a/mne/utils/docs.py
+++ b/mne/utils/docs.py
@@ -144,61 +144,54 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75):
     formatting. This can add overhead so is meant only for debugging.
 """
 
-docdict["adjacency_clust"] = """
-adjacency : scipy.sparse.spmatrix | None | False
+_adjacency_clust_template = """
+adjacency : scipy.sparse.spmatrix | {param_none}False
     Defines adjacency between locations in the data, where "locations" can be
     spatial vertices, frequency bins, time points, etc. For spatial vertices
     (i.e. sensor space data), see :func:`mne.channels.find_ch_adjacency` or
     :func:`mne.spatial_inter_hemi_adjacency`. For source space data, see
-    :func:`mne.spatial_src_adjacency` or
-    :func:`mne.spatio_temporal_src_adjacency`. If ``False``, assumes
-    no adjacency (each location is treated as independent and unconnected).
-    If ``None``, a regular lattice adjacency is assumed, connecting
-    each {sp} location to its neighbor(s) along the last dimension
-    of {{eachgrp}} ``{{x}}``{lastdim}.
+    :func:`mne.spatial_src_adjacency` or :func:`mne.spatio_temporal_src_adjacency`.
+    If ``False``, assumes no adjacency (each location is treated as independent and
+    unconnected).{if_none}
     If ``adjacency`` is a matrix, it is assumed to be symmetric (only the
     upper triangular half is used) and must be square with dimension equal to
-    ``{{x}}.shape[-1]`` {parone} or ``{{x}}.shape[-1] * {{x}}.shape[-2]``
-    {partwo} or (optionally)
-    ``{{x}}.shape[-1] * {{x}}.shape[-2] * {{x}}.shape[-3]``
-    {parthree}.{memory}
+    the product of the last 1, 2, or 3 data dimensions (e.g., for time-frequency data:
+    n_channels, n_channels * n_freqs, or n_channels * n_freqs * n_times).{memory}
+"""
+_if_none = """ If ``None``, a regular lattice adjacency is assumed, connecting
+    each {spatial}location to its neighbor(s) along the last dimension
+    of {the_data}.
 """
-
-mem = (
-    " If spatial adjacency is uniform in time, it is recommended to use "
-    "a square matrix with dimension ``{x}.shape[-1]`` (n_vertices) to save "
-    "memory and computation, and to use ``max_step`` to define the extent "
-    "of temporal adjacency to consider when clustering."
-)
-comb = " The function `mne.stats.combine_adjacency` may be useful for 4D data."
 st = dict(
-    sp="spatial",
-    lastdim="",
-    parone="(n_vertices)",
-    partwo="(n_times * n_vertices)",
-    parthree="(n_times * n_freqs * n_vertices)",
-    memory=mem,
+    param_none="None | ",
+    if_none=_if_none.format(spatial="spatial ", the_data="{eachgrp} ``{x}``"),
+    memory="""
+    If spatial adjacency is uniform in time, it is recommended to use a square matrix
+    with dimension ``{x}.shape[-1]`` (n_vertices) to save memory and computation,
+    and to use ``max_step`` to define the extent of temporal adjacency to consider when
+    clustering.
+""",
 )
 tf = dict(
-    sp="",
-    lastdim=" (or the last two dimensions if ``{x}`` is 2D)",
-    parone="(for 2D data)",
-    partwo="(for 3D data)",
-    parthree="(for 4D data)",
-    memory=comb,
+    param_none="None | ",
+    if_none=_if_none.format(
+        spatial="",
+        the_data="{eachgrp} ``{x}`` (or the last two dimensions if ``{x}`` is 2D)",
+    ),
+    memory="""
+    The function `mne.stats.combine_adjacency` may be useful for 4D data.
+""",
 )
-nogroups = dict(eachgrp="", x="X")
+nogrps = dict(eachgrp="", x="X")
 groups = dict(eachgrp="each group ", x="X[k]")
-docdict["adjacency_clust_1"] = (
-    docdict["adjacency_clust"].format(**tf).format(**nogroups)
-)
-docdict["adjacency_clust_n"] = docdict["adjacency_clust"].format(**tf).format(**groups)
-docdict["adjacency_clust_st1"] = (
-    docdict["adjacency_clust"].format(**st).format(**nogroups)
-)
-docdict["adjacency_clust_stn"] = (
-    docdict["adjacency_clust"].format(**st).format(**groups)
+
+docdict["adjacency_clust_1"] = _adjacency_clust_template.format(**tf).format(**nogrps)
+docdict["adjacency_clust_both"] = _adjacency_clust_template.format(
+    param_none="", if_none="", memory=""
 )
+docdict["adjacency_clust_n"] = _adjacency_clust_template.format(**tf).format(**groups)
+docdict["adjacency_clust_st1"] = _adjacency_clust_template.format(**st).format(**nogrps)
+docdict["adjacency_clust_stn"] = _adjacency_clust_template.format(**st).format(**groups)
 
 docdict["adjust_dig_chpi"] = """
 adjust_dig : bool
@@ -708,7 +701,7 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75):
 
 docdict["check_disjoint_clust"] = """
 check_disjoint : bool
-    Whether to check if the connectivity matrix can be separated into disjoint
+    Whether to check if the ``adjacency`` matrix can be separated into disjoint
     sets before clustering. This may lead to faster clustering, especially if
     the second dimension of ``X`` (usually the "time" dimension) is large.
 """
@@ -1416,7 +1409,7 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75):
 """
 
 docdict["exclude_clust"] = """
-exclude : bool array or None
+exclude : array-like of bool | None
     Mask to apply to the data to exclude certain points from clustering
     (e.g., medial wall vertices). Should be the same shape as ``X``.
     If ``None``, no points are excluded.
@@ -3962,7 +3955,7 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75):
 seed : None | int | instance of ~numpy.random.RandomState
     A seed for the NumPy random number generator (RNG). If ``None`` (default),
     the seed will be  obtained from the operating system
-    (see  :class:`~numpy.random.RandomState` for details), meaning it will most
+    (see :class:`~numpy.random.RandomState` for details), meaning it will most
     likely produce different output every time this function or method is run.
     To achieve reproducible results, pass a value here to explicitly initialize
     the RNG with a defined state.
@@ -4253,16 +4246,23 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75):
     channel names in the file will be used when possible.
 """
 
-_stat_fun_clust_base = """
+_stat_fun_template = """
 stat_fun : callable | None
     Function called to calculate the test statistic. Must accept 1D-array as
-    input and return a 1D array. If ``None`` (the default), uses
-    `mne.stats.{}`.
+    input and return a 1D array. If ``None`` (the default), uses {}.
 """
 
-docdict["stat_fun_clust_f"] = _stat_fun_clust_base.format("f_oneway")
+docdict["stat_fun_clust_both"] = _stat_fun_template.format(
+    """:func:`mne.stats.ttest_1samp_no_p`
+    for paired tests and :func:`mne.stats.f_oneway` for unpaired tests or tests of
+    more than 2 groups."""
+)
+
+docdict["stat_fun_clust_f"] = _stat_fun_template.format(":func:`mne.stats.f_oneway`")
 
-docdict["stat_fun_clust_t"] = _stat_fun_clust_base.format("ttest_1samp_no_p")
+docdict["stat_fun_clust_t"] = _stat_fun_template.format(
+    ":func:`mne.stats.ttest_1samp_no_p`"
+)
 
 docdict["static"] = """
 static : instance of SpatialImage
@@ -4473,10 +4473,10 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75):
 threshold : float | dict | None
     The so-called "cluster forming threshold" in the form of a test statistic
     (note: this is not an alpha level / "p-value").
-    If numeric, vertices with data values more extreme than ``threshold`` will
-    be used to form clusters. If ``None``, {} will be chosen
+    If numeric, vertices with stat values more extreme than ``threshold`` will
+    be used to form clusters. If ``None``, {which_thresh} will be chosen
     automatically that corresponds to a p-value of 0.05 for the given number of
-    observations (only valid when using {}). If ``threshold`` is a
+    observations (only valid when using {which_stat}). If ``threshold`` is a
     :class:`dict` (with keys ``'start'`` and ``'step'``) then threshold-free
     cluster enhancement (TFCE) will be used (see the
     :ref:`TFCE example <tfce_example>` and :footcite:`SmithNichols2009`).
@@ -4484,8 +4484,14 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75):
     a particular p-value for one-tailed or two-tailed tests.
 """
 
-f_test = ("an F-threshold", "an F-statistic")
-docdict["threshold_clust_f"] = _threshold_clust_base.format(*f_test)
+docdict["threshold_clust_both"] = _threshold_clust_base.format(
+    which_thresh="a t- or F-threshold",
+    which_stat="``stat_fun=None``, i.e., a paired t-test or one-way F-test",
+)
+
+docdict["threshold_clust_f"] = _threshold_clust_base.format(
+    which_thresh="an F-threshold", which_stat="an F-statistic"
+)
 
 docdict["threshold_clust_f_notes"] = """
 For computing a ``threshold`` based on a p-value, use the conversion
@@ -4497,8 +4503,9 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75):
     thresh = scipy.stats.f.ppf(1 - pval, dfn=dfn, dfd=dfd)  # F distribution
 """
 
-t_test = ("a t-threshold", "a t-statistic")
-docdict["threshold_clust_t"] = _threshold_clust_base.format(*t_test)
+docdict["threshold_clust_t"] = _threshold_clust_base.format(
+    which_thresh="a t-threshold", which_stat="a t-statistic"
+)
 
 docdict["threshold_clust_t_notes"] = """
 For computing a ``threshold`` based on a p-value, use the conversion
@@ -4512,6 +4519,23 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75):
 For testing the lower tail (``tail=-1``), don't subtract ``pval`` from 1.
 """
 
+docdict["threshold_clust_t_or_f_notes"] = """
+For computing a ``threshold`` based on a p-value, use the conversion
+from :meth:`scipy.stats.rv_continuous.ppf`::
+
+    pval = 0.001  # arbitrary
+    # for t-statistic
+    df = n_observations - 1  # degrees of freedom for the t-test
+    thresh = scipy.stats.t.ppf(1 - pval / 2, df)  # two-tailed, t distribution
+    # for f-statistic
+    dfn = n_conditions - 1  # degrees of freedom numerator
+    dfd = n_observations - n_conditions  # degrees of freedom denominator
+    thresh = scipy.stats.f.ppf(1 - pval, dfn=dfn, dfd=dfd)  # F distribution
+
+For a one-tailed test (``tail=1``), don't divide the p-value by 2.
+For testing the lower tail (``tail=-1``), don't subtract ``pval`` from 1.
+"""
+
 docdict["time_bandwidth_tfr"] = """
 time_bandwidth : float ``≥ 2.0``
     Product between the temporal window length (in seconds) and the *full*

From 2c27a6989e5b304966ac085d134f7ee14d3a562a Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Mon, 5 Aug 2024 13:15:37 +0200
Subject: [PATCH 73/88] implemented Dan's comments

---
 mne/stats/cluster_level.py | 65 ++++++++++++++++++++------------------
 1 file changed, 35 insertions(+), 30 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index cd86a40e22a..0b5b35889f6 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -1768,7 +1768,7 @@ def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str):
         all_shapes = set(
             df[dv_name].map(lambda x: x.shape[1:])
         )  # first dim may vary (participants or epochs)
-    elif isinstance(inst, (BaseEpochs | BaseTFR)):  # should include BaseTFR?
+    elif isinstance(inst, (BaseEpochs | BaseTFR)):
         all_shapes = set(df[dv_name].map(lambda x: x.get_data().shape[1:]))
     else:
         all_shapes = set(df[dv_name].map(lambda x: x.get_data().shape))
@@ -1790,7 +1790,7 @@ def cluster_test(
     tail: Literal[-1, 0, 1] = 0,
     threshold=None,
     n_permutations: str | int = 1024,
-    adjacency: sparse.spmatrix | False = False,
+    adjacency: sparse.spmatrix | None | False = None,  # should be None (default)
     max_step: int = 1,  # TODO may need to provide `max_step_time` and `max_step_freq`
     exclude: list | None = None,  # TODO needs rethink because user passes MNE objects
     step_down_p: float = 0.0,
@@ -1810,7 +1810,7 @@ def cluster_test(
         Dataframe containing the data, dependent and independent variables.
     formula : str
         Wilkinson notation formula for design matrix. The names of the dependent
-        and independent variable should match the columns in the dataframe.
+        and independent variable should match the columns in ``df``.
     within_id : None | str
         Name of column in ``df`` to use in identifying within-group contrasts. If
         ``None``, will perform a between-group test. Ignored if the number of groups
@@ -1870,7 +1870,7 @@ def cluster_test(
         df.sort_values([iv_name, within_id], inplace=True)
         counts = df[within_id].value_counts()
         if any(counts != 2):
-            raise ValueError("for paired tttest, each subject must have 2 observations")
+            raise ValueError("for paired t-test, each subject must have 2 observations")
 
     # extract the data from the dataframe
     def _extract_data_array(series):
@@ -1907,7 +1907,7 @@ def _extract_data_tfr(series):
     elif within_id in df:
         kind = "within"
         X = X[0] - X[1]
-    else:  # what would be another else cas
+    else:  # 2 elements in X but no within_id provided → unpaired test
         kind = "between"
 
     # define stat function and threshold
@@ -1971,7 +1971,7 @@ def __init__(
         self.H0 = H0
         self.stat_fun = stat_fun
 
-        # unpaired t-test is f_oneway
+        # unpaired t-test equivalent to f_oneway w/ 2 groups
         if stat_fun is f_oneway:
             self.stat_name = "F-statistic"
         elif stat_fun is ttest_1samp_no_p:
@@ -1979,7 +1979,15 @@ def __init__(
         else:
             self.stat_name = "test statistic"
 
-    def plot_cluster_time_sensor(self, condition_labels: dict):
+    def plot_cluster_time_sensor(
+        self,
+        condition_labels: dict,
+        colors: list | dict | None = None,
+        linestyles: list | dict | None = None,
+        cmap_evokeds: None | str | tuple = None,
+        cmap_topo: None | str | tuple = None,
+        ci: float | bool | callable() | None = None,
+    ):
         """
         Plot the cluster with the lowest p-value.
 
@@ -1991,21 +1999,23 @@ def plot_cluster_time_sensor(self, condition_labels: dict):
         ----------
         condition_labels : dict
             Dictionary with condition labels as keys and evoked objects as values.
+        colors : list|dict|None
+            Colors to use when plotting the ERP lines and confidence bands.
+        linestyles : list|dict|None
+            Styles to use when plotting the ERP lines.
+        cmap_evokeds : None|str|tuple
+            Colormap from which to draw color values when plotting the ERP lines.
+        cmap_topo: matplotlib colormap
+            Colormap to use for the topomap.
+        ci : float|bool|callable()|None
+            Confidence band around each ERP time series.
         """
-        # define colorblind friendly colors
-        colorblind_palette = ["#4daf4a", "#984ea3"]
-
         # extract condition labels from the dictionary
         cond_keys = list(condition_labels.keys())
         # extract the evokeds from the dictionary
         cond_values = list(condition_labels.values())
 
-        # configure variables for visualization
-        colors = {
-            cond_keys[0]: colorblind_palette[0],
-            cond_keys[1]: colorblind_palette[1],
-        }
-        line_styles = {cond_keys[0]: "-", cond_keys[1]: "--"}
+        linestyles = {cond_keys[0]: "-", cond_keys[1]: "--"}
 
         lowest_p_cluster = np.argmin(self.cluster_p_values)
 
@@ -2033,7 +2043,7 @@ def plot_cluster_time_sensor(self, condition_labels: dict):
             times=0,
             mask=mask,
             axes=ax_topo,
-            cmap="RdBu_r",
+            cmap=cmap_topo,
             show=False,
             colorbar=False,
             mask_params=dict(markersize=10),
@@ -2042,13 +2052,11 @@ def plot_cluster_time_sensor(self, condition_labels: dict):
         image = ax_topo.images[0]
 
         # remove the title that would otherwise say "0.000 s"
-        ax_topo.set_title("")
-
-        # soft import?
-        # make_axes_locatable = _soft_import(
-        #    "mpl_toolkits.axes_grid1.make_axes_locatable",
-        #    purpose="plot cluster results"
-        # )  # soft import (not a dependency for MNE)
+        ax_topo.set_title(
+            "Spatial cluster extent:\n averaged from {:0.3f} to {:0.3f} s".format(
+                *sig_times[[0, -1]]
+            )
+        )
 
         # create additional axes (for ERF and colorbar)
         divider = make_axes_locatable(ax_topo)
@@ -2057,11 +2065,6 @@ def plot_cluster_time_sensor(self, condition_labels: dict):
         ax_colorbar = divider.append_axes("right", size="5%", pad=0.1)
         cbar = plt.colorbar(image, cax=ax_colorbar)
         cbar.set_label(self.stat_name)
-        ax_topo.set_xlabel(
-            "Spatial cluster extent:\n averaged from {:0.3f} to {:0.3f} s".format(
-                *sig_times[[0, -1]]
-            )
-        )
 
         # add new axis for time courses and plot time courses
         ax_signals = divider.append_axes("right", size="300%", pad=1.3)
@@ -2074,11 +2077,13 @@ def plot_cluster_time_sensor(self, condition_labels: dict):
             picks=ch_inds,
             axes=ax_signals,
             colors=colors,
-            linestyles=line_styles,
+            linestyles=linestyles,
+            cmap=cmap_evokeds,
             show=False,
             split_legend=True,
             truncate_yaxis="auto",
             truncate_xaxis=False,
+            ci=ci,
         )
         plt.legend(frameon=False, loc="upper left")
 

From 2664ee218a73a8526726df0ccdb3d24fa912329c Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Mon, 5 Aug 2024 13:22:41 +0200
Subject: [PATCH 74/88] implemented Dan's comments

---
 tutorials/stats-sensor-space/76_new_cluster_test_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index b7f933d127b..fb928f89d0a 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -152,7 +152,7 @@
 # finally let's plot the results using the ClusterResults class
 
 # we plot the cluster with the lowest p-value
-cluster_result.plot_cluster_time_sensor(condition_labels=conditions_dict)
+cluster_result.plot_cluster_time_sensor(condition_labels=conditions_dict, ci=True)
 # we can see that there is something going on around 400 ms
 # with a stronger signal for target trials in right central-parietal channels
 

From 492754436fe8fa8e69f1419f2971e1c63a0a2b58 Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Mon, 5 Aug 2024 16:11:23 +0200
Subject: [PATCH 75/88] test for handling different MNE objects - test is
 failing

---
 mne/stats/tests/test_cluster_level.py | 101 +++++++++++++++++++++++---
 1 file changed, 90 insertions(+), 11 deletions(-)

diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py
index 00989e3e00c..4391c51f238 100644
--- a/mne/stats/tests/test_cluster_level.py
+++ b/mne/stats/tests/test_cluster_level.py
@@ -904,21 +904,100 @@ def test_compare_old_and_new_cluster_api():
 @pytest.mark.parametrize(
     "Inst", (EpochsArray, EvokedArray, EpochsTFRArray, AverageTFRArray)
 )
+@pytest.mark.filterwarnings('ignore:Ignoring argument "tail":RuntimeWarning')
 def test_new_cluster_api(Inst):
     """Test handling different MNE objects in the cluster API."""
     pd = pytest.importorskip("pandas")
 
-    n_epo, n_chan, n_freq, n_times = 2, 3, 5, 7
-    shape = (n_chan, n_times)
-    if Inst in (EpochsArray, EpochsTFRArray):
-        shape = (n_epo,) + shape
-    if Inst in (EpochsTFRArray, AverageTFRArray):
-        shape = shape[:-1] + (n_freq, shape[-1])
+    n_epo, n_chan, n_freq, n_times = 2, 3, 4, 5
+    info = create_info(ch_names=n_chan, sfreq=1000, ch_types="eeg")
+    # Introduce a significant difference in a specific region, time, and frequency
+    region_start = 1
+    region_end = 2
+    time_start = 2
+    time_end = 4
+    freq_start = 2
+    freq_end = 4
+
+    if Inst == EpochsArray:
+        # Create random data for EpochsArray
+        inst1 = Inst(np.random.randn(n_epo, n_chan, n_times), info=info)
+        # Adding a constant to create a difference
+        data_copy = inst1.get_data().copy()  # no data attribute for EpochsArray
+        data_copy[:, region_start:region_end, time_start:time_end] += (
+            2  # Modify the copy
+        )
+        inst2 = Inst(
+            data=data_copy, info=info
+        )  # Use the modified copy as a new instance
+
+    elif Inst == EvokedArray:
+        # Create random data for EvokedArray
+        inst1 = Inst(np.random.randn(n_chan, n_times), info=info)
+        data_copy = inst1.data.copy()
+        data_copy[region_start:region_end, time_start:time_end] += 2
+        inst2 = Inst(data=data_copy, info=info)
+
+    elif Inst == EpochsTFRArray:
+        # Create random data for EpochsTFRArray
+        data_tfr1 = np.random.randn(n_epo, n_chan, n_freq, n_times)
+        data_tfr2 = np.random.randn(n_epo, n_chan, n_freq, n_times)
+        inst1 = Inst(
+            data=data_tfr1, info=info, times=np.arange(n_times), freqs=np.arange(n_freq)
+        )
+        inst2 = Inst(
+            data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq)
+        )
+        data_tfr2 = inst2.data.copy()
+        data_tfr2[
+            :, region_start:region_end, freq_start:freq_end, time_start:time_end
+        ] += 2
+        inst2 = Inst(
+            data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq)
+        )
 
-    info = create_info(...)
-    inst1 = Inst(np.random.normal(shape, ...), info=info)
-    inst2 = Inst(np.random.normal(shape, ...), info=info)
+    elif Inst == AverageTFRArray:
+        # Create random data for AverageTFRArray
+        data_tfr1 = np.random.randn(n_chan, n_freq, n_times)
+        data_tfr2 = np.random.randn(n_chan, n_freq, n_times)
+        inst1 = Inst(
+            data=data_tfr1, info=info, times=np.arange(n_times), freqs=np.arange(n_freq)
+        )
+        inst2 = Inst(
+            data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq)
+        )
+        data_tfr2 = inst2.data.copy()
+        data_tfr2[
+            region_start:region_end, freq_start:freq_end, time_start:time_end
+        ] += 2
+        inst2 = Inst(
+            data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq)
+        )
 
+    # test old and new API with sample data
     df = pd.DataFrame(dict(data=[inst1, inst2], condition=["a", "b"]))
-    result = cluster_test(df, "data~condition", ...)
-    assert result  # TODO do something more interesting here
+    kwargs = dict(n_permutations=100, seed=1, tail=1, buffer_size=None, out_type="mask")
+
+    result_new_api = cluster_test(df, "data~condition", **kwargs)
+
+    # make sure channels are last dimension for old API
+    if Inst == EpochsArray:
+        inst1 = inst1.get_data().transpose(0, 2, 1)
+        inst2 = inst2.get_data().transpose(0, 2, 1)
+    elif Inst == EpochsTFRArray:
+        inst1 = inst1.data.transpose(0, 3, 2, 1)
+        inst2 = inst2.data.transpose(0, 3, 2, 1)
+    elif Inst == AverageTFRArray:
+        inst1 = inst1.data.transpose(2, 1, 0)
+        inst2 = inst2.data.transpose(2, 1, 0)
+    else:
+        inst1 = inst1.data.transpose(1, 0)
+        inst2 = inst2.data.transpose(1, 0)
+
+    F_obs, clusters, cluster_pvals, H0 = permutation_cluster_test(
+        [inst1, inst2], **kwargs
+    )
+    assert_array_equal(result_new_api.H0, H0)
+    assert_array_equal(result_new_api.stat_obs, F_obs)
+    assert_array_equal(result_new_api.cluster_p_values, cluster_pvals)
+    assert result_new_api.clusters == clusters

From 006acdf9d87f21d180fa6993540fcccefa281829 Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Tue, 6 Aug 2024 16:54:31 +0200
Subject: [PATCH 76/88] adjusted test to account for multiple subjects

---
 mne/stats/tests/test_cluster_level.py | 40 ++++++++++++++++++++++-----
 1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py
index 4391c51f238..9b10aacabf5 100644
--- a/mne/stats/tests/test_cluster_level.py
+++ b/mne/stats/tests/test_cluster_level.py
@@ -909,7 +909,7 @@ def test_new_cluster_api(Inst):
     """Test handling different MNE objects in the cluster API."""
     pd = pytest.importorskip("pandas")
 
-    n_epo, n_chan, n_freq, n_times = 2, 3, 4, 5
+    n_subs, n_epo, n_chan, n_freq, n_times = 2, 2, 3, 4, 5
     info = create_info(ch_names=n_chan, sfreq=1000, ch_types="eeg")
     # Introduce a significant difference in a specific region, time, and frequency
     region_start = 1
@@ -974,9 +974,25 @@ def test_new_cluster_api(Inst):
             data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq)
         )
 
-    # test old and new API with sample data
-    df = pd.DataFrame(dict(data=[inst1, inst2], condition=["a", "b"]))
-    kwargs = dict(n_permutations=100, seed=1, tail=1, buffer_size=None, out_type="mask")
+    if Inst == EvokedArray or Inst == AverageTFRArray:
+        # Generate random noise
+        noise = np.random.normal(loc=0, scale=0.1, size=inst1.data.shape)
+        # add noise to the data of the second subject
+        inst1_n = inst1.copy()
+        inst1_n.data = inst1.data + noise
+        inst2_n = inst2.copy()
+        inst2_n.data = inst2.data + noise
+        data = [inst1, inst2, inst1_n, inst2_n]
+        conds = ["a", "b"] * n_subs
+    else:
+        data = [inst1, inst2]
+        conds = ["a", "b"]
+
+    df = pd.DataFrame(dict(data=data, condition=conds))
+
+    kwargs = dict(
+        n_permutations=100, seed=42, tail=1, buffer_size=None, out_type="mask"
+    )
 
     result_new_api = cluster_test(df, "data~condition", **kwargs)
 
@@ -990,14 +1006,24 @@ def test_new_cluster_api(Inst):
     elif Inst == AverageTFRArray:
         inst1 = inst1.data.transpose(2, 1, 0)
         inst2 = inst2.data.transpose(2, 1, 0)
+        inst1_n = inst1_n.data.transpose(2, 1, 0)
+        inst2_n = inst2_n.data.transpose(2, 1, 0)
+        # combine the data of the two subjects
+        inst1 = np.concatenate([inst1[np.newaxis, :], inst1_n[np.newaxis, :]], axis=0)
+        inst2 = np.concatenate([inst2[np.newaxis, :], inst2_n[np.newaxis, :]], axis=0)
     else:
         inst1 = inst1.data.transpose(1, 0)
         inst2 = inst2.data.transpose(1, 0)
+        inst1_n = inst1_n.data.transpose(1, 0)
+        inst2_n = inst2_n.data.transpose(1, 0)
+        # combine the data of the two subjects
+        inst1 = np.concatenate([inst1[np.newaxis, :], inst1_n[np.newaxis, :]], axis=0)
+        inst2 = np.concatenate([inst2[np.newaxis, :], inst2_n[np.newaxis, :]], axis=0)
 
     F_obs, clusters, cluster_pvals, H0 = permutation_cluster_test(
         [inst1, inst2], **kwargs
     )
-    assert_array_equal(result_new_api.H0, H0)
-    assert_array_equal(result_new_api.stat_obs, F_obs)
-    assert_array_equal(result_new_api.cluster_p_values, cluster_pvals)
+    assert_array_almost_equal(result_new_api.H0, H0)
+    assert_array_almost_equal(result_new_api.stat_obs, F_obs)
+    assert_array_almost_equal(result_new_api.cluster_p_values, cluster_pvals)
     assert result_new_api.clusters == clusters

From f0f4cba540e8d51f93d46b32b8f98340f7d9044c Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Sat, 10 Aug 2024 17:39:40 -0500
Subject: [PATCH 77/88] refactor df validation to return bools

---
 mne/stats/cluster_level.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index 0b5b35889f6..7dff9a41a0a 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -35,6 +35,7 @@
     verbose,
     warn,
 )
+from ..utils.mixin import GetEpochsMixin
 from ..viz import plot_compare_evokeds
 from .parametric import f_oneway, ttest_1samp_no_p
 
@@ -1777,7 +1778,11 @@ def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str):
             f"{prologue} consistent shape, but {len(all_shapes)} different "
             f"shapes were found: {'; '.join(all_shapes)}."
         )
-    return all_types.pop()  # return the type of the data column entries
+    obj_type = all_types.pop()
+    is_epo = GetEpochsMixin in obj_type.__mro__
+    is_tfr = BaseTFR in obj_type.__mro__
+    is_arr = np.ndarray in obj_type.__mro__
+    return is_epo, is_tfr, is_arr
 
 
 @verbose
@@ -1861,7 +1866,7 @@ def cluster_test(
     iv_name = str(np.array(formula.rhs.root).item())
 
     # validate the input dataframe and return the type of the data column entries
-    _dtype = _validate_cluster_df(df, dv_name, iv_name)
+    is_epo, is_tfr, is_arr = _validate_cluster_df(df, dv_name, iv_name)
 
     # for within_subject designs, check if each subject has 2 observations
     _validate_type(within_id, (str, None), "within_id")
@@ -1873,23 +1878,18 @@ def cluster_test(
             raise ValueError("for paired t-test, each subject must have 2 observations")
 
     # extract the data from the dataframe
-    def _extract_data_array(series):
-        return np.concatenate(series.values)
+    outer_func = np.concatenate if is_epo or is_arr else np.array
+    axes = (-3, -1) if is_tfr else (-2, -1)
 
-    def _extract_data_mne(series):  # 2D data
-        return np.array(
-            series.map(lambda inst: inst.get_data().swapaxes(-2, -1)).to_list()
+    def func_mne(series):
+        return outer_func(
+            series.map(lambda inst: inst.get_data().swapaxes(*axes)).to_list()
         )
 
-    def _extract_data_tfr(series):
-        return series.map(lambda inst: inst.get_data().swapaxes(-3, -1)).to_list()
+    def func_array(series):
+        return outer_func(series.values)
 
-    if _dtype is np.ndarray:
-        func = _extract_data_array
-    elif _dtype is BaseTFR:
-        func = _extract_data_tfr
-    else:
-        func = _extract_data_mne
+    func = func_array if is_arr else func_mne
 
     # convert to a list-like X for clustering
     X = df.groupby(iv_name).agg({dv_name: func})[dv_name].to_list()

From 346e3ce1270bcffa19edeeb8de3949b078729862 Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Sat, 10 Aug 2024 17:40:14 -0500
Subject: [PATCH 78/88] unrelated typing fix

---
 mne/stats/cluster_level.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index 7dff9a41a0a..dcaf3e615b0 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -1986,7 +1986,7 @@ def plot_cluster_time_sensor(
         linestyles: list | dict | None = None,
         cmap_evokeds: None | str | tuple = None,
         cmap_topo: None | str | tuple = None,
-        ci: float | bool | callable() | None = None,
+        ci: float | bool | callable | None = None,
     ):
         """
         Plot the cluster with the lowest p-value.

From a49d2cd8e888786b61e7c7ce29ad3d56c3b7be2e Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Sat, 10 Aug 2024 17:41:19 -0500
Subject: [PATCH 79/88] rework test

---
 mne/stats/tests/test_cluster_level.py | 164 ++++++++++----------------
 1 file changed, 60 insertions(+), 104 deletions(-)

diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py
index 9b10aacabf5..ed0d830bdfd 100644
--- a/mne/stats/tests/test_cluster_level.py
+++ b/mne/stats/tests/test_cluster_level.py
@@ -909,121 +909,77 @@ def test_new_cluster_api(Inst):
     """Test handling different MNE objects in the cluster API."""
     pd = pytest.importorskip("pandas")
 
-    n_subs, n_epo, n_chan, n_freq, n_times = 2, 2, 3, 4, 5
+    rng = np.random.default_rng(seed=8675309)
+    is_epo = Inst in (EpochsTFRArray, EpochsArray)
+    is_tfr = Inst in (EpochsTFRArray, AverageTFRArray)
+
+    n_epo, n_chan, n_freq, n_times = 6, 3, 4, 5
+
+    # prepare the dimensions of the simulated data, then simulate
+    size = (n_chan,)
+    if is_epo:
+        size = (n_epo, *size)
+    if is_tfr:
+        size = (*size, n_freq)
+    size = (*size, n_times)
+    data = rng.normal(size=size)
+
+    # construct the instance
     info = create_info(ch_names=n_chan, sfreq=1000, ch_types="eeg")
-    # Introduce a significant difference in a specific region, time, and frequency
-    region_start = 1
-    region_end = 2
-    time_start = 2
-    time_end = 4
-    freq_start = 2
-    freq_end = 4
-
-    if Inst == EpochsArray:
-        # Create random data for EpochsArray
-        inst1 = Inst(np.random.randn(n_epo, n_chan, n_times), info=info)
-        # Adding a constant to create a difference
-        data_copy = inst1.get_data().copy()  # no data attribute for EpochsArray
-        data_copy[:, region_start:region_end, time_start:time_end] += (
-            2  # Modify the copy
-        )
-        inst2 = Inst(
-            data=data_copy, info=info
-        )  # Use the modified copy as a new instance
-
-    elif Inst == EvokedArray:
-        # Create random data for EvokedArray
-        inst1 = Inst(np.random.randn(n_chan, n_times), info=info)
-        data_copy = inst1.data.copy()
-        data_copy[region_start:region_end, time_start:time_end] += 2
-        inst2 = Inst(data=data_copy, info=info)
-
-    elif Inst == EpochsTFRArray:
-        # Create random data for EpochsTFRArray
-        data_tfr1 = np.random.randn(n_epo, n_chan, n_freq, n_times)
-        data_tfr2 = np.random.randn(n_epo, n_chan, n_freq, n_times)
-        inst1 = Inst(
-            data=data_tfr1, info=info, times=np.arange(n_times), freqs=np.arange(n_freq)
-        )
-        inst2 = Inst(
-            data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq)
-        )
-        data_tfr2 = inst2.data.copy()
-        data_tfr2[
-            :, region_start:region_end, freq_start:freq_end, time_start:time_end
-        ] += 2
-        inst2 = Inst(
-            data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq)
-        )
-
-    elif Inst == AverageTFRArray:
-        # Create random data for AverageTFRArray
-        data_tfr1 = np.random.randn(n_chan, n_freq, n_times)
-        data_tfr2 = np.random.randn(n_chan, n_freq, n_times)
-        inst1 = Inst(
-            data=data_tfr1, info=info, times=np.arange(n_times), freqs=np.arange(n_freq)
-        )
-        inst2 = Inst(
-            data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq)
-        )
-        data_tfr2 = inst2.data.copy()
-        data_tfr2[
-            region_start:region_end, freq_start:freq_end, time_start:time_end
-        ] += 2
-        inst2 = Inst(
-            data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq)
-        )
-
-    if Inst == EvokedArray or Inst == AverageTFRArray:
-        # Generate random noise
-        noise = np.random.normal(loc=0, scale=0.1, size=inst1.data.shape)
-        # add noise to the data of the second subject
-        inst1_n = inst1.copy()
-        inst1_n.data = inst1.data + noise
-        inst2_n = inst2.copy()
-        inst2_n.data = inst2.data + noise
-        data = [inst1, inst2, inst1_n, inst2_n]
-        conds = ["a", "b"] * n_subs
+    kw = dict(times=np.arange(n_times), freqs=np.arange(n_freq)) if is_tfr else dict()
+    cond_a = Inst(data=data, info=info, **kw)
+    cond_b = cond_a.copy()
+    # introduce a significant difference in a specific region, time, and frequency
+    ch_start, ch_end = 0, 2  # 2 channels
+    t_start, t_end = 2, 4  # 2 times
+    f_start, f_end = 2, 4  # 2 freqs
+    if is_tfr:
+        cond_b._data[..., ch_start:ch_end, f_start:f_end, t_start:t_end] += 2
+    else:
+        cond_b._data[..., ch_start:ch_end, t_start:t_end] += 2
+    # for Evokeds/AverageTFRs, we create fake "subjects" as our observations within each
+    # condition. We add a bit of noise while we do so.
+    if not is_epo:
+        insts = list()
+        for cond in cond_a, cond_b:
+            for _n in range(n_epo):
+                if not _n:
+                    insts.append(cond)
+                    continue
+                _cond = cond.copy()
+                _cond.data += rng.normal(scale=0.1, size=_cond.data.shape)
+                insts.append(_cond)
+        conds = np.repeat(["a", "b"], n_epo).tolist()
     else:
-        data = [inst1, inst2]
+        # For Epochs(TFR)Array, each epoch is an observation and they're already
+        # noisy/non-identical, so no duplication / noise-addition necessary.
+        insts = [cond_a, cond_b]
         conds = ["a", "b"]
 
-    df = pd.DataFrame(dict(data=data, condition=conds))
-
+    # run new clustering API
+    df = pd.DataFrame(dict(data=insts, condition=conds))
     kwargs = dict(
         n_permutations=100, seed=42, tail=1, buffer_size=None, out_type="mask"
     )
-
     result_new_api = cluster_test(df, "data~condition", **kwargs)
 
     # make sure channels are last dimension for old API
-    if Inst == EpochsArray:
-        inst1 = inst1.get_data().transpose(0, 2, 1)
-        inst2 = inst2.get_data().transpose(0, 2, 1)
-    elif Inst == EpochsTFRArray:
-        inst1 = inst1.data.transpose(0, 3, 2, 1)
-        inst2 = inst2.data.transpose(0, 3, 2, 1)
-    elif Inst == AverageTFRArray:
-        inst1 = inst1.data.transpose(2, 1, 0)
-        inst2 = inst2.data.transpose(2, 1, 0)
-        inst1_n = inst1_n.data.transpose(2, 1, 0)
-        inst2_n = inst2_n.data.transpose(2, 1, 0)
-        # combine the data of the two subjects
-        inst1 = np.concatenate([inst1[np.newaxis, :], inst1_n[np.newaxis, :]], axis=0)
-        inst2 = np.concatenate([inst2[np.newaxis, :], inst2_n[np.newaxis, :]], axis=0)
+    if is_epo:
+        axes = (0, 3, 2, 1) if is_tfr else (0, 2, 1)
+        X = [cond_a.get_data().transpose(*axes), cond_b.get_data().transpose(*axes)]
     else:
-        inst1 = inst1.data.transpose(1, 0)
-        inst2 = inst2.data.transpose(1, 0)
-        inst1_n = inst1_n.data.transpose(1, 0)
-        inst2_n = inst2_n.data.transpose(1, 0)
-        # combine the data of the two subjects
-        inst1 = np.concatenate([inst1[np.newaxis, :], inst1_n[np.newaxis, :]], axis=0)
-        inst2 = np.concatenate([inst2[np.newaxis, :], inst2_n[np.newaxis, :]], axis=0)
-
-    F_obs, clusters, cluster_pvals, H0 = permutation_cluster_test(
-        [inst1, inst2], **kwargs
-    )
+        axes = (2, 1, 0) if is_tfr else (1, 0)
+        Xa = list()
+        Xb = list()
+        for inst, cond in zip(insts, conds):
+            container = Xa if cond == "a" else Xb
+            container.append(inst.get_data().transpose(*axes))
+        X = [np.stack(Xa), np.stack(Xb)]
+
+    F_obs, clusters, cluster_pvals, H0 = permutation_cluster_test(X, **kwargs)
     assert_array_almost_equal(result_new_api.H0, H0)
     assert_array_almost_equal(result_new_api.stat_obs, F_obs)
     assert_array_almost_equal(result_new_api.cluster_p_values, cluster_pvals)
-    assert result_new_api.clusters == clusters
+    assert len(result_new_api.clusters) == len(clusters)
+    for clu1, clu2 in zip(result_new_api.clusters, clusters):
+        assert_array_equal(clu1, clu2)

From a01182b56ea7c932db51405748f2d731c45f6d92 Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Mon, 12 Aug 2024 09:08:27 -0500
Subject: [PATCH 80/88] minor cleanup

---
 mne/stats/cluster_level.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index dcaf3e615b0..f640ba9634f 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -1878,18 +1878,18 @@ def cluster_test(
             raise ValueError("for paired t-test, each subject must have 2 observations")
 
     # extract the data from the dataframe
-    outer_func = np.concatenate if is_epo or is_arr else np.array
+    outer_func = np.concatenate if is_epo else np.array
     axes = (-3, -1) if is_tfr else (-2, -1)
 
+    def func_arr(series):
+        return np.concatenate(series.values)
+
     def func_mne(series):
         return outer_func(
             series.map(lambda inst: inst.get_data().swapaxes(*axes)).to_list()
         )
 
-    def func_array(series):
-        return outer_func(series.values)
-
-    func = func_array if is_arr else func_mne
+    func = func_arr if is_arr else func_mne
 
     # convert to a list-like X for clustering
     X = df.groupby(iv_name).agg({dv_name: func})[dv_name].to_list()

From 0984b61313bd76d14279e80f5e8db190d8c0e62d Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Mon, 12 Aug 2024 09:16:24 -0500
Subject: [PATCH 81/88] fix imports

---
 mne/stats/cluster_level.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index f640ba9634f..2999a73c07c 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -17,13 +17,15 @@
 from scipy.stats import f as fstat
 from scipy.stats import t as tstat
 
-from .. import BaseEpochs, Evoked, EvokedArray
+from ..epochs import BaseEpochs, EvokedArray
+from ..evoked import Evoked
 from ..fixes import has_numba, jit
 from ..parallel import parallel_func
 from ..source_estimate import MixedSourceEstimate, SourceEstimate, VolSourceEstimate
 from ..source_space import SourceSpaces
 from ..time_frequency import BaseTFR
 from ..utils import (
+    GetEpochsMixin,
     ProgressBar,
     _check_option,
     _pl,
@@ -35,7 +37,6 @@
     verbose,
     warn,
 )
-from ..utils.mixin import GetEpochsMixin
 from ..viz import plot_compare_evokeds
 from .parametric import f_oneway, ttest_1samp_no_p
 

From 6322499b88e414a97cc6d6642de4529594d22ec5 Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Mon, 12 Aug 2024 09:16:35 -0500
Subject: [PATCH 82/88] use MRO in test too

---
 mne/stats/tests/test_cluster_level.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py
index ed0d830bdfd..fc2af127a96 100644
--- a/mne/stats/tests/test_cluster_level.py
+++ b/mne/stats/tests/test_cluster_level.py
@@ -37,8 +37,8 @@
     summarize_clusters_stc,
     ttest_1samp_no_p,
 )
-from mne.time_frequency import AverageTFRArray, EpochsTFRArray
-from mne.utils import _record_warnings, catch_logging
+from mne.time_frequency import AverageTFRArray, BaseTFR, EpochsTFRArray
+from mne.utils import GetEpochsMixin, _record_warnings, catch_logging
 
 n_space = 50
 
@@ -910,8 +910,8 @@ def test_new_cluster_api(Inst):
     pd = pytest.importorskip("pandas")
 
     rng = np.random.default_rng(seed=8675309)
-    is_epo = Inst in (EpochsTFRArray, EpochsArray)
-    is_tfr = Inst in (EpochsTFRArray, AverageTFRArray)
+    is_epo = GetEpochsMixin in Inst.__mro__
+    is_tfr = BaseTFR in Inst.__mro__
 
     n_epo, n_chan, n_freq, n_times = 6, 3, 4, 5
 

From a04b8a3e1031890b32ef817eebf01966e49620fd Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Thu, 22 Aug 2024 15:23:39 -0500
Subject: [PATCH 83/88] fix vulture allowlist

---
 tools/vulture_allowlist.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/vulture_allowlist.py b/tools/vulture_allowlist.py
index f030b4d4346..edc3bdf9811 100644
--- a/tools/vulture_allowlist.py
+++ b/tools/vulture_allowlist.py
@@ -148,4 +148,4 @@
 _qt_get_stylesheet
 
 # used in tutorial, not sure why shows up
-plot_cluster
+plot_cluster_time_sensor

From f1d39bf6dccf2186e5d9198e73e8d20068425acf Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Thu, 22 Aug 2024 15:40:08 -0500
Subject: [PATCH 84/88] fix nesting and type hints

---
 mne/stats/cluster_level.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index 2999a73c07c..a86a6dfafe4 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -10,7 +10,6 @@
 
 import matplotlib.pyplot as plt
 import numpy as np
-import pandas as pd
 from mpl_toolkits.axes_grid1 import make_axes_locatable
 from scipy import ndimage, sparse
 from scipy.sparse.csgraph import connected_components
@@ -40,6 +39,10 @@
 from ..viz import plot_compare_evokeds
 from .parametric import f_oneway, ttest_1samp_no_p
 
+# need this at top-level of file due to type hints
+pd = _soft_import("pandas", purpose="DataFrame integration")
+DataFrame = getattr(pd, "DataFrame", None)
+
 
 def _get_buddies_fallback(r, s, neighbors, indices=None):
     if indices is None:
@@ -1738,7 +1741,7 @@ def summarize_clusters_stc(
     return klass(data_summary, vertices, tmin, tstep, subject)
 
 
-def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str):
+def _validate_cluster_df(df: DataFrame, dv_name: str, iv_name: str):
     """Validate the input DataFrame for cluster tests."""
     # check if all necessary columns are present
     missing = ({dv_name} | {iv_name}) - set(df.columns)  # should be empty
@@ -1788,7 +1791,7 @@ def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str):
 
 @verbose
 def cluster_test(
-    df: pd.DataFrame,
+    df: DataFrame,
     formula: str,
     *,  # end of positional-only parameters
     within_id: str | None = None,

From 987ea433c14d7bab513b28c4463357a500db7a57 Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Thu, 22 Aug 2024 16:42:55 -0500
Subject: [PATCH 85/88] strict=False

---
 mne/stats/cluster_level.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py
index a86a6dfafe4..8f24c0c4a0c 100644
--- a/mne/stats/cluster_level.py
+++ b/mne/stats/cluster_level.py
@@ -40,7 +40,7 @@
 from .parametric import f_oneway, ttest_1samp_no_p
 
 # need this at top-level of file due to type hints
-pd = _soft_import("pandas", purpose="DataFrame integration")
+pd = _soft_import("pandas", purpose="DataFrame integration", strict=False)
 DataFrame = getattr(pd, "DataFrame", None)
 
 

From 78829b43e14b3447ff0bec06d14c338838de90a4 Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Thu, 22 Aug 2024 16:58:16 -0500
Subject: [PATCH 86/88] nest import in test file too

---
 mne/stats/tests/test_cluster_level.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py
index fc2af127a96..06a87a07477 100644
--- a/mne/stats/tests/test_cluster_level.py
+++ b/mne/stats/tests/test_cluster_level.py
@@ -6,7 +6,6 @@
 from functools import partial
 
 import numpy as np
-import pandas as pd
 import pytest
 from numpy.testing import (
     assert_allclose,
@@ -882,6 +881,7 @@ def test_output_equiv(shape, out_type, adjacency, threshold):
 
 def test_compare_old_and_new_cluster_api():
     """Test for same results from old and new APIs."""
+    pd = pytest.importorskip("pandas")
     condition1_1d, condition2_1d, condition1_2d, condition2_2d = _get_conditions()
     df_1d = pd.DataFrame(
         dict(

From 372bccacaa9522586fb811b00ac2f85d7adeb3d1 Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Wed, 2 Oct 2024 13:51:31 +0200
Subject: [PATCH 87/88] clean up pyproject mess

---
 pyproject.toml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 83d9479dccc..0c36566021d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -146,8 +146,6 @@ test_extra = [
     "snirf",
     "neo",
     "mne-bids",
-    "
-    ",
 ]
 
 # Dependencies for building the documentation

From 4da84634d1b265355f9e75c60d324ef6f1b29dd2 Mon Sep 17 00:00:00 2001
From: CarinaFo <carinaforster0611@gmail.com>
Date: Wed, 2 Oct 2024 13:53:08 +0200
Subject: [PATCH 88/88] add n_permutations, plotting, added min_cluster_p_value

---
 tutorials/stats-sensor-space/76_new_cluster_test_api.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
index fb928f89d0a..0e5aee91432 100644
--- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py
+++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py
@@ -139,6 +139,10 @@
 cluster_result = mne.stats.cluster_level.cluster_test(
     df=df, formula=formula, within_id="subject_index"
 )
+# TODO: add n_permutations to cluster_result
+
+# print the lowest cluster p-value
+print(f"The lowest cluster p-value is: {cluster_result.cluster_p_values.min()}")
 
 # note that we ran an exact test due to the small sample size
 # (only 15 permutations)