From 62daaf00192275820c91d3c28af8f7a93db7c99b Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Fri, 14 Jun 2024 14:22:51 +0200 Subject: [PATCH 01/88] added cluster test api, first commit --- .../76_new_cluster_test_api.py | 467 ++++++++++++++++++ 1 file changed, 467 insertions(+) create mode 100644 tutorials/stats-sensor-space/76_new_cluster_test_api.py diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py new file mode 100644 index 00000000000..4e2b3af8f6d --- /dev/null +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -0,0 +1,467 @@ +from pathlib import Path +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from mpl_toolkits.axes_grid1 import make_axes_locatable +import mne + +# eventually we want to use the _permutation_cluster_test function + +# import and load dataset +path_to_p3 = Path("C:/Users/Carina/mne_data/ERP_CORE_P3") + +def prep_sample_data(plot_evokeds: bool = False): + """ + Load the P3 dataset and extract the target, non-target and contrast evokeds. + """ + # Define the range of participant IDs + participant_ids = range(15, 20) # This will cover 015 to 019 + + evokeds_allsubs = [] + + # Loop over each participant ID and generate the corresponding filename + for pid in participant_ids: + # Create the filename using an f-string, ensuring the participant ID is zero-padded to 3 digits + filename_p3 = f"sub-{pid:03d}_ses-P3_task-P3_ave.fif" + + # Print the filename (or perform your desired operations on it) + print(filename_p3) + + p3_file_path = Path(path_to_p3) / filename_p3 + + evokeds = mne.read_evokeds(p3_file_path) + + # add to list + evokeds_allsubs.append(evokeds) + + target_only = [evoked[0] for evoked in evokeds_allsubs] + non_target_only = [evoked[1] for evoked in evokeds_allsubs] + contrast = [evoked[2] for evoked in evokeds_allsubs] + + if plot_evokeds: + # plot the grand average + mne.grand_average(target_only).plot() + mne.grand_average(non_target_only).plot() + mne.grand_average(contrast).plot() + + # create contrast from evokeds target and non-target + diff_evoked = [ + mne.combine_evoked([evokeds_a, evokeds_b], weights=[1, -1]) + for evokeds_a, evokeds_b in zip(target_only, non_target_only) + ] + + if plot_evokeds: + mne.grand_average(diff_evoked).plot() + + # crop the evokeds in the post stimulus window + contrast = [evokeds.crop(tmin=-0.1, tmax=0.6) for evokeds in contrast] + target_only = [evokeds.crop(tmin=-0.1, tmax=0.6) for evokeds in target_only] + non_target_only = [evokeds.crop(tmin=-0.1, tmax=0.6) for evokeds in non_target_only] + + return contrast, target_only, non_target_only + + +def old_api_cluster(n_permutations: int = 10000, seed: int = 1234): + """ + Run the cluster test using the old API to get a bechmark result for the new API. + Currently implementing a paired t-test with contrast between participants. + """ + contrast, target_only, non_target_only = prep_sample_data() + + # extract the data for each evoked and store in numpy array + data = np.array([evoked.data for evoked in contrast]) + + # shape should be (n_subjects, n_channels, n_times) + data.shape + + # reshape to channels as last dimension + data = data.transpose(0, 2, 1) + + data.shape + + adjacency, _ = mne.channels.find_ch_adjacency(contrast[0].info, ch_type="eeg") + + stat_fun, threshold = mne.stats.cluster_level._check_fun( + X=data, stat_fun=None, threshold=None, tail=0, kind="within" + ) + + # adjacency = mne.channels.find_ch_adjacency(contrast[0].info, ch_type='eeg') + # Run the analysis + T_obs, clusters, cluster_p_values, H0 = ( + mne.stats.cluster_level._permutation_cluster_test( + [data], + threshold=threshold, + stat_fun=stat_fun, + n_jobs=-1, # takes all CPU cores + max_step=1, # maximum distance between samples (time points) + exclude=None, # exclude no time points or channels + step_down_p=0, # step down in jumps test + t_power=1, # weigh each location by its stats score + out_type="indices", + check_disjoint=False, + buffer_size=None, # block size for chunking the data + n_permutations=n_permutations, + tail=0, + adjacency=adjacency, + seed=seed, + ) + ) + + print(min(cluster_p_values)) + + plot_cluster( + contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values + ) + + return T_obs, clusters, cluster_p_values, H0 + + +# fit cluster test with dataframe as input +# create condition list that repeats 5times 1 and then 5 times 0 +# 1 = target, 0 = non-target +# condition = 5 * [1] + 5 * [0] + +# 1 = target, 0 = non-target +# contrast, target_only, non_target_only = prep_sample_data() + +# evokeds_list = target_only + non_target_only + + +def create_random_evokeds_id_condition_list(evoked_data_a: list, evoked_data_b: list): + """ + Create a list of shuffled participant IDs, conditions, and evoked data. + # Keep the participant IDs and conditions paired but shuffle the order of the evoked data. + """ + import random + + # Example participant IDs + participant_ids = ["p1", "p2", "p3", "p4", "p5"] * 2 + + # Combine the evoked data into a single list + all_evoked_data = evoked_data_a + evoked_data_b + + # Create a corresponding list of conditions + conditions = [1] * len(evoked_data_a) + [0] * len(evoked_data_b) + + # Combine the participant IDs, conditions, and evoked data into a list of tuples + combined_list = list(zip(participant_ids, conditions, all_evoked_data)) + + # Shuffle the combined list + random.shuffle(combined_list) + + # Separate the shuffled list back into participant IDs, conditions, and evoked data + shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = zip( + *combined_list + ) + + # Convert the tuples back to lists + shuffled_participant_ids = list(shuffled_participant_ids) + shuffled_conditions = list(shuffled_conditions) + shuffled_evoked_data = list(shuffled_evoked_data) + + return shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data + + +def create_random_paired_evokeds_list(evoked_data_a: list, evoked_data_b: list): + """ + Create a list of shuffled evoked data where each pair of target and non-target evoked data is shuffled together. + """ + import random + + # Create a list of tuples where each tuple contains an evoked data and its corresponding label + evoked_pairs = [(evoked, 1) for evoked in evoked_data_a] + [ + (evoked, 0) for evoked in evoked_data_b + ] + + # Shuffle the list of tuples + random.shuffle(evoked_pairs) + + # Separate the shuffled list back into evoked data and labels + shuffled_evoked_data, shuffled_labels = zip(*evoked_pairs) + + # Convert the tuples back to lists + shuffled_evoked_data = list(shuffled_evoked_data) + + return shuffled_evoked_data + + +# shuffle order of pairs +shuffled_evokeds_list = create_random_paired_evokeds_list(target_only, non_target_only) +# shouldn't change the results (p-value is different though?) + +shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = ( + create_random_evokeds_id_condition_list( + evoked_data_a=target_only, evoked_data_b=non_target_only + ) +) + + +def prepare_dataframe_for_cluster_function( + contrast: bool = False, + evokeds: list = None, + condition: list = None, + subject_index: list = None, +): + """ + Prepare a dataframe for the cluster test function. + + Parameters + ---------- + contrast : bool, optional + If True, a contrast is calculated. Default is False. + evokeds : list, optional + List of evoked objects. Default is None. + condition : list, optional + List of conditions for each evoked object. Default is None. + subject_index : list, optional + List of subject IDs. Default is None. + + """ + # create an empty dataframe + df = pd.DataFrame() + + if contrast == True: + # check if evoked list is dividable by 2 + if len(evokeds) % 2 != 0: + raise ValueError("evokeds list needs to be dividable by 2") + if condition is not None: + # Convert lists to DataFrame for easier manipulation + df = pd.DataFrame( + { + "evoked": evokeds, + "condition": condition, + "subject_index": subject_index, + } + ) + + return df + + +def cluster_test( + df: pd.DataFrame, + n_permutations: int = 10000, + seed: int = 1234, + contrast_weights: list = [1, -1], +): + """ + Run the cluster test using the new API. + # currently supports paired t-test with contrast or with list of conditions + + Parameters + ---------- + dataframe : pd.DataFrame + Dataframe with evoked data, conditions and subject IDs. + n_permutations : int, optional + Number of permutations. Default is 10000. + seed : int, optional + Random seed. Default is 1234. + + Returns + ------- + T_obs : array + The observed test statistic. + clusters : list + List of clusters. + cluster_p_values : array + Array of cluster p-values. + H0 : array + The permuted test statistics. + """ + if df.condition is not None: + # Extract unique conditions + unique_conditions = np.unique(df.condition) + if len(unique_conditions) != 2: + raise ValueError("Condition list needs to contain 2 unique values") + if df.subject_index is not None: + # Initialize a list to hold the combined evoked data + evokeds_data = [] + + # Process each subject's evoked data + for sub_id in df.subject_index.unique(): + sub_df = df[df.subject_index == sub_id] + + # Split evokeds list based on condition list for this subject + evokeds_a = sub_df[sub_df.condition == unique_conditions[0]][ + "evoked" + ].tolist() + evokeds_b = sub_df[sub_df.condition == unique_conditions[1]][ + "evoked" + ].tolist() + + if len(evokeds_a) != 1 or len(evokeds_b) != 1: + raise ValueError( + f"Subject {sub_id}: Each subject must have exactly one evoked for each condition" + ) + + # Calculate contrast based on condition list + diff_evoked = mne.combine_evoked( + [evokeds_a[0], evokeds_b[0]], weights=contrast_weights + ) + evokeds_data.append(diff_evoked) + else: + # calculate length of evokeds list + n_evokeds = len(df.evokeds) + # now split evokeds list in two lists + evokeds_a = df.evokeds[: n_evokeds // 2] + evokeds_b = df.evokeds[n_evokeds // 2 :] + # create contrast from evokeds_a and evokeds_b + diff_evoked = [ + mne.combine_evoked([evo_a, evo_b], weights=contrast_weights) + for evo_a, evo_b in zip(evokeds_a, evokeds_b) + ] + evokeds_data = diff_evoked + else: + evokeds_data = df.evokeds + + # extract number of channels + n_channels = evokeds_data[0].info["nchan"] + + # loop over rows and extract data from evokeds + data_array = np.array([evoked.data for evoked in evokeds_data]) + + # find the dimension that is equal to n_channels + if data_array.shape[1] == n_channels: + # reshape to channels as last dimension + data = data_array.transpose(0, 2, 1) + + adjacency, _ = mne.channels.find_ch_adjacency(evokeds_data[0].info, ch_type="eeg") + + stat_fun, threshold = mne.stats.cluster_level._check_fun( + X=data, stat_fun=None, threshold=None, tail=0, kind="within" + ) + + T_obs, clusters, cluster_p_values, H0 = ( + mne.stats.cluster_level._permutation_cluster_test( + [data], + threshold=threshold, + stat_fun=stat_fun, + n_jobs=-1, + max_step=1, + exclude=None, + step_down_p=0.05, + t_power=1, + out_type="indices", + check_disjoint=True, + buffer_size=None, + n_permutations=n_permutations, + tail=0, + adjacency=adjacency, + seed=seed, + ) + ) + + print(min(cluster_p_values)) + + # need to adjust plotting function for contrast only data + contrast, evokeds_a, evokeds_b = prep_sample_data() + + # plot cluster + plot_cluster(contrast, evokeds_a, evokeds_b, T_obs, clusters, cluster_p_values) + + return T_obs, clusters, cluster_p_values, H0 + + +def plot_cluster( + contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values +): + """ + Plot the cluster with the lowest p-value. + + Parameters + ---------- + contrast : list + List of contrast evoked objects. + target_only : list + List of target evoked objects. + non_target_only : list + List of non-target evoked objects. + T_obs : array + The observed test statistic. + clusters : list + List of clusters. + cluster_p_values : array + Array of cluster p-values. + + Returns + ------- + None + + """ + # configure variables for visualization + colors = {"target": "crimson", "non-target": "steelblue"} + + # organize data for plotting + evokeds = {"target": target_only, "non-target": non_target_only} + + lowest_p_cluster = np.argmin(cluster_p_values) + + # plot the cluster with the lowest p-value + time_inds, space_inds = np.squeeze(clusters[lowest_p_cluster]) + ch_inds = np.unique(space_inds) + time_inds = np.unique(time_inds) + + # get topography for F stat + t_map = T_obs[time_inds, ...].mean(axis=0) + + # get signals at the sensors contributing to the cluster + sig_times = contrast[0].times[time_inds] + + # create spatial mask + mask = np.zeros((t_map.shape[0], 1), dtype=bool) + mask[ch_inds, :] = True + + # initialize figure + fig, ax_topo = plt.subplots(1, 1, figsize=(10, 3), layout="constrained") + + # plot average test statistic and mark significant sensors + t_evoked = mne.EvokedArray(t_map[:, np.newaxis], contrast[0].info, tmin=0) + t_evoked.plot_topomap( + times=0, + mask=mask, + axes=ax_topo, + cmap="Reds", + vlim=(np.min, np.max), + show=False, + colorbar=False, + mask_params=dict(markersize=10), + ) + image = ax_topo.images[0] + + # remove the title that would otherwise say "0.000 s" + ax_topo.set_title("") + + # create additional axes (for ERF and colorbar) + divider = make_axes_locatable(ax_topo) + + # add axes for colorbar + ax_colorbar = divider.append_axes("right", size="5%", pad=0.05) + plt.colorbar(image, cax=ax_colorbar) + ax_topo.set_xlabel( + "Averaged t-map ({:0.3f} - {:0.3f} s)".format(*sig_times[[0, -1]]) + ) + + # add new axis for time courses and plot time courses + ax_signals = divider.append_axes("right", size="300%", pad=1.2) + title = f"Cluster #1, {len(ch_inds)} sensor" + if len(ch_inds) > 1: + title += "s (mean)" + mne.viz.plot_compare_evokeds( + evokeds, + title=title, + picks=ch_inds, + axes=ax_signals, + colors=colors, + show=False, + split_legend=True, + truncate_yaxis="auto", + ) + + # plot temporal cluster extent + ymin, ymax = ax_signals.get_ylim() + ax_signals.fill_betweenx( + (ymin, ymax), sig_times[0], sig_times[-1], color="green", alpha=0.3 + ) + + plt.show() + + return None From d59978f575842ef148e814679bae7c746c1e2b4a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 14 Jun 2024 12:24:30 +0000 Subject: [PATCH 02/88] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tutorials/stats-sensor-space/76_new_cluster_test_api.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index 4e2b3af8f6d..2bdae528448 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -1,8 +1,10 @@ from pathlib import Path + import matplotlib.pyplot as plt import numpy as np import pandas as pd from mpl_toolkits.axes_grid1 import make_axes_locatable + import mne # eventually we want to use the _permutation_cluster_test function @@ -10,6 +12,7 @@ # import and load dataset path_to_p3 = Path("C:/Users/Carina/mne_data/ERP_CORE_P3") + def prep_sample_data(plot_evokeds: bool = False): """ Load the P3 dataset and extract the target, non-target and contrast evokeds. From 2843905c57bf2fe841607c88adb054fbc6ec322a Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Fri, 14 Jun 2024 19:02:45 +0200 Subject: [PATCH 03/88] tested dataframe function and results, cleaned up --- .../76_new_cluster_test_api.py | 187 +++++++++--------- 1 file changed, 95 insertions(+), 92 deletions(-) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index 4e2b3af8f6d..3f001251ba5 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -5,7 +5,6 @@ from mpl_toolkits.axes_grid1 import make_axes_locatable import mne -# eventually we want to use the _permutation_cluster_test function # import and load dataset path_to_p3 = Path("C:/Users/Carina/mne_data/ERP_CORE_P3") @@ -85,7 +84,6 @@ def old_api_cluster(n_permutations: int = 10000, seed: int = 1234): X=data, stat_fun=None, threshold=None, tail=0, kind="within" ) - # adjacency = mne.channels.find_ch_adjacency(contrast[0].info, ch_type='eeg') # Run the analysis T_obs, clusters, cluster_p_values, H0 = ( mne.stats.cluster_level._permutation_cluster_test( @@ -115,25 +113,15 @@ def old_api_cluster(n_permutations: int = 10000, seed: int = 1234): return T_obs, clusters, cluster_p_values, H0 - -# fit cluster test with dataframe as input -# create condition list that repeats 5times 1 and then 5 times 0 -# 1 = target, 0 = non-target -# condition = 5 * [1] + 5 * [0] - -# 1 = target, 0 = non-target -# contrast, target_only, non_target_only = prep_sample_data() - -# evokeds_list = target_only + non_target_only - - -def create_random_evokeds_id_condition_list(evoked_data_a: list, evoked_data_b: list): +def create_random_evokeds_id_condition_list(): """ Create a list of shuffled participant IDs, conditions, and evoked data. # Keep the participant IDs and conditions paired but shuffle the order of the evoked data. """ import random + _ , evoked_data_a, evoked_data_b = prep_sample_data() + # Example participant IDs participant_ids = ["p1", "p2", "p3", "p4", "p5"] * 2 @@ -162,42 +150,42 @@ def create_random_evokeds_id_condition_list(evoked_data_a: list, evoked_data_b: return shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data -def create_random_paired_evokeds_list(evoked_data_a: list, evoked_data_b: list): +def create_random_paired_evokeds_list(): """ Create a list of shuffled evoked data where each pair of target and non-target evoked data is shuffled together. """ import random + _, evoked_data_a, evoked_data_b = prep_sample_data() - # Create a list of tuples where each tuple contains an evoked data and its corresponding label - evoked_pairs = [(evoked, 1) for evoked in evoked_data_a] + [ - (evoked, 0) for evoked in evoked_data_b - ] + # Ensure evoked_data_a and evoked_data_b are of the same length + assert len(evoked_data_a) == len(evoked_data_b), "evoked_data_a and evoked_data_b must have the same length" + + # Create a list of participant indices + participant_indices = list(range(len(evoked_data_a))) - # Shuffle the list of tuples - random.shuffle(evoked_pairs) + # Shuffle the list of participant indices + random.shuffle(participant_indices) - # Separate the shuffled list back into evoked data and labels - shuffled_evoked_data, shuffled_labels = zip(*evoked_pairs) + # Reorder evoked data according to the shuffled participant indices + shuffled_evoked_data_a = [evoked_data_a[i] for i in participant_indices] + shuffled_evoked_data_b = [evoked_data_b[i] for i in participant_indices] - # Convert the tuples back to lists - shuffled_evoked_data = list(shuffled_evoked_data) + # Combine the shuffled evoked data into a single list + shuffled_evoked_data = shuffled_evoked_data_a + shuffled_evoked_data_b + + # Combine the original evoked data into a single list + original_evoked_data = evoked_data_a + evoked_data_b - return shuffled_evoked_data + return original_evoked_data, shuffled_evoked_data # shuffle order of pairs -shuffled_evokeds_list = create_random_paired_evokeds_list(target_only, non_target_only) +original_evoked_data, shuffled_evoked_data = create_random_paired_evokeds_list() # shouldn't change the results (p-value is different though?) -shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = ( - create_random_evokeds_id_condition_list( - evoked_data_a=target_only, evoked_data_b=non_target_only - ) -) - +shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = create_random_evokeds_id_condition_list() def prepare_dataframe_for_cluster_function( - contrast: bool = False, evokeds: list = None, condition: list = None, subject_index: list = None, @@ -216,29 +204,39 @@ def prepare_dataframe_for_cluster_function( subject_index : list, optional List of subject IDs. Default is None. + Returns + ------- + df : DataFrame + The prepared DataFrame for the cluster test function. """ - # create an empty dataframe - df = pd.DataFrame() - - if contrast == True: - # check if evoked list is dividable by 2 - if len(evokeds) % 2 != 0: - raise ValueError("evokeds list needs to be dividable by 2") - if condition is not None: - # Convert lists to DataFrame for easier manipulation - df = pd.DataFrame( - { - "evoked": evokeds, - "condition": condition, - "subject_index": subject_index, - } - ) - - return df + # Initialize the DataFrame with evoked data + df = pd.DataFrame({ + "evoked": evokeds, + "condition": condition if condition is not None else np.nan, + "subject_index": subject_index if subject_index is not None else np.nan + }) + + return df +# run with original data +df = prepare_dataframe_for_cluster_function(evokeds=original_evoked_data, + condition=None, + subject_index=None) + +df = prepare_dataframe_for_cluster_function(evokeds=shuffled_evoked_data, + condition=None, + subject_index=None) + +df = prepare_dataframe_for_cluster_function(evokeds=shuffled_evoked_data, + condition=shuffled_conditions, + subject_index=shuffled_participant_ids) + + +cluster_test(df) def cluster_test( df: pd.DataFrame, + contrast: bool = True, n_permutations: int = 10000, seed: int = 1234, contrast_weights: list = [1, -1], @@ -267,43 +265,47 @@ def cluster_test( H0 : array The permuted test statistics. """ - if df.condition is not None: - # Extract unique conditions - unique_conditions = np.unique(df.condition) - if len(unique_conditions) != 2: - raise ValueError("Condition list needs to contain 2 unique values") - if df.subject_index is not None: + # Check if conditions and subject_index are present and valid + conditions_present = pd.notna(df['condition']).all() + subject_index_present = pd.notna(df['subject_index']).all() + + if contrast == 1: + if conditions_present: + # Extract unique conditions + unique_conditions = np.unique(df.condition) + if len(unique_conditions) != 2: + raise ValueError("Condition list needs to contain 2 unique values") # Initialize a list to hold the combined evoked data evokeds_data = [] - - # Process each subject's evoked data - for sub_id in df.subject_index.unique(): - sub_df = df[df.subject_index == sub_id] - - # Split evokeds list based on condition list for this subject - evokeds_a = sub_df[sub_df.condition == unique_conditions[0]][ - "evoked" - ].tolist() - evokeds_b = sub_df[sub_df.condition == unique_conditions[1]][ - "evoked" - ].tolist() - - if len(evokeds_a) != 1 or len(evokeds_b) != 1: - raise ValueError( - f"Subject {sub_id}: Each subject must have exactly one evoked for each condition" + if subject_index_present: + # Process each subject's evoked data + for sub_id in df.subject_index.unique(): + sub_df = df[df.subject_index == sub_id] + + # Split evokeds list based on condition list for this subject + evokeds_a = sub_df[sub_df.condition == unique_conditions[0]][ + "evoked" + ].tolist() + evokeds_b = sub_df[sub_df.condition == unique_conditions[1]][ + "evoked" + ].tolist() + + if len(evokeds_a) != 1 or len(evokeds_b) != 1: + raise ValueError( + f"Subject {sub_id}: Each subject must have exactly one evoked for each condition" + ) + + # Calculate contrast based on condition list + diff_evoked = mne.combine_evoked( + [evokeds_a[0], evokeds_b[0]], weights=contrast_weights ) - - # Calculate contrast based on condition list - diff_evoked = mne.combine_evoked( - [evokeds_a[0], evokeds_b[0]], weights=contrast_weights - ) - evokeds_data.append(diff_evoked) + evokeds_data.append(diff_evoked) else: # calculate length of evokeds list - n_evokeds = len(df.evokeds) + n_evokeds = len(df.evoked) # now split evokeds list in two lists - evokeds_a = df.evokeds[: n_evokeds // 2] - evokeds_b = df.evokeds[n_evokeds // 2 :] + evokeds_a = df.evoked[: n_evokeds // 2] + evokeds_b = df.evoked[n_evokeds // 2 :] # create contrast from evokeds_a and evokeds_b diff_evoked = [ mne.combine_evoked([evo_a, evo_b], weights=contrast_weights) @@ -311,7 +313,7 @@ def cluster_test( ] evokeds_data = diff_evoked else: - evokeds_data = df.evokeds + evokeds_data = df.evoked.tolist() # extract number of channels n_channels = evokeds_data[0].info["nchan"] @@ -330,19 +332,20 @@ def cluster_test( X=data, stat_fun=None, threshold=None, tail=0, kind="within" ) + # Run the analysis T_obs, clusters, cluster_p_values, H0 = ( mne.stats.cluster_level._permutation_cluster_test( [data], threshold=threshold, stat_fun=stat_fun, - n_jobs=-1, - max_step=1, - exclude=None, - step_down_p=0.05, - t_power=1, + n_jobs=-1, # takes all CPU cores + max_step=1, # maximum distance between samples (time points) + exclude=None, # exclude no time points or channels + step_down_p=0, # step down in jumps test + t_power=1, # weigh each location by its stats score out_type="indices", - check_disjoint=True, - buffer_size=None, + check_disjoint=False, + buffer_size=None, # block size for chunking the data n_permutations=n_permutations, tail=0, adjacency=adjacency, From fa5b215ded34da56ef72bccd8dd3fd6290c8fe2f Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Fri, 14 Jun 2024 19:04:48 +0200 Subject: [PATCH 04/88] added ToDos --- tutorials/stats-sensor-space/76_new_cluster_test_api.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index 5d943985aa2..51ad611aa58 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -7,6 +7,8 @@ import mne +# TODO: implement formulaic design matrix for paired t-test +# TODO: @erik: add dataset to mne-data # import and load dataset path_to_p3 = Path("C:/Users/Carina/mne_data/ERP_CORE_P3") @@ -235,8 +237,6 @@ def prepare_dataframe_for_cluster_function( subject_index=shuffled_participant_ids) -cluster_test(df) - def cluster_test( df: pd.DataFrame, contrast: bool = True, @@ -471,3 +471,5 @@ def plot_cluster( plt.show() return None + +cluster_test(df) \ No newline at end of file From 1a1511ddec91aeea543b2e3c671b077d2711ef7e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 14 Jun 2024 17:04:48 +0000 Subject: [PATCH 05/88] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../76_new_cluster_test_api.py | 54 +++++++++++-------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index 5d943985aa2..3396e3137ff 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -7,7 +7,6 @@ import mne - # import and load dataset path_to_p3 = Path("C:/Users/Carina/mne_data/ERP_CORE_P3") @@ -116,6 +115,7 @@ def old_api_cluster(n_permutations: int = 10000, seed: int = 1234): return T_obs, clusters, cluster_p_values, H0 + def create_random_evokeds_id_condition_list(): """ Create a list of shuffled participant IDs, conditions, and evoked data. @@ -123,7 +123,7 @@ def create_random_evokeds_id_condition_list(): """ import random - _ , evoked_data_a, evoked_data_b = prep_sample_data() + _, evoked_data_a, evoked_data_b = prep_sample_data() # Example participant IDs participant_ids = ["p1", "p2", "p3", "p4", "p5"] * 2 @@ -158,11 +158,14 @@ def create_random_paired_evokeds_list(): Create a list of shuffled evoked data where each pair of target and non-target evoked data is shuffled together. """ import random + _, evoked_data_a, evoked_data_b = prep_sample_data() # Ensure evoked_data_a and evoked_data_b are of the same length - assert len(evoked_data_a) == len(evoked_data_b), "evoked_data_a and evoked_data_b must have the same length" - + assert len(evoked_data_a) == len( + evoked_data_b + ), "evoked_data_a and evoked_data_b must have the same length" + # Create a list of participant indices participant_indices = list(range(len(evoked_data_a))) @@ -186,7 +189,10 @@ def create_random_paired_evokeds_list(): original_evoked_data, shuffled_evoked_data = create_random_paired_evokeds_list() # shouldn't change the results (p-value is different though?) -shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = create_random_evokeds_id_condition_list() +shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = ( + create_random_evokeds_id_condition_list() +) + def prepare_dataframe_for_cluster_function( evokeds: list = None, @@ -213,30 +219,36 @@ def prepare_dataframe_for_cluster_function( The prepared DataFrame for the cluster test function. """ # Initialize the DataFrame with evoked data - df = pd.DataFrame({ - "evoked": evokeds, - "condition": condition if condition is not None else np.nan, - "subject_index": subject_index if subject_index is not None else np.nan - }) + df = pd.DataFrame( + { + "evoked": evokeds, + "condition": condition if condition is not None else np.nan, + "subject_index": subject_index if subject_index is not None else np.nan, + } + ) return df + # run with original data -df = prepare_dataframe_for_cluster_function(evokeds=original_evoked_data, - condition=None, - subject_index=None) +df = prepare_dataframe_for_cluster_function( + evokeds=original_evoked_data, condition=None, subject_index=None +) -df = prepare_dataframe_for_cluster_function(evokeds=shuffled_evoked_data, - condition=None, - subject_index=None) +df = prepare_dataframe_for_cluster_function( + evokeds=shuffled_evoked_data, condition=None, subject_index=None +) -df = prepare_dataframe_for_cluster_function(evokeds=shuffled_evoked_data, - condition=shuffled_conditions, - subject_index=shuffled_participant_ids) +df = prepare_dataframe_for_cluster_function( + evokeds=shuffled_evoked_data, + condition=shuffled_conditions, + subject_index=shuffled_participant_ids, +) cluster_test(df) + def cluster_test( df: pd.DataFrame, contrast: bool = True, @@ -269,8 +281,8 @@ def cluster_test( The permuted test statistics. """ # Check if conditions and subject_index are present and valid - conditions_present = pd.notna(df['condition']).all() - subject_index_present = pd.notna(df['subject_index']).all() + conditions_present = pd.notna(df["condition"]).all() + subject_index_present = pd.notna(df["subject_index"]).all() if contrast == 1: if conditions_present: From a12cf951fda22bed14ffd18dd9f67e30627f5b00 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 14 Jun 2024 20:51:19 +0000 Subject: [PATCH 06/88] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tutorials/stats-sensor-space/76_new_cluster_test_api.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index a1630dcd8ee..011c2f69d7f 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -251,6 +251,7 @@ def prepare_dataframe_for_cluster_function( cluster_test(df) + def cluster_test( df: pd.DataFrame, contrast: bool = True, @@ -486,4 +487,5 @@ def plot_cluster( return None -cluster_test(df) \ No newline at end of file + +cluster_test(df) From 45ce63a75a1fbc5a3676a924fb181bb0e7e7e3f7 Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Wed, 19 Jun 2024 19:28:07 +0200 Subject: [PATCH 07/88] added formula support and implemented suggestions --- .../76_new_cluster_test_api.py | 54 +++++++++++++++---- 1 file changed, 45 insertions(+), 9 deletions(-) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index 011c2f69d7f..eef90a2612b 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -225,8 +225,8 @@ def prepare_dataframe_for_cluster_function( df = pd.DataFrame( { "evoked": evokeds, - "condition": condition if condition is not None else np.nan, - "subject_index": subject_index if subject_index is not None else np.nan, + "condition": condition if condition is not None else pd.NA, + "subject_index": subject_index if subject_index is not None else pd.NA, } ) @@ -249,15 +249,13 @@ def prepare_dataframe_for_cluster_function( ) -cluster_test(df) - - def cluster_test( df: pd.DataFrame, - contrast: bool = True, + formula: str = None, # Wilkinson notation formula for design matrix + contrast: bool = True, # will be replaced by formulaic design matrix n_permutations: int = 10000, - seed: int = 1234, - contrast_weights: list = [1, -1], + seed: None | int | np.random.RandomState = None, + contrast_weights: list = [1, -1], # will be replaced by formulaic design matrix ): """ Run the cluster test using the new API. @@ -287,6 +285,22 @@ def cluster_test( conditions_present = pd.notna(df["condition"]).all() subject_index_present = pd.notna(df["subject_index"]).all() + # add a data column to the dataframe (numpy array) + df["data"] = [evoked.data for evoked in df.evoked] + + # convert wide format to long format + df_long = convert_wide_to_long(df) + + # check if formula is present + if formula is not None: + import formulaic + + # create design matrix based on formula + # Create the design matrix using formulaic + y, X = formulaic.model_matrix(formula, df_long) + + # what to do with the design matrix? + if contrast == 1: if conditions_present: # Extract unique conditions @@ -381,6 +395,29 @@ def cluster_test( return T_obs, clusters, cluster_p_values, H0 +# Convert wide format to long format +def convert_wide_to_long(df): + long_format_data = [] + for idx, row in df.iterrows(): + condition = row['condition'] + subject_index = row['subject_index'] + data_2d = row['data'] + + for channel in range(data_2d.shape[0]): + for timepoint in range(data_2d.shape[1]): + long_format_data.append({ + 'condition': condition, + 'subject_index': subject_index, + 'channel': channel, + 'timepoint': timepoint, + 'value': data_2d[channel, timepoint] + }) + + df_long = pd.DataFrame(long_format_data) + return df_long + +df_long = convert_wide_to_long(df) + def plot_cluster( contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values @@ -485,7 +522,6 @@ def plot_cluster( plt.show() - return None cluster_test(df) From 2b7bae8cae58d9ee370edd48a1642e7f28a73aa8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 19 Jun 2024 17:28:23 +0000 Subject: [PATCH 08/88] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../76_new_cluster_test_api.py | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index eef90a2612b..7c0abc95fae 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -251,11 +251,11 @@ def prepare_dataframe_for_cluster_function( def cluster_test( df: pd.DataFrame, - formula: str = None, # Wilkinson notation formula for design matrix - contrast: bool = True, # will be replaced by formulaic design matrix + formula: str = None, # Wilkinson notation formula for design matrix + contrast: bool = True, # will be replaced by formulaic design matrix n_permutations: int = 10000, seed: None | int | np.random.RandomState = None, - contrast_weights: list = [1, -1], # will be replaced by formulaic design matrix + contrast_weights: list = [1, -1], # will be replaced by formulaic design matrix ): """ Run the cluster test using the new API. @@ -395,27 +395,31 @@ def cluster_test( return T_obs, clusters, cluster_p_values, H0 + # Convert wide format to long format def convert_wide_to_long(df): long_format_data = [] for idx, row in df.iterrows(): - condition = row['condition'] - subject_index = row['subject_index'] - data_2d = row['data'] - + condition = row["condition"] + subject_index = row["subject_index"] + data_2d = row["data"] + for channel in range(data_2d.shape[0]): for timepoint in range(data_2d.shape[1]): - long_format_data.append({ - 'condition': condition, - 'subject_index': subject_index, - 'channel': channel, - 'timepoint': timepoint, - 'value': data_2d[channel, timepoint] - }) - + long_format_data.append( + { + "condition": condition, + "subject_index": subject_index, + "channel": channel, + "timepoint": timepoint, + "value": data_2d[channel, timepoint], + } + ) + df_long = pd.DataFrame(long_format_data) return df_long + df_long = convert_wide_to_long(df) @@ -523,5 +527,4 @@ def plot_cluster( plt.show() - cluster_test(df) From 38834baeb64460c885988f85f5175f0ff8cdd84b Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Sat, 22 Jun 2024 11:10:13 +0200 Subject: [PATCH 09/88] fixed linting errors --- .../76_new_cluster_test_api.py | 35 +++++++++++++------ 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index 7c0abc95fae..2f1d55383d2 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -6,6 +6,7 @@ from mpl_toolkits.axes_grid1 import make_axes_locatable import mne +from mne.utils import _soft_import_ # TODO: implement formulaic design matrix for paired t-test # TODO: @erik: add dataset to mne-data @@ -15,9 +16,7 @@ def prep_sample_data(plot_evokeds: bool = False): - """ - Load the P3 dataset and extract the target, non-target and contrast evokeds. - """ + """Load the P3 dataset.""" # Define the range of participant IDs participant_ids = range(15, 20) # This will cover 015 to 019 @@ -25,7 +24,7 @@ def prep_sample_data(plot_evokeds: bool = False): # Loop over each participant ID and generate the corresponding filename for pid in participant_ids: - # Create the filename using an f-string, ensuring the participant ID is zero-padded to 3 digits + # Create the filename using an f-string, ID is zero-padded to 3 digits filename_p3 = f"sub-{pid:03d}_ses-P3_task-P3_ave.fif" # Print the filename (or perform your desired operations on it) @@ -67,7 +66,8 @@ def prep_sample_data(plot_evokeds: bool = False): def old_api_cluster(n_permutations: int = 10000, seed: int = 1234): """ - Run the cluster test using the old API to get a bechmark result for the new API. + Run the cluster test using the old API to get a benchmark result for the new API. + Currently implementing a paired t-test with contrast between participants. """ contrast, target_only, non_target_only = prep_sample_data() @@ -122,7 +122,8 @@ def old_api_cluster(n_permutations: int = 10000, seed: int = 1234): def create_random_evokeds_id_condition_list(): """ Create a list of shuffled participant IDs, conditions, and evoked data. - # Keep the participant IDs and conditions paired but shuffle the order of the evoked data. + + # Keep the participant IDs and conditions paired but shuffle the order of the data. """ import random @@ -158,7 +159,10 @@ def create_random_evokeds_id_condition_list(): def create_random_paired_evokeds_list(): """ - Create a list of shuffled evoked data where each pair of target and non-target evoked data is shuffled together. + Create shuffled paired evoked data. + + Create a list of shuffled evoked data where each pair of target + and non-target evoked data is shuffled together. """ import random @@ -255,10 +259,11 @@ def cluster_test( contrast: bool = True, # will be replaced by formulaic design matrix n_permutations: int = 10000, seed: None | int | np.random.RandomState = None, - contrast_weights: list = [1, -1], # will be replaced by formulaic design matrix + contrast_weights: list = (1, -1), # will be replaced by formulaic design matrix ): """ Run the cluster test using the new API. + # currently supports paired t-test with contrast or with list of conditions Parameters @@ -293,12 +298,14 @@ def cluster_test( # check if formula is present if formula is not None: - import formulaic + formulaic = _soft_import_("formulaic") # soft import # create design matrix based on formula # Create the design matrix using formulaic y, X = formulaic.model_matrix(formula, df_long) + # sign flip for paired t-test + # what to do with the design matrix? if contrast == 1: @@ -324,7 +331,7 @@ def cluster_test( if len(evokeds_a) != 1 or len(evokeds_b) != 1: raise ValueError( - f"Subject {sub_id}: Each subject must have exactly one evoked for each condition" + f"Subject {sub_id}: subject must have one evoked per cond" ) # Calculate contrast based on condition list @@ -398,6 +405,14 @@ def cluster_test( # Convert wide format to long format def convert_wide_to_long(df): + """ + Convert a DataFrame from wide to long. + + Parameters + ---------- + df : pd.DataFrame + DataFrame in wide format. + """ long_format_data = [] for idx, row in df.iterrows(): condition = row["condition"] From c00859f79ecfcc889fb84a13bc0c1632a59b92c7 Mon Sep 17 00:00:00 2001 From: Eric Larson Date: Tue, 25 Jun 2024 12:25:21 -0400 Subject: [PATCH 10/88] ENH: Add dataset [skip azp] [skip actions] --- mne/datasets/config.py | 4 ++-- pyproject.toml | 3 +++ tutorials/stats-sensor-space/76_new_cluster_test_api.py | 7 +++---- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/mne/datasets/config.py b/mne/datasets/config.py index a2f2d7781b7..be6c4c49f70 100644 --- a/mne/datasets/config.py +++ b/mne/datasets/config.py @@ -90,7 +90,7 @@ # here: ↓↓↓↓↓↓↓↓ RELEASES = dict( testing="0.152", - misc="0.27", + misc="0.30", phantom_kit="0.2", ucl_opm_auditory="0.2", ) @@ -131,7 +131,7 @@ ) MNE_DATASETS["misc"] = dict( archive_name=f"{MISC_VERSIONED}.tar.gz", # 'mne-misc-data', - hash="md5:e343d3a00cb49f8a2f719d14f4758afe", + hash="md5:201d35531d3c03701cf50e38bb73481f", url=( "https://codeload.github.com/mne-tools/mne-misc-data/tar.gz/" f'{RELEASES["misc"]}' diff --git a/pyproject.toml b/pyproject.toml index 93bfb4abead..6c909263bcb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -107,6 +107,7 @@ full-no-qt = [ "snirf", "defusedxml", "neo", + "formulaic", ] full = ["mne[full-no-qt]", "PyQt6!=6.6.0", "PyQt6-Qt6!=6.6.0,!=6.7.0"] full-pyqt6 = ["mne[full]"] @@ -145,6 +146,7 @@ test_extra = [ "snirf", "neo", "mne-bids", + "formulaic", ] # Dependencies for building the documentation @@ -157,6 +159,7 @@ doc = [ "sphinxcontrib-towncrier", "memory_profiler", "neo", + "formulaic", "seaborn!=0.11.2", "sphinx_copybutton", "sphinx-design", diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index 2f1d55383d2..8eb7637df53 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -6,13 +6,12 @@ from mpl_toolkits.axes_grid1 import make_axes_locatable import mne -from mne.utils import _soft_import_ +from mne.utils import _soft_import # TODO: implement formulaic design matrix for paired t-test -# TODO: @erik: add dataset to mne-data # import and load dataset -path_to_p3 = Path("C:/Users/Carina/mne_data/ERP_CORE_P3") +path_to_p3 = mne.datasets.misc.data_path() / "ERP_CORE" / "P3" def prep_sample_data(plot_evokeds: bool = False): @@ -298,7 +297,7 @@ def cluster_test( # check if formula is present if formula is not None: - formulaic = _soft_import_("formulaic") # soft import + formulaic = _soft_import("formulaic") # soft import # create design matrix based on formula # Create the design matrix using formulaic From 9c8ec900cf1ac02b13e1b8fdedeca05a8897a882 Mon Sep 17 00:00:00 2001 From: Eric Larson Date: Tue, 25 Jun 2024 12:26:57 -0400 Subject: [PATCH 11/88] FIX: One more [skip azp] [skip actions] --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index cc2f8e752d5..71cd307cca0 100644 --- a/environment.yml +++ b/environment.yml @@ -64,3 +64,4 @@ dependencies: - lazy_loader - defusedxml - python-neo + - formulaic From 47363b539250b63a654e05a3a3aebc0e89ac8b4d Mon Sep 17 00:00:00 2001 From: Eric Larson Date: Tue, 25 Jun 2024 12:39:41 -0400 Subject: [PATCH 12/88] FIX: Title [skip azp] [skip actions] --- .../stats-sensor-space/76_new_cluster_test_api.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index 8eb7637df53..f9c4f61ad5f 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -1,3 +1,15 @@ +""" +.. _tut-new-cluster-test-api: + +==================== +New cluster test API +==================== + +This tutorial shows how to use the new API for cluster testing. +""" +# License: BSD-3-Clause +# Copyright the MNE-Python contributors. + from pathlib import Path import matplotlib.pyplot as plt From 1f6221dccedc679b8af651420dbbb2068037eb26 Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Sun, 30 Jun 2024 20:11:28 +0200 Subject: [PATCH 13/88] first draft of formulaic paired t-test --- .../76_new_cluster_test_api.py | 342 ++++++++++++------ 1 file changed, 224 insertions(+), 118 deletions(-) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index f9c4f61ad5f..6a3a966bbcc 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -15,12 +15,13 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd +import scipy from mpl_toolkits.axes_grid1 import make_axes_locatable import mne from mne.utils import _soft_import -# TODO: implement formulaic design matrix for paired t-test +# TODO: test function and update docstrings # import and load dataset path_to_p3 = mne.datasets.misc.data_path() / "ERP_CORE" / "P3" @@ -248,15 +249,6 @@ def prepare_dataframe_for_cluster_function( return df -# run with original data -df = prepare_dataframe_for_cluster_function( - evokeds=original_evoked_data, condition=None, subject_index=None -) - -df = prepare_dataframe_for_cluster_function( - evokeds=shuffled_evoked_data, condition=None, subject_index=None -) - df = prepare_dataframe_for_cluster_function( evokeds=shuffled_evoked_data, condition=shuffled_conditions, @@ -267,24 +259,56 @@ def prepare_dataframe_for_cluster_function( def cluster_test( df: pd.DataFrame, formula: str = None, # Wilkinson notation formula for design matrix - contrast: bool = True, # will be replaced by formulaic design matrix n_permutations: int = 10000, seed: None | int | np.random.RandomState = None, - contrast_weights: list = (1, -1), # will be replaced by formulaic design matrix + tail: int = 0, # 0 for two-tailed, 1 for greater, -1 for less + n_jobs: int = 1, # how many cores to use + adjacency: tuple = None, + max_step: int = 1, # maximum distance between samples (time points) + exclude: list = None, # exclude no time points or channels + step_down_p: int = 0, # step down in jumps test + t_power: int = 1, # weigh each location by its stats score + out_type: str = "indices", + check_disjoint: bool = False, + buffer_size: int = None, # block size for chunking the data ): """ Run the cluster test using the new API. - # currently supports paired t-test with contrast or with list of conditions + # currently supports paired t-test Parameters ---------- dataframe : pd.DataFrame Dataframe with evoked data, conditions and subject IDs. + formula : str, optional + Wilkinson notation formula for design matrix. Default is None. n_permutations : int, optional Number of permutations. Default is 10000. + seed : None | int | np.random.RandomState, optional + Seed for the random number generator. Default is None. + tail : int, optional + 0 for two-tailed, 1 for greater, -1 for less. Default is 0. + n_jobs : int, optional + How many cores to use. Default is 1. + adjacency : None, optional + Adjacency matrix. Default is None. + max_step : int, optional + Maximum distance between samples (time points). Default is 1. + exclude : np.Array, optional + Exclude no time points or channels. Default is None. + step_down_p : int, optional + Step down in jumps test. Default is 0. + t_power : int, optional + Weigh each location by its stats score. Default is 1. + out_type : str, optional + Output type. Default is "indices". + check_disjoint : bool, optional + Check if clusters are disjoint. Default is False. + buffer_size : int, optional + Block size for chunking the data. Default is None. seed : int, optional - Random seed. Default is 1234. + Seed for the random number generator. Default is None. Returns ------- @@ -297,108 +321,78 @@ def cluster_test( H0 : array The permuted test statistics. """ - # Check if conditions and subject_index are present and valid - conditions_present = pd.notna(df["condition"]).all() - subject_index_present = pd.notna(df["subject_index"]).all() - + # for now this assumes a dataframe with a column for evoked data # add a data column to the dataframe (numpy array) df["data"] = [evoked.data for evoked in df.evoked] - # convert wide format to long format - df_long = convert_wide_to_long(df) + # extract number of channels and timepoints + # (eventually should also allow for frequency) + n_channels, n_timepoints = df["data"][0].shape + + # convert wide format to long format for formulaic + df_long = unpack_time_and_channels(df) + + # Pivot the DataFrame + pivot_df = df_long.pivot_table( + index=["subject_index", "channel", "timepoint"], + columns="condition", + values="value", + ).reset_index() + + # if not 2 unique conditions raise error + if len(pd.unique(df.condition)) != 2: + raise ValueError("Condition list needs to contain 2 unique values") + + # Compute the difference (assuming there are only 2 conditions) + pivot_df["y"] = pivot_df[0] - pivot_df[1] + + # Optional: Clean up the DataFrame + pivot_df = pivot_df[["subject_index", "channel", "timepoint", "y"]] # check if formula is present if formula is not None: - formulaic = _soft_import("formulaic") # soft import + formulaic = _soft_import( + "formulaic", purpose="set up Design Matrix" + ) # soft import (not a dependency for MNE) - # create design matrix based on formula + # for the paired t-test y is the difference between conditions + # X is the design matrix with a column with 1s and 0s for each participant # Create the design matrix using formulaic - y, X = formulaic.model_matrix(formula, df_long) - - # sign flip for paired t-test - - # what to do with the design matrix? - - if contrast == 1: - if conditions_present: - # Extract unique conditions - unique_conditions = np.unique(df.condition) - if len(unique_conditions) != 2: - raise ValueError("Condition list needs to contain 2 unique values") - # Initialize a list to hold the combined evoked data - evokeds_data = [] - if subject_index_present: - # Process each subject's evoked data - for sub_id in df.subject_index.unique(): - sub_df = df[df.subject_index == sub_id] - - # Split evokeds list based on condition list for this subject - evokeds_a = sub_df[sub_df.condition == unique_conditions[0]][ - "evoked" - ].tolist() - evokeds_b = sub_df[sub_df.condition == unique_conditions[1]][ - "evoked" - ].tolist() - - if len(evokeds_a) != 1 or len(evokeds_b) != 1: - raise ValueError( - f"Subject {sub_id}: subject must have one evoked per cond" - ) - - # Calculate contrast based on condition list - diff_evoked = mne.combine_evoked( - [evokeds_a[0], evokeds_b[0]], weights=contrast_weights - ) - evokeds_data.append(diff_evoked) - else: - # calculate length of evokeds list - n_evokeds = len(df.evoked) - # now split evokeds list in two lists - evokeds_a = df.evoked[: n_evokeds // 2] - evokeds_b = df.evoked[n_evokeds // 2 :] - # create contrast from evokeds_a and evokeds_b - diff_evoked = [ - mne.combine_evoked([evo_a, evo_b], weights=contrast_weights) - for evo_a, evo_b in zip(evokeds_a, evokeds_b) - ] - evokeds_data = diff_evoked + y, X = formulaic.model_matrix(formula, pivot_df) else: - evokeds_data = df.evoked.tolist() - - # extract number of channels - n_channels = evokeds_data[0].info["nchan"] - - # loop over rows and extract data from evokeds - data_array = np.array([evoked.data for evoked in evokeds_data]) + raise ValueError( + "Formula is required and needs to be a string in Wilkinson notation." + ) - # find the dimension that is equal to n_channels - if data_array.shape[1] == n_channels: - # reshape to channels as last dimension - data = data_array.transpose(0, 2, 1) + # now prep design matrix outcome variable for input into MNE cluster function + # we initially had first channels, then timepoints, + # now we need first timepoints, then channels + y_for_cluster = y.values.reshape(-1, n_channels, n_timepoints).transpose(0, 2, 1) - adjacency, _ = mne.channels.find_ch_adjacency(evokeds_data[0].info, ch_type="eeg") + adjacency, _ = mne.channels.find_ch_adjacency(df["evoked"][0].info, ch_type="eeg") + # define stat function and threshold stat_fun, threshold = mne.stats.cluster_level._check_fun( - X=data, stat_fun=None, threshold=None, tail=0, kind="within" + X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="within" ) - # Run the analysis + # Run the cluster-based permutation test T_obs, clusters, cluster_p_values, H0 = ( mne.stats.cluster_level._permutation_cluster_test( - [data], + [y_for_cluster], + n_permutations=10000, threshold=threshold, stat_fun=stat_fun, - n_jobs=-1, # takes all CPU cores - max_step=1, # maximum distance between samples (time points) - exclude=None, # exclude no time points or channels - step_down_p=0, # step down in jumps test - t_power=1, # weigh each location by its stats score - out_type="indices", - check_disjoint=False, - buffer_size=None, # block size for chunking the data - n_permutations=n_permutations, - tail=0, + tail=tail, + n_jobs=n_jobs, adjacency=adjacency, + max_step=max_step, # maximum distance between samples (time points) + exclude=exclude, # exclude no time points or channels + step_down_p=step_down_p, # step down in jumps test + t_power=t_power, # weigh each location by its stats score + out_type=out_type, + check_disjoint=check_disjoint, + buffer_size=buffer_size, # block size for chunking the data seed=seed, ) ) @@ -414,39 +408,44 @@ def cluster_test( return T_obs, clusters, cluster_p_values, H0 -# Convert wide format to long format -def convert_wide_to_long(df): +def unpack_time_and_channels(df): """ - Convert a DataFrame from wide to long. + Extract the time and channel data from the DataFrame. Parameters ---------- df : pd.DataFrame DataFrame in wide format. """ - long_format_data = [] - for idx, row in df.iterrows(): - condition = row["condition"] - subject_index = row["subject_index"] - data_2d = row["data"] - - for channel in range(data_2d.shape[0]): - for timepoint in range(data_2d.shape[1]): - long_format_data.append( - { - "condition": condition, - "subject_index": subject_index, - "channel": channel, - "timepoint": timepoint, - "value": data_2d[channel, timepoint], - } - ) + # Extracting all necessary data using list comprehensions for better performance + long_format_data = [ + { + "condition": row["condition"], + "subject_index": row["subject_index"], + "channel": channel, + "timepoint": timepoint, + "value": row["data"][channel, timepoint], + } + for idx, row in df.iterrows() + for channel in range(row["data"].shape[0]) + for timepoint in range(row["data"].shape[1]) + ] + # Creating the long format DataFrame df_long = pd.DataFrame(long_format_data) + return df_long -df_long = convert_wide_to_long(df) +# Example usage +# Sample wide format DataFrame +df_wide = pd.DataFrame( + { + "condition": ["A", "B"], + "subject_index": [1, 2], + "data": [np.array([[1, 2, 3], [4, 5, 6]]), np.array([[7, 8, 9], [10, 11, 12]])], + } +) def plot_cluster( @@ -553,4 +552,111 @@ def plot_cluster( plt.show() -cluster_test(df) +# translated the limo permutation ttest from matlab to python +def limo_ttest_permute(Data, n_perm=None): + """ + Pseudo one-sample t-test using sign-test with permutations. + + Parameters + ---------- + Data (numpy.ndarray): A matrix of data for the one-sample t-test. + Shape can be (n_channels, n_var, n_obs) or + (n_var, n_obs). + n_perm (int, optional): Number of permutations to perform. + If None, it defaults based on the number of observations. + + Returns + ------- + t_vals (numpy.ndarray): t-values under H0. + p_vals (numpy.ndarray): p-values under H0. + dfe (int): Degrees of freedom. + """ + # Check inputs and reshape if necessary + if Data.ndim == 3: + n_channels, n_var, n_obs = Data.shape + else: + n_channels = 1 + n_var, n_obs = Data.shape + Data = Data[np.newaxis, ...] + + # Warn if the number of observations is very small + if n_obs < 7: + n_psbl_prms = 2**n_obs + print( + f"Due to the very limited number of observations, " + f"the total number of possible permutations is small ({n_psbl_prms}). " + "Thus, only a limited number of p-values are possible " + "and the test might be overly conservative." + ) + + # Set up permutation test + if n_obs <= 12: + n_perm = 2**n_obs # total number of possible permutations + exact = True + print( + "Due to the limited number of observations, all possible permutations " + "of the data will be computed instead of random permutations." + ) + else: + exact = False + if n_perm is None: + n_perm = 1000 + + print(f"Executing permutation test with {n_perm} permutations...") + + # Initialize variables + t_vals = np.full( + (n_channels, n_var, n_perm), np.nan + ) # Array to store t-values for each permutation + sqrt_nXnM1 = np.sqrt( + n_obs * (n_obs - 1) + ) # Precompute constant for t-value calculation + dfe = n_obs - 1 # Degrees of freedom + + if exact: + # Use all possible permutations + for perm in range(n_perm): + # Set sign of each trial / participant's data + temp = np.array( + [int(x) for x in bin(perm)[2:].zfill(n_obs)] + ) # Convert perm index to binary array + sn = np.where(temp == 0, -1, 1) # Map 0 to -1 and 1 to 1 + sn_mtrx = np.tile(sn, (n_var, 1)).T # Repeat sn for each variable + + for c in range(n_channels): + data = Data[c, :, :] + d_perm = data * sn_mtrx # Apply sign flip to data + + # Compute t-score of permuted data + sm = np.sum(d_perm, axis=1) # Sum of permuted data + mn = sm / n_obs # Mean of permuted data + sm_sqrs = ( + np.sum(d_perm**2, axis=1) - (sm**2) / n_obs + ) # Sum of squares for standard error + stder = np.sqrt(sm_sqrs) / sqrt_nXnM1 # Standard error + t_vals[c, :, perm] = mn / stder # Compute t-values + + else: + # Use random permutations + for perm in range(n_perm): + # Randomly set sign of each trial / participant's data + sn = (np.random.rand(n_obs) > 0.5) * 2 - 1 # Generate random sign flips + sn_mtrx = np.tile(sn, (n_var, 1)) # Repeat sn for each variable + + for c in range(n_channels): + data = Data[c, :, :] + d_perm = data * sn_mtrx # Apply sign flip to data + + # Compute t-score of permuted data + sm = np.sum(d_perm, axis=1) # Sum of permuted data + mn = sm / n_obs # Mean of permuted data + sm_sqrs = ( + np.sum(d_perm**2, axis=1) - (sm**2) / n_obs + ) # Sum of squares for standard error + stder = np.sqrt(sm_sqrs) / sqrt_nXnM1 # Standard error + t_vals[c, :, perm] = mn / stder # Compute t-values + + # Compute p-values from t-values + p_vals = 2 * scipy.stats.cdf(-np.abs(t_vals), dfe) + + return t_vals, p_vals, dfe From 37616e53d32b9bbf0371109ec1bc8aadd5de4e8a Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Sat, 6 Jul 2024 10:36:55 +0200 Subject: [PATCH 14/88] first draft without cluster plotting class implemented --- mne/stats/cluster_level.py | 293 +++++++ .../76_new_cluster_test_api.py | 722 +++--------------- 2 files changed, 392 insertions(+), 623 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index 76ae11bab7c..d3813c57817 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -10,12 +10,17 @@ # License: BSD-3-Clause # Copyright the MNE-Python contributors. +import matplotlib.pyplot as plt import numpy as np +import pandas as pd +from mpl_toolkits.axes_grid1 import make_axes_locatable from scipy import ndimage, sparse from scipy.sparse.csgraph import connected_components from scipy.stats import f as fstat from scipy.stats import t as tstat +from .. import EvokedArray +from ..channels import find_ch_adjacency from ..fixes import has_numba, jit from ..parallel import parallel_func from ..source_estimate import MixedSourceEstimate, SourceEstimate, VolSourceEstimate @@ -24,6 +29,7 @@ ProgressBar, _check_option, _pl, + _soft_import, _validate_type, check_random_state, logger, @@ -31,6 +37,7 @@ verbose, warn, ) +from ..viz import plot_compare_evokeds from .parametric import f_oneway, ttest_1samp_no_p @@ -1729,3 +1736,289 @@ def summarize_clusters_stc( data_summary[:, 0] = np.sum(data_summary, axis=1) return klass(data_summary, vertices, tmin, tstep, subject) + + +def cluster_test( + df: pd.DataFrame, + formula: str = None, # Wilkinson notation formula for design matrix + n_permutations: int = 10000, + seed: None | int | np.random.RandomState = None, + tail: int = 0, # 0 for two-tailed, 1 for greater, -1 for less + n_jobs: int = 1, # how many cores to use + adjacency: tuple = None, + max_step: int = 1, # maximum distance between samples (time points) + exclude: list = None, # exclude no time points or channels + step_down_p: int = 0, # step down in jumps test + t_power: int = 1, # weigh each location by its stats score + out_type: str = "indices", + check_disjoint: bool = False, + buffer_size: int = None, # block size for chunking the data +): + """ + Run the cluster test using the new API. + + # currently supports paired t-test + + Parameters + ---------- + dataframe : pd.DataFrame + Dataframe with evoked data, conditions and subject IDs. + formula : str, optional + Wilkinson notation formula for design matrix. Default is None. + n_permutations : int, optional + Number of permutations. Default is 10000. + seed : None | int | np.random.RandomState, optional + Seed for the random number generator. Default is None. + tail : int, optional + 0 for two-tailed, 1 for greater, -1 for less. Default is 0. + n_jobs : int, optional + How many cores to use. Default is 1. + adjacency : None, optional + Adjacency matrix. Default is None. + max_step : int, optional + Maximum distance between samples (time points). Default is 1. + exclude : np.Array, optional + Exclude no time points or channels. Default is None. + step_down_p : int, optional + Step down in jumps test. Default is 0. + t_power : int, optional + Weigh each location by its stats score. Default is 1. + out_type : str, optional + Output type. Default is "indices". + check_disjoint : bool, optional + Check if clusters are disjoint. Default is False. + buffer_size : int, optional + Block size for chunking the data. Default is None. + seed : int, optional + Seed for the random number generator. Default is None. + + Returns + ------- + T_obs : array + The observed test statistic. + clusters : list + List of clusters. + cluster_p_values : array + Array of cluster p-values. + H0 : array + The permuted test statistics. + """ + # for now this assumes a dataframe with a column for evoked data or epochs + # add a data column to the dataframe (numpy array) + df["data"] = [evoked.data for evoked in df.evoked] + + # extract number of channels and timepoints + # (eventually should also allow for frequency) + n_channels, n_timepoints = df["data"][0].shape + + # convert wide format to long format for formulaic + df_long = unpack_time_and_channels(df) + + # Pivot the DataFrame + pivot_df = df_long.pivot_table( + index=["subject_index", "channel", "timepoint"], + columns="condition", + values="value", + ).reset_index() + + # if not 2 unique conditions raise error + if len(pd.unique(df.condition)) != 2: + raise ValueError("Condition list needs to contain 2 unique values") + + # Get the unique conditions + conditions = np.unique(df.condition) + + # Compute the difference (assuming there are only 2 conditions) + pivot_df["evoked"] = pivot_df[conditions[0]] - pivot_df[conditions[1]] + + # Optional: Clean up the DataFrame + pivot_df = pivot_df[["subject_index", "channel", "timepoint", "evoked"]] + + # check if formula is present + if formula is not None: + formulaic = _soft_import( + "formulaic", purpose="set up Design Matrix" + ) # soft import (not a dependency for MNE) + + # for the paired t-test y is the difference between conditions + # X is the design matrix with a column with 1s and 0s for each participant + # Create the design matrix using formulaic + y, X = formulaic.model_matrix(formula, pivot_df) + else: + raise ValueError( + "Formula is required and needs to be a string in Wilkinson notation." + ) + + # now prep design matrix outcome variable for input into MNE cluster function + # we initially had first channels, then timepoints, + # now we need first timepoints, then channels + y_for_cluster = y.values.reshape(-1, n_channels, n_timepoints).transpose(0, 2, 1) + + adjacency, _ = find_ch_adjacency(df["evoked"][0].info, ch_type="eeg") + + # define stat function and threshold + stat_fun, threshold = _check_fun( + X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="within" + ) + + # Run the cluster-based permutation test + T_obs, clusters, cluster_p_values, H0 = _permutation_cluster_test( + [y_for_cluster], + n_permutations=10000, + threshold=threshold, + stat_fun=stat_fun, + tail=tail, + n_jobs=n_jobs, + adjacency=adjacency, + max_step=max_step, # maximum distance between samples (time points) + exclude=exclude, # exclude no time points or channels + step_down_p=step_down_p, # step down in jumps test + t_power=t_power, # weigh each location by its stats score + out_type=out_type, + check_disjoint=check_disjoint, + buffer_size=buffer_size, # block size for chunking the data + seed=seed, + ) + + print(min(cluster_p_values)) + + return T_obs, clusters, cluster_p_values, H0 + + +def unpack_time_and_channels(df): + """ + Extract the time and channel data from the DataFrame. + + Parameters + ---------- + df : pd.DataFrame + DataFrame in wide format. + """ + # Extracting all necessary data using list comprehensions for better performance + long_format_data = [ + { + "condition": row["condition"], + "subject_index": row["subject_index"], + "channel": channel, + "timepoint": timepoint, + "value": row["data"][channel, timepoint], + } + for idx, row in df.iterrows() + for channel in range(row["data"].shape[0]) + for timepoint in range(row["data"].shape[1]) + ] + + # Creating the long format DataFrame + df_long = pd.DataFrame(long_format_data) + + return df_long + + +def plot_cluster( + contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values +): + """ + Plot the cluster with the lowest p-value. + + Parameters + ---------- + contrast : list + List of contrast evoked objects. + target_only : list + List of target evoked objects. + non_target_only : list + List of non-target evoked objects. + T_obs : array + The observed test statistic. + clusters : list + List of clusters. + cluster_p_values : array + Array of cluster p-values. + + Returns + ------- + None + + """ + # configure variables for visualization + colors = {"target": "crimson", "non-target": "steelblue"} + + # organize data for plotting + evokeds = {"target": target_only, "non-target": non_target_only} + + lowest_p_cluster = np.argmin(cluster_p_values) + + # plot the cluster with the lowest p-value + time_inds, space_inds = np.squeeze(clusters[lowest_p_cluster]) + ch_inds = np.unique(space_inds) + time_inds = np.unique(time_inds) + + # get topography for F stat + t_map = T_obs[time_inds, ...].mean(axis=0) + + # get signals at the sensors contributing to the cluster + sig_times = contrast[0].times[time_inds] + + # create spatial mask + mask = np.zeros((t_map.shape[0], 1), dtype=bool) + mask[ch_inds, :] = True + + # initialize figure + fig, ax_topo = plt.subplots(1, 1, figsize=(10, 3), layout="constrained") + + # plot average test statistic and mark significant sensors + t_evoked = EvokedArray(t_map[:, np.newaxis], contrast[0].info, tmin=0) + t_evoked.plot_topomap( + times=0, + mask=mask, + axes=ax_topo, + cmap="Reds", + vlim=(np.min, np.max), + show=False, + colorbar=False, + mask_params=dict(markersize=10), + ) + image = ax_topo.images[0] + + # remove the title that would otherwise say "0.000 s" + ax_topo.set_title("") + + # soft import? + # make_axes_locatable = _soft_import( + # "mpl_toolkits.axes_grid1.make_axes_locatable", + # purpose="plot cluster results" + # ) # soft import (not a dependency for MNE) + + # create additional axes (for ERF and colorbar) + divider = make_axes_locatable(ax_topo) + + # add axes for colorbar + ax_colorbar = divider.append_axes("right", size="5%", pad=0.05) + plt.colorbar(image, cax=ax_colorbar) + ax_topo.set_xlabel( + "Averaged t-map ({:0.3f} - {:0.3f} s)".format(*sig_times[[0, -1]]) + ) + + # add new axis for time courses and plot time courses + ax_signals = divider.append_axes("right", size="300%", pad=1.2) + title = f"Cluster #1, {len(ch_inds)} sensor" + if len(ch_inds) > 1: + title += "s (mean)" + plot_compare_evokeds( + evokeds, + title=title, + picks=ch_inds, + axes=ax_signals, + colors=colors, + show=False, + split_legend=True, + truncate_yaxis="auto", + ) + + # plot temporal cluster extent + ymin, ymax = ax_signals.get_ylim() + ax_signals.fill_betweenx( + (ymin, ymax), sig_times[0], sig_times[-1], color="green", alpha=0.3 + ) + + plt.show() diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index 6a3a966bbcc..ec8bd8275a1 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -1,662 +1,138 @@ """ .. _tut-new-cluster-test-api: -==================== -New cluster test API -==================== +=============================================================== +New cluster test API that allows for Wilkinson style formulas +=============================================================== This tutorial shows how to use the new API for cluster testing. +This script shows how to estimate significant clusters in +evoked contrast data of multiple subjects. +It uses a non-parametric statistical procedure based on permutations and +cluster level statistics. + +The procedure consists of: + + - loading evoked data from multiple subjects + - construct a dataframe that contains the difference between conditions + - run the new cluster test function + +Here, the unit of observation are evokeds from multiple subjects (2nd level analysis). + +For more information on cluster-based permutation testing in MNE-Python, +see also: :ref:`tut-cluster-one-samp-tfr`. """ +# Authors: Carina Forster +# # License: BSD-3-Clause # Copyright the MNE-Python contributors. +# %% + from pathlib import Path -import matplotlib.pyplot as plt -import numpy as np import pandas as pd -import scipy -from mpl_toolkits.axes_grid1 import make_axes_locatable import mne -from mne.utils import _soft_import - -# TODO: test function and update docstrings -# import and load dataset +# Set parameters +# -------------- +# Define the path to the P3 dataset path_to_p3 = mne.datasets.misc.data_path() / "ERP_CORE" / "P3" +# Define the range of participant IDs +participant_ids = range(15, 20) # This will cover 015 to 019 -def prep_sample_data(plot_evokeds: bool = False): - """Load the P3 dataset.""" - # Define the range of participant IDs - participant_ids = range(15, 20) # This will cover 015 to 019 - - evokeds_allsubs = [] - - # Loop over each participant ID and generate the corresponding filename - for pid in participant_ids: - # Create the filename using an f-string, ID is zero-padded to 3 digits - filename_p3 = f"sub-{pid:03d}_ses-P3_task-P3_ave.fif" - - # Print the filename (or perform your desired operations on it) - print(filename_p3) - - p3_file_path = Path(path_to_p3) / filename_p3 - - evokeds = mne.read_evokeds(p3_file_path) - - # add to list - evokeds_allsubs.append(evokeds) - - target_only = [evoked[0] for evoked in evokeds_allsubs] - non_target_only = [evoked[1] for evoked in evokeds_allsubs] - contrast = [evoked[2] for evoked in evokeds_allsubs] - - if plot_evokeds: - # plot the grand average - mne.grand_average(target_only).plot() - mne.grand_average(non_target_only).plot() - mne.grand_average(contrast).plot() - - # create contrast from evokeds target and non-target - diff_evoked = [ - mne.combine_evoked([evokeds_a, evokeds_b], weights=[1, -1]) - for evokeds_a, evokeds_b in zip(target_only, non_target_only) - ] - - if plot_evokeds: - mne.grand_average(diff_evoked).plot() - - # crop the evokeds in the post stimulus window - contrast = [evokeds.crop(tmin=-0.1, tmax=0.6) for evokeds in contrast] - target_only = [evokeds.crop(tmin=-0.1, tmax=0.6) for evokeds in target_only] - non_target_only = [evokeds.crop(tmin=-0.1, tmax=0.6) for evokeds in non_target_only] - - return contrast, target_only, non_target_only - - -def old_api_cluster(n_permutations: int = 10000, seed: int = 1234): - """ - Run the cluster test using the old API to get a benchmark result for the new API. - - Currently implementing a paired t-test with contrast between participants. - """ - contrast, target_only, non_target_only = prep_sample_data() - - # extract the data for each evoked and store in numpy array - data = np.array([evoked.data for evoked in contrast]) - - # shape should be (n_subjects, n_channels, n_times) - data.shape - - # reshape to channels as last dimension - data = data.transpose(0, 2, 1) - - data.shape +# store the evoked data of all subjects +evokeds_allsubs = [] - adjacency, _ = mne.channels.find_ch_adjacency(contrast[0].info, ch_type="eeg") +# Loop over each participant ID and generate the corresponding filename +for pid in participant_ids: + # Create the filename using an f-string, ID is zero-padded to 3 digits + filename_p3 = f"sub-{pid:03d}_ses-P3_task-P3_ave.fif" - stat_fun, threshold = mne.stats.cluster_level._check_fun( - X=data, stat_fun=None, threshold=None, tail=0, kind="within" - ) + # Create the full path to the file + p3_file_path = Path(path_to_p3) / filename_p3 - # Run the analysis - T_obs, clusters, cluster_p_values, H0 = ( - mne.stats.cluster_level._permutation_cluster_test( - [data], - threshold=threshold, - stat_fun=stat_fun, - n_jobs=-1, # takes all CPU cores - max_step=1, # maximum distance between samples (time points) - exclude=None, # exclude no time points or channels - step_down_p=0, # step down in jumps test - t_power=1, # weigh each location by its stats score - out_type="indices", - check_disjoint=False, - buffer_size=None, # block size for chunking the data - n_permutations=n_permutations, - tail=0, - adjacency=adjacency, - seed=seed, - ) - ) + # load the evoked data + evokeds = mne.read_evokeds(p3_file_path) - print(min(cluster_p_values)) + # add subjects evoked data to list + evokeds_allsubs.append(evokeds) - plot_cluster( - contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values - ) +# the P3b dataset is part of the freely available ERP CORE dataset +# participants were presented with a visual oddball task +# and the P3b component was analyzed +# the conditions of interest are the target (rare visual stimuli) +# and non-target stimuli (frequency visual stimuli) - return T_obs, clusters, cluster_p_values, H0 +# let's extract the target and non-target evokeds +target_only = [evoked[0] for evoked in evokeds_allsubs] +non_target_only = [evoked[1] for evoked in evokeds_allsubs] +# let's first have a look at the data +# create contrast from target and non-target evokeds +diff_evoked = [ + mne.combine_evoked([evokeds_a, evokeds_b], weights=[1, -1]) + for evokeds_a, evokeds_b in zip(target_only, non_target_only) +] -def create_random_evokeds_id_condition_list(): - """ - Create a list of shuffled participant IDs, conditions, and evoked data. +# plot the grand average of the difference signal +mne.grand_average(diff_evoked).plot() +# plot the topography of the difference signal +mne.grand_average(diff_evoked).plot_topomap() - # Keep the participant IDs and conditions paired but shuffle the order of the data. - """ - import random +# we can see that the strongest difference is around 400 ms in +# visual channels (occipital region) - _, evoked_data_a, evoked_data_b = prep_sample_data() +# Next we prepare a dataframe for the cluster test function +# the dataframe should contain the contrast evoked data and the subject index +# each row in the dataframe should represent one observation (evoked data) - # Example participant IDs - participant_ids = ["p1", "p2", "p3", "p4", "p5"] * 2 +# save the evoked data for both conditions in one list +evokeds_conditions = target_only + non_target_only - # Combine the evoked data into a single list - all_evoked_data = evoked_data_a + evoked_data_b +# set up a list that defines the condition for each evoked data +# this will be used to create the conditions column in the dataframe +conditions = ["target"] * len(target_only) + ["non-target"] * len(non_target_only) - # Create a corresponding list of conditions - conditions = [1] * len(evoked_data_a) + [0] * len(evoked_data_b) +# finally add a column that defines the subject index +# this will be used to create the subject_index column in the dataframe +# we multiply the participant_ids by 2 to account for the two conditions +subject_index = list(participant_ids) * 2 - # Combine the participant IDs, conditions, and evoked data into a list of tuples - combined_list = list(zip(participant_ids, conditions, all_evoked_data)) - - # Shuffle the combined list - random.shuffle(combined_list) - - # Separate the shuffled list back into participant IDs, conditions, and evoked data - shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = zip( - *combined_list - ) - - # Convert the tuples back to lists - shuffled_participant_ids = list(shuffled_participant_ids) - shuffled_conditions = list(shuffled_conditions) - shuffled_evoked_data = list(shuffled_evoked_data) - - return shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data - - -def create_random_paired_evokeds_list(): - """ - Create shuffled paired evoked data. - - Create a list of shuffled evoked data where each pair of target - and non-target evoked data is shuffled together. - """ - import random - - _, evoked_data_a, evoked_data_b = prep_sample_data() - - # Ensure evoked_data_a and evoked_data_b are of the same length - assert len(evoked_data_a) == len( - evoked_data_b - ), "evoked_data_a and evoked_data_b must have the same length" - - # Create a list of participant indices - participant_indices = list(range(len(evoked_data_a))) - - # Shuffle the list of participant indices - random.shuffle(participant_indices) - - # Reorder evoked data according to the shuffled participant indices - shuffled_evoked_data_a = [evoked_data_a[i] for i in participant_indices] - shuffled_evoked_data_b = [evoked_data_b[i] for i in participant_indices] - - # Combine the shuffled evoked data into a single list - shuffled_evoked_data = shuffled_evoked_data_a + shuffled_evoked_data_b - - # Combine the original evoked data into a single list - original_evoked_data = evoked_data_a + evoked_data_b - - return original_evoked_data, shuffled_evoked_data - - -# shuffle order of pairs -original_evoked_data, shuffled_evoked_data = create_random_paired_evokeds_list() -# shouldn't change the results (p-value is different though?) - -shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = ( - create_random_evokeds_id_condition_list() +# create the dataframe +df = pd.DataFrame( + { + "evoked": evokeds_conditions, + "condition": conditions, + "subject_index": subject_index, + } ) +# now we can run the cluster test function +# we will use the new API that allows for Wilkinson style formulas +# the formula should be a string in Wilkinson notation -def prepare_dataframe_for_cluster_function( - evokeds: list = None, - condition: list = None, - subject_index: list = None, -): - """ - Prepare a dataframe for the cluster test function. - - Parameters - ---------- - contrast : bool, optional - If True, a contrast is calculated. Default is False. - evokeds : list, optional - List of evoked objects. Default is None. - condition : list, optional - List of conditions for each evoked object. Default is None. - subject_index : list, optional - List of subject IDs. Default is None. - - Returns - ------- - df : DataFrame - The prepared DataFrame for the cluster test function. - """ - # Initialize the DataFrame with evoked data - df = pd.DataFrame( - { - "evoked": evokeds, - "condition": condition if condition is not None else pd.NA, - "subject_index": subject_index if subject_index is not None else pd.NA, - } - ) - - return df - - -df = prepare_dataframe_for_cluster_function( - evokeds=shuffled_evoked_data, - condition=shuffled_conditions, - subject_index=shuffled_participant_ids, -) +# we want to test whether there is a significant difference between +# target and non-target stimuli in the post-stimulus window +# we will use a cluster-based permutation paired t-test for this +# let's first define the formula based on Wilkinson notation +formula = "evoked ~ 1 + C(subject_index)" -def cluster_test( - df: pd.DataFrame, - formula: str = None, # Wilkinson notation formula for design matrix - n_permutations: int = 10000, - seed: None | int | np.random.RandomState = None, - tail: int = 0, # 0 for two-tailed, 1 for greater, -1 for less - n_jobs: int = 1, # how many cores to use - adjacency: tuple = None, - max_step: int = 1, # maximum distance between samples (time points) - exclude: list = None, # exclude no time points or channels - step_down_p: int = 0, # step down in jumps test - t_power: int = 1, # weigh each location by its stats score - out_type: str = "indices", - check_disjoint: bool = False, - buffer_size: int = None, # block size for chunking the data -): - """ - Run the cluster test using the new API. - - # currently supports paired t-test - - Parameters - ---------- - dataframe : pd.DataFrame - Dataframe with evoked data, conditions and subject IDs. - formula : str, optional - Wilkinson notation formula for design matrix. Default is None. - n_permutations : int, optional - Number of permutations. Default is 10000. - seed : None | int | np.random.RandomState, optional - Seed for the random number generator. Default is None. - tail : int, optional - 0 for two-tailed, 1 for greater, -1 for less. Default is 0. - n_jobs : int, optional - How many cores to use. Default is 1. - adjacency : None, optional - Adjacency matrix. Default is None. - max_step : int, optional - Maximum distance between samples (time points). Default is 1. - exclude : np.Array, optional - Exclude no time points or channels. Default is None. - step_down_p : int, optional - Step down in jumps test. Default is 0. - t_power : int, optional - Weigh each location by its stats score. Default is 1. - out_type : str, optional - Output type. Default is "indices". - check_disjoint : bool, optional - Check if clusters are disjoint. Default is False. - buffer_size : int, optional - Block size for chunking the data. Default is None. - seed : int, optional - Seed for the random number generator. Default is None. - - Returns - ------- - T_obs : array - The observed test statistic. - clusters : list - List of clusters. - cluster_p_values : array - Array of cluster p-values. - H0 : array - The permuted test statistics. - """ - # for now this assumes a dataframe with a column for evoked data - # add a data column to the dataframe (numpy array) - df["data"] = [evoked.data for evoked in df.evoked] - - # extract number of channels and timepoints - # (eventually should also allow for frequency) - n_channels, n_timepoints = df["data"][0].shape - - # convert wide format to long format for formulaic - df_long = unpack_time_and_channels(df) - - # Pivot the DataFrame - pivot_df = df_long.pivot_table( - index=["subject_index", "channel", "timepoint"], - columns="condition", - values="value", - ).reset_index() - - # if not 2 unique conditions raise error - if len(pd.unique(df.condition)) != 2: - raise ValueError("Condition list needs to contain 2 unique values") - - # Compute the difference (assuming there are only 2 conditions) - pivot_df["y"] = pivot_df[0] - pivot_df[1] - - # Optional: Clean up the DataFrame - pivot_df = pivot_df[["subject_index", "channel", "timepoint", "y"]] - - # check if formula is present - if formula is not None: - formulaic = _soft_import( - "formulaic", purpose="set up Design Matrix" - ) # soft import (not a dependency for MNE) - - # for the paired t-test y is the difference between conditions - # X is the design matrix with a column with 1s and 0s for each participant - # Create the design matrix using formulaic - y, X = formulaic.model_matrix(formula, pivot_df) - else: - raise ValueError( - "Formula is required and needs to be a string in Wilkinson notation." - ) - - # now prep design matrix outcome variable for input into MNE cluster function - # we initially had first channels, then timepoints, - # now we need first timepoints, then channels - y_for_cluster = y.values.reshape(-1, n_channels, n_timepoints).transpose(0, 2, 1) - - adjacency, _ = mne.channels.find_ch_adjacency(df["evoked"][0].info, ch_type="eeg") - - # define stat function and threshold - stat_fun, threshold = mne.stats.cluster_level._check_fun( - X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="within" - ) - - # Run the cluster-based permutation test - T_obs, clusters, cluster_p_values, H0 = ( - mne.stats.cluster_level._permutation_cluster_test( - [y_for_cluster], - n_permutations=10000, - threshold=threshold, - stat_fun=stat_fun, - tail=tail, - n_jobs=n_jobs, - adjacency=adjacency, - max_step=max_step, # maximum distance between samples (time points) - exclude=exclude, # exclude no time points or channels - step_down_p=step_down_p, # step down in jumps test - t_power=t_power, # weigh each location by its stats score - out_type=out_type, - check_disjoint=check_disjoint, - buffer_size=buffer_size, # block size for chunking the data - seed=seed, - ) - ) - - print(min(cluster_p_values)) - - # need to adjust plotting function for contrast only data - contrast, evokeds_a, evokeds_b = prep_sample_data() - - # plot cluster - plot_cluster(contrast, evokeds_a, evokeds_b, T_obs, clusters, cluster_p_values) - - return T_obs, clusters, cluster_p_values, H0 - - -def unpack_time_and_channels(df): - """ - Extract the time and channel data from the DataFrame. - - Parameters - ---------- - df : pd.DataFrame - DataFrame in wide format. - """ - # Extracting all necessary data using list comprehensions for better performance - long_format_data = [ - { - "condition": row["condition"], - "subject_index": row["subject_index"], - "channel": channel, - "timepoint": timepoint, - "value": row["data"][channel, timepoint], - } - for idx, row in df.iterrows() - for channel in range(row["data"].shape[0]) - for timepoint in range(row["data"].shape[1]) - ] - - # Creating the long format DataFrame - df_long = pd.DataFrame(long_format_data) - - return df_long - - -# Example usage -# Sample wide format DataFrame -df_wide = pd.DataFrame( - { - "condition": ["A", "B"], - "subject_index": [1, 2], - "data": [np.array([[1, 2, 3], [4, 5, 6]]), np.array([[7, 8, 9], [10, 11, 12]])], - } +# run the cluster test +T_obs, clusters, cluster_p_values, H0 = mne.stats.cluster_level.cluster_test( + df=df, formula=formula ) - -def plot_cluster( - contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values -): - """ - Plot the cluster with the lowest p-value. - - Parameters - ---------- - contrast : list - List of contrast evoked objects. - target_only : list - List of target evoked objects. - non_target_only : list - List of non-target evoked objects. - T_obs : array - The observed test statistic. - clusters : list - List of clusters. - cluster_p_values : array - Array of cluster p-values. - - Returns - ------- - None - - """ - # configure variables for visualization - colors = {"target": "crimson", "non-target": "steelblue"} - - # organize data for plotting - evokeds = {"target": target_only, "non-target": non_target_only} - - lowest_p_cluster = np.argmin(cluster_p_values) - - # plot the cluster with the lowest p-value - time_inds, space_inds = np.squeeze(clusters[lowest_p_cluster]) - ch_inds = np.unique(space_inds) - time_inds = np.unique(time_inds) - - # get topography for F stat - t_map = T_obs[time_inds, ...].mean(axis=0) - - # get signals at the sensors contributing to the cluster - sig_times = contrast[0].times[time_inds] - - # create spatial mask - mask = np.zeros((t_map.shape[0], 1), dtype=bool) - mask[ch_inds, :] = True - - # initialize figure - fig, ax_topo = plt.subplots(1, 1, figsize=(10, 3), layout="constrained") - - # plot average test statistic and mark significant sensors - t_evoked = mne.EvokedArray(t_map[:, np.newaxis], contrast[0].info, tmin=0) - t_evoked.plot_topomap( - times=0, - mask=mask, - axes=ax_topo, - cmap="Reds", - vlim=(np.min, np.max), - show=False, - colorbar=False, - mask_params=dict(markersize=10), - ) - image = ax_topo.images[0] - - # remove the title that would otherwise say "0.000 s" - ax_topo.set_title("") - - # create additional axes (for ERF and colorbar) - divider = make_axes_locatable(ax_topo) - - # add axes for colorbar - ax_colorbar = divider.append_axes("right", size="5%", pad=0.05) - plt.colorbar(image, cax=ax_colorbar) - ax_topo.set_xlabel( - "Averaged t-map ({:0.3f} - {:0.3f} s)".format(*sig_times[[0, -1]]) - ) - - # add new axis for time courses and plot time courses - ax_signals = divider.append_axes("right", size="300%", pad=1.2) - title = f"Cluster #1, {len(ch_inds)} sensor" - if len(ch_inds) > 1: - title += "s (mean)" - mne.viz.plot_compare_evokeds( - evokeds, - title=title, - picks=ch_inds, - axes=ax_signals, - colors=colors, - show=False, - split_legend=True, - truncate_yaxis="auto", - ) - - # plot temporal cluster extent - ymin, ymax = ax_signals.get_ylim() - ax_signals.fill_betweenx( - (ymin, ymax), sig_times[0], sig_times[-1], color="green", alpha=0.3 - ) - - plt.show() - - -# translated the limo permutation ttest from matlab to python -def limo_ttest_permute(Data, n_perm=None): - """ - Pseudo one-sample t-test using sign-test with permutations. - - Parameters - ---------- - Data (numpy.ndarray): A matrix of data for the one-sample t-test. - Shape can be (n_channels, n_var, n_obs) or - (n_var, n_obs). - n_perm (int, optional): Number of permutations to perform. - If None, it defaults based on the number of observations. - - Returns - ------- - t_vals (numpy.ndarray): t-values under H0. - p_vals (numpy.ndarray): p-values under H0. - dfe (int): Degrees of freedom. - """ - # Check inputs and reshape if necessary - if Data.ndim == 3: - n_channels, n_var, n_obs = Data.shape - else: - n_channels = 1 - n_var, n_obs = Data.shape - Data = Data[np.newaxis, ...] - - # Warn if the number of observations is very small - if n_obs < 7: - n_psbl_prms = 2**n_obs - print( - f"Due to the very limited number of observations, " - f"the total number of possible permutations is small ({n_psbl_prms}). " - "Thus, only a limited number of p-values are possible " - "and the test might be overly conservative." - ) - - # Set up permutation test - if n_obs <= 12: - n_perm = 2**n_obs # total number of possible permutations - exact = True - print( - "Due to the limited number of observations, all possible permutations " - "of the data will be computed instead of random permutations." - ) - else: - exact = False - if n_perm is None: - n_perm = 1000 - - print(f"Executing permutation test with {n_perm} permutations...") - - # Initialize variables - t_vals = np.full( - (n_channels, n_var, n_perm), np.nan - ) # Array to store t-values for each permutation - sqrt_nXnM1 = np.sqrt( - n_obs * (n_obs - 1) - ) # Precompute constant for t-value calculation - dfe = n_obs - 1 # Degrees of freedom - - if exact: - # Use all possible permutations - for perm in range(n_perm): - # Set sign of each trial / participant's data - temp = np.array( - [int(x) for x in bin(perm)[2:].zfill(n_obs)] - ) # Convert perm index to binary array - sn = np.where(temp == 0, -1, 1) # Map 0 to -1 and 1 to 1 - sn_mtrx = np.tile(sn, (n_var, 1)).T # Repeat sn for each variable - - for c in range(n_channels): - data = Data[c, :, :] - d_perm = data * sn_mtrx # Apply sign flip to data - - # Compute t-score of permuted data - sm = np.sum(d_perm, axis=1) # Sum of permuted data - mn = sm / n_obs # Mean of permuted data - sm_sqrs = ( - np.sum(d_perm**2, axis=1) - (sm**2) / n_obs - ) # Sum of squares for standard error - stder = np.sqrt(sm_sqrs) / sqrt_nXnM1 # Standard error - t_vals[c, :, perm] = mn / stder # Compute t-values - - else: - # Use random permutations - for perm in range(n_perm): - # Randomly set sign of each trial / participant's data - sn = (np.random.rand(n_obs) > 0.5) * 2 - 1 # Generate random sign flips - sn_mtrx = np.tile(sn, (n_var, 1)) # Repeat sn for each variable - - for c in range(n_channels): - data = Data[c, :, :] - d_perm = data * sn_mtrx # Apply sign flip to data - - # Compute t-score of permuted data - sm = np.sum(d_perm, axis=1) # Sum of permuted data - mn = sm / n_obs # Mean of permuted data - sm_sqrs = ( - np.sum(d_perm**2, axis=1) - (sm**2) / n_obs - ) # Sum of squares for standard error - stder = np.sqrt(sm_sqrs) / sqrt_nXnM1 # Standard error - t_vals[c, :, perm] = mn / stder # Compute t-values - - # Compute p-values from t-values - p_vals = 2 * scipy.stats.cdf(-np.abs(t_vals), dfe) - - return t_vals, p_vals, dfe +# finally let's plot the results +# we plot the cluster with the lowest p-value +# and the topomap of the significant cluster +# we can see that there is something going on around 400 ms +# in the visual channels +# however the cluster is not significant which is not surprising +# given the small sample size (only 5 subjects) +mne.stats.cluster_level.plot_cluster( + diff_evoked, target_only, non_target_only, T_obs, clusters, cluster_p_values +) From 6aaef9a7acd47e44f897c8759a346a215c59ae72 Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Sat, 6 Jul 2024 11:01:12 +0200 Subject: [PATCH 15/88] cleaned up plotting function --- mne/stats/cluster_level.py | 61 ++++++++++--------- .../76_new_cluster_test_api.py | 6 +- 2 files changed, 36 insertions(+), 31 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index d3813c57817..686f1097063 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -1755,14 +1755,14 @@ def cluster_test( buffer_size: int = None, # block size for chunking the data ): """ - Run the cluster test using the new API. + Run a cluster permutation test based on formulaic input. - # currently supports paired t-test + # currently only supports paired t-test on evokeds or epochs Parameters ---------- dataframe : pd.DataFrame - Dataframe with evoked data, conditions and subject IDs. + Dataframe with evoked/epoched data, conditions and subject IDs. formula : str, optional Wilkinson notation formula for design matrix. Default is None. n_permutations : int, optional @@ -1794,6 +1794,7 @@ def cluster_test( Returns ------- + TODO: turn this into a class for further plotting T_obs : array The observed test statistic. clusters : list @@ -1814,7 +1815,7 @@ def cluster_test( # convert wide format to long format for formulaic df_long = unpack_time_and_channels(df) - # Pivot the DataFrame + # pivot the DataFrame pivot_df = df_long.pivot_table( index=["subject_index", "channel", "timepoint"], columns="condition", @@ -1825,7 +1826,7 @@ def cluster_test( if len(pd.unique(df.condition)) != 2: raise ValueError("Condition list needs to contain 2 unique values") - # Get the unique conditions + # get the unique conditions conditions = np.unique(df.condition) # Compute the difference (assuming there are only 2 conditions) @@ -1849,9 +1850,8 @@ def cluster_test( "Formula is required and needs to be a string in Wilkinson notation." ) - # now prep design matrix outcome variable for input into MNE cluster function - # we initially had first channels, then timepoints, - # now we need first timepoints, then channels + # now prep design matrix for input into MNE cluster function + # cluster functions expects channels as list dimension y_for_cluster = y.values.reshape(-1, n_channels, n_timepoints).transpose(0, 2, 1) adjacency, _ = find_ch_adjacency(df["evoked"][0].info, ch_type="eeg") @@ -1864,7 +1864,7 @@ def cluster_test( # Run the cluster-based permutation test T_obs, clusters, cluster_p_values, H0 = _permutation_cluster_test( [y_for_cluster], - n_permutations=10000, + n_permutations=n_permutations, threshold=threshold, stat_fun=stat_fun, tail=tail, @@ -1880,19 +1880,24 @@ def cluster_test( seed=seed, ) - print(min(cluster_p_values)) + print(f"smallest cluster p-value: {min(cluster_p_values)}") return T_obs, clusters, cluster_p_values, H0 -def unpack_time_and_channels(df): +def unpack_time_and_channels(df: pd.DataFrame = None) -> pd.DataFrame: """ - Extract the time and channel data from the DataFrame. + Extract timepoints and channels and convert to long. Parameters ---------- df : pd.DataFrame DataFrame in wide format. + + Returns + ------- + df_long : pd.DataFrame + DataFrame in long format. """ # Extracting all necessary data using list comprehensions for better performance long_format_data = [ @@ -1914,20 +1919,18 @@ def unpack_time_and_channels(df): return df_long -def plot_cluster( - contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values -): +def plot_cluster(cond_dict, T_obs, clusters, cluster_p_values): """ Plot the cluster with the lowest p-value. + 2D cluster plotted with topoplot on the left and evoked signals on the right. + Timepoints that are part of the cluster are + highlighted in green on the evoked signals. + Parameters ---------- - contrast : list - List of contrast evoked objects. - target_only : list - List of target evoked objects. - non_target_only : list - List of non-target evoked objects. + cond_dict : dict + Dictionary with conditions as keys and evoked data as values. T_obs : array The observed test statistic. clusters : list @@ -1940,11 +1943,13 @@ def plot_cluster( None """ - # configure variables for visualization - colors = {"target": "crimson", "non-target": "steelblue"} + # extract condition labels from the dictionary + cond_keys = list(cond_dict.keys()) + # extract the evokeds from the dictionary + cond_values = list(cond_dict.values()) - # organize data for plotting - evokeds = {"target": target_only, "non-target": non_target_only} + # configure variables for visualization + colors = {cond_keys[0]: "crimson", cond_keys[1]: "steelblue"} lowest_p_cluster = np.argmin(cluster_p_values) @@ -1957,7 +1962,7 @@ def plot_cluster( t_map = T_obs[time_inds, ...].mean(axis=0) # get signals at the sensors contributing to the cluster - sig_times = contrast[0].times[time_inds] + sig_times = cond_values[0][0].times[time_inds] # create spatial mask mask = np.zeros((t_map.shape[0], 1), dtype=bool) @@ -1967,7 +1972,7 @@ def plot_cluster( fig, ax_topo = plt.subplots(1, 1, figsize=(10, 3), layout="constrained") # plot average test statistic and mark significant sensors - t_evoked = EvokedArray(t_map[:, np.newaxis], contrast[0].info, tmin=0) + t_evoked = EvokedArray(t_map[:, np.newaxis], cond_values[0][0].info, tmin=0) t_evoked.plot_topomap( times=0, mask=mask, @@ -2005,7 +2010,7 @@ def plot_cluster( if len(ch_inds) > 1: title += "s (mean)" plot_compare_evokeds( - evokeds, + cond_dict, title=title, picks=ch_inds, axes=ax_signals, diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index ec8bd8275a1..a88904a5b5b 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -125,6 +125,8 @@ T_obs, clusters, cluster_p_values, H0 = mne.stats.cluster_level.cluster_test( df=df, formula=formula ) +# set up conditions dictionary for cluster plots +conditions_dict = {"target": target_only, "non-target": non_target_only} # finally let's plot the results # we plot the cluster with the lowest p-value @@ -133,6 +135,4 @@ # in the visual channels # however the cluster is not significant which is not surprising # given the small sample size (only 5 subjects) -mne.stats.cluster_level.plot_cluster( - diff_evoked, target_only, non_target_only, T_obs, clusters, cluster_p_values -) +mne.stats.cluster_level.plot_cluster(conditions_dict, T_obs, clusters, cluster_p_values) From 0f99c709e16207c1bc6e5a9a3e664030cb171d27 Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Sat, 6 Jul 2024 11:53:49 +0200 Subject: [PATCH 16/88] implemented cluser results class --- mne/stats/cluster_level.py | 213 +++++++++--------- .../76_new_cluster_test_api.py | 23 +- 2 files changed, 124 insertions(+), 112 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index 686f1097063..146a6cd7c5f 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -1794,15 +1794,8 @@ def cluster_test( Returns ------- - TODO: turn this into a class for further plotting - T_obs : array - The observed test statistic. - clusters : list - List of clusters. - cluster_p_values : array - Array of cluster p-values. - H0 : array - The permuted test statistics. + ClusterResult + Object containing the results of the cluster permutation test. """ # for now this assumes a dataframe with a column for evoked data or epochs # add a data column to the dataframe (numpy array) @@ -1882,7 +1875,7 @@ def cluster_test( print(f"smallest cluster p-value: {min(cluster_p_values)}") - return T_obs, clusters, cluster_p_values, H0 + return ClusterResult(T_obs, clusters, cluster_p_values, H0) def unpack_time_and_channels(df: pd.DataFrame = None) -> pd.DataFrame: @@ -1919,111 +1912,127 @@ def unpack_time_and_channels(df: pd.DataFrame = None) -> pd.DataFrame: return df_long -def plot_cluster(cond_dict, T_obs, clusters, cluster_p_values): +class ClusterResult: """ - Plot the cluster with the lowest p-value. - - 2D cluster plotted with topoplot on the left and evoked signals on the right. - Timepoints that are part of the cluster are - highlighted in green on the evoked signals. + Object containing the results of the cluster permutation test. Parameters ---------- - cond_dict : dict - Dictionary with conditions as keys and evoked data as values. - T_obs : array + T_obs : np.ndarray The observed test statistic. clusters : list List of clusters. - cluster_p_values : array - Array of cluster p-values. - - Returns - ------- - None - + cluster_p_values : np.ndarray + P-values for each cluster. + H0 : np.ndarray + Max cluster level stats observed under permutation. """ - # extract condition labels from the dictionary - cond_keys = list(cond_dict.keys()) - # extract the evokeds from the dictionary - cond_values = list(cond_dict.values()) - - # configure variables for visualization - colors = {cond_keys[0]: "crimson", cond_keys[1]: "steelblue"} - - lowest_p_cluster = np.argmin(cluster_p_values) - - # plot the cluster with the lowest p-value - time_inds, space_inds = np.squeeze(clusters[lowest_p_cluster]) - ch_inds = np.unique(space_inds) - time_inds = np.unique(time_inds) - - # get topography for F stat - t_map = T_obs[time_inds, ...].mean(axis=0) - - # get signals at the sensors contributing to the cluster - sig_times = cond_values[0][0].times[time_inds] - - # create spatial mask - mask = np.zeros((t_map.shape[0], 1), dtype=bool) - mask[ch_inds, :] = True - - # initialize figure - fig, ax_topo = plt.subplots(1, 1, figsize=(10, 3), layout="constrained") - - # plot average test statistic and mark significant sensors - t_evoked = EvokedArray(t_map[:, np.newaxis], cond_values[0][0].info, tmin=0) - t_evoked.plot_topomap( - times=0, - mask=mask, - axes=ax_topo, - cmap="Reds", - vlim=(np.min, np.max), - show=False, - colorbar=False, - mask_params=dict(markersize=10), - ) - image = ax_topo.images[0] - # remove the title that would otherwise say "0.000 s" - ax_topo.set_title("") + def __init__(self, T_obs, clusters, cluster_p_values, H0): + self.T_obs = T_obs + self.clusters = clusters + self.cluster_p_values = cluster_p_values + self.H0 = H0 + + def plot_cluster(self, cond_dict: dict = None): + """ + Plot the cluster with the lowest p-value. + + 2D cluster plotted with topoplot on the left and evoked signals on the right. + Timepoints that are part of the cluster are + highlighted in green on the evoked signals. + + Parameters + ---------- + cond_dict : dict + Dictionary with condition labels as keys and evoked objects as values. + + Returns + ------- + None + + """ + # extract condition labels from the dictionary + cond_keys = list(cond_dict.keys()) + # extract the evokeds from the dictionary + cond_values = list(cond_dict.values()) + + # configure variables for visualization + colors = {cond_keys[0]: "crimson", cond_keys[1]: "steelblue"} + + lowest_p_cluster = np.argmin(self.cluster_p_values) + + # plot the cluster with the lowest p-value + time_inds, space_inds = np.squeeze(self.clusters[lowest_p_cluster]) + ch_inds = np.unique(space_inds) + time_inds = np.unique(time_inds) + + # get topography for F stat + t_map = self.T_obs[time_inds, ...].mean(axis=0) + + # get signals at the sensors contributing to the cluster + sig_times = cond_values[0][0].times[time_inds] + + # create spatial mask + mask = np.zeros((t_map.shape[0], 1), dtype=bool) + mask[ch_inds, :] = True + + # initialize figure + fig, ax_topo = plt.subplots(1, 1, figsize=(10, 3), layout="constrained") + + # plot average test statistic and mark significant sensors + t_evoked = EvokedArray(t_map[:, np.newaxis], cond_values[0][0].info, tmin=0) + t_evoked.plot_topomap( + times=0, + mask=mask, + axes=ax_topo, + cmap="Reds", + vlim=(np.min, np.max), + show=False, + colorbar=False, + mask_params=dict(markersize=10), + ) + image = ax_topo.images[0] - # soft import? - # make_axes_locatable = _soft_import( - # "mpl_toolkits.axes_grid1.make_axes_locatable", - # purpose="plot cluster results" - # ) # soft import (not a dependency for MNE) + # remove the title that would otherwise say "0.000 s" + ax_topo.set_title("") - # create additional axes (for ERF and colorbar) - divider = make_axes_locatable(ax_topo) + # soft import? + # make_axes_locatable = _soft_import( + # "mpl_toolkits.axes_grid1.make_axes_locatable", + # purpose="plot cluster results" + # ) # soft import (not a dependency for MNE) - # add axes for colorbar - ax_colorbar = divider.append_axes("right", size="5%", pad=0.05) - plt.colorbar(image, cax=ax_colorbar) - ax_topo.set_xlabel( - "Averaged t-map ({:0.3f} - {:0.3f} s)".format(*sig_times[[0, -1]]) - ) + # create additional axes (for ERF and colorbar) + divider = make_axes_locatable(ax_topo) - # add new axis for time courses and plot time courses - ax_signals = divider.append_axes("right", size="300%", pad=1.2) - title = f"Cluster #1, {len(ch_inds)} sensor" - if len(ch_inds) > 1: - title += "s (mean)" - plot_compare_evokeds( - cond_dict, - title=title, - picks=ch_inds, - axes=ax_signals, - colors=colors, - show=False, - split_legend=True, - truncate_yaxis="auto", + # add axes for colorbar + ax_colorbar = divider.append_axes("right", size="5%", pad=0.05) + plt.colorbar(image, cax=ax_colorbar) + ax_topo.set_xlabel( + "Averaged t-map ({:0.3f} - {:0.3f} s)".format(*sig_times[[0, -1]]) ) - # plot temporal cluster extent - ymin, ymax = ax_signals.get_ylim() - ax_signals.fill_betweenx( - (ymin, ymax), sig_times[0], sig_times[-1], color="green", alpha=0.3 - ) + # add new axis for time courses and plot time courses + ax_signals = divider.append_axes("right", size="300%", pad=1.2) + title = f"Cluster #1, {len(ch_inds)} sensor" + if len(ch_inds) > 1: + title += "s (mean)" + plot_compare_evokeds( + cond_dict, + title=title, + picks=ch_inds, + axes=ax_signals, + colors=colors, + show=False, + split_legend=True, + truncate_yaxis="auto", + ) + + # plot temporal cluster extent + ymin, ymax = ax_signals.get_ylim() + ax_signals.fill_betweenx( + (ymin, ymax), sig_times[0], sig_times[-1], color="green", alpha=0.3 + ) - plt.show() + plt.show() diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index a88904a5b5b..3acfd21f7f0 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -15,7 +15,8 @@ - loading evoked data from multiple subjects - construct a dataframe that contains the difference between conditions - - run the new cluster test function + - run the new cluster test function with formula in Wilkinson notation + - plot the results with the ClusterResults Class Here, the unit of observation are evokeds from multiple subjects (2nd level analysis). @@ -121,18 +122,20 @@ # let's first define the formula based on Wilkinson notation formula = "evoked ~ 1 + C(subject_index)" -# run the cluster test -T_obs, clusters, cluster_p_values, H0 = mne.stats.cluster_level.cluster_test( - df=df, formula=formula -) +# run the cluster test and return the cluster_result object +cluster_result = mne.stats.cluster_level.cluster_test(df=df, formula=formula) + +# note that we ran an exact test due to the small sample size (only 15 permutations) + # set up conditions dictionary for cluster plots conditions_dict = {"target": target_only, "non-target": non_target_only} -# finally let's plot the results +# finally let's plot the results using the ClusterResults class + # we plot the cluster with the lowest p-value -# and the topomap of the significant cluster + # we can see that there is something going on around 400 ms -# in the visual channels -# however the cluster is not significant which is not surprising +# in the visual channels (topomap on the left) +# however the cluster is not significant which is unsurprising # given the small sample size (only 5 subjects) -mne.stats.cluster_level.plot_cluster(conditions_dict, T_obs, clusters, cluster_p_values) +cluster_result.plot_cluster(cond_dict=conditions_dict) From 4083691f928cc932213fc1a0610c769c53476e15 Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Sat, 6 Jul 2024 11:55:34 +0200 Subject: [PATCH 17/88] added contribution --- mne/stats/cluster_level.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index 146a6cd7c5f..9a223d715ae 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -6,6 +6,7 @@ # Eric Larson # Denis Engemann # Fernando Perez (bin_perm_rep function) +# Carina Forster # # License: BSD-3-Clause # Copyright the MNE-Python contributors. From 7e9b2e5d3b844bab1454ca16076a1dcf747992d9 Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Thu, 18 Jul 2024 14:35:02 +0200 Subject: [PATCH 18/88] fixed codespell --- mne/stats/cluster_level.py | 55 +++++++++++-------- .../76_new_cluster_test_api.py | 34 +++++++----- 2 files changed, 50 insertions(+), 39 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index 9a223d715ae..0d3ecec2e58 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -1820,8 +1820,14 @@ def cluster_test( if len(pd.unique(df.condition)) != 2: raise ValueError("Condition list needs to contain 2 unique values") - # get the unique conditions - conditions = np.unique(df.condition) + # Get unique elements and the indices of their first occurrences + unique_elements, indices = np.unique(df.condition, return_index=True) + + # Sort unique elements by the indices of their first occurrences + conditions = unique_elements[np.argsort(indices)] + + # print the contrast used for the paired t-test + print(f"Contrast used for paired t-test: {conditions[0]} - {conditions[1]}") # Compute the difference (assuming there are only 2 conditions) pivot_df["evoked"] = pivot_df[conditions[0]] - pivot_df[conditions[1]] @@ -1968,8 +1974,8 @@ def plot_cluster(self, cond_dict: dict = None): ch_inds = np.unique(space_inds) time_inds = np.unique(time_inds) - # get topography for F stat - t_map = self.T_obs[time_inds, ...].mean(axis=0) + # get topography for t stat + t_map = self.T_obs[time_inds, ...].mean(axis=0).astype(int) # get signals at the sensors contributing to the cluster sig_times = cond_values[0][0].times[time_inds] @@ -1987,11 +1993,11 @@ def plot_cluster(self, cond_dict: dict = None): times=0, mask=mask, axes=ax_topo, - cmap="Reds", - vlim=(np.min, np.max), + cmap="RdBu_r", show=False, colorbar=False, mask_params=dict(markersize=10), + scalings=1.00, ) image = ax_topo.images[0] @@ -2008,32 +2014,33 @@ def plot_cluster(self, cond_dict: dict = None): divider = make_axes_locatable(ax_topo) # add axes for colorbar - ax_colorbar = divider.append_axes("right", size="5%", pad=0.05) - plt.colorbar(image, cax=ax_colorbar) + ax_colorbar = divider.append_axes("right", size="5%", pad=0.1) + cbar = plt.colorbar(image, cax=ax_colorbar) + cbar.set_label("t-value") ax_topo.set_xlabel( - "Averaged t-map ({:0.3f} - {:0.3f} s)".format(*sig_times[[0, -1]]) + "average from {:0.3f} to {:0.3f} s".format(*sig_times[[0, -1]]) ) # add new axis for time courses and plot time courses - ax_signals = divider.append_axes("right", size="300%", pad=1.2) - title = f"Cluster #1, {len(ch_inds)} sensor" - if len(ch_inds) > 1: - title += "s (mean)" - plot_compare_evokeds( - cond_dict, - title=title, - picks=ch_inds, - axes=ax_signals, - colors=colors, - show=False, - split_legend=True, - truncate_yaxis="auto", - ) + ax_signals = divider.append_axes("right", size="300%", pad=1.3) + title = f"Signal averaged over {len(ch_inds)} sensor(s)" + plot_compare_evokeds( + cond_dict, + title=title, + picks=ch_inds, + axes=ax_signals, + colors=colors, + show=False, + split_legend=True, + truncate_yaxis="auto", + truncate_xaxis=False, + ) + plt.legend(frameon=False, loc="upper left") # plot temporal cluster extent ymin, ymax = ax_signals.get_ylim() ax_signals.fill_betweenx( - (ymin, ymax), sig_times[0], sig_times[-1], color="green", alpha=0.3 + (ymin, ymax), sig_times[0], sig_times[-1], color="grey", alpha=0.3 ) plt.show() diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index 3acfd21f7f0..842e0543b0b 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -6,8 +6,9 @@ =============================================================== This tutorial shows how to use the new API for cluster testing. -This script shows how to estimate significant clusters in -evoked contrast data of multiple subjects. +The new API allows for Wilkinson style formulas and allows for more flexibility in +the design of the test. Here we will demonstrate how to use the new API for +a standard paired t-test on evoked data from multiple subjects. It uses a non-parametric statistical procedure based on permutations and cluster level statistics. @@ -16,7 +17,7 @@ - loading evoked data from multiple subjects - construct a dataframe that contains the difference between conditions - run the new cluster test function with formula in Wilkinson notation - - plot the results with the ClusterResults Class + - plot the results with the new ClusterResults API Here, the unit of observation are evokeds from multiple subjects (2nd level analysis). @@ -41,13 +42,14 @@ # Define the path to the P3 dataset path_to_p3 = mne.datasets.misc.data_path() / "ERP_CORE" / "P3" -# Define the range of participant IDs -participant_ids = range(15, 20) # This will cover 015 to 019 +# Define the range of participant IDs (we only have 5 participants in the dataset) +participant_ids = range(15, 20) # This will cover participant 15 to 19 # store the evoked data of all subjects evokeds_allsubs = [] # Loop over each participant ID and generate the corresponding filename +# to load the evoked data for pid in participant_ids: # Create the filename using an f-string, ID is zero-padded to 3 digits filename_p3 = f"sub-{pid:03d}_ses-P3_task-P3_ave.fif" @@ -58,21 +60,22 @@ # load the evoked data evokeds = mne.read_evokeds(p3_file_path) - # add subjects evoked data to list + # add single subjects evoked data to a list evokeds_allsubs.append(evokeds) # the P3b dataset is part of the freely available ERP CORE dataset # participants were presented with a visual oddball task # and the P3b component was analyzed # the conditions of interest are the target (rare visual stimuli) -# and non-target stimuli (frequency visual stimuli) +# and non-target stimuli (frequent visual stimuli) # let's extract the target and non-target evokeds target_only = [evoked[0] for evoked in evokeds_allsubs] non_target_only = [evoked[1] for evoked in evokeds_allsubs] # let's first have a look at the data -# create contrast from target and non-target evokeds + +# create contrast target - non-target diff_evoked = [ mne.combine_evoked([evokeds_a, evokeds_b], weights=[1, -1]) for evokeds_a, evokeds_b in zip(target_only, non_target_only) @@ -84,7 +87,7 @@ mne.grand_average(diff_evoked).plot_topomap() # we can see that the strongest difference is around 400 ms in -# visual channels (occipital region) +# central-parietal channels with a stronger evoked signal for target stimuli # Next we prepare a dataframe for the cluster test function # the dataframe should contain the contrast evoked data and the subject index @@ -93,7 +96,7 @@ # save the evoked data for both conditions in one list evokeds_conditions = target_only + non_target_only -# set up a list that defines the condition for each evoked data +# create a list that defines the condition for each evoked data # this will be used to create the conditions column in the dataframe conditions = ["target"] * len(target_only) + ["non-target"] * len(non_target_only) @@ -102,7 +105,7 @@ # we multiply the participant_ids by 2 to account for the two conditions subject_index = list(participant_ids) * 2 -# create the dataframe +# create the dataframe containing the evoked data, the condition and the subject index df = pd.DataFrame( { "evoked": evokeds_conditions, @@ -122,20 +125,21 @@ # let's first define the formula based on Wilkinson notation formula = "evoked ~ 1 + C(subject_index)" -# run the cluster test and return the cluster_result object +# run the new cluster test API and return the new cluster_result object cluster_result = mne.stats.cluster_level.cluster_test(df=df, formula=formula) # note that we ran an exact test due to the small sample size (only 15 permutations) # set up conditions dictionary for cluster plots +# this is necessary for plotting the evoked data and the cluster result on top conditions_dict = {"target": target_only, "non-target": non_target_only} # finally let's plot the results using the ClusterResults class # we plot the cluster with the lowest p-value - +cluster_result.plot_cluster(cond_dict=conditions_dict) # we can see that there is something going on around 400 ms -# in the visual channels (topomap on the left) +# with a stronger signal for target trials in right central-parietal channels + # however the cluster is not significant which is unsurprising # given the small sample size (only 5 subjects) -cluster_result.plot_cluster(cond_dict=conditions_dict) From 8f510a9fdca0ba6411795d79a871b3cfd9c21a6a Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Mon, 22 Jul 2024 20:22:20 +0200 Subject: [PATCH 19/88] first review --- mne/stats/cluster_level.py | 384 ++++++++++++++++++++++++++----------- 1 file changed, 272 insertions(+), 112 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index 0d3ecec2e58..ab0fd0daf69 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -11,6 +11,10 @@ # License: BSD-3-Clause # Copyright the MNE-Python contributors. +from __future__ import annotations + +from typing import Literal + import matplotlib.pyplot as plt import numpy as np import pandas as pd @@ -20,12 +24,13 @@ from scipy.stats import f as fstat from scipy.stats import t as tstat -from .. import EvokedArray -from ..channels import find_ch_adjacency +from .. import Epochs, Evoked +from ..epochs import EpochsArray, EvokedArray from ..fixes import has_numba, jit from ..parallel import parallel_func from ..source_estimate import MixedSourceEstimate, SourceEstimate, VolSourceEstimate from ..source_space import SourceSpaces +from ..time_frequency import AverageTFR, AverageTFRArray, EpochsTFR, EpochsTFRArray from ..utils import ( ProgressBar, _check_option, @@ -945,7 +950,7 @@ def _permutation_cluster_test( sample_shape = X[0].shape[1:] for x in X: if x.shape[1:] != sample_shape: - raise ValueError("All samples mush have the same size") + raise ValueError("All samples must have the same size") # flatten the last dimensions in case the data is high dimensional X = [np.reshape(x, (x.shape[0], -1)) for x in X] @@ -1739,21 +1744,186 @@ def summarize_clusters_stc( return klass(data_summary, vertices, tmin, tstep, subject) +def validate_input_dataframe(df: pd.DataFrame, formula: str): + """ + Validate the input dataframe for the cluster permutation test. + + Parameters + ---------- + df : pd.DataFrame + Dataframe with 3 columns (subject_index, condition, data). + formula : formulaic.ModelSpec + Wilkinson style Formula for the design matrix. + + Returns + ------- + dv_name : str + Name of the dependent variable. + """ + # extract dependent variable name from formula + formulaic = _soft_import( + "formulaic", purpose="set up Design Matrix" + ) # soft import (not a dependency for MNE) + formula = formulaic.Formula(formula) + dv_name = str(formula.lhs) + + # check if all necessary columns are present + if dv_name not in df.columns: + raise ValueError("""DataFrame needs to contain a column + with the dependent variable name + as defined in the formula""") + if "condition" not in df.columns: + raise ValueError("DataFrame needs to contain a condition column") + if "subject_index" not in df.columns: + raise ValueError("DataFrame needs to contain a subject_index column") + + # check if the data column contains only valid types + check_column_types(df[dv_name]) + + # check if the shape of the data is consistent + if not all(data.data.shape == df[dv_name][0].data.shape for data in df[dv_name]): + raise ValueError("Data objects need to have the same shape") + + # check if the condition column contains only 2 unique values + if len(pd.unique(df.condition)) != 2: + raise ValueError("currently only supports 2 conditions.") + + return dv_name + + +def check_column_types(input_data: np.ndarray): + """ + Check if the column types are valid for the cluster permutation test. + + Parameters + ---------- + input_data : np.Array + Data to be checked for the cluster permutation test. + """ + # Get the type of the first element + first_type = type(input_data.iloc[0]) + + # Define the possible valid types + valid_types = ( + Evoked, + EvokedArray, + Epochs, + EpochsArray, + AverageTFR, + EpochsTFR, + EpochsTFRArray, + AverageTFRArray, + ) + + # Check if the type of the first element is a valid type + if first_type not in valid_types: + raise ValueError(f"Object type '{first_type}' is not a valid type.") + + # Check if all elements are of the same type as the first one + if not all(isinstance(data, first_type) for data in input_data): + raise ValueError("Data column must contain objects of the same type.") + + +def prepare_data_for_cluster_test(input_df: pd.DataFrame, dv_name: str): + """ + Prepare the data for the cluster permutation test. + + Parameters + ---------- + input_data : np.ndarray + Data to be prepared for the cluster permutation test. + + Returns + ------- + data : np.Array + Data prepared for the cluster permutation test. + """ + # extract data and add to dataframe + input_df["data"] = [data.data for data in input_df[dv_name]] + + # extract dimensions from time series or time-frequency data + first_data_obj = input_df["data"].iloc[0] + if isinstance(first_data_obj, (Epochs, Evoked, EpochsArray, EvokedArray)): + n_channels, n_timepoints = first_data_obj.get_data().shape + if isinstance( + first_data_obj, (AverageTFR, EpochsTFR, AverageTFRArray, EpochsTFRArray) + ): + n_channels, n_freqs, n_timepoints = first_data_obj.get_data().shape + + reshaped_data = [] + + for idx, row in input_df.iterrows(): + subject_index = row["subject_index"] + condition = row["condition"] + data_array = row["data"] + + if data_array.ndim == 2: + n_channels, n_timepoints = data_array.shape + # timepoints are the columns + df_temp = pd.DataFrame( + data_array, columns=[f"timepoint_{i}" for i in range(n_timepoints)] + ) + df_temp["channel"] = range(n_channels) + df_temp["subject_index"] = subject_index + df_temp["condition"] = condition + + reshaped_data.append(df_temp) + + elif data_array.ndim == 3: + n_channels, n_freqs, n_timepoints = data_array.shape + # timepoints are the columns + df_temp = pd.DataFrame( + data_array.reshape(-1, n_timepoints), + columns=[f"timepoint_{i}" for i in range(n_timepoints)], + ) + df_temp["frequency"] = np.repeat(range(n_freqs), n_channels) + df_temp["channel"] = np.tile(range(n_channels), n_freqs) + df_temp["subject_index"] = subject_index + df_temp["condition"] = condition + + reshaped_data.append(df_temp) + + else: + raise ValueError(f"Unsupported data array dimensions: {data_array.ndim}") + # combine the reshaped data + combined_df = pd.concat(reshaped_data, ignore_index=True) + # Convert the dataframe to long format + id_vars = ["subject_index", "condition", "channel"] + if "frequency" in combined_df.columns: + id_vars.append("frequency") + + reshaped_df = pd.melt( + combined_df, id_vars=id_vars, var_name="timepoint", value_name="value" + ) + + # rename column and convert to integer + reshaped_df["timepoint"] = ( + reshaped_df["timepoint"].str.replace("timepoint_", "").astype(int) + ) + + # return the reshaped dataframe and dimensions + if data_array.ndim == 2: + return reshaped_df, data_array.ndim, n_channels, n_timepoints + elif data_array.ndim == 3: + return reshaped_df, data_array.ndim, n_channels, n_freqs, n_timepoints + + def cluster_test( df: pd.DataFrame, - formula: str = None, # Wilkinson notation formula for design matrix - n_permutations: int = 10000, + formula: str, # Wilkinson notation formula for design matrix + paired_test: bool, # whether to run a paired t-test or unpaired test + n_permutations: int = 1024, # same default as in old API seed: None | int | np.random.RandomState = None, - tail: int = 0, # 0 for two-tailed, 1 for greater, -1 for less + tail: Literal[-1, 0, 1] = 0, # 0 for two-tailed, 1 for greater, -1 for less n_jobs: int = 1, # how many cores to use - adjacency: tuple = None, + adjacency: tuple | None = None, max_step: int = 1, # maximum distance between samples (time points) - exclude: list = None, # exclude no time points or channels + exclude: list | None = None, # exclude no time points or channels step_down_p: int = 0, # step down in jumps test t_power: int = 1, # weigh each location by its stats score - out_type: str = "indices", + out_type: Literal["indices", "mask"] = "indices", check_disjoint: bool = False, - buffer_size: int = None, # block size for chunking the data + buffer_size: int | None = None, # block size for chunking the data ): """ Run a cluster permutation test based on formulaic input. @@ -1762,12 +1932,14 @@ def cluster_test( Parameters ---------- - dataframe : pd.DataFrame - Dataframe with evoked/epoched data, conditions and subject IDs. - formula : str, optional - Wilkinson notation formula for design matrix. Default is None. + df : pd.DataFrame + Dataframe with 3 columns (subject_index, condition, evoked). + formula : str + Wilkinson notation formula for design matrix. + paired_test: bool + Whether to run a paired t-test. n_permutations : int, optional - Number of permutations. Default is 10000. + Number of permutations. Default is 1024. seed : None | int | np.random.RandomState, optional Seed for the random number generator. Default is None. tail : int, optional @@ -1775,7 +1947,7 @@ def cluster_test( n_jobs : int, optional How many cores to use. Default is 1. adjacency : None, optional - Adjacency matrix. Default is None. + Provide a adjacency matrix. Default is None. max_step : int, optional Maximum distance between samples (time points). Default is 1. exclude : np.Array, optional @@ -1798,27 +1970,38 @@ def cluster_test( ClusterResult Object containing the results of the cluster permutation test. """ - # for now this assumes a dataframe with a column for evoked data or epochs - # add a data column to the dataframe (numpy array) - df["data"] = [evoked.data for evoked in df.evoked] - - # extract number of channels and timepoints - # (eventually should also allow for frequency) - n_channels, n_timepoints = df["data"][0].shape - - # convert wide format to long format for formulaic - df_long = unpack_time_and_channels(df) - - # pivot the DataFrame - pivot_df = df_long.pivot_table( - index=["subject_index", "channel", "timepoint"], - columns="condition", - values="value", - ).reset_index() - - # if not 2 unique conditions raise error - if len(pd.unique(df.condition)) != 2: - raise ValueError("Condition list needs to contain 2 unique values") + # check if formula is present + if formula is None: + raise ValueError("Wilkinson style formula is required.") + + # validate the input dataframe and return name of dependent variable + dv_name = validate_input_dataframe(df, formula) + + # prepare the data for the cluster permutation test + prep_result = prepare_data_for_cluster_test(df, dv_name) + + if prep_result[1] == 2: + # pivot the dataframe based on condition for later subtraction + pivot_df = ( + prep_result[0] + .pivot_table( + index=["subject_index", "channel", "timepoint"], + columns="condition", + values="value", + ) + .reset_index() + ) + elif prep_result[1] == 3: + # pivot the dataframe based on condition for later subtraction + pivot_df = ( + prep_result[0] + .pivot_table( + index=["subject_index", "channel", "frequency", "timepoint"], + columns="condition", + values="value", + ) + .reset_index() + ) # Get unique elements and the indices of their first occurrences unique_elements, indices = np.unique(df.condition, return_index=True) @@ -1826,41 +2009,51 @@ def cluster_test( # Sort unique elements by the indices of their first occurrences conditions = unique_elements[np.argsort(indices)] - # print the contrast used for the paired t-test - print(f"Contrast used for paired t-test: {conditions[0]} - {conditions[1]}") + # store the contrast for the clusterResults object + contrast = f"{conditions[0]} - {conditions[1]}" - # Compute the difference (assuming there are only 2 conditions) - pivot_df["evoked"] = pivot_df[conditions[0]] - pivot_df[conditions[1]] + # print the contrast used for the paired t-test so the user knows + # what is subtracted from what + logger.info(f"Contrast used for paired t-test: {contrast}") - # Optional: Clean up the DataFrame - pivot_df = pivot_df[["subject_index", "channel", "timepoint", "evoked"]] + # Compute the difference (assuming there are only 2 conditions) + pivot_df[dv_name] = pivot_df[conditions[0]] - pivot_df[conditions[1]] + + # for the paired t-test y is the difference between conditions + # X is the design matrix with a column with 1s and 0s for each participant + # Create the design matrix using formulaic + formulaic = _soft_import( + "formulaic", purpose="set up Design Matrix" + ) # soft import (not a dependency for MNE) + y, X = formulaic.model_matrix(formula, pivot_df) + + # Prepare design matrix for input into MNE cluster function + # MNE cluster functions expect channels as the last dimension + + if prep_result[1] == 2: + # Reshape y.values into a 3D array: (participants, n_channels, n_timepoints) + y_reshaped = y.values.reshape(-1, prep_result[2], prep_result[3]) + # Transpose the array to have channels as the last dimension + y_for_cluster = y_reshaped.transpose(0, 2, 1) + elif prep_result[1] == 3: + # Reshape y.values into a 4D array: + # (participants, n_channels, n_freqs, n_timepoints) + y_reshaped = y.values.reshape( + -1, prep_result[2], prep_result[3], prep_result[4] + ) + # Transpose the array to have channels as the last dimension + y_for_cluster = y_reshaped.transpose(0, 3, 2, 1) - # check if formula is present - if formula is not None: - formulaic = _soft_import( - "formulaic", purpose="set up Design Matrix" - ) # soft import (not a dependency for MNE) - - # for the paired t-test y is the difference between conditions - # X is the design matrix with a column with 1s and 0s for each participant - # Create the design matrix using formulaic - y, X = formulaic.model_matrix(formula, pivot_df) + if paired_test: + # define stat function and threshold + stat_fun, threshold = _check_fun( + X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="within" + ) else: - raise ValueError( - "Formula is required and needs to be a string in Wilkinson notation." + # define stat function and threshold + stat_fun, threshold = _check_fun( + X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="between" ) - - # now prep design matrix for input into MNE cluster function - # cluster functions expects channels as list dimension - y_for_cluster = y.values.reshape(-1, n_channels, n_timepoints).transpose(0, 2, 1) - - adjacency, _ = find_ch_adjacency(df["evoked"][0].info, ch_type="eeg") - - # define stat function and threshold - stat_fun, threshold = _check_fun( - X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="within" - ) - # Run the cluster-based permutation test T_obs, clusters, cluster_p_values, H0 = _permutation_cluster_test( [y_for_cluster], @@ -1885,40 +2078,6 @@ def cluster_test( return ClusterResult(T_obs, clusters, cluster_p_values, H0) -def unpack_time_and_channels(df: pd.DataFrame = None) -> pd.DataFrame: - """ - Extract timepoints and channels and convert to long. - - Parameters - ---------- - df : pd.DataFrame - DataFrame in wide format. - - Returns - ------- - df_long : pd.DataFrame - DataFrame in long format. - """ - # Extracting all necessary data using list comprehensions for better performance - long_format_data = [ - { - "condition": row["condition"], - "subject_index": row["subject_index"], - "channel": channel, - "timepoint": timepoint, - "value": row["data"][channel, timepoint], - } - for idx, row in df.iterrows() - for channel in range(row["data"].shape[0]) - for timepoint in range(row["data"].shape[1]) - ] - - # Creating the long format DataFrame - df_long = pd.DataFrame(long_format_data) - - return df_long - - class ClusterResult: """ Object containing the results of the cluster permutation test. @@ -1935,13 +2094,19 @@ class ClusterResult: Max cluster level stats observed under permutation. """ - def __init__(self, T_obs, clusters, cluster_p_values, H0): + def __init__( + self, + T_obs: np.typing.NDArray, + clusters: list, + cluster_p_values: np.typing.NDArray, + H0: np.typing.NDArray, + ): self.T_obs = T_obs self.clusters = clusters self.cluster_p_values = cluster_p_values self.H0 = H0 - def plot_cluster(self, cond_dict: dict = None): + def plot_cluster(self, condition_labels: dict): """ Plot the cluster with the lowest p-value. @@ -1951,18 +2116,13 @@ def plot_cluster(self, cond_dict: dict = None): Parameters ---------- - cond_dict : dict + condition_labels : dict Dictionary with condition labels as keys and evoked objects as values. - - Returns - ------- - None - """ # extract condition labels from the dictionary - cond_keys = list(cond_dict.keys()) + cond_keys = list(condition_labels.keys()) # extract the evokeds from the dictionary - cond_values = list(cond_dict.values()) + cond_values = list(condition_labels.values()) # configure variables for visualization colors = {cond_keys[0]: "crimson", cond_keys[1]: "steelblue"} @@ -2025,7 +2185,7 @@ def plot_cluster(self, cond_dict: dict = None): ax_signals = divider.append_axes("right", size="300%", pad=1.3) title = f"Signal averaged over {len(ch_inds)} sensor(s)" plot_compare_evokeds( - cond_dict, + condition_labels, title=title, picks=ch_inds, axes=ax_signals, From d6c0c4c299b3aeaa9089f923aa9017de0b12e42d Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Mon, 22 Jul 2024 20:44:43 +0200 Subject: [PATCH 20/88] quick clean up --- .../76_new_cluster_test_api.py | 27 ++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index 842e0543b0b..efbc6d5e3f0 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -29,7 +29,7 @@ # License: BSD-3-Clause # Copyright the MNE-Python contributors. -# %% +# %% Load the required packages from pathlib import Path @@ -37,6 +37,8 @@ import mne +# %% Load the P3 dataset + # Set parameters # -------------- # Define the path to the P3 dataset @@ -69,6 +71,8 @@ # the conditions of interest are the target (rare visual stimuli) # and non-target stimuli (frequent visual stimuli) +# %% visually inspect the evoked data for each condition + # let's extract the target and non-target evokeds target_only = [evoked[0] for evoked in evokeds_allsubs] non_target_only = [evoked[1] for evoked in evokeds_allsubs] @@ -89,7 +93,8 @@ # we can see that the strongest difference is around 400 ms in # central-parietal channels with a stronger evoked signal for target stimuli -# Next we prepare a dataframe for the cluster test function +# %% Prepare the dataframe for the new cluster test API + # the dataframe should contain the contrast evoked data and the subject index # each row in the dataframe should represent one observation (evoked data) @@ -114,7 +119,8 @@ } ) -# now we can run the cluster test function +# %% run the cluster test function with formulaic input + # we will use the new API that allows for Wilkinson style formulas # the formula should be a string in Wilkinson notation @@ -123,12 +129,21 @@ # we will use a cluster-based permutation paired t-test for this # let's first define the formula based on Wilkinson notation +# we want to predict the evoked difference signal based on the subject +# the cluster test randomly permutes the subject label +# the 1 in the formula represents the intercept which is always included +# C is a categorical variable that will be dummy coded formula = "evoked ~ 1 + C(subject_index)" # run the new cluster test API and return the new cluster_result object -cluster_result = mne.stats.cluster_level.cluster_test(df=df, formula=formula) +cluster_result = mne.stats.cluster_level.cluster_test( + df=df, formula=formula, paired_test=True, adjacency=None +) + +# note that we ran an exact test due to the small sample size +# (only 15 permutations) -# note that we ran an exact test due to the small sample size (only 15 permutations) +# %% plot the results # set up conditions dictionary for cluster plots # this is necessary for plotting the evoked data and the cluster result on top @@ -137,7 +152,7 @@ # finally let's plot the results using the ClusterResults class # we plot the cluster with the lowest p-value -cluster_result.plot_cluster(cond_dict=conditions_dict) +cluster_result.plot_cluster(condition_labels=conditions_dict) # we can see that there is something going on around 400 ms # with a stronger signal for target trials in right central-parietal channels From f17f38fffc5b7c8ae0b19dec319df68a9d75df0f Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Mon, 22 Jul 2024 21:52:00 +0200 Subject: [PATCH 21/88] test compare_old_vs_new_cluster_API --- mne/stats/tests/test_cluster_level.py | 150 +++++++++++++++++++++++++- 1 file changed, 149 insertions(+), 1 deletion(-) diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py index 439045b8d08..2307f793dad 100644 --- a/mne/stats/tests/test_cluster_level.py +++ b/mne/stats/tests/test_cluster_level.py @@ -8,6 +8,7 @@ from functools import partial import numpy as np +import pandas as pd import pytest from numpy.testing import ( assert_allclose, @@ -17,10 +18,20 @@ ) from scipy import linalg, sparse, stats -from mne import MixedSourceEstimate, SourceEstimate, SourceSpaces, VolSourceEstimate +from mne import ( + EvokedArray, + MixedSourceEstimate, + SourceEstimate, + SourceSpaces, + VolSourceEstimate, + create_info, +) from mne.fixes import _eye_array from mne.stats import combine_adjacency, ttest_ind_no_p from mne.stats.cluster_level import ( + _check_fun, + _permutation_cluster_test, + cluster_test, f_oneway, permutation_cluster_1samp_test, permutation_cluster_test, @@ -29,6 +40,7 @@ summarize_clusters_stc, ttest_1samp_no_p, ) +from mne.time_frequency import AverageTFRArray from mne.utils import _record_warnings, catch_logging n_space = 50 @@ -869,3 +881,139 @@ def test_output_equiv(shape, out_type, adjacency, threshold): assert out_type == "indices" got_mask[np.ix_(*clu)] = n assert_array_equal(got_mask, want_mask) + + +def create_sample_data_cluster_test(): + """Create sample data to test new cluster API.""" + # Prepare some dummy data + n_subjects = 20 + n_conditions = 2 + n_channels = 5 + n_timepoints = 8 + n_freqs = 3 + + # Create dummy data + dummy_data_2d = [ + np.random.rand(n_channels, n_timepoints) + for _ in range(n_subjects * n_conditions) + ] + dummy_data_3d = [ + np.random.rand(n_channels, n_freqs, n_timepoints) + for _ in range(n_subjects * n_conditions) + ] + + # Create a DataFrame with dummy data + df_2d = pd.DataFrame( + { + "subject_index": np.repeat(range(n_subjects), n_conditions), + "condition": np.tile(["cond1", "cond2"], n_subjects), + "data": dummy_data_2d, + } + ) + + df_3d = pd.DataFrame( + { + "subject_index": np.repeat(range(n_subjects), n_conditions), + "condition": np.tile(["cond1", "cond2"], n_subjects), + "data": dummy_data_3d, + } + ) + + return df_2d, df_3d + + +def compare_old_and_new_cluster_api(): + """Make sure old and new cluster API results are the same.""" + # load sample data + df_2d, df_3d = create_sample_data_cluster_test() + + # mandatory parameters for new cluster API + formula = "evoked ~ 1 + C(subject_index)" + + data_to_test = [df_2d, df_3d] + + # save 2D and 3D data results for both old and new API + result_old_api_all = [] + result_new_api_all = [] + d_all = [] + + for df in data_to_test: + # Pivot the DataFrame to have conditions as columns for old API + pivot_df = df.pivot(index="subject_index", columns="condition", values="data") + + # Subtract condition 2 data from condition 1 data for each subject + pivot_df["cond_diff"] = pivot_df.apply( + lambda row: row["cond1"] - row["cond1"], axis=1 + ) + + # Extract the 'cond_diff' column as a numpy array + cond_diff_array = np.stack(pivot_df["cond_diff"].values) + + # extract data and reshape for old API + if pivot_df.cond_diff[0].ndim == 2: + # reshape to channels as last dimension + d = cond_diff_array.transpose(0, 2, 1) + else: + # reshape 3D data to channels as last dimension + d = cond_diff_array.transpose(0, 3, 2, 1) + + # define test statistic + stat_fun, threshold = _check_fun( + X=d, stat_fun=None, threshold=None, tail=0, kind="within" + ) + + # Run old cluster api + result_old_api = _permutation_cluster_test( + [d], + threshold=threshold, + stat_fun=stat_fun, + n_jobs=-1, # takes all CPU cores + max_step=1, # maximum distance between samples (time points) + exclude=None, # exclude no time points or channels + step_down_p=0, # step down in jumps test + t_power=1, # weigh each location by its stats score + out_type="indices", + check_disjoint=False, + buffer_size=None, # block size for chunking the data + n_permutations=1024, + tail=0, + adjacency=None, + seed=42, + ) + result_old_api_all.append(result_old_api) + d_all.append(d) + + if df.data[0].ndim == 2: + # convert each row in data column into evoked object + df["evoked"] = df["data"].apply( + lambda x: EvokedArray( + x, create_info(df.data[0].shape[0], 1000.0, "eeg") + ) + ) + else: + # convert each row in data column into evoked object + df["evoked"] = df["data"].apply( + lambda x: AverageTFRArray( + create_info(df.data[0].shape[0], 1000.0, "eeg"), + x, + times=np.arange(df.data[0].shape[2]), + freqs=np.arange(df.data[0].shape[1]), + ) + ) + + # run the new cluster test API and return the new cluster_result object + cluster_result = cluster_test( + df=df, formula=formula, paired_test=True, adjacency=None, seed=42 + ) + result_new_api_all.append(cluster_result) + + # compare old and new API results both for 2D and 3D data + for result_old_api, result_new_api in zip(result_old_api_all, result_new_api_all): + # compare the cluster statistics + assert_array_equal(result_old_api[0], result_new_api.T_obs) + + # compare the cluster indices + assert_array_equal(result_old_api[1], result_new_api.clusters) + + # compare the cluster p-values + assert_array_equal(result_old_api[2], result_new_api.cluster_p_values) From 9d592de86e56412dfc69433e0c9116589f4fde1f Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Thu, 25 Jul 2024 12:17:36 -0500 Subject: [PATCH 22/88] simplify tests Co-authored-by: Carina Forster --- mne/stats/tests/test_cluster_level.py | 136 ++++++++------------------ 1 file changed, 41 insertions(+), 95 deletions(-) diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py index 2307f793dad..01fcd5adba6 100644 --- a/mne/stats/tests/test_cluster_level.py +++ b/mne/stats/tests/test_cluster_level.py @@ -19,6 +19,7 @@ from scipy import linalg, sparse, stats from mne import ( + EpochsArray, EvokedArray, MixedSourceEstimate, SourceEstimate, @@ -29,8 +30,6 @@ from mne.fixes import _eye_array from mne.stats import combine_adjacency, ttest_ind_no_p from mne.stats.cluster_level import ( - _check_fun, - _permutation_cluster_test, cluster_test, f_oneway, permutation_cluster_1samp_test, @@ -40,7 +39,7 @@ summarize_clusters_stc, ttest_1samp_no_p, ) -from mne.time_frequency import AverageTFRArray +from mne.time_frequency import AverageTFRArray, EpochsTFRArray from mne.utils import _record_warnings, catch_logging n_space = 50 @@ -922,98 +921,45 @@ def create_sample_data_cluster_test(): return df_2d, df_3d -def compare_old_and_new_cluster_api(): - """Make sure old and new cluster API results are the same.""" - # load sample data - df_2d, df_3d = create_sample_data_cluster_test() - - # mandatory parameters for new cluster API - formula = "evoked ~ 1 + C(subject_index)" - - data_to_test = [df_2d, df_3d] - - # save 2D and 3D data results for both old and new API - result_old_api_all = [] - result_new_api_all = [] - d_all = [] - - for df in data_to_test: - # Pivot the DataFrame to have conditions as columns for old API - pivot_df = df.pivot(index="subject_index", columns="condition", values="data") - - # Subtract condition 2 data from condition 1 data for each subject - pivot_df["cond_diff"] = pivot_df.apply( - lambda row: row["cond1"] - row["cond1"], axis=1 - ) - - # Extract the 'cond_diff' column as a numpy array - cond_diff_array = np.stack(pivot_df["cond_diff"].values) - - # extract data and reshape for old API - if pivot_df.cond_diff[0].ndim == 2: - # reshape to channels as last dimension - d = cond_diff_array.transpose(0, 2, 1) - else: - # reshape 3D data to channels as last dimension - d = cond_diff_array.transpose(0, 3, 2, 1) - - # define test statistic - stat_fun, threshold = _check_fun( - X=d, stat_fun=None, threshold=None, tail=0, kind="within" - ) - - # Run old cluster api - result_old_api = _permutation_cluster_test( - [d], - threshold=threshold, - stat_fun=stat_fun, - n_jobs=-1, # takes all CPU cores - max_step=1, # maximum distance between samples (time points) - exclude=None, # exclude no time points or channels - step_down_p=0, # step down in jumps test - t_power=1, # weigh each location by its stats score - out_type="indices", - check_disjoint=False, - buffer_size=None, # block size for chunking the data - n_permutations=1024, - tail=0, - adjacency=None, - seed=42, - ) - result_old_api_all.append(result_old_api) - d_all.append(d) - - if df.data[0].ndim == 2: - # convert each row in data column into evoked object - df["evoked"] = df["data"].apply( - lambda x: EvokedArray( - x, create_info(df.data[0].shape[0], 1000.0, "eeg") - ) - ) - else: - # convert each row in data column into evoked object - df["evoked"] = df["data"].apply( - lambda x: AverageTFRArray( - create_info(df.data[0].shape[0], 1000.0, "eeg"), - x, - times=np.arange(df.data[0].shape[2]), - freqs=np.arange(df.data[0].shape[1]), - ) - ) - - # run the new cluster test API and return the new cluster_result object - cluster_result = cluster_test( - df=df, formula=formula, paired_test=True, adjacency=None, seed=42 +def test_compare_old_and_new_cluster_api(): + """Test for same results from old and new APIs.""" + condition1_1d, condition2_1d, condition1_2d, condition2_2d = _get_conditions() + df_1d = pd.DataFrame( + dict( + data=[condition1_1d, condition2_1d], + condition=["a", "b"], ) - result_new_api_all.append(cluster_result) - - # compare old and new API results both for 2D and 3D data - for result_old_api, result_new_api in zip(result_old_api_all, result_new_api_all): - # compare the cluster statistics - assert_array_equal(result_old_api[0], result_new_api.T_obs) + ) + kwargs = dict(n_permutations=100, tail=1, seed=1, buffer_size=None, out_type="mask") + F_obs, clusters, cluster_pvals, H0 = permutation_cluster_test( + [condition1_1d, condition2_1d], **kwargs + ) + formula = "data ~ condition" + cluster_result = cluster_test(df_1d, formula, **kwargs) + assert_array_equal(cluster_result.H0, H0) + assert_array_equal(cluster_result.stat_obs, F_obs) + assert_array_equal(cluster_result.cluster_p_values, cluster_pvals) + assert cluster_result.clusters == clusters - # compare the cluster indices - assert_array_equal(result_old_api[1], result_new_api.clusters) - # compare the cluster p-values - assert_array_equal(result_old_api[2], result_new_api.cluster_p_values) +@pytest.mark.parametrize( + "Inst", (EpochsArray, EvokedArray, EpochsTFRArray, AverageTFRArray) +) +def test_new_cluster_api(Inst): + """Test handling different MNE objects in the cluster API.""" + pd = pytest.importorskip("pandas") + + n_epo, n_chan, n_freq, n_times = 2, 3, 5, 7 + shape = (n_chan, n_times) + if Inst in (EpochsArray, EpochsTFRArray): + shape = (n_epo,) + shape + if Inst in (EpochsTFRArray, AverageTFRArray): + shape = shape[:-1] + (n_freq, shape[-1]) + + info = create_info(...) + inst1 = Inst(np.random.normal(shape, ...), info=info) + inst2 = Inst(np.random.normal(shape, ...), info=info) + + df = pd.DataFrame(dict(data=[inst1, inst2], condition=["a", "b"])) + result = cluster_test(df, "data~condition", ...) + assert result # TODO do something more interesting here From d64ef84d6a1885fe225e19fe99e7cf3087550854 Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Thu, 25 Jul 2024 12:23:49 -0500 Subject: [PATCH 23/88] refactor cluster_test Co-authored-by: Eric Larson Co-authored-by: Carina Forster --- mne/stats/cluster_level.py | 419 ++++++++++++------------------------- 1 file changed, 139 insertions(+), 280 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index ab0fd0daf69..a366b19ecc1 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -24,13 +24,12 @@ from scipy.stats import f as fstat from scipy.stats import t as tstat -from .. import Epochs, Evoked -from ..epochs import EpochsArray, EvokedArray +from .. import BaseEpochs, Evoked, EvokedArray from ..fixes import has_numba, jit from ..parallel import parallel_func from ..source_estimate import MixedSourceEstimate, SourceEstimate, VolSourceEstimate from ..source_space import SourceSpaces -from ..time_frequency import AverageTFR, AverageTFRArray, EpochsTFR, EpochsTFRArray +from ..time_frequency import BaseTFR from ..utils import ( ProgressBar, _check_option, @@ -1744,191 +1743,65 @@ def summarize_clusters_stc( return klass(data_summary, vertices, tmin, tstep, subject) -def validate_input_dataframe(df: pd.DataFrame, formula: str): - """ - Validate the input dataframe for the cluster permutation test. - - Parameters - ---------- - df : pd.DataFrame - Dataframe with 3 columns (subject_index, condition, data). - formula : formulaic.ModelSpec - Wilkinson style Formula for the design matrix. - - Returns - ------- - dv_name : str - Name of the dependent variable. - """ - # extract dependent variable name from formula - formulaic = _soft_import( - "formulaic", purpose="set up Design Matrix" - ) # soft import (not a dependency for MNE) - formula = formulaic.Formula(formula) - dv_name = str(formula.lhs) - +def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str): # check if all necessary columns are present - if dv_name not in df.columns: - raise ValueError("""DataFrame needs to contain a column - with the dependent variable name - as defined in the formula""") - if "condition" not in df.columns: - raise ValueError("DataFrame needs to contain a condition column") - if "subject_index" not in df.columns: - raise ValueError("DataFrame needs to contain a subject_index column") - - # check if the data column contains only valid types - check_column_types(df[dv_name]) - + missing = ({dv_name} | {iv_name}) - set(df.columns) + sep = '", "' + if missing: + raise ValueError( + f"DataFrame must contain a column named for each term in `formula`. " + f"Column{_pl(missing)} missing for term{_pl(missing)} " + f'"{sep.join(missing)}".' + ) + # check if the data column contains valid (and consistent) instance types + inst = df[dv_name].iloc[0] + valid_types = (Evoked, BaseEpochs, BaseTFR, np.ndarray) + _validate_type(inst, valid_types, f"Data in dependent variable column '{dv_name}'") + all_types = set(df[dv_name].map(type)) + all_type_names = ", ".join([type(x).__name__ for x in all_types]) + prologue = f"Data in dependent variable column '{dv_name}' must all have " + if len(all_types) > 1: + raise ValueError( + f"{prologue} the same type, but found types {{{all_type_names}}}." + ) # check if the shape of the data is consistent - if not all(data.data.shape == df[dv_name][0].data.shape for data in df[dv_name]): - raise ValueError("Data objects need to have the same shape") - - # check if the condition column contains only 2 unique values - if len(pd.unique(df.condition)) != 2: - raise ValueError("currently only supports 2 conditions.") - - return dv_name - - -def check_column_types(input_data: np.ndarray): - """ - Check if the column types are valid for the cluster permutation test. - - Parameters - ---------- - input_data : np.Array - Data to be checked for the cluster permutation test. - """ - # Get the type of the first element - first_type = type(input_data.iloc[0]) - - # Define the possible valid types - valid_types = ( - Evoked, - EvokedArray, - Epochs, - EpochsArray, - AverageTFR, - EpochsTFR, - EpochsTFRArray, - AverageTFRArray, - ) - - # Check if the type of the first element is a valid type - if first_type not in valid_types: - raise ValueError(f"Object type '{first_type}' is not a valid type.") - - # Check if all elements are of the same type as the first one - if not all(isinstance(data, first_type) for data in input_data): - raise ValueError("Data column must contain objects of the same type.") - - -def prepare_data_for_cluster_test(input_df: pd.DataFrame, dv_name: str): - """ - Prepare the data for the cluster permutation test. - - Parameters - ---------- - input_data : np.ndarray - Data to be prepared for the cluster permutation test. - - Returns - ------- - data : np.Array - Data prepared for the cluster permutation test. - """ - # extract data and add to dataframe - input_df["data"] = [data.data for data in input_df[dv_name]] - - # extract dimensions from time series or time-frequency data - first_data_obj = input_df["data"].iloc[0] - if isinstance(first_data_obj, (Epochs, Evoked, EpochsArray, EvokedArray)): - n_channels, n_timepoints = first_data_obj.get_data().shape - if isinstance( - first_data_obj, (AverageTFR, EpochsTFR, AverageTFRArray, EpochsTFRArray) - ): - n_channels, n_freqs, n_timepoints = first_data_obj.get_data().shape - - reshaped_data = [] - - for idx, row in input_df.iterrows(): - subject_index = row["subject_index"] - condition = row["condition"] - data_array = row["data"] - - if data_array.ndim == 2: - n_channels, n_timepoints = data_array.shape - # timepoints are the columns - df_temp = pd.DataFrame( - data_array, columns=[f"timepoint_{i}" for i in range(n_timepoints)] - ) - df_temp["channel"] = range(n_channels) - df_temp["subject_index"] = subject_index - df_temp["condition"] = condition - - reshaped_data.append(df_temp) - - elif data_array.ndim == 3: - n_channels, n_freqs, n_timepoints = data_array.shape - # timepoints are the columns - df_temp = pd.DataFrame( - data_array.reshape(-1, n_timepoints), - columns=[f"timepoint_{i}" for i in range(n_timepoints)], - ) - df_temp["frequency"] = np.repeat(range(n_freqs), n_channels) - df_temp["channel"] = np.tile(range(n_channels), n_freqs) - df_temp["subject_index"] = subject_index - df_temp["condition"] = condition - - reshaped_data.append(df_temp) - - else: - raise ValueError(f"Unsupported data array dimensions: {data_array.ndim}") - # combine the reshaped data - combined_df = pd.concat(reshaped_data, ignore_index=True) - # Convert the dataframe to long format - id_vars = ["subject_index", "condition", "channel"] - if "frequency" in combined_df.columns: - id_vars.append("frequency") - - reshaped_df = pd.melt( - combined_df, id_vars=id_vars, var_name="timepoint", value_name="value" - ) - - # rename column and convert to integer - reshaped_df["timepoint"] = ( - reshaped_df["timepoint"].str.replace("timepoint_", "").astype(int) - ) - - # return the reshaped dataframe and dimensions - if data_array.ndim == 2: - return reshaped_df, data_array.ndim, n_channels, n_timepoints - elif data_array.ndim == 3: - return reshaped_df, data_array.ndim, n_channels, n_freqs, n_timepoints + if isinstance(inst, np.ndarray): + all_shapes = set(df[dv_name].map(lambda x: x.shape[1:])) # first dim may vary + elif isinstance(inst, BaseEpochs): + all_shapes = set(df[dv_name].map(lambda x: x.get_data().shape[1:])) + else: + all_shapes = set(df[dv_name].map(lambda x: x.get_data().shape)) + if len(all_shapes) > 1: + raise ValueError( + f"{prologue} consistent shape, but {len(all_shapes)} different " + f"shapes were found: {'; '.join(all_shapes)}." + ) + return all_types.pop() +@verbose def cluster_test( df: pd.DataFrame, - formula: str, # Wilkinson notation formula for design matrix - paired_test: bool, # whether to run a paired t-test or unpaired test - n_permutations: int = 1024, # same default as in old API - seed: None | int | np.random.RandomState = None, - tail: Literal[-1, 0, 1] = 0, # 0 for two-tailed, 1 for greater, -1 for less - n_jobs: int = 1, # how many cores to use + formula: str, + *, + within_id: str | None = None, + stat_fun: callable | None = None, + tail: Literal[-1, 0, 1] = 0, + threshold=None, + n_permutations: int = 1024, adjacency: tuple | None = None, - max_step: int = 1, # maximum distance between samples (time points) - exclude: list | None = None, # exclude no time points or channels - step_down_p: int = 0, # step down in jumps test - t_power: int = 1, # weigh each location by its stats score - out_type: Literal["indices", "mask"] = "indices", + max_step: int = 1, + exclude: list | None = None, + step_down_p: int = 0, + t_power: int = 1, check_disjoint: bool = False, - buffer_size: int | None = None, # block size for chunking the data + out_type: Literal["indices", "mask"] = "indices", + seed: None | int | np.random.RandomState = None, + buffer_size: int | None = None, + n_jobs: int = 1, + verbose=None, ): - """ - Run a cluster permutation test based on formulaic input. - - # currently only supports paired t-test on evokeds or epochs + """Run a cluster permutation test from a DataFrame and a formula. Parameters ---------- @@ -1936,16 +1809,14 @@ def cluster_test( Dataframe with 3 columns (subject_index, condition, evoked). formula : str Wilkinson notation formula for design matrix. - paired_test: bool - Whether to run a paired t-test. - n_permutations : int, optional - Number of permutations. Default is 1024. - seed : None | int | np.random.RandomState, optional - Seed for the random number generator. Default is None. + within_id : None | str + Name of column in ``df`` to use in identifying within-group contrasts. + stat_fun : None | callable + Statistical function to use. tail : int, optional 0 for two-tailed, 1 for greater, -1 for less. Default is 0. - n_jobs : int, optional - How many cores to use. Default is 1. + n_permutations : int, optional + Number of permutations. Default is 1024. adjacency : None, optional Provide a adjacency matrix. Default is None. max_step : int, optional @@ -1956,107 +1827,86 @@ def cluster_test( Step down in jumps test. Default is 0. t_power : int, optional Weigh each location by its stats score. Default is 1. - out_type : str, optional - Output type. Default is "indices". check_disjoint : bool, optional Check if clusters are disjoint. Default is False. + out_type : str, optional + Output type. Default is "indices". + seed : None | int | np.random.RandomState, optional + Seed for the random number generator. Default is None. buffer_size : int, optional Block size for chunking the data. Default is None. - seed : int, optional - Seed for the random number generator. Default is None. + n_jobs : int, optional + How many cores to use. Default is 1. + %(verbose)s Returns ------- ClusterResult Object containing the results of the cluster permutation test. """ - # check if formula is present - if formula is None: - raise ValueError("Wilkinson style formula is required.") - - # validate the input dataframe and return name of dependent variable - dv_name = validate_input_dataframe(df, formula) - - # prepare the data for the cluster permutation test - prep_result = prepare_data_for_cluster_test(df, dv_name) - - if prep_result[1] == 2: - # pivot the dataframe based on condition for later subtraction - pivot_df = ( - prep_result[0] - .pivot_table( - index=["subject_index", "channel", "timepoint"], - columns="condition", - values="value", - ) - .reset_index() - ) - elif prep_result[1] == 3: - # pivot the dataframe based on condition for later subtraction - pivot_df = ( - prep_result[0] - .pivot_table( - index=["subject_index", "channel", "frequency", "timepoint"], - columns="condition", - values="value", - ) - .reset_index() + # parse formula + formulaic = _soft_import("formulaic", purpose="parse formula for clustering") + parser = formulaic.parser.DefaultFormulaParser(include_intercept=False) + formula = formulaic.Formula(formula, _parser=parser) + dv_name = str(np.array(formula.lhs.root).item()) + iv_name = str(np.array(formula.rhs.root).item()) + # validate the input dataframe and return the type of the data column entries + _dtype = _validate_cluster_df(df, dv_name, iv_name) + + # for within_subject + _validate_type(within_id, (str, None), "within_id") + if within_id: + df = df.copy(deep=False) # Don't mutate input dataframe row order! + df.sort_values([iv_name, within_id], inplace=True) + counts = df[within_id].value_counts() + if any(counts != 2): + raise ValueError("Badness 10000") + + # extract the data + + def _extract_data_array(series): + return np.concatenate(series.values) + + def _extract_data_mne(series): + return np.array( + series.map(lambda inst: inst.get_data().swapaxes(-2, -1)).to_list() ) - # Get unique elements and the indices of their first occurrences - unique_elements, indices = np.unique(df.condition, return_index=True) - - # Sort unique elements by the indices of their first occurrences - conditions = unique_elements[np.argsort(indices)] - - # store the contrast for the clusterResults object - contrast = f"{conditions[0]} - {conditions[1]}" - - # print the contrast used for the paired t-test so the user knows - # what is subtracted from what - logger.info(f"Contrast used for paired t-test: {contrast}") - - # Compute the difference (assuming there are only 2 conditions) - pivot_df[dv_name] = pivot_df[conditions[0]] - pivot_df[conditions[1]] - - # for the paired t-test y is the difference between conditions - # X is the design matrix with a column with 1s and 0s for each participant - # Create the design matrix using formulaic - formulaic = _soft_import( - "formulaic", purpose="set up Design Matrix" - ) # soft import (not a dependency for MNE) - y, X = formulaic.model_matrix(formula, pivot_df) - - # Prepare design matrix for input into MNE cluster function - # MNE cluster functions expect channels as the last dimension - - if prep_result[1] == 2: - # Reshape y.values into a 3D array: (participants, n_channels, n_timepoints) - y_reshaped = y.values.reshape(-1, prep_result[2], prep_result[3]) - # Transpose the array to have channels as the last dimension - y_for_cluster = y_reshaped.transpose(0, 2, 1) - elif prep_result[1] == 3: - # Reshape y.values into a 4D array: - # (participants, n_channels, n_freqs, n_timepoints) - y_reshaped = y.values.reshape( - -1, prep_result[2], prep_result[3], prep_result[4] - ) - # Transpose the array to have channels as the last dimension - y_for_cluster = y_reshaped.transpose(0, 3, 2, 1) + def _extract_data_tfr(series): + return series.map(lambda inst: inst.get_data().swapaxes(-3, -1)).to_list() - if paired_test: - # define stat function and threshold - stat_fun, threshold = _check_fun( - X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="within" - ) + if _dtype is np.ndarray: + func = _extract_data_array + elif _dtype is BaseTFR: + func = _extract_data_tfr else: - # define stat function and threshold - stat_fun, threshold = _check_fun( - X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="between" - ) + func = _extract_data_mne + # convert to a list-like X for clustering + X = df.groupby(iv_name).agg({dv_name: func})[dv_name].to_list() + + # determine test type + if len(X) == 1: + kind = "within" + elif len(X) > 2: + kind = "between" + elif len(set(x.shape for x in X)) > 1: + kind = "between" + # by now we know there are exactly 2 elements in X, and their shapes match + elif within_id in df: + kind = "within" + X = X[0] - X[1] + else: + kind = "between" + + # define stat function and threshold + stat_fun, threshold = _check_fun( + X=X, stat_fun=stat_fun, threshold=threshold, tail=tail, kind=kind + ) + if kind == "within": + X = [X] # Run the cluster-based permutation test - T_obs, clusters, cluster_p_values, H0 = _permutation_cluster_test( - [y_for_cluster], + stat_obs, clusters, cluster_p_values, H0 = _permutation_cluster_test( + X, n_permutations=n_permutations, threshold=threshold, stat_fun=stat_fun, @@ -2073,9 +1923,9 @@ def cluster_test( seed=seed, ) - print(f"smallest cluster p-value: {min(cluster_p_values)}") + # print(f"smallest cluster p-value: {min(cluster_p_values)}") - return ClusterResult(T_obs, clusters, cluster_p_values, H0) + return ClusterResult(stat_obs, clusters, cluster_p_values, H0, stat_fun) class ClusterResult: @@ -2084,7 +1934,7 @@ class ClusterResult: Parameters ---------- - T_obs : np.ndarray + stat_obs : np.ndarray The observed test statistic. clusters : list List of clusters. @@ -2096,15 +1946,24 @@ class ClusterResult: def __init__( self, - T_obs: np.typing.NDArray, + stat_obs: np.typing.NDArray, clusters: list, cluster_p_values: np.typing.NDArray, H0: np.typing.NDArray, + stat_fun: callable, ): - self.T_obs = T_obs + self.stat_obs = stat_obs self.clusters = clusters self.cluster_p_values = cluster_p_values self.H0 = H0 + self.stat_fun = stat_fun + # TODO improve detection of stat name (e.g. unpaired T)? + if stat_fun is f_oneway: + self.stat_name = "F-statistic" + elif stat_fun is ttest_1samp_no_p: + self.stat_name = "paired T-statistic" + else: + self.stat_name = "test statistic" def plot_cluster(self, condition_labels: dict): """ @@ -2135,7 +1994,7 @@ def plot_cluster(self, condition_labels: dict): time_inds = np.unique(time_inds) # get topography for t stat - t_map = self.T_obs[time_inds, ...].mean(axis=0).astype(int) + t_map = self.stat_obs[time_inds, ...].mean(axis=0).astype(int) # get signals at the sensors contributing to the cluster sig_times = cond_values[0][0].times[time_inds] @@ -2176,7 +2035,7 @@ def plot_cluster(self, condition_labels: dict): # add axes for colorbar ax_colorbar = divider.append_axes("right", size="5%", pad=0.1) cbar = plt.colorbar(image, cax=ax_colorbar) - cbar.set_label("t-value") + cbar.set_label(self.stat_name) ax_topo.set_xlabel( "average from {:0.3f} to {:0.3f} s".format(*sig_times[[0, -1]]) ) From dc8a799b16f5871c6fb0352f414265d26ad20d29 Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Thu, 25 Jul 2024 12:25:12 -0500 Subject: [PATCH 24/88] make tutorial match modified API Co-authored-by: Carina Forster --- tutorials/stats-sensor-space/76_new_cluster_test_api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index efbc6d5e3f0..83b4f019b6f 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -133,11 +133,11 @@ # the cluster test randomly permutes the subject label # the 1 in the formula represents the intercept which is always included # C is a categorical variable that will be dummy coded -formula = "evoked ~ 1 + C(subject_index)" +formula = "evoked ~ condition" # run the new cluster test API and return the new cluster_result object cluster_result = mne.stats.cluster_level.cluster_test( - df=df, formula=formula, paired_test=True, adjacency=None + df=df, formula=formula, within_id="subject_index" ) # note that we ran an exact test due to the small sample size From f12cf6e574eaa08d9dd9f59d93a66b3e3f49bcf2 Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Thu, 25 Jul 2024 12:30:30 -0500 Subject: [PATCH 25/88] remove unused test helper func --- mne/stats/tests/test_cluster_level.py | 39 --------------------------- 1 file changed, 39 deletions(-) diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py index 01fcd5adba6..24f3ee687ca 100644 --- a/mne/stats/tests/test_cluster_level.py +++ b/mne/stats/tests/test_cluster_level.py @@ -882,45 +882,6 @@ def test_output_equiv(shape, out_type, adjacency, threshold): assert_array_equal(got_mask, want_mask) -def create_sample_data_cluster_test(): - """Create sample data to test new cluster API.""" - # Prepare some dummy data - n_subjects = 20 - n_conditions = 2 - n_channels = 5 - n_timepoints = 8 - n_freqs = 3 - - # Create dummy data - dummy_data_2d = [ - np.random.rand(n_channels, n_timepoints) - for _ in range(n_subjects * n_conditions) - ] - dummy_data_3d = [ - np.random.rand(n_channels, n_freqs, n_timepoints) - for _ in range(n_subjects * n_conditions) - ] - - # Create a DataFrame with dummy data - df_2d = pd.DataFrame( - { - "subject_index": np.repeat(range(n_subjects), n_conditions), - "condition": np.tile(["cond1", "cond2"], n_subjects), - "data": dummy_data_2d, - } - ) - - df_3d = pd.DataFrame( - { - "subject_index": np.repeat(range(n_subjects), n_conditions), - "condition": np.tile(["cond1", "cond2"], n_subjects), - "data": dummy_data_3d, - } - ) - - return df_2d, df_3d - - def test_compare_old_and_new_cluster_api(): """Test for same results from old and new APIs.""" condition1_1d, condition2_1d, condition1_2d, condition2_2d = _get_conditions() From 5b97971602d761e7f2b181f7f0914032aacb8e96 Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Thu, 25 Jul 2024 12:33:11 -0500 Subject: [PATCH 26/88] vulture allowlist update --- tools/vulture_allowlist.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/vulture_allowlist.py b/tools/vulture_allowlist.py index 3de48b3b906..c41ea610880 100644 --- a/tools/vulture_allowlist.py +++ b/tools/vulture_allowlist.py @@ -136,3 +136,6 @@ _qt_raise_window _qt_disable_paint _qt_get_stylesheet + +# used in tutorial, not sure why shows up +plot_cluster From 5f5b0fc3262ce29ff19139756f10143df12a4804 Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Sun, 28 Jul 2024 13:00:41 +0200 Subject: [PATCH 27/88] included BaseTFR in validate_cluster_df --- mne/stats/cluster_level.py | 43 +++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index a366b19ecc1..fc41b3a5506 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -1744,18 +1744,24 @@ def summarize_clusters_stc( def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str): + """Validate the input DataFrame for cluster tests.""" # check if all necessary columns are present - missing = ({dv_name} | {iv_name}) - set(df.columns) + missing = ({dv_name} | {iv_name}) - set(df.columns) # should be empty sep = '", "' - if missing: + if missing: # if not empty, there are missing columns raise ValueError( f"DataFrame must contain a column named for each term in `formula`. " - f"Column{_pl(missing)} missing for term{_pl(missing)} " + f"Column{_pl(missing)} missing for term{_pl(missing)} " # _pl = pluralize f'"{sep.join(missing)}".' ) # check if the data column contains valid (and consistent) instance types inst = df[dv_name].iloc[0] - valid_types = (Evoked, BaseEpochs, BaseTFR, np.ndarray) + valid_types = ( + Evoked, + BaseEpochs, + BaseTFR, + np.ndarray, + ) # Base covers all Epochs and TFRs _validate_type(inst, valid_types, f"Data in dependent variable column '{dv_name}'") all_types = set(df[dv_name].map(type)) all_type_names = ", ".join([type(x).__name__ for x in all_types]) @@ -1766,8 +1772,10 @@ def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str): ) # check if the shape of the data is consistent if isinstance(inst, np.ndarray): - all_shapes = set(df[dv_name].map(lambda x: x.shape[1:])) # first dim may vary - elif isinstance(inst, BaseEpochs): + all_shapes = set( + df[dv_name].map(lambda x: x.shape[1:]) + ) # first dim may vary (participants or epochs) + elif isinstance(inst, (BaseEpochs | BaseTFR)): # should include BaseTFR? all_shapes = set(df[dv_name].map(lambda x: x.get_data().shape[1:])) else: all_shapes = set(df[dv_name].map(lambda x: x.get_data().shape)) @@ -1776,14 +1784,14 @@ def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str): f"{prologue} consistent shape, but {len(all_shapes)} different " f"shapes were found: {'; '.join(all_shapes)}." ) - return all_types.pop() + return all_types.pop() # return the type of the data column entries @verbose def cluster_test( df: pd.DataFrame, formula: str, - *, + *, # end of positional-only parameters within_id: str | None = None, stat_fun: callable | None = None, tail: Literal[-1, 0, 1] = 0, @@ -1806,9 +1814,10 @@ def cluster_test( Parameters ---------- df : pd.DataFrame - Dataframe with 3 columns (subject_index, condition, evoked). + Dataframe containing the data, dependent and independent variables. formula : str - Wilkinson notation formula for design matrix. + Wilkinson notation formula for design matrix. The names of the dependent + and independent variable should match the columns in the dataframe. within_id : None | str Name of column in ``df`` to use in identifying within-group contrasts. stat_fun : None | callable @@ -1848,8 +1857,10 @@ def cluster_test( formulaic = _soft_import("formulaic", purpose="parse formula for clustering") parser = formulaic.parser.DefaultFormulaParser(include_intercept=False) formula = formulaic.Formula(formula, _parser=parser) + # extract the dependent and independent variable names dv_name = str(np.array(formula.lhs.root).item()) iv_name = str(np.array(formula.rhs.root).item()) + # validate the input dataframe and return the type of the data column entries _dtype = _validate_cluster_df(df, dv_name, iv_name) @@ -1860,10 +1871,9 @@ def cluster_test( df.sort_values([iv_name, within_id], inplace=True) counts = df[within_id].value_counts() if any(counts != 2): - raise ValueError("Badness 10000") - - # extract the data + raise ValueError("for paired tttest, each subject must have 2 observations") + # extract the data from the dataframe def _extract_data_array(series): return np.concatenate(series.values) @@ -1881,15 +1891,16 @@ def _extract_data_tfr(series): func = _extract_data_tfr else: func = _extract_data_mne + # convert to a list-like X for clustering X = df.groupby(iv_name).agg({dv_name: func})[dv_name].to_list() # determine test type if len(X) == 1: - kind = "within" + kind = "within" # data already subtracted elif len(X) > 2: kind = "between" - elif len(set(x.shape for x in X)) > 1: + elif len(set(x.shape for x in X)) > 1: # check if shapes match kind = "between" # by now we know there are exactly 2 elements in X, and their shapes match elif within_id in df: @@ -1923,8 +1934,6 @@ def _extract_data_tfr(series): seed=seed, ) - # print(f"smallest cluster p-value: {min(cluster_p_values)}") - return ClusterResult(stat_obs, clusters, cluster_p_values, H0, stat_fun) From ccccb5bfa19a71be0f95dc243e7fb59dfc4a267d Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Sun, 28 Jul 2024 13:35:40 +0200 Subject: [PATCH 28/88] comments on cluster_test function --- mne/stats/cluster_level.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index fc41b3a5506..7b44641a3ec 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -1864,7 +1864,7 @@ def cluster_test( # validate the input dataframe and return the type of the data column entries _dtype = _validate_cluster_df(df, dv_name, iv_name) - # for within_subject + # for within_subject designs, check if each subject has 2 observations _validate_type(within_id, (str, None), "within_id") if within_id: df = df.copy(deep=False) # Don't mutate input dataframe row order! @@ -1877,7 +1877,7 @@ def cluster_test( def _extract_data_array(series): return np.concatenate(series.values) - def _extract_data_mne(series): + def _extract_data_mne(series): # 2D data return np.array( series.map(lambda inst: inst.get_data().swapaxes(-2, -1)).to_list() ) @@ -1900,21 +1900,26 @@ def _extract_data_tfr(series): kind = "within" # data already subtracted elif len(X) > 2: kind = "between" - elif len(set(x.shape for x in X)) > 1: # check if shapes match + elif ( + len(set(x.shape for x in X)) > 1 + ): # check if there are unequal observations in each group kind = "between" # by now we know there are exactly 2 elements in X, and their shapes match elif within_id in df: kind = "within" X = X[0] - X[1] - else: + else: # what would be another else cas kind = "between" # define stat function and threshold stat_fun, threshold = _check_fun( X=X, stat_fun=stat_fun, threshold=threshold, tail=tail, kind=kind ) - if kind == "within": + + # check_fun doesn't work with list input` + if kind == "within": # will this create an issue for already subtracted data? X = [X] + # Run the cluster-based permutation test stat_obs, clusters, cluster_p_values, H0 = _permutation_cluster_test( X, From 59b1a3a7850681df957d3fcb35818f4c9d7b5911 Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Sun, 28 Jul 2024 14:11:54 +0200 Subject: [PATCH 29/88] updated clusterResult class and plot function --- mne/stats/cluster_level.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index 7b44641a3ec..7f48c999f5f 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -1971,7 +1971,8 @@ def __init__( self.cluster_p_values = cluster_p_values self.H0 = H0 self.stat_fun = stat_fun - # TODO improve detection of stat name (e.g. unpaired T)? + + # unpaired t-test is f_oneway if stat_fun is f_oneway: self.stat_name = "F-statistic" elif stat_fun is ttest_1samp_no_p: @@ -1979,7 +1980,7 @@ def __init__( else: self.stat_name = "test statistic" - def plot_cluster(self, condition_labels: dict): + def plot_cluster_time_sensor(self, condition_labels: dict): """ Plot the cluster with the lowest p-value. @@ -1992,13 +1993,20 @@ def plot_cluster(self, condition_labels: dict): condition_labels : dict Dictionary with condition labels as keys and evoked objects as values. """ + # define colorblind friendly colors + colorblind_palette = ["#4daf4a", "#f781bf"] + # extract condition labels from the dictionary cond_keys = list(condition_labels.keys()) # extract the evokeds from the dictionary cond_values = list(condition_labels.values()) # configure variables for visualization - colors = {cond_keys[0]: "crimson", cond_keys[1]: "steelblue"} + colors = { + cond_keys[0]: colorblind_palette[0], + cond_keys[1]: colorblind_palette[1], + } + line_styles = {cond_keys[0]: "-", cond_keys[1]: "--"} lowest_p_cluster = np.argmin(self.cluster_p_values) @@ -2051,18 +2059,23 @@ def plot_cluster(self, condition_labels: dict): cbar = plt.colorbar(image, cax=ax_colorbar) cbar.set_label(self.stat_name) ax_topo.set_xlabel( - "average from {:0.3f} to {:0.3f} s".format(*sig_times[[0, -1]]) + "Spatial cluster extent:\n averaged from {:0.3f} to {:0.3f} s".format( + *sig_times[[0, -1]] + ) ) # add new axis for time courses and plot time courses ax_signals = divider.append_axes("right", size="300%", pad=1.3) - title = f"Signal averaged over {len(ch_inds)} sensor(s)" + title = ( + f"Temporal cluster extent:\nSignal averaged over {len(ch_inds)} sensor(s)" + ) plot_compare_evokeds( condition_labels, title=title, picks=ch_inds, axes=ax_signals, colors=colors, + linestyles=line_styles, show=False, split_legend=True, truncate_yaxis="auto", From 98d08797fb3a943878a1b432202effee4db2c796 Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Sun, 28 Jul 2024 14:12:12 +0200 Subject: [PATCH 30/88] updated function call for plotting --- tutorials/stats-sensor-space/76_new_cluster_test_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index 83b4f019b6f..b7f933d127b 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -152,7 +152,7 @@ # finally let's plot the results using the ClusterResults class # we plot the cluster with the lowest p-value -cluster_result.plot_cluster(condition_labels=conditions_dict) +cluster_result.plot_cluster_time_sensor(condition_labels=conditions_dict) # we can see that there is something going on around 400 ms # with a stronger signal for target trials in right central-parietal channels From ec0324207428b720920908e5a25bb7f685d3e676 Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Sun, 28 Jul 2024 14:14:18 +0200 Subject: [PATCH 31/88] changed color --- mne/stats/cluster_level.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index 7f48c999f5f..8e5b73d6474 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -1994,7 +1994,7 @@ def plot_cluster_time_sensor(self, condition_labels: dict): Dictionary with condition labels as keys and evoked objects as values. """ # define colorblind friendly colors - colorblind_palette = ["#4daf4a", "#f781bf"] + colorblind_palette = ["#4daf4a", "#984ea3"] # extract condition labels from the dictionary cond_keys = list(condition_labels.keys()) From 5941f61f9b26f06e830cd83c513b9c046c1235d6 Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Thu, 1 Aug 2024 12:30:11 -0500 Subject: [PATCH 32/88] docstring/docdict cleanups and fixes --- mne/stats/cluster_level.py | 68 +++++++++--------- mne/utils/docs.py | 138 ++++++++++++++++++++++--------------- 2 files changed, 118 insertions(+), 88 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index c763f01da91..9f6a3a8d343 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -1796,12 +1796,12 @@ def cluster_test( stat_fun: callable | None = None, tail: Literal[-1, 0, 1] = 0, threshold=None, - n_permutations: int = 1024, - adjacency: tuple | None = None, - max_step: int = 1, - exclude: list | None = None, - step_down_p: int = 0, - t_power: int = 1, + n_permutations: str | int = 1024, + adjacency: sparse.spmatrix | False = False, + max_step: int = 1, # TODO may need to provide `max_step_time` and `max_step_freq` + exclude: list | None = None, # TODO needs rethink because user passes MNE objects + step_down_p: float = 0.0, + t_power: float = 1.0, check_disjoint: bool = False, out_type: Literal["indices", "mask"] = "indices", seed: None | int | np.random.RandomState = None, @@ -1819,35 +1819,41 @@ def cluster_test( Wilkinson notation formula for design matrix. The names of the dependent and independent variable should match the columns in the dataframe. within_id : None | str - Name of column in ``df`` to use in identifying within-group contrasts. - stat_fun : None | callable - Statistical function to use. - tail : int, optional - 0 for two-tailed, 1 for greater, -1 for less. Default is 0. - n_permutations : int, optional - Number of permutations. Default is 1024. - adjacency : None, optional - Provide a adjacency matrix. Default is None. + Name of column in ``df`` to use in identifying within-group contrasts. If + ``None``, will perform a between-group test. Ignored if the number of groups + (unique values in the independent variable column of ``df``) is greater than 2. + %(stat_fun_clust_both)s + %(tail_clust)s + %(threshold_clust_both)s + %(n_permutations_clust_all)s + %(adjacency_clust_both)s max_step : int, optional Maximum distance between samples (time points). Default is 1. - exclude : np.Array, optional - Exclude no time points or channels. Default is None. - step_down_p : int, optional - Step down in jumps test. Default is 0. - t_power : int, optional - Weigh each location by its stats score. Default is 1. - check_disjoint : bool, optional - Check if clusters are disjoint. Default is False. - out_type : str, optional - Output type. Default is "indices". - seed : None | int | np.random.RandomState, optional - Seed for the random number generator. Default is None. - buffer_size : int, optional - Block size for chunking the data. Default is None. - n_jobs : int, optional - How many cores to use. Default is 1. + exclude : array-like of bool | None + Mask to apply to the data to exclude certain points from clustering + (e.g., medial wall vertices). Should be the same shape as the channels/vertices + dimension of the data objects. If ``None``, no points are excluded. + %(step_down_p_clust)s + %(t_power_clust)s + check_disjoint : bool + Whether to check if the ``adjacency`` matrix can be separated into disjoint + sets before clustering. This may lead to faster clustering, especially if + the "time" and/or "frequency" dimensions are large. + %(out_type_clust)s + %(seed)s + buffer_size : int | None + Block size to use when computing test statistics. This can significantly + reduce memory usage when ``n_jobs > 1`` and memory sharing between + processes is enabled (see :func:`mne.set_cache_dir`), because the data will be + shared between processes and each process only needs to allocate space for + a small block of locations at a time. + %(n_jobs)s %(verbose)s + Notes + ----- + %(threshold_clust_t_or_f_notes)s + Returns ------- ClusterResult diff --git a/mne/utils/docs.py b/mne/utils/docs.py index ff9e11ee776..464e7e3e84c 100644 --- a/mne/utils/docs.py +++ b/mne/utils/docs.py @@ -144,61 +144,54 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75): formatting. This can add overhead so is meant only for debugging. """ -docdict["adjacency_clust"] = """ -adjacency : scipy.sparse.spmatrix | None | False +_adjacency_clust_template = """ +adjacency : scipy.sparse.spmatrix | {param_none}False Defines adjacency between locations in the data, where "locations" can be spatial vertices, frequency bins, time points, etc. For spatial vertices (i.e. sensor space data), see :func:`mne.channels.find_ch_adjacency` or :func:`mne.spatial_inter_hemi_adjacency`. For source space data, see - :func:`mne.spatial_src_adjacency` or - :func:`mne.spatio_temporal_src_adjacency`. If ``False``, assumes - no adjacency (each location is treated as independent and unconnected). - If ``None``, a regular lattice adjacency is assumed, connecting - each {sp} location to its neighbor(s) along the last dimension - of {{eachgrp}} ``{{x}}``{lastdim}. + :func:`mne.spatial_src_adjacency` or :func:`mne.spatio_temporal_src_adjacency`. + If ``False``, assumes no adjacency (each location is treated as independent and + unconnected).{if_none} If ``adjacency`` is a matrix, it is assumed to be symmetric (only the upper triangular half is used) and must be square with dimension equal to - ``{{x}}.shape[-1]`` {parone} or ``{{x}}.shape[-1] * {{x}}.shape[-2]`` - {partwo} or (optionally) - ``{{x}}.shape[-1] * {{x}}.shape[-2] * {{x}}.shape[-3]`` - {parthree}.{memory} + the product of the last 1, 2, or 3 data dimensions (e.g., for time-frequency data: + n_channels, n_channels * n_freqs, or n_channels * n_freqs * n_times).{memory} +""" +_if_none = """ If ``None``, a regular lattice adjacency is assumed, connecting + each {spatial}location to its neighbor(s) along the last dimension + of {the_data}. """ - -mem = ( - " If spatial adjacency is uniform in time, it is recommended to use " - "a square matrix with dimension ``{x}.shape[-1]`` (n_vertices) to save " - "memory and computation, and to use ``max_step`` to define the extent " - "of temporal adjacency to consider when clustering." -) -comb = " The function `mne.stats.combine_adjacency` may be useful for 4D data." st = dict( - sp="spatial", - lastdim="", - parone="(n_vertices)", - partwo="(n_times * n_vertices)", - parthree="(n_times * n_freqs * n_vertices)", - memory=mem, + param_none="None | ", + if_none=_if_none.format(spatial="spatial ", the_data="{eachgrp} ``{x}``"), + memory=""" + If spatial adjacency is uniform in time, it is recommended to use a square matrix + with dimension ``{x}.shape[-1]`` (n_vertices) to save memory and computation, + and to use ``max_step`` to define the extent of temporal adjacency to consider when + clustering. +""", ) tf = dict( - sp="", - lastdim=" (or the last two dimensions if ``{x}`` is 2D)", - parone="(for 2D data)", - partwo="(for 3D data)", - parthree="(for 4D data)", - memory=comb, + param_none="None | ", + if_none=_if_none.format( + spatial="", + the_data="{eachgrp} ``{x}`` (or the last two dimensions if ``{x}`` is 2D)", + ), + memory=""" + The function `mne.stats.combine_adjacency` may be useful for 4D data. +""", ) -nogroups = dict(eachgrp="", x="X") +nogrps = dict(eachgrp="", x="X") groups = dict(eachgrp="each group ", x="X[k]") -docdict["adjacency_clust_1"] = ( - docdict["adjacency_clust"].format(**tf).format(**nogroups) -) -docdict["adjacency_clust_n"] = docdict["adjacency_clust"].format(**tf).format(**groups) -docdict["adjacency_clust_st1"] = ( - docdict["adjacency_clust"].format(**st).format(**nogroups) -) -docdict["adjacency_clust_stn"] = ( - docdict["adjacency_clust"].format(**st).format(**groups) + +docdict["adjacency_clust_1"] = _adjacency_clust_template.format(**tf).format(**nogrps) +docdict["adjacency_clust_both"] = _adjacency_clust_template.format( + param_none="", if_none="", memory="" ) +docdict["adjacency_clust_n"] = _adjacency_clust_template.format(**tf).format(**groups) +docdict["adjacency_clust_st1"] = _adjacency_clust_template.format(**st).format(**nogrps) +docdict["adjacency_clust_stn"] = _adjacency_clust_template.format(**st).format(**groups) docdict["adjust_dig_chpi"] = """ adjust_dig : bool @@ -708,7 +701,7 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75): docdict["check_disjoint_clust"] = """ check_disjoint : bool - Whether to check if the connectivity matrix can be separated into disjoint + Whether to check if the ``adjacency`` matrix can be separated into disjoint sets before clustering. This may lead to faster clustering, especially if the second dimension of ``X`` (usually the "time" dimension) is large. """ @@ -1416,7 +1409,7 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75): """ docdict["exclude_clust"] = """ -exclude : bool array or None +exclude : array-like of bool | None Mask to apply to the data to exclude certain points from clustering (e.g., medial wall vertices). Should be the same shape as ``X``. If ``None``, no points are excluded. @@ -3958,7 +3951,7 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75): seed : None | int | instance of ~numpy.random.RandomState A seed for the NumPy random number generator (RNG). If ``None`` (default), the seed will be obtained from the operating system - (see :class:`~numpy.random.RandomState` for details), meaning it will most + (see :class:`~numpy.random.RandomState` for details), meaning it will most likely produce different output every time this function or method is run. To achieve reproducible results, pass a value here to explicitly initialize the RNG with a defined state. @@ -4249,16 +4242,23 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75): channel names in the file will be used when possible. """ -_stat_fun_clust_base = """ +_stat_fun_template = """ stat_fun : callable | None Function called to calculate the test statistic. Must accept 1D-array as - input and return a 1D array. If ``None`` (the default), uses - `mne.stats.{}`. + input and return a 1D array. If ``None`` (the default), uses {}. """ -docdict["stat_fun_clust_f"] = _stat_fun_clust_base.format("f_oneway") +docdict["stat_fun_clust_both"] = _stat_fun_template.format( + """:func:`mne.stats.ttest_1samp_no_p` + for paired tests and :func:`mne.stats.f_oneway` for unpaired tests or tests of + more than 2 groups.""" +) + +docdict["stat_fun_clust_f"] = _stat_fun_template.format(":func:`mne.stats.f_oneway`") -docdict["stat_fun_clust_t"] = _stat_fun_clust_base.format("ttest_1samp_no_p") +docdict["stat_fun_clust_t"] = _stat_fun_template.format( + ":func:`mne.stats.ttest_1samp_no_p`" +) docdict["static"] = """ static : instance of SpatialImage @@ -4469,10 +4469,10 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75): threshold : float | dict | None The so-called "cluster forming threshold" in the form of a test statistic (note: this is not an alpha level / "p-value"). - If numeric, vertices with data values more extreme than ``threshold`` will - be used to form clusters. If ``None``, {} will be chosen + If numeric, vertices with stat values more extreme than ``threshold`` will + be used to form clusters. If ``None``, {which_thresh} will be chosen automatically that corresponds to a p-value of 0.05 for the given number of - observations (only valid when using {}). If ``threshold`` is a + observations (only valid when using {which_stat}). If ``threshold`` is a :class:`dict` (with keys ``'start'`` and ``'step'``) then threshold-free cluster enhancement (TFCE) will be used (see the :ref:`TFCE example ` and :footcite:`SmithNichols2009`). @@ -4480,8 +4480,14 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75): a particular p-value for one-tailed or two-tailed tests. """ -f_test = ("an F-threshold", "an F-statistic") -docdict["threshold_clust_f"] = _threshold_clust_base.format(*f_test) +docdict["threshold_clust_both"] = _threshold_clust_base.format( + which_thresh="a t- or F-threshold", + which_stat="``stat_fun=None``, i.e., a paired t-test or one-way F-test", +) + +docdict["threshold_clust_f"] = _threshold_clust_base.format( + which_thresh="an F-threshold", which_stat="an F-statistic" +) docdict["threshold_clust_f_notes"] = """ For computing a ``threshold`` based on a p-value, use the conversion @@ -4493,8 +4499,9 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75): thresh = scipy.stats.f.ppf(1 - pval, dfn=dfn, dfd=dfd) # F distribution """ -t_test = ("a t-threshold", "a t-statistic") -docdict["threshold_clust_t"] = _threshold_clust_base.format(*t_test) +docdict["threshold_clust_t"] = _threshold_clust_base.format( + which_thresh="a t-threshold", which_stat="a t-statistic" +) docdict["threshold_clust_t_notes"] = """ For computing a ``threshold`` based on a p-value, use the conversion @@ -4508,6 +4515,23 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75): For testing the lower tail (``tail=-1``), don't subtract ``pval`` from 1. """ +docdict["threshold_clust_t_or_f_notes"] = """ +For computing a ``threshold`` based on a p-value, use the conversion +from :meth:`scipy.stats.rv_continuous.ppf`:: + + pval = 0.001 # arbitrary + # for t-statistic + df = n_observations - 1 # degrees of freedom for the t-test + thresh = scipy.stats.t.ppf(1 - pval / 2, df) # two-tailed, t distribution + # for f-statistic + dfn = n_conditions - 1 # degrees of freedom numerator + dfd = n_observations - n_conditions # degrees of freedom denominator + thresh = scipy.stats.f.ppf(1 - pval, dfn=dfn, dfd=dfd) # F distribution + +For a one-tailed test (``tail=1``), don't divide the p-value by 2. +For testing the lower tail (``tail=-1``), don't subtract ``pval`` from 1. +""" + docdict["time_bandwidth_tfr"] = """ time_bandwidth : float ``≥ 2.0`` Product between the temporal window length (in seconds) and the *full* From 368fa44bc8b0772ba514ecd9532d588446c7a7a3 Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Mon, 5 Aug 2024 13:15:37 +0200 Subject: [PATCH 33/88] implemented Dan's comments --- mne/stats/cluster_level.py | 65 ++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 30 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index 9f6a3a8d343..804d035ff51 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -1775,7 +1775,7 @@ def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str): all_shapes = set( df[dv_name].map(lambda x: x.shape[1:]) ) # first dim may vary (participants or epochs) - elif isinstance(inst, (BaseEpochs | BaseTFR)): # should include BaseTFR? + elif isinstance(inst, (BaseEpochs | BaseTFR)): all_shapes = set(df[dv_name].map(lambda x: x.get_data().shape[1:])) else: all_shapes = set(df[dv_name].map(lambda x: x.get_data().shape)) @@ -1797,7 +1797,7 @@ def cluster_test( tail: Literal[-1, 0, 1] = 0, threshold=None, n_permutations: str | int = 1024, - adjacency: sparse.spmatrix | False = False, + adjacency: sparse.spmatrix | None | False = None, # should be None (default) max_step: int = 1, # TODO may need to provide `max_step_time` and `max_step_freq` exclude: list | None = None, # TODO needs rethink because user passes MNE objects step_down_p: float = 0.0, @@ -1817,7 +1817,7 @@ def cluster_test( Dataframe containing the data, dependent and independent variables. formula : str Wilkinson notation formula for design matrix. The names of the dependent - and independent variable should match the columns in the dataframe. + and independent variable should match the columns in ``df``. within_id : None | str Name of column in ``df`` to use in identifying within-group contrasts. If ``None``, will perform a between-group test. Ignored if the number of groups @@ -1877,7 +1877,7 @@ def cluster_test( df.sort_values([iv_name, within_id], inplace=True) counts = df[within_id].value_counts() if any(counts != 2): - raise ValueError("for paired tttest, each subject must have 2 observations") + raise ValueError("for paired t-test, each subject must have 2 observations") # extract the data from the dataframe def _extract_data_array(series): @@ -1914,7 +1914,7 @@ def _extract_data_tfr(series): elif within_id in df: kind = "within" X = X[0] - X[1] - else: # what would be another else cas + else: # 2 elements in X but no within_id provided → unpaired test kind = "between" # define stat function and threshold @@ -1978,7 +1978,7 @@ def __init__( self.H0 = H0 self.stat_fun = stat_fun - # unpaired t-test is f_oneway + # unpaired t-test equivalent to f_oneway w/ 2 groups if stat_fun is f_oneway: self.stat_name = "F-statistic" elif stat_fun is ttest_1samp_no_p: @@ -1986,7 +1986,15 @@ def __init__( else: self.stat_name = "test statistic" - def plot_cluster_time_sensor(self, condition_labels: dict): + def plot_cluster_time_sensor( + self, + condition_labels: dict, + colors: list | dict | None = None, + linestyles: list | dict | None = None, + cmap_evokeds: None | str | tuple = None, + cmap_topo: None | str | tuple = None, + ci: float | bool | callable() | None = None, + ): """ Plot the cluster with the lowest p-value. @@ -1998,21 +2006,23 @@ def plot_cluster_time_sensor(self, condition_labels: dict): ---------- condition_labels : dict Dictionary with condition labels as keys and evoked objects as values. + colors : list|dict|None + Colors to use when plotting the ERP lines and confidence bands. + linestyles : list|dict|None + Styles to use when plotting the ERP lines. + cmap_evokeds : None|str|tuple + Colormap from which to draw color values when plotting the ERP lines. + cmap_topo: matplotlib colormap + Colormap to use for the topomap. + ci : float|bool|callable()|None + Confidence band around each ERP time series. """ - # define colorblind friendly colors - colorblind_palette = ["#4daf4a", "#984ea3"] - # extract condition labels from the dictionary cond_keys = list(condition_labels.keys()) # extract the evokeds from the dictionary cond_values = list(condition_labels.values()) - # configure variables for visualization - colors = { - cond_keys[0]: colorblind_palette[0], - cond_keys[1]: colorblind_palette[1], - } - line_styles = {cond_keys[0]: "-", cond_keys[1]: "--"} + linestyles = {cond_keys[0]: "-", cond_keys[1]: "--"} lowest_p_cluster = np.argmin(self.cluster_p_values) @@ -2040,7 +2050,7 @@ def plot_cluster_time_sensor(self, condition_labels: dict): times=0, mask=mask, axes=ax_topo, - cmap="RdBu_r", + cmap=cmap_topo, show=False, colorbar=False, mask_params=dict(markersize=10), @@ -2049,13 +2059,11 @@ def plot_cluster_time_sensor(self, condition_labels: dict): image = ax_topo.images[0] # remove the title that would otherwise say "0.000 s" - ax_topo.set_title("") - - # soft import? - # make_axes_locatable = _soft_import( - # "mpl_toolkits.axes_grid1.make_axes_locatable", - # purpose="plot cluster results" - # ) # soft import (not a dependency for MNE) + ax_topo.set_title( + "Spatial cluster extent:\n averaged from {:0.3f} to {:0.3f} s".format( + *sig_times[[0, -1]] + ) + ) # create additional axes (for ERF and colorbar) divider = make_axes_locatable(ax_topo) @@ -2064,11 +2072,6 @@ def plot_cluster_time_sensor(self, condition_labels: dict): ax_colorbar = divider.append_axes("right", size="5%", pad=0.1) cbar = plt.colorbar(image, cax=ax_colorbar) cbar.set_label(self.stat_name) - ax_topo.set_xlabel( - "Spatial cluster extent:\n averaged from {:0.3f} to {:0.3f} s".format( - *sig_times[[0, -1]] - ) - ) # add new axis for time courses and plot time courses ax_signals = divider.append_axes("right", size="300%", pad=1.3) @@ -2081,11 +2084,13 @@ def plot_cluster_time_sensor(self, condition_labels: dict): picks=ch_inds, axes=ax_signals, colors=colors, - linestyles=line_styles, + linestyles=linestyles, + cmap=cmap_evokeds, show=False, split_legend=True, truncate_yaxis="auto", truncate_xaxis=False, + ci=ci, ) plt.legend(frameon=False, loc="upper left") From 3aa32b699932d188768dfc60f33b4e6d0df2645d Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Mon, 5 Aug 2024 13:22:41 +0200 Subject: [PATCH 34/88] implemented Dan's comments --- tutorials/stats-sensor-space/76_new_cluster_test_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index b7f933d127b..fb928f89d0a 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -152,7 +152,7 @@ # finally let's plot the results using the ClusterResults class # we plot the cluster with the lowest p-value -cluster_result.plot_cluster_time_sensor(condition_labels=conditions_dict) +cluster_result.plot_cluster_time_sensor(condition_labels=conditions_dict, ci=True) # we can see that there is something going on around 400 ms # with a stronger signal for target trials in right central-parietal channels From a76afd31ecdc8a55aec3003ab12eff433dc0616d Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Mon, 5 Aug 2024 16:11:23 +0200 Subject: [PATCH 35/88] test for handling different MNE objects - test is failing --- mne/stats/tests/test_cluster_level.py | 101 +++++++++++++++++++++++--- 1 file changed, 90 insertions(+), 11 deletions(-) diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py index 24f3ee687ca..1c126494250 100644 --- a/mne/stats/tests/test_cluster_level.py +++ b/mne/stats/tests/test_cluster_level.py @@ -906,21 +906,100 @@ def test_compare_old_and_new_cluster_api(): @pytest.mark.parametrize( "Inst", (EpochsArray, EvokedArray, EpochsTFRArray, AverageTFRArray) ) +@pytest.mark.filterwarnings('ignore:Ignoring argument "tail":RuntimeWarning') def test_new_cluster_api(Inst): """Test handling different MNE objects in the cluster API.""" pd = pytest.importorskip("pandas") - n_epo, n_chan, n_freq, n_times = 2, 3, 5, 7 - shape = (n_chan, n_times) - if Inst in (EpochsArray, EpochsTFRArray): - shape = (n_epo,) + shape - if Inst in (EpochsTFRArray, AverageTFRArray): - shape = shape[:-1] + (n_freq, shape[-1]) + n_epo, n_chan, n_freq, n_times = 2, 3, 4, 5 + info = create_info(ch_names=n_chan, sfreq=1000, ch_types="eeg") + # Introduce a significant difference in a specific region, time, and frequency + region_start = 1 + region_end = 2 + time_start = 2 + time_end = 4 + freq_start = 2 + freq_end = 4 + + if Inst == EpochsArray: + # Create random data for EpochsArray + inst1 = Inst(np.random.randn(n_epo, n_chan, n_times), info=info) + # Adding a constant to create a difference + data_copy = inst1.get_data().copy() # no data attribute for EpochsArray + data_copy[:, region_start:region_end, time_start:time_end] += ( + 2 # Modify the copy + ) + inst2 = Inst( + data=data_copy, info=info + ) # Use the modified copy as a new instance + + elif Inst == EvokedArray: + # Create random data for EvokedArray + inst1 = Inst(np.random.randn(n_chan, n_times), info=info) + data_copy = inst1.data.copy() + data_copy[region_start:region_end, time_start:time_end] += 2 + inst2 = Inst(data=data_copy, info=info) + + elif Inst == EpochsTFRArray: + # Create random data for EpochsTFRArray + data_tfr1 = np.random.randn(n_epo, n_chan, n_freq, n_times) + data_tfr2 = np.random.randn(n_epo, n_chan, n_freq, n_times) + inst1 = Inst( + data=data_tfr1, info=info, times=np.arange(n_times), freqs=np.arange(n_freq) + ) + inst2 = Inst( + data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq) + ) + data_tfr2 = inst2.data.copy() + data_tfr2[ + :, region_start:region_end, freq_start:freq_end, time_start:time_end + ] += 2 + inst2 = Inst( + data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq) + ) - info = create_info(...) - inst1 = Inst(np.random.normal(shape, ...), info=info) - inst2 = Inst(np.random.normal(shape, ...), info=info) + elif Inst == AverageTFRArray: + # Create random data for AverageTFRArray + data_tfr1 = np.random.randn(n_chan, n_freq, n_times) + data_tfr2 = np.random.randn(n_chan, n_freq, n_times) + inst1 = Inst( + data=data_tfr1, info=info, times=np.arange(n_times), freqs=np.arange(n_freq) + ) + inst2 = Inst( + data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq) + ) + data_tfr2 = inst2.data.copy() + data_tfr2[ + region_start:region_end, freq_start:freq_end, time_start:time_end + ] += 2 + inst2 = Inst( + data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq) + ) + # test old and new API with sample data df = pd.DataFrame(dict(data=[inst1, inst2], condition=["a", "b"])) - result = cluster_test(df, "data~condition", ...) - assert result # TODO do something more interesting here + kwargs = dict(n_permutations=100, seed=1, tail=1, buffer_size=None, out_type="mask") + + result_new_api = cluster_test(df, "data~condition", **kwargs) + + # make sure channels are last dimension for old API + if Inst == EpochsArray: + inst1 = inst1.get_data().transpose(0, 2, 1) + inst2 = inst2.get_data().transpose(0, 2, 1) + elif Inst == EpochsTFRArray: + inst1 = inst1.data.transpose(0, 3, 2, 1) + inst2 = inst2.data.transpose(0, 3, 2, 1) + elif Inst == AverageTFRArray: + inst1 = inst1.data.transpose(2, 1, 0) + inst2 = inst2.data.transpose(2, 1, 0) + else: + inst1 = inst1.data.transpose(1, 0) + inst2 = inst2.data.transpose(1, 0) + + F_obs, clusters, cluster_pvals, H0 = permutation_cluster_test( + [inst1, inst2], **kwargs + ) + assert_array_equal(result_new_api.H0, H0) + assert_array_equal(result_new_api.stat_obs, F_obs) + assert_array_equal(result_new_api.cluster_p_values, cluster_pvals) + assert result_new_api.clusters == clusters From b5fce8b7f23d96e79e64cfcac732b7a32dd7aa0a Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Tue, 6 Aug 2024 16:54:31 +0200 Subject: [PATCH 36/88] adjusted test to account for multiple subjects --- mne/stats/tests/test_cluster_level.py | 40 ++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py index 1c126494250..e3a701d3691 100644 --- a/mne/stats/tests/test_cluster_level.py +++ b/mne/stats/tests/test_cluster_level.py @@ -911,7 +911,7 @@ def test_new_cluster_api(Inst): """Test handling different MNE objects in the cluster API.""" pd = pytest.importorskip("pandas") - n_epo, n_chan, n_freq, n_times = 2, 3, 4, 5 + n_subs, n_epo, n_chan, n_freq, n_times = 2, 2, 3, 4, 5 info = create_info(ch_names=n_chan, sfreq=1000, ch_types="eeg") # Introduce a significant difference in a specific region, time, and frequency region_start = 1 @@ -976,9 +976,25 @@ def test_new_cluster_api(Inst): data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq) ) - # test old and new API with sample data - df = pd.DataFrame(dict(data=[inst1, inst2], condition=["a", "b"])) - kwargs = dict(n_permutations=100, seed=1, tail=1, buffer_size=None, out_type="mask") + if Inst == EvokedArray or Inst == AverageTFRArray: + # Generate random noise + noise = np.random.normal(loc=0, scale=0.1, size=inst1.data.shape) + # add noise to the data of the second subject + inst1_n = inst1.copy() + inst1_n.data = inst1.data + noise + inst2_n = inst2.copy() + inst2_n.data = inst2.data + noise + data = [inst1, inst2, inst1_n, inst2_n] + conds = ["a", "b"] * n_subs + else: + data = [inst1, inst2] + conds = ["a", "b"] + + df = pd.DataFrame(dict(data=data, condition=conds)) + + kwargs = dict( + n_permutations=100, seed=42, tail=1, buffer_size=None, out_type="mask" + ) result_new_api = cluster_test(df, "data~condition", **kwargs) @@ -992,14 +1008,24 @@ def test_new_cluster_api(Inst): elif Inst == AverageTFRArray: inst1 = inst1.data.transpose(2, 1, 0) inst2 = inst2.data.transpose(2, 1, 0) + inst1_n = inst1_n.data.transpose(2, 1, 0) + inst2_n = inst2_n.data.transpose(2, 1, 0) + # combine the data of the two subjects + inst1 = np.concatenate([inst1[np.newaxis, :], inst1_n[np.newaxis, :]], axis=0) + inst2 = np.concatenate([inst2[np.newaxis, :], inst2_n[np.newaxis, :]], axis=0) else: inst1 = inst1.data.transpose(1, 0) inst2 = inst2.data.transpose(1, 0) + inst1_n = inst1_n.data.transpose(1, 0) + inst2_n = inst2_n.data.transpose(1, 0) + # combine the data of the two subjects + inst1 = np.concatenate([inst1[np.newaxis, :], inst1_n[np.newaxis, :]], axis=0) + inst2 = np.concatenate([inst2[np.newaxis, :], inst2_n[np.newaxis, :]], axis=0) F_obs, clusters, cluster_pvals, H0 = permutation_cluster_test( [inst1, inst2], **kwargs ) - assert_array_equal(result_new_api.H0, H0) - assert_array_equal(result_new_api.stat_obs, F_obs) - assert_array_equal(result_new_api.cluster_p_values, cluster_pvals) + assert_array_almost_equal(result_new_api.H0, H0) + assert_array_almost_equal(result_new_api.stat_obs, F_obs) + assert_array_almost_equal(result_new_api.cluster_p_values, cluster_pvals) assert result_new_api.clusters == clusters From 3ce510c1e5b53c7cdd123b468cf3ef4d6e55e428 Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Sat, 10 Aug 2024 17:39:40 -0500 Subject: [PATCH 37/88] refactor df validation to return bools --- mne/stats/cluster_level.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index 804d035ff51..821d12cfd8f 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -42,6 +42,7 @@ verbose, warn, ) +from ..utils.mixin import GetEpochsMixin from ..viz import plot_compare_evokeds from .parametric import f_oneway, ttest_1samp_no_p @@ -1784,7 +1785,11 @@ def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str): f"{prologue} consistent shape, but {len(all_shapes)} different " f"shapes were found: {'; '.join(all_shapes)}." ) - return all_types.pop() # return the type of the data column entries + obj_type = all_types.pop() + is_epo = GetEpochsMixin in obj_type.__mro__ + is_tfr = BaseTFR in obj_type.__mro__ + is_arr = np.ndarray in obj_type.__mro__ + return is_epo, is_tfr, is_arr @verbose @@ -1868,7 +1873,7 @@ def cluster_test( iv_name = str(np.array(formula.rhs.root).item()) # validate the input dataframe and return the type of the data column entries - _dtype = _validate_cluster_df(df, dv_name, iv_name) + is_epo, is_tfr, is_arr = _validate_cluster_df(df, dv_name, iv_name) # for within_subject designs, check if each subject has 2 observations _validate_type(within_id, (str, None), "within_id") @@ -1880,23 +1885,18 @@ def cluster_test( raise ValueError("for paired t-test, each subject must have 2 observations") # extract the data from the dataframe - def _extract_data_array(series): - return np.concatenate(series.values) + outer_func = np.concatenate if is_epo or is_arr else np.array + axes = (-3, -1) if is_tfr else (-2, -1) - def _extract_data_mne(series): # 2D data - return np.array( - series.map(lambda inst: inst.get_data().swapaxes(-2, -1)).to_list() + def func_mne(series): + return outer_func( + series.map(lambda inst: inst.get_data().swapaxes(*axes)).to_list() ) - def _extract_data_tfr(series): - return series.map(lambda inst: inst.get_data().swapaxes(-3, -1)).to_list() + def func_array(series): + return outer_func(series.values) - if _dtype is np.ndarray: - func = _extract_data_array - elif _dtype is BaseTFR: - func = _extract_data_tfr - else: - func = _extract_data_mne + func = func_array if is_arr else func_mne # convert to a list-like X for clustering X = df.groupby(iv_name).agg({dv_name: func})[dv_name].to_list() From feb1911773fa9aeec7528f1e31be44599a6f4c89 Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Sat, 10 Aug 2024 17:40:14 -0500 Subject: [PATCH 38/88] unrelated typing fix --- mne/stats/cluster_level.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index 821d12cfd8f..2ab8917226d 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -1993,7 +1993,7 @@ def plot_cluster_time_sensor( linestyles: list | dict | None = None, cmap_evokeds: None | str | tuple = None, cmap_topo: None | str | tuple = None, - ci: float | bool | callable() | None = None, + ci: float | bool | callable | None = None, ): """ Plot the cluster with the lowest p-value. From 6f9781197e29c410d9a2cef41f1e7228da9ab740 Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Sat, 10 Aug 2024 17:41:19 -0500 Subject: [PATCH 39/88] rework test --- mne/stats/tests/test_cluster_level.py | 164 ++++++++++---------------- 1 file changed, 60 insertions(+), 104 deletions(-) diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py index e3a701d3691..654f0c552f2 100644 --- a/mne/stats/tests/test_cluster_level.py +++ b/mne/stats/tests/test_cluster_level.py @@ -911,121 +911,77 @@ def test_new_cluster_api(Inst): """Test handling different MNE objects in the cluster API.""" pd = pytest.importorskip("pandas") - n_subs, n_epo, n_chan, n_freq, n_times = 2, 2, 3, 4, 5 + rng = np.random.default_rng(seed=8675309) + is_epo = Inst in (EpochsTFRArray, EpochsArray) + is_tfr = Inst in (EpochsTFRArray, AverageTFRArray) + + n_epo, n_chan, n_freq, n_times = 6, 3, 4, 5 + + # prepare the dimensions of the simulated data, then simulate + size = (n_chan,) + if is_epo: + size = (n_epo, *size) + if is_tfr: + size = (*size, n_freq) + size = (*size, n_times) + data = rng.normal(size=size) + + # construct the instance info = create_info(ch_names=n_chan, sfreq=1000, ch_types="eeg") - # Introduce a significant difference in a specific region, time, and frequency - region_start = 1 - region_end = 2 - time_start = 2 - time_end = 4 - freq_start = 2 - freq_end = 4 - - if Inst == EpochsArray: - # Create random data for EpochsArray - inst1 = Inst(np.random.randn(n_epo, n_chan, n_times), info=info) - # Adding a constant to create a difference - data_copy = inst1.get_data().copy() # no data attribute for EpochsArray - data_copy[:, region_start:region_end, time_start:time_end] += ( - 2 # Modify the copy - ) - inst2 = Inst( - data=data_copy, info=info - ) # Use the modified copy as a new instance - - elif Inst == EvokedArray: - # Create random data for EvokedArray - inst1 = Inst(np.random.randn(n_chan, n_times), info=info) - data_copy = inst1.data.copy() - data_copy[region_start:region_end, time_start:time_end] += 2 - inst2 = Inst(data=data_copy, info=info) - - elif Inst == EpochsTFRArray: - # Create random data for EpochsTFRArray - data_tfr1 = np.random.randn(n_epo, n_chan, n_freq, n_times) - data_tfr2 = np.random.randn(n_epo, n_chan, n_freq, n_times) - inst1 = Inst( - data=data_tfr1, info=info, times=np.arange(n_times), freqs=np.arange(n_freq) - ) - inst2 = Inst( - data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq) - ) - data_tfr2 = inst2.data.copy() - data_tfr2[ - :, region_start:region_end, freq_start:freq_end, time_start:time_end - ] += 2 - inst2 = Inst( - data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq) - ) - - elif Inst == AverageTFRArray: - # Create random data for AverageTFRArray - data_tfr1 = np.random.randn(n_chan, n_freq, n_times) - data_tfr2 = np.random.randn(n_chan, n_freq, n_times) - inst1 = Inst( - data=data_tfr1, info=info, times=np.arange(n_times), freqs=np.arange(n_freq) - ) - inst2 = Inst( - data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq) - ) - data_tfr2 = inst2.data.copy() - data_tfr2[ - region_start:region_end, freq_start:freq_end, time_start:time_end - ] += 2 - inst2 = Inst( - data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq) - ) - - if Inst == EvokedArray or Inst == AverageTFRArray: - # Generate random noise - noise = np.random.normal(loc=0, scale=0.1, size=inst1.data.shape) - # add noise to the data of the second subject - inst1_n = inst1.copy() - inst1_n.data = inst1.data + noise - inst2_n = inst2.copy() - inst2_n.data = inst2.data + noise - data = [inst1, inst2, inst1_n, inst2_n] - conds = ["a", "b"] * n_subs + kw = dict(times=np.arange(n_times), freqs=np.arange(n_freq)) if is_tfr else dict() + cond_a = Inst(data=data, info=info, **kw) + cond_b = cond_a.copy() + # introduce a significant difference in a specific region, time, and frequency + ch_start, ch_end = 0, 2 # 2 channels + t_start, t_end = 2, 4 # 2 times + f_start, f_end = 2, 4 # 2 freqs + if is_tfr: + cond_b._data[..., ch_start:ch_end, f_start:f_end, t_start:t_end] += 2 + else: + cond_b._data[..., ch_start:ch_end, t_start:t_end] += 2 + # for Evokeds/AverageTFRs, we create fake "subjects" as our observations within each + # condition. We add a bit of noise while we do so. + if not is_epo: + insts = list() + for cond in cond_a, cond_b: + for _n in range(n_epo): + if not _n: + insts.append(cond) + continue + _cond = cond.copy() + _cond.data += rng.normal(scale=0.1, size=_cond.data.shape) + insts.append(_cond) + conds = np.repeat(["a", "b"], n_epo).tolist() else: - data = [inst1, inst2] + # For Epochs(TFR)Array, each epoch is an observation and they're already + # noisy/non-identical, so no duplication / noise-addition necessary. + insts = [cond_a, cond_b] conds = ["a", "b"] - df = pd.DataFrame(dict(data=data, condition=conds)) - + # run new clustering API + df = pd.DataFrame(dict(data=insts, condition=conds)) kwargs = dict( n_permutations=100, seed=42, tail=1, buffer_size=None, out_type="mask" ) - result_new_api = cluster_test(df, "data~condition", **kwargs) # make sure channels are last dimension for old API - if Inst == EpochsArray: - inst1 = inst1.get_data().transpose(0, 2, 1) - inst2 = inst2.get_data().transpose(0, 2, 1) - elif Inst == EpochsTFRArray: - inst1 = inst1.data.transpose(0, 3, 2, 1) - inst2 = inst2.data.transpose(0, 3, 2, 1) - elif Inst == AverageTFRArray: - inst1 = inst1.data.transpose(2, 1, 0) - inst2 = inst2.data.transpose(2, 1, 0) - inst1_n = inst1_n.data.transpose(2, 1, 0) - inst2_n = inst2_n.data.transpose(2, 1, 0) - # combine the data of the two subjects - inst1 = np.concatenate([inst1[np.newaxis, :], inst1_n[np.newaxis, :]], axis=0) - inst2 = np.concatenate([inst2[np.newaxis, :], inst2_n[np.newaxis, :]], axis=0) + if is_epo: + axes = (0, 3, 2, 1) if is_tfr else (0, 2, 1) + X = [cond_a.get_data().transpose(*axes), cond_b.get_data().transpose(*axes)] else: - inst1 = inst1.data.transpose(1, 0) - inst2 = inst2.data.transpose(1, 0) - inst1_n = inst1_n.data.transpose(1, 0) - inst2_n = inst2_n.data.transpose(1, 0) - # combine the data of the two subjects - inst1 = np.concatenate([inst1[np.newaxis, :], inst1_n[np.newaxis, :]], axis=0) - inst2 = np.concatenate([inst2[np.newaxis, :], inst2_n[np.newaxis, :]], axis=0) - - F_obs, clusters, cluster_pvals, H0 = permutation_cluster_test( - [inst1, inst2], **kwargs - ) + axes = (2, 1, 0) if is_tfr else (1, 0) + Xa = list() + Xb = list() + for inst, cond in zip(insts, conds): + container = Xa if cond == "a" else Xb + container.append(inst.get_data().transpose(*axes)) + X = [np.stack(Xa), np.stack(Xb)] + + F_obs, clusters, cluster_pvals, H0 = permutation_cluster_test(X, **kwargs) assert_array_almost_equal(result_new_api.H0, H0) assert_array_almost_equal(result_new_api.stat_obs, F_obs) assert_array_almost_equal(result_new_api.cluster_p_values, cluster_pvals) - assert result_new_api.clusters == clusters + assert len(result_new_api.clusters) == len(clusters) + for clu1, clu2 in zip(result_new_api.clusters, clusters): + assert_array_equal(clu1, clu2) From b09d20a6759882268c32a5f4f0069620f8bb0a3a Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Mon, 12 Aug 2024 09:08:27 -0500 Subject: [PATCH 40/88] minor cleanup --- mne/stats/cluster_level.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index 2ab8917226d..79efde4be4f 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -1885,18 +1885,18 @@ def cluster_test( raise ValueError("for paired t-test, each subject must have 2 observations") # extract the data from the dataframe - outer_func = np.concatenate if is_epo or is_arr else np.array + outer_func = np.concatenate if is_epo else np.array axes = (-3, -1) if is_tfr else (-2, -1) + def func_arr(series): + return np.concatenate(series.values) + def func_mne(series): return outer_func( series.map(lambda inst: inst.get_data().swapaxes(*axes)).to_list() ) - def func_array(series): - return outer_func(series.values) - - func = func_array if is_arr else func_mne + func = func_arr if is_arr else func_mne # convert to a list-like X for clustering X = df.groupby(iv_name).agg({dv_name: func})[dv_name].to_list() From 977e153d6b0948b52c9a0ae25eca9ad20c3e71e9 Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Mon, 12 Aug 2024 09:16:24 -0500 Subject: [PATCH 41/88] fix imports --- mne/stats/cluster_level.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index 79efde4be4f..141f7c299d4 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -24,13 +24,15 @@ from scipy.stats import f as fstat from scipy.stats import t as tstat -from .. import BaseEpochs, Evoked, EvokedArray +from ..epochs import BaseEpochs, EvokedArray +from ..evoked import Evoked from ..fixes import has_numba, jit from ..parallel import parallel_func from ..source_estimate import MixedSourceEstimate, SourceEstimate, VolSourceEstimate from ..source_space import SourceSpaces from ..time_frequency import BaseTFR from ..utils import ( + GetEpochsMixin, ProgressBar, _check_option, _pl, @@ -42,7 +44,6 @@ verbose, warn, ) -from ..utils.mixin import GetEpochsMixin from ..viz import plot_compare_evokeds from .parametric import f_oneway, ttest_1samp_no_p From a288d8579546adcf20750ff974bedb043d82ca0d Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Mon, 12 Aug 2024 09:16:35 -0500 Subject: [PATCH 42/88] use MRO in test too --- mne/stats/tests/test_cluster_level.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py index 654f0c552f2..b4d676abe91 100644 --- a/mne/stats/tests/test_cluster_level.py +++ b/mne/stats/tests/test_cluster_level.py @@ -39,8 +39,8 @@ summarize_clusters_stc, ttest_1samp_no_p, ) -from mne.time_frequency import AverageTFRArray, EpochsTFRArray -from mne.utils import _record_warnings, catch_logging +from mne.time_frequency import AverageTFRArray, BaseTFR, EpochsTFRArray +from mne.utils import GetEpochsMixin, _record_warnings, catch_logging n_space = 50 @@ -912,8 +912,8 @@ def test_new_cluster_api(Inst): pd = pytest.importorskip("pandas") rng = np.random.default_rng(seed=8675309) - is_epo = Inst in (EpochsTFRArray, EpochsArray) - is_tfr = Inst in (EpochsTFRArray, AverageTFRArray) + is_epo = GetEpochsMixin in Inst.__mro__ + is_tfr = BaseTFR in Inst.__mro__ n_epo, n_chan, n_freq, n_times = 6, 3, 4, 5 From 05586c8803316d3cb29d217b93d8cdaf54949309 Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Fri, 14 Jun 2024 14:22:51 +0200 Subject: [PATCH 43/88] added cluster test api, first commit --- .../76_new_cluster_test_api.py | 467 ++++++++++++++++++ 1 file changed, 467 insertions(+) create mode 100644 tutorials/stats-sensor-space/76_new_cluster_test_api.py diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py new file mode 100644 index 00000000000..4e2b3af8f6d --- /dev/null +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -0,0 +1,467 @@ +from pathlib import Path +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from mpl_toolkits.axes_grid1 import make_axes_locatable +import mne + +# eventually we want to use the _permutation_cluster_test function + +# import and load dataset +path_to_p3 = Path("C:/Users/Carina/mne_data/ERP_CORE_P3") + +def prep_sample_data(plot_evokeds: bool = False): + """ + Load the P3 dataset and extract the target, non-target and contrast evokeds. + """ + # Define the range of participant IDs + participant_ids = range(15, 20) # This will cover 015 to 019 + + evokeds_allsubs = [] + + # Loop over each participant ID and generate the corresponding filename + for pid in participant_ids: + # Create the filename using an f-string, ensuring the participant ID is zero-padded to 3 digits + filename_p3 = f"sub-{pid:03d}_ses-P3_task-P3_ave.fif" + + # Print the filename (or perform your desired operations on it) + print(filename_p3) + + p3_file_path = Path(path_to_p3) / filename_p3 + + evokeds = mne.read_evokeds(p3_file_path) + + # add to list + evokeds_allsubs.append(evokeds) + + target_only = [evoked[0] for evoked in evokeds_allsubs] + non_target_only = [evoked[1] for evoked in evokeds_allsubs] + contrast = [evoked[2] for evoked in evokeds_allsubs] + + if plot_evokeds: + # plot the grand average + mne.grand_average(target_only).plot() + mne.grand_average(non_target_only).plot() + mne.grand_average(contrast).plot() + + # create contrast from evokeds target and non-target + diff_evoked = [ + mne.combine_evoked([evokeds_a, evokeds_b], weights=[1, -1]) + for evokeds_a, evokeds_b in zip(target_only, non_target_only) + ] + + if plot_evokeds: + mne.grand_average(diff_evoked).plot() + + # crop the evokeds in the post stimulus window + contrast = [evokeds.crop(tmin=-0.1, tmax=0.6) for evokeds in contrast] + target_only = [evokeds.crop(tmin=-0.1, tmax=0.6) for evokeds in target_only] + non_target_only = [evokeds.crop(tmin=-0.1, tmax=0.6) for evokeds in non_target_only] + + return contrast, target_only, non_target_only + + +def old_api_cluster(n_permutations: int = 10000, seed: int = 1234): + """ + Run the cluster test using the old API to get a bechmark result for the new API. + Currently implementing a paired t-test with contrast between participants. + """ + contrast, target_only, non_target_only = prep_sample_data() + + # extract the data for each evoked and store in numpy array + data = np.array([evoked.data for evoked in contrast]) + + # shape should be (n_subjects, n_channels, n_times) + data.shape + + # reshape to channels as last dimension + data = data.transpose(0, 2, 1) + + data.shape + + adjacency, _ = mne.channels.find_ch_adjacency(contrast[0].info, ch_type="eeg") + + stat_fun, threshold = mne.stats.cluster_level._check_fun( + X=data, stat_fun=None, threshold=None, tail=0, kind="within" + ) + + # adjacency = mne.channels.find_ch_adjacency(contrast[0].info, ch_type='eeg') + # Run the analysis + T_obs, clusters, cluster_p_values, H0 = ( + mne.stats.cluster_level._permutation_cluster_test( + [data], + threshold=threshold, + stat_fun=stat_fun, + n_jobs=-1, # takes all CPU cores + max_step=1, # maximum distance between samples (time points) + exclude=None, # exclude no time points or channels + step_down_p=0, # step down in jumps test + t_power=1, # weigh each location by its stats score + out_type="indices", + check_disjoint=False, + buffer_size=None, # block size for chunking the data + n_permutations=n_permutations, + tail=0, + adjacency=adjacency, + seed=seed, + ) + ) + + print(min(cluster_p_values)) + + plot_cluster( + contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values + ) + + return T_obs, clusters, cluster_p_values, H0 + + +# fit cluster test with dataframe as input +# create condition list that repeats 5times 1 and then 5 times 0 +# 1 = target, 0 = non-target +# condition = 5 * [1] + 5 * [0] + +# 1 = target, 0 = non-target +# contrast, target_only, non_target_only = prep_sample_data() + +# evokeds_list = target_only + non_target_only + + +def create_random_evokeds_id_condition_list(evoked_data_a: list, evoked_data_b: list): + """ + Create a list of shuffled participant IDs, conditions, and evoked data. + # Keep the participant IDs and conditions paired but shuffle the order of the evoked data. + """ + import random + + # Example participant IDs + participant_ids = ["p1", "p2", "p3", "p4", "p5"] * 2 + + # Combine the evoked data into a single list + all_evoked_data = evoked_data_a + evoked_data_b + + # Create a corresponding list of conditions + conditions = [1] * len(evoked_data_a) + [0] * len(evoked_data_b) + + # Combine the participant IDs, conditions, and evoked data into a list of tuples + combined_list = list(zip(participant_ids, conditions, all_evoked_data)) + + # Shuffle the combined list + random.shuffle(combined_list) + + # Separate the shuffled list back into participant IDs, conditions, and evoked data + shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = zip( + *combined_list + ) + + # Convert the tuples back to lists + shuffled_participant_ids = list(shuffled_participant_ids) + shuffled_conditions = list(shuffled_conditions) + shuffled_evoked_data = list(shuffled_evoked_data) + + return shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data + + +def create_random_paired_evokeds_list(evoked_data_a: list, evoked_data_b: list): + """ + Create a list of shuffled evoked data where each pair of target and non-target evoked data is shuffled together. + """ + import random + + # Create a list of tuples where each tuple contains an evoked data and its corresponding label + evoked_pairs = [(evoked, 1) for evoked in evoked_data_a] + [ + (evoked, 0) for evoked in evoked_data_b + ] + + # Shuffle the list of tuples + random.shuffle(evoked_pairs) + + # Separate the shuffled list back into evoked data and labels + shuffled_evoked_data, shuffled_labels = zip(*evoked_pairs) + + # Convert the tuples back to lists + shuffled_evoked_data = list(shuffled_evoked_data) + + return shuffled_evoked_data + + +# shuffle order of pairs +shuffled_evokeds_list = create_random_paired_evokeds_list(target_only, non_target_only) +# shouldn't change the results (p-value is different though?) + +shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = ( + create_random_evokeds_id_condition_list( + evoked_data_a=target_only, evoked_data_b=non_target_only + ) +) + + +def prepare_dataframe_for_cluster_function( + contrast: bool = False, + evokeds: list = None, + condition: list = None, + subject_index: list = None, +): + """ + Prepare a dataframe for the cluster test function. + + Parameters + ---------- + contrast : bool, optional + If True, a contrast is calculated. Default is False. + evokeds : list, optional + List of evoked objects. Default is None. + condition : list, optional + List of conditions for each evoked object. Default is None. + subject_index : list, optional + List of subject IDs. Default is None. + + """ + # create an empty dataframe + df = pd.DataFrame() + + if contrast == True: + # check if evoked list is dividable by 2 + if len(evokeds) % 2 != 0: + raise ValueError("evokeds list needs to be dividable by 2") + if condition is not None: + # Convert lists to DataFrame for easier manipulation + df = pd.DataFrame( + { + "evoked": evokeds, + "condition": condition, + "subject_index": subject_index, + } + ) + + return df + + +def cluster_test( + df: pd.DataFrame, + n_permutations: int = 10000, + seed: int = 1234, + contrast_weights: list = [1, -1], +): + """ + Run the cluster test using the new API. + # currently supports paired t-test with contrast or with list of conditions + + Parameters + ---------- + dataframe : pd.DataFrame + Dataframe with evoked data, conditions and subject IDs. + n_permutations : int, optional + Number of permutations. Default is 10000. + seed : int, optional + Random seed. Default is 1234. + + Returns + ------- + T_obs : array + The observed test statistic. + clusters : list + List of clusters. + cluster_p_values : array + Array of cluster p-values. + H0 : array + The permuted test statistics. + """ + if df.condition is not None: + # Extract unique conditions + unique_conditions = np.unique(df.condition) + if len(unique_conditions) != 2: + raise ValueError("Condition list needs to contain 2 unique values") + if df.subject_index is not None: + # Initialize a list to hold the combined evoked data + evokeds_data = [] + + # Process each subject's evoked data + for sub_id in df.subject_index.unique(): + sub_df = df[df.subject_index == sub_id] + + # Split evokeds list based on condition list for this subject + evokeds_a = sub_df[sub_df.condition == unique_conditions[0]][ + "evoked" + ].tolist() + evokeds_b = sub_df[sub_df.condition == unique_conditions[1]][ + "evoked" + ].tolist() + + if len(evokeds_a) != 1 or len(evokeds_b) != 1: + raise ValueError( + f"Subject {sub_id}: Each subject must have exactly one evoked for each condition" + ) + + # Calculate contrast based on condition list + diff_evoked = mne.combine_evoked( + [evokeds_a[0], evokeds_b[0]], weights=contrast_weights + ) + evokeds_data.append(diff_evoked) + else: + # calculate length of evokeds list + n_evokeds = len(df.evokeds) + # now split evokeds list in two lists + evokeds_a = df.evokeds[: n_evokeds // 2] + evokeds_b = df.evokeds[n_evokeds // 2 :] + # create contrast from evokeds_a and evokeds_b + diff_evoked = [ + mne.combine_evoked([evo_a, evo_b], weights=contrast_weights) + for evo_a, evo_b in zip(evokeds_a, evokeds_b) + ] + evokeds_data = diff_evoked + else: + evokeds_data = df.evokeds + + # extract number of channels + n_channels = evokeds_data[0].info["nchan"] + + # loop over rows and extract data from evokeds + data_array = np.array([evoked.data for evoked in evokeds_data]) + + # find the dimension that is equal to n_channels + if data_array.shape[1] == n_channels: + # reshape to channels as last dimension + data = data_array.transpose(0, 2, 1) + + adjacency, _ = mne.channels.find_ch_adjacency(evokeds_data[0].info, ch_type="eeg") + + stat_fun, threshold = mne.stats.cluster_level._check_fun( + X=data, stat_fun=None, threshold=None, tail=0, kind="within" + ) + + T_obs, clusters, cluster_p_values, H0 = ( + mne.stats.cluster_level._permutation_cluster_test( + [data], + threshold=threshold, + stat_fun=stat_fun, + n_jobs=-1, + max_step=1, + exclude=None, + step_down_p=0.05, + t_power=1, + out_type="indices", + check_disjoint=True, + buffer_size=None, + n_permutations=n_permutations, + tail=0, + adjacency=adjacency, + seed=seed, + ) + ) + + print(min(cluster_p_values)) + + # need to adjust plotting function for contrast only data + contrast, evokeds_a, evokeds_b = prep_sample_data() + + # plot cluster + plot_cluster(contrast, evokeds_a, evokeds_b, T_obs, clusters, cluster_p_values) + + return T_obs, clusters, cluster_p_values, H0 + + +def plot_cluster( + contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values +): + """ + Plot the cluster with the lowest p-value. + + Parameters + ---------- + contrast : list + List of contrast evoked objects. + target_only : list + List of target evoked objects. + non_target_only : list + List of non-target evoked objects. + T_obs : array + The observed test statistic. + clusters : list + List of clusters. + cluster_p_values : array + Array of cluster p-values. + + Returns + ------- + None + + """ + # configure variables for visualization + colors = {"target": "crimson", "non-target": "steelblue"} + + # organize data for plotting + evokeds = {"target": target_only, "non-target": non_target_only} + + lowest_p_cluster = np.argmin(cluster_p_values) + + # plot the cluster with the lowest p-value + time_inds, space_inds = np.squeeze(clusters[lowest_p_cluster]) + ch_inds = np.unique(space_inds) + time_inds = np.unique(time_inds) + + # get topography for F stat + t_map = T_obs[time_inds, ...].mean(axis=0) + + # get signals at the sensors contributing to the cluster + sig_times = contrast[0].times[time_inds] + + # create spatial mask + mask = np.zeros((t_map.shape[0], 1), dtype=bool) + mask[ch_inds, :] = True + + # initialize figure + fig, ax_topo = plt.subplots(1, 1, figsize=(10, 3), layout="constrained") + + # plot average test statistic and mark significant sensors + t_evoked = mne.EvokedArray(t_map[:, np.newaxis], contrast[0].info, tmin=0) + t_evoked.plot_topomap( + times=0, + mask=mask, + axes=ax_topo, + cmap="Reds", + vlim=(np.min, np.max), + show=False, + colorbar=False, + mask_params=dict(markersize=10), + ) + image = ax_topo.images[0] + + # remove the title that would otherwise say "0.000 s" + ax_topo.set_title("") + + # create additional axes (for ERF and colorbar) + divider = make_axes_locatable(ax_topo) + + # add axes for colorbar + ax_colorbar = divider.append_axes("right", size="5%", pad=0.05) + plt.colorbar(image, cax=ax_colorbar) + ax_topo.set_xlabel( + "Averaged t-map ({:0.3f} - {:0.3f} s)".format(*sig_times[[0, -1]]) + ) + + # add new axis for time courses and plot time courses + ax_signals = divider.append_axes("right", size="300%", pad=1.2) + title = f"Cluster #1, {len(ch_inds)} sensor" + if len(ch_inds) > 1: + title += "s (mean)" + mne.viz.plot_compare_evokeds( + evokeds, + title=title, + picks=ch_inds, + axes=ax_signals, + colors=colors, + show=False, + split_legend=True, + truncate_yaxis="auto", + ) + + # plot temporal cluster extent + ymin, ymax = ax_signals.get_ylim() + ax_signals.fill_betweenx( + (ymin, ymax), sig_times[0], sig_times[-1], color="green", alpha=0.3 + ) + + plt.show() + + return None From e8770fd0d7f1854cddc59b8fdf5bab414202eff6 Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Fri, 14 Jun 2024 19:02:45 +0200 Subject: [PATCH 44/88] tested dataframe function and results, cleaned up --- .../76_new_cluster_test_api.py | 187 +++++++++--------- 1 file changed, 95 insertions(+), 92 deletions(-) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index 4e2b3af8f6d..3f001251ba5 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -5,7 +5,6 @@ from mpl_toolkits.axes_grid1 import make_axes_locatable import mne -# eventually we want to use the _permutation_cluster_test function # import and load dataset path_to_p3 = Path("C:/Users/Carina/mne_data/ERP_CORE_P3") @@ -85,7 +84,6 @@ def old_api_cluster(n_permutations: int = 10000, seed: int = 1234): X=data, stat_fun=None, threshold=None, tail=0, kind="within" ) - # adjacency = mne.channels.find_ch_adjacency(contrast[0].info, ch_type='eeg') # Run the analysis T_obs, clusters, cluster_p_values, H0 = ( mne.stats.cluster_level._permutation_cluster_test( @@ -115,25 +113,15 @@ def old_api_cluster(n_permutations: int = 10000, seed: int = 1234): return T_obs, clusters, cluster_p_values, H0 - -# fit cluster test with dataframe as input -# create condition list that repeats 5times 1 and then 5 times 0 -# 1 = target, 0 = non-target -# condition = 5 * [1] + 5 * [0] - -# 1 = target, 0 = non-target -# contrast, target_only, non_target_only = prep_sample_data() - -# evokeds_list = target_only + non_target_only - - -def create_random_evokeds_id_condition_list(evoked_data_a: list, evoked_data_b: list): +def create_random_evokeds_id_condition_list(): """ Create a list of shuffled participant IDs, conditions, and evoked data. # Keep the participant IDs and conditions paired but shuffle the order of the evoked data. """ import random + _ , evoked_data_a, evoked_data_b = prep_sample_data() + # Example participant IDs participant_ids = ["p1", "p2", "p3", "p4", "p5"] * 2 @@ -162,42 +150,42 @@ def create_random_evokeds_id_condition_list(evoked_data_a: list, evoked_data_b: return shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data -def create_random_paired_evokeds_list(evoked_data_a: list, evoked_data_b: list): +def create_random_paired_evokeds_list(): """ Create a list of shuffled evoked data where each pair of target and non-target evoked data is shuffled together. """ import random + _, evoked_data_a, evoked_data_b = prep_sample_data() - # Create a list of tuples where each tuple contains an evoked data and its corresponding label - evoked_pairs = [(evoked, 1) for evoked in evoked_data_a] + [ - (evoked, 0) for evoked in evoked_data_b - ] + # Ensure evoked_data_a and evoked_data_b are of the same length + assert len(evoked_data_a) == len(evoked_data_b), "evoked_data_a and evoked_data_b must have the same length" + + # Create a list of participant indices + participant_indices = list(range(len(evoked_data_a))) - # Shuffle the list of tuples - random.shuffle(evoked_pairs) + # Shuffle the list of participant indices + random.shuffle(participant_indices) - # Separate the shuffled list back into evoked data and labels - shuffled_evoked_data, shuffled_labels = zip(*evoked_pairs) + # Reorder evoked data according to the shuffled participant indices + shuffled_evoked_data_a = [evoked_data_a[i] for i in participant_indices] + shuffled_evoked_data_b = [evoked_data_b[i] for i in participant_indices] - # Convert the tuples back to lists - shuffled_evoked_data = list(shuffled_evoked_data) + # Combine the shuffled evoked data into a single list + shuffled_evoked_data = shuffled_evoked_data_a + shuffled_evoked_data_b + + # Combine the original evoked data into a single list + original_evoked_data = evoked_data_a + evoked_data_b - return shuffled_evoked_data + return original_evoked_data, shuffled_evoked_data # shuffle order of pairs -shuffled_evokeds_list = create_random_paired_evokeds_list(target_only, non_target_only) +original_evoked_data, shuffled_evoked_data = create_random_paired_evokeds_list() # shouldn't change the results (p-value is different though?) -shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = ( - create_random_evokeds_id_condition_list( - evoked_data_a=target_only, evoked_data_b=non_target_only - ) -) - +shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = create_random_evokeds_id_condition_list() def prepare_dataframe_for_cluster_function( - contrast: bool = False, evokeds: list = None, condition: list = None, subject_index: list = None, @@ -216,29 +204,39 @@ def prepare_dataframe_for_cluster_function( subject_index : list, optional List of subject IDs. Default is None. + Returns + ------- + df : DataFrame + The prepared DataFrame for the cluster test function. """ - # create an empty dataframe - df = pd.DataFrame() - - if contrast == True: - # check if evoked list is dividable by 2 - if len(evokeds) % 2 != 0: - raise ValueError("evokeds list needs to be dividable by 2") - if condition is not None: - # Convert lists to DataFrame for easier manipulation - df = pd.DataFrame( - { - "evoked": evokeds, - "condition": condition, - "subject_index": subject_index, - } - ) - - return df + # Initialize the DataFrame with evoked data + df = pd.DataFrame({ + "evoked": evokeds, + "condition": condition if condition is not None else np.nan, + "subject_index": subject_index if subject_index is not None else np.nan + }) + + return df +# run with original data +df = prepare_dataframe_for_cluster_function(evokeds=original_evoked_data, + condition=None, + subject_index=None) + +df = prepare_dataframe_for_cluster_function(evokeds=shuffled_evoked_data, + condition=None, + subject_index=None) + +df = prepare_dataframe_for_cluster_function(evokeds=shuffled_evoked_data, + condition=shuffled_conditions, + subject_index=shuffled_participant_ids) + + +cluster_test(df) def cluster_test( df: pd.DataFrame, + contrast: bool = True, n_permutations: int = 10000, seed: int = 1234, contrast_weights: list = [1, -1], @@ -267,43 +265,47 @@ def cluster_test( H0 : array The permuted test statistics. """ - if df.condition is not None: - # Extract unique conditions - unique_conditions = np.unique(df.condition) - if len(unique_conditions) != 2: - raise ValueError("Condition list needs to contain 2 unique values") - if df.subject_index is not None: + # Check if conditions and subject_index are present and valid + conditions_present = pd.notna(df['condition']).all() + subject_index_present = pd.notna(df['subject_index']).all() + + if contrast == 1: + if conditions_present: + # Extract unique conditions + unique_conditions = np.unique(df.condition) + if len(unique_conditions) != 2: + raise ValueError("Condition list needs to contain 2 unique values") # Initialize a list to hold the combined evoked data evokeds_data = [] - - # Process each subject's evoked data - for sub_id in df.subject_index.unique(): - sub_df = df[df.subject_index == sub_id] - - # Split evokeds list based on condition list for this subject - evokeds_a = sub_df[sub_df.condition == unique_conditions[0]][ - "evoked" - ].tolist() - evokeds_b = sub_df[sub_df.condition == unique_conditions[1]][ - "evoked" - ].tolist() - - if len(evokeds_a) != 1 or len(evokeds_b) != 1: - raise ValueError( - f"Subject {sub_id}: Each subject must have exactly one evoked for each condition" + if subject_index_present: + # Process each subject's evoked data + for sub_id in df.subject_index.unique(): + sub_df = df[df.subject_index == sub_id] + + # Split evokeds list based on condition list for this subject + evokeds_a = sub_df[sub_df.condition == unique_conditions[0]][ + "evoked" + ].tolist() + evokeds_b = sub_df[sub_df.condition == unique_conditions[1]][ + "evoked" + ].tolist() + + if len(evokeds_a) != 1 or len(evokeds_b) != 1: + raise ValueError( + f"Subject {sub_id}: Each subject must have exactly one evoked for each condition" + ) + + # Calculate contrast based on condition list + diff_evoked = mne.combine_evoked( + [evokeds_a[0], evokeds_b[0]], weights=contrast_weights ) - - # Calculate contrast based on condition list - diff_evoked = mne.combine_evoked( - [evokeds_a[0], evokeds_b[0]], weights=contrast_weights - ) - evokeds_data.append(diff_evoked) + evokeds_data.append(diff_evoked) else: # calculate length of evokeds list - n_evokeds = len(df.evokeds) + n_evokeds = len(df.evoked) # now split evokeds list in two lists - evokeds_a = df.evokeds[: n_evokeds // 2] - evokeds_b = df.evokeds[n_evokeds // 2 :] + evokeds_a = df.evoked[: n_evokeds // 2] + evokeds_b = df.evoked[n_evokeds // 2 :] # create contrast from evokeds_a and evokeds_b diff_evoked = [ mne.combine_evoked([evo_a, evo_b], weights=contrast_weights) @@ -311,7 +313,7 @@ def cluster_test( ] evokeds_data = diff_evoked else: - evokeds_data = df.evokeds + evokeds_data = df.evoked.tolist() # extract number of channels n_channels = evokeds_data[0].info["nchan"] @@ -330,19 +332,20 @@ def cluster_test( X=data, stat_fun=None, threshold=None, tail=0, kind="within" ) + # Run the analysis T_obs, clusters, cluster_p_values, H0 = ( mne.stats.cluster_level._permutation_cluster_test( [data], threshold=threshold, stat_fun=stat_fun, - n_jobs=-1, - max_step=1, - exclude=None, - step_down_p=0.05, - t_power=1, + n_jobs=-1, # takes all CPU cores + max_step=1, # maximum distance between samples (time points) + exclude=None, # exclude no time points or channels + step_down_p=0, # step down in jumps test + t_power=1, # weigh each location by its stats score out_type="indices", - check_disjoint=True, - buffer_size=None, + check_disjoint=False, + buffer_size=None, # block size for chunking the data n_permutations=n_permutations, tail=0, adjacency=adjacency, From a081d7d4ff53520abdc910fc75e3fa108d794a15 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 14 Jun 2024 12:24:30 +0000 Subject: [PATCH 45/88] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tutorials/stats-sensor-space/76_new_cluster_test_api.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index 3f001251ba5..5d943985aa2 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -1,14 +1,17 @@ from pathlib import Path + import matplotlib.pyplot as plt import numpy as np import pandas as pd from mpl_toolkits.axes_grid1 import make_axes_locatable + import mne # import and load dataset path_to_p3 = Path("C:/Users/Carina/mne_data/ERP_CORE_P3") + def prep_sample_data(plot_evokeds: bool = False): """ Load the P3 dataset and extract the target, non-target and contrast evokeds. From d6d70c8b461523c87c472ad7d9c0b6e1e4403689 Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Fri, 14 Jun 2024 19:04:48 +0200 Subject: [PATCH 46/88] added ToDos --- tutorials/stats-sensor-space/76_new_cluster_test_api.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index 5d943985aa2..51ad611aa58 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -7,6 +7,8 @@ import mne +# TODO: implement formulaic design matrix for paired t-test +# TODO: @erik: add dataset to mne-data # import and load dataset path_to_p3 = Path("C:/Users/Carina/mne_data/ERP_CORE_P3") @@ -235,8 +237,6 @@ def prepare_dataframe_for_cluster_function( subject_index=shuffled_participant_ids) -cluster_test(df) - def cluster_test( df: pd.DataFrame, contrast: bool = True, @@ -471,3 +471,5 @@ def plot_cluster( plt.show() return None + +cluster_test(df) \ No newline at end of file From 834526146fd68d6bc243fcbd824543d055458938 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 14 Jun 2024 17:04:48 +0000 Subject: [PATCH 47/88] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../76_new_cluster_test_api.py | 55 ++++++++++++------- 1 file changed, 34 insertions(+), 21 deletions(-) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index 51ad611aa58..08917f78f03 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -118,6 +118,7 @@ def old_api_cluster(n_permutations: int = 10000, seed: int = 1234): return T_obs, clusters, cluster_p_values, H0 + def create_random_evokeds_id_condition_list(): """ Create a list of shuffled participant IDs, conditions, and evoked data. @@ -125,7 +126,7 @@ def create_random_evokeds_id_condition_list(): """ import random - _ , evoked_data_a, evoked_data_b = prep_sample_data() + _, evoked_data_a, evoked_data_b = prep_sample_data() # Example participant IDs participant_ids = ["p1", "p2", "p3", "p4", "p5"] * 2 @@ -160,11 +161,14 @@ def create_random_paired_evokeds_list(): Create a list of shuffled evoked data where each pair of target and non-target evoked data is shuffled together. """ import random + _, evoked_data_a, evoked_data_b = prep_sample_data() # Ensure evoked_data_a and evoked_data_b are of the same length - assert len(evoked_data_a) == len(evoked_data_b), "evoked_data_a and evoked_data_b must have the same length" - + assert len(evoked_data_a) == len( + evoked_data_b + ), "evoked_data_a and evoked_data_b must have the same length" + # Create a list of participant indices participant_indices = list(range(len(evoked_data_a))) @@ -188,7 +192,10 @@ def create_random_paired_evokeds_list(): original_evoked_data, shuffled_evoked_data = create_random_paired_evokeds_list() # shouldn't change the results (p-value is different though?) -shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = create_random_evokeds_id_condition_list() +shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = ( + create_random_evokeds_id_condition_list() +) + def prepare_dataframe_for_cluster_function( evokeds: list = None, @@ -215,26 +222,31 @@ def prepare_dataframe_for_cluster_function( The prepared DataFrame for the cluster test function. """ # Initialize the DataFrame with evoked data - df = pd.DataFrame({ - "evoked": evokeds, - "condition": condition if condition is not None else np.nan, - "subject_index": subject_index if subject_index is not None else np.nan - }) + df = pd.DataFrame( + { + "evoked": evokeds, + "condition": condition if condition is not None else np.nan, + "subject_index": subject_index if subject_index is not None else np.nan, + } + ) return df + # run with original data -df = prepare_dataframe_for_cluster_function(evokeds=original_evoked_data, - condition=None, - subject_index=None) +df = prepare_dataframe_for_cluster_function( + evokeds=original_evoked_data, condition=None, subject_index=None +) -df = prepare_dataframe_for_cluster_function(evokeds=shuffled_evoked_data, - condition=None, - subject_index=None) +df = prepare_dataframe_for_cluster_function( + evokeds=shuffled_evoked_data, condition=None, subject_index=None +) -df = prepare_dataframe_for_cluster_function(evokeds=shuffled_evoked_data, - condition=shuffled_conditions, - subject_index=shuffled_participant_ids) +df = prepare_dataframe_for_cluster_function( + evokeds=shuffled_evoked_data, + condition=shuffled_conditions, + subject_index=shuffled_participant_ids, +) def cluster_test( @@ -269,8 +281,8 @@ def cluster_test( The permuted test statistics. """ # Check if conditions and subject_index are present and valid - conditions_present = pd.notna(df['condition']).all() - subject_index_present = pd.notna(df['subject_index']).all() + conditions_present = pd.notna(df["condition"]).all() + subject_index_present = pd.notna(df["subject_index"]).all() if contrast == 1: if conditions_present: @@ -472,4 +484,5 @@ def plot_cluster( return None -cluster_test(df) \ No newline at end of file + +cluster_test(df) From 0373195a6cfba2bed5299aa2d883e64c5134de82 Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Wed, 19 Jun 2024 19:28:07 +0200 Subject: [PATCH 48/88] added formula support and implemented suggestions --- .../76_new_cluster_test_api.py | 51 ++++++++++++++++--- 1 file changed, 45 insertions(+), 6 deletions(-) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index 08917f78f03..eef90a2612b 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -225,8 +225,8 @@ def prepare_dataframe_for_cluster_function( df = pd.DataFrame( { "evoked": evokeds, - "condition": condition if condition is not None else np.nan, - "subject_index": subject_index if subject_index is not None else np.nan, + "condition": condition if condition is not None else pd.NA, + "subject_index": subject_index if subject_index is not None else pd.NA, } ) @@ -251,10 +251,11 @@ def prepare_dataframe_for_cluster_function( def cluster_test( df: pd.DataFrame, - contrast: bool = True, + formula: str = None, # Wilkinson notation formula for design matrix + contrast: bool = True, # will be replaced by formulaic design matrix n_permutations: int = 10000, - seed: int = 1234, - contrast_weights: list = [1, -1], + seed: None | int | np.random.RandomState = None, + contrast_weights: list = [1, -1], # will be replaced by formulaic design matrix ): """ Run the cluster test using the new API. @@ -284,6 +285,22 @@ def cluster_test( conditions_present = pd.notna(df["condition"]).all() subject_index_present = pd.notna(df["subject_index"]).all() + # add a data column to the dataframe (numpy array) + df["data"] = [evoked.data for evoked in df.evoked] + + # convert wide format to long format + df_long = convert_wide_to_long(df) + + # check if formula is present + if formula is not None: + import formulaic + + # create design matrix based on formula + # Create the design matrix using formulaic + y, X = formulaic.model_matrix(formula, df_long) + + # what to do with the design matrix? + if contrast == 1: if conditions_present: # Extract unique conditions @@ -378,6 +395,29 @@ def cluster_test( return T_obs, clusters, cluster_p_values, H0 +# Convert wide format to long format +def convert_wide_to_long(df): + long_format_data = [] + for idx, row in df.iterrows(): + condition = row['condition'] + subject_index = row['subject_index'] + data_2d = row['data'] + + for channel in range(data_2d.shape[0]): + for timepoint in range(data_2d.shape[1]): + long_format_data.append({ + 'condition': condition, + 'subject_index': subject_index, + 'channel': channel, + 'timepoint': timepoint, + 'value': data_2d[channel, timepoint] + }) + + df_long = pd.DataFrame(long_format_data) + return df_long + +df_long = convert_wide_to_long(df) + def plot_cluster( contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values @@ -482,7 +522,6 @@ def plot_cluster( plt.show() - return None cluster_test(df) From 8bc44f968614f217d398fdc1894d8af1a0787115 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 19 Jun 2024 17:28:23 +0000 Subject: [PATCH 49/88] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../76_new_cluster_test_api.py | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index eef90a2612b..7c0abc95fae 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -251,11 +251,11 @@ def prepare_dataframe_for_cluster_function( def cluster_test( df: pd.DataFrame, - formula: str = None, # Wilkinson notation formula for design matrix - contrast: bool = True, # will be replaced by formulaic design matrix + formula: str = None, # Wilkinson notation formula for design matrix + contrast: bool = True, # will be replaced by formulaic design matrix n_permutations: int = 10000, seed: None | int | np.random.RandomState = None, - contrast_weights: list = [1, -1], # will be replaced by formulaic design matrix + contrast_weights: list = [1, -1], # will be replaced by formulaic design matrix ): """ Run the cluster test using the new API. @@ -395,27 +395,31 @@ def cluster_test( return T_obs, clusters, cluster_p_values, H0 + # Convert wide format to long format def convert_wide_to_long(df): long_format_data = [] for idx, row in df.iterrows(): - condition = row['condition'] - subject_index = row['subject_index'] - data_2d = row['data'] - + condition = row["condition"] + subject_index = row["subject_index"] + data_2d = row["data"] + for channel in range(data_2d.shape[0]): for timepoint in range(data_2d.shape[1]): - long_format_data.append({ - 'condition': condition, - 'subject_index': subject_index, - 'channel': channel, - 'timepoint': timepoint, - 'value': data_2d[channel, timepoint] - }) - + long_format_data.append( + { + "condition": condition, + "subject_index": subject_index, + "channel": channel, + "timepoint": timepoint, + "value": data_2d[channel, timepoint], + } + ) + df_long = pd.DataFrame(long_format_data) return df_long + df_long = convert_wide_to_long(df) @@ -523,5 +527,4 @@ def plot_cluster( plt.show() - cluster_test(df) From 654a3504412571b4fc72f10be71a9bb686048c05 Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Sat, 22 Jun 2024 11:10:13 +0200 Subject: [PATCH 50/88] fixed linting errors --- .../76_new_cluster_test_api.py | 35 +++++++++++++------ 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index 7c0abc95fae..2f1d55383d2 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -6,6 +6,7 @@ from mpl_toolkits.axes_grid1 import make_axes_locatable import mne +from mne.utils import _soft_import_ # TODO: implement formulaic design matrix for paired t-test # TODO: @erik: add dataset to mne-data @@ -15,9 +16,7 @@ def prep_sample_data(plot_evokeds: bool = False): - """ - Load the P3 dataset and extract the target, non-target and contrast evokeds. - """ + """Load the P3 dataset.""" # Define the range of participant IDs participant_ids = range(15, 20) # This will cover 015 to 019 @@ -25,7 +24,7 @@ def prep_sample_data(plot_evokeds: bool = False): # Loop over each participant ID and generate the corresponding filename for pid in participant_ids: - # Create the filename using an f-string, ensuring the participant ID is zero-padded to 3 digits + # Create the filename using an f-string, ID is zero-padded to 3 digits filename_p3 = f"sub-{pid:03d}_ses-P3_task-P3_ave.fif" # Print the filename (or perform your desired operations on it) @@ -67,7 +66,8 @@ def prep_sample_data(plot_evokeds: bool = False): def old_api_cluster(n_permutations: int = 10000, seed: int = 1234): """ - Run the cluster test using the old API to get a bechmark result for the new API. + Run the cluster test using the old API to get a benchmark result for the new API. + Currently implementing a paired t-test with contrast between participants. """ contrast, target_only, non_target_only = prep_sample_data() @@ -122,7 +122,8 @@ def old_api_cluster(n_permutations: int = 10000, seed: int = 1234): def create_random_evokeds_id_condition_list(): """ Create a list of shuffled participant IDs, conditions, and evoked data. - # Keep the participant IDs and conditions paired but shuffle the order of the evoked data. + + # Keep the participant IDs and conditions paired but shuffle the order of the data. """ import random @@ -158,7 +159,10 @@ def create_random_evokeds_id_condition_list(): def create_random_paired_evokeds_list(): """ - Create a list of shuffled evoked data where each pair of target and non-target evoked data is shuffled together. + Create shuffled paired evoked data. + + Create a list of shuffled evoked data where each pair of target + and non-target evoked data is shuffled together. """ import random @@ -255,10 +259,11 @@ def cluster_test( contrast: bool = True, # will be replaced by formulaic design matrix n_permutations: int = 10000, seed: None | int | np.random.RandomState = None, - contrast_weights: list = [1, -1], # will be replaced by formulaic design matrix + contrast_weights: list = (1, -1), # will be replaced by formulaic design matrix ): """ Run the cluster test using the new API. + # currently supports paired t-test with contrast or with list of conditions Parameters @@ -293,12 +298,14 @@ def cluster_test( # check if formula is present if formula is not None: - import formulaic + formulaic = _soft_import_("formulaic") # soft import # create design matrix based on formula # Create the design matrix using formulaic y, X = formulaic.model_matrix(formula, df_long) + # sign flip for paired t-test + # what to do with the design matrix? if contrast == 1: @@ -324,7 +331,7 @@ def cluster_test( if len(evokeds_a) != 1 or len(evokeds_b) != 1: raise ValueError( - f"Subject {sub_id}: Each subject must have exactly one evoked for each condition" + f"Subject {sub_id}: subject must have one evoked per cond" ) # Calculate contrast based on condition list @@ -398,6 +405,14 @@ def cluster_test( # Convert wide format to long format def convert_wide_to_long(df): + """ + Convert a DataFrame from wide to long. + + Parameters + ---------- + df : pd.DataFrame + DataFrame in wide format. + """ long_format_data = [] for idx, row in df.iterrows(): condition = row["condition"] From d1ed8a104375b77d7bccbd115235490b9668f712 Mon Sep 17 00:00:00 2001 From: Eric Larson Date: Tue, 25 Jun 2024 12:25:21 -0400 Subject: [PATCH 51/88] ENH: Add dataset [skip azp] [skip actions] --- mne/datasets/config.py | 4 ++-- pyproject.toml | 3 +++ tutorials/stats-sensor-space/76_new_cluster_test_api.py | 7 +++---- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/mne/datasets/config.py b/mne/datasets/config.py index 2cd937dbdee..1fb4282c513 100644 --- a/mne/datasets/config.py +++ b/mne/datasets/config.py @@ -88,7 +88,7 @@ # here: ↓↓↓↓↓↓↓↓ RELEASES = dict( testing="0.154", - misc="0.27", + misc="0.30", phantom_kit="0.2", ucl_opm_auditory="0.2", ) @@ -129,7 +129,7 @@ ) MNE_DATASETS["misc"] = dict( archive_name=f"{MISC_VERSIONED}.tar.gz", # 'mne-misc-data', - hash="md5:e343d3a00cb49f8a2f719d14f4758afe", + hash="md5:201d35531d3c03701cf50e38bb73481f", url=( "https://codeload.github.com/mne-tools/mne-misc-data/tar.gz/" f'{RELEASES["misc"]}' diff --git a/pyproject.toml b/pyproject.toml index 5427bfe16dc..47e54f4a5a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -108,6 +108,7 @@ full-no-qt = [ "defusedxml", "neo", "antio", + "formulaic", ] full = ["mne[full-no-qt]", "PyQt6!=6.6.0", "PyQt6-Qt6!=6.6.0,!=6.7.0"] full-pyqt6 = ["mne[full]"] @@ -146,6 +147,7 @@ test_extra = [ "snirf", "neo", "mne-bids", + "formulaic", ] # Dependencies for building the documentation @@ -158,6 +160,7 @@ doc = [ "sphinxcontrib-towncrier", "memory_profiler", "neo", + "formulaic", "seaborn!=0.11.2", "sphinx_copybutton", "sphinx-design", diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index 2f1d55383d2..8eb7637df53 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -6,13 +6,12 @@ from mpl_toolkits.axes_grid1 import make_axes_locatable import mne -from mne.utils import _soft_import_ +from mne.utils import _soft_import # TODO: implement formulaic design matrix for paired t-test -# TODO: @erik: add dataset to mne-data # import and load dataset -path_to_p3 = Path("C:/Users/Carina/mne_data/ERP_CORE_P3") +path_to_p3 = mne.datasets.misc.data_path() / "ERP_CORE" / "P3" def prep_sample_data(plot_evokeds: bool = False): @@ -298,7 +297,7 @@ def cluster_test( # check if formula is present if formula is not None: - formulaic = _soft_import_("formulaic") # soft import + formulaic = _soft_import("formulaic") # soft import # create design matrix based on formula # Create the design matrix using formulaic From c634a44ffee8c7de971008ba6f2d4fa088f9874e Mon Sep 17 00:00:00 2001 From: Eric Larson Date: Tue, 25 Jun 2024 12:26:57 -0400 Subject: [PATCH 52/88] FIX: One more [skip azp] [skip actions] --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index a0dbdf5ec49..45898f4fd5b 100644 --- a/environment.yml +++ b/environment.yml @@ -65,3 +65,4 @@ dependencies: - lazy_loader - defusedxml - python-neo + - formulaic From 0c2eb4f7736fcc085e93befbe7be9aaea97d9f60 Mon Sep 17 00:00:00 2001 From: Eric Larson Date: Tue, 25 Jun 2024 12:39:41 -0400 Subject: [PATCH 53/88] FIX: Title [skip azp] [skip actions] --- .../stats-sensor-space/76_new_cluster_test_api.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index 8eb7637df53..f9c4f61ad5f 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -1,3 +1,15 @@ +""" +.. _tut-new-cluster-test-api: + +==================== +New cluster test API +==================== + +This tutorial shows how to use the new API for cluster testing. +""" +# License: BSD-3-Clause +# Copyright the MNE-Python contributors. + from pathlib import Path import matplotlib.pyplot as plt From f46a79c1e94ad8eacc310243b542d220951dd068 Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Sun, 30 Jun 2024 20:11:28 +0200 Subject: [PATCH 54/88] first draft of formulaic paired t-test --- .../76_new_cluster_test_api.py | 342 ++++++++++++------ 1 file changed, 224 insertions(+), 118 deletions(-) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index f9c4f61ad5f..6a3a966bbcc 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -15,12 +15,13 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd +import scipy from mpl_toolkits.axes_grid1 import make_axes_locatable import mne from mne.utils import _soft_import -# TODO: implement formulaic design matrix for paired t-test +# TODO: test function and update docstrings # import and load dataset path_to_p3 = mne.datasets.misc.data_path() / "ERP_CORE" / "P3" @@ -248,15 +249,6 @@ def prepare_dataframe_for_cluster_function( return df -# run with original data -df = prepare_dataframe_for_cluster_function( - evokeds=original_evoked_data, condition=None, subject_index=None -) - -df = prepare_dataframe_for_cluster_function( - evokeds=shuffled_evoked_data, condition=None, subject_index=None -) - df = prepare_dataframe_for_cluster_function( evokeds=shuffled_evoked_data, condition=shuffled_conditions, @@ -267,24 +259,56 @@ def prepare_dataframe_for_cluster_function( def cluster_test( df: pd.DataFrame, formula: str = None, # Wilkinson notation formula for design matrix - contrast: bool = True, # will be replaced by formulaic design matrix n_permutations: int = 10000, seed: None | int | np.random.RandomState = None, - contrast_weights: list = (1, -1), # will be replaced by formulaic design matrix + tail: int = 0, # 0 for two-tailed, 1 for greater, -1 for less + n_jobs: int = 1, # how many cores to use + adjacency: tuple = None, + max_step: int = 1, # maximum distance between samples (time points) + exclude: list = None, # exclude no time points or channels + step_down_p: int = 0, # step down in jumps test + t_power: int = 1, # weigh each location by its stats score + out_type: str = "indices", + check_disjoint: bool = False, + buffer_size: int = None, # block size for chunking the data ): """ Run the cluster test using the new API. - # currently supports paired t-test with contrast or with list of conditions + # currently supports paired t-test Parameters ---------- dataframe : pd.DataFrame Dataframe with evoked data, conditions and subject IDs. + formula : str, optional + Wilkinson notation formula for design matrix. Default is None. n_permutations : int, optional Number of permutations. Default is 10000. + seed : None | int | np.random.RandomState, optional + Seed for the random number generator. Default is None. + tail : int, optional + 0 for two-tailed, 1 for greater, -1 for less. Default is 0. + n_jobs : int, optional + How many cores to use. Default is 1. + adjacency : None, optional + Adjacency matrix. Default is None. + max_step : int, optional + Maximum distance between samples (time points). Default is 1. + exclude : np.Array, optional + Exclude no time points or channels. Default is None. + step_down_p : int, optional + Step down in jumps test. Default is 0. + t_power : int, optional + Weigh each location by its stats score. Default is 1. + out_type : str, optional + Output type. Default is "indices". + check_disjoint : bool, optional + Check if clusters are disjoint. Default is False. + buffer_size : int, optional + Block size for chunking the data. Default is None. seed : int, optional - Random seed. Default is 1234. + Seed for the random number generator. Default is None. Returns ------- @@ -297,108 +321,78 @@ def cluster_test( H0 : array The permuted test statistics. """ - # Check if conditions and subject_index are present and valid - conditions_present = pd.notna(df["condition"]).all() - subject_index_present = pd.notna(df["subject_index"]).all() - + # for now this assumes a dataframe with a column for evoked data # add a data column to the dataframe (numpy array) df["data"] = [evoked.data for evoked in df.evoked] - # convert wide format to long format - df_long = convert_wide_to_long(df) + # extract number of channels and timepoints + # (eventually should also allow for frequency) + n_channels, n_timepoints = df["data"][0].shape + + # convert wide format to long format for formulaic + df_long = unpack_time_and_channels(df) + + # Pivot the DataFrame + pivot_df = df_long.pivot_table( + index=["subject_index", "channel", "timepoint"], + columns="condition", + values="value", + ).reset_index() + + # if not 2 unique conditions raise error + if len(pd.unique(df.condition)) != 2: + raise ValueError("Condition list needs to contain 2 unique values") + + # Compute the difference (assuming there are only 2 conditions) + pivot_df["y"] = pivot_df[0] - pivot_df[1] + + # Optional: Clean up the DataFrame + pivot_df = pivot_df[["subject_index", "channel", "timepoint", "y"]] # check if formula is present if formula is not None: - formulaic = _soft_import("formulaic") # soft import + formulaic = _soft_import( + "formulaic", purpose="set up Design Matrix" + ) # soft import (not a dependency for MNE) - # create design matrix based on formula + # for the paired t-test y is the difference between conditions + # X is the design matrix with a column with 1s and 0s for each participant # Create the design matrix using formulaic - y, X = formulaic.model_matrix(formula, df_long) - - # sign flip for paired t-test - - # what to do with the design matrix? - - if contrast == 1: - if conditions_present: - # Extract unique conditions - unique_conditions = np.unique(df.condition) - if len(unique_conditions) != 2: - raise ValueError("Condition list needs to contain 2 unique values") - # Initialize a list to hold the combined evoked data - evokeds_data = [] - if subject_index_present: - # Process each subject's evoked data - for sub_id in df.subject_index.unique(): - sub_df = df[df.subject_index == sub_id] - - # Split evokeds list based on condition list for this subject - evokeds_a = sub_df[sub_df.condition == unique_conditions[0]][ - "evoked" - ].tolist() - evokeds_b = sub_df[sub_df.condition == unique_conditions[1]][ - "evoked" - ].tolist() - - if len(evokeds_a) != 1 or len(evokeds_b) != 1: - raise ValueError( - f"Subject {sub_id}: subject must have one evoked per cond" - ) - - # Calculate contrast based on condition list - diff_evoked = mne.combine_evoked( - [evokeds_a[0], evokeds_b[0]], weights=contrast_weights - ) - evokeds_data.append(diff_evoked) - else: - # calculate length of evokeds list - n_evokeds = len(df.evoked) - # now split evokeds list in two lists - evokeds_a = df.evoked[: n_evokeds // 2] - evokeds_b = df.evoked[n_evokeds // 2 :] - # create contrast from evokeds_a and evokeds_b - diff_evoked = [ - mne.combine_evoked([evo_a, evo_b], weights=contrast_weights) - for evo_a, evo_b in zip(evokeds_a, evokeds_b) - ] - evokeds_data = diff_evoked + y, X = formulaic.model_matrix(formula, pivot_df) else: - evokeds_data = df.evoked.tolist() - - # extract number of channels - n_channels = evokeds_data[0].info["nchan"] - - # loop over rows and extract data from evokeds - data_array = np.array([evoked.data for evoked in evokeds_data]) + raise ValueError( + "Formula is required and needs to be a string in Wilkinson notation." + ) - # find the dimension that is equal to n_channels - if data_array.shape[1] == n_channels: - # reshape to channels as last dimension - data = data_array.transpose(0, 2, 1) + # now prep design matrix outcome variable for input into MNE cluster function + # we initially had first channels, then timepoints, + # now we need first timepoints, then channels + y_for_cluster = y.values.reshape(-1, n_channels, n_timepoints).transpose(0, 2, 1) - adjacency, _ = mne.channels.find_ch_adjacency(evokeds_data[0].info, ch_type="eeg") + adjacency, _ = mne.channels.find_ch_adjacency(df["evoked"][0].info, ch_type="eeg") + # define stat function and threshold stat_fun, threshold = mne.stats.cluster_level._check_fun( - X=data, stat_fun=None, threshold=None, tail=0, kind="within" + X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="within" ) - # Run the analysis + # Run the cluster-based permutation test T_obs, clusters, cluster_p_values, H0 = ( mne.stats.cluster_level._permutation_cluster_test( - [data], + [y_for_cluster], + n_permutations=10000, threshold=threshold, stat_fun=stat_fun, - n_jobs=-1, # takes all CPU cores - max_step=1, # maximum distance between samples (time points) - exclude=None, # exclude no time points or channels - step_down_p=0, # step down in jumps test - t_power=1, # weigh each location by its stats score - out_type="indices", - check_disjoint=False, - buffer_size=None, # block size for chunking the data - n_permutations=n_permutations, - tail=0, + tail=tail, + n_jobs=n_jobs, adjacency=adjacency, + max_step=max_step, # maximum distance between samples (time points) + exclude=exclude, # exclude no time points or channels + step_down_p=step_down_p, # step down in jumps test + t_power=t_power, # weigh each location by its stats score + out_type=out_type, + check_disjoint=check_disjoint, + buffer_size=buffer_size, # block size for chunking the data seed=seed, ) ) @@ -414,39 +408,44 @@ def cluster_test( return T_obs, clusters, cluster_p_values, H0 -# Convert wide format to long format -def convert_wide_to_long(df): +def unpack_time_and_channels(df): """ - Convert a DataFrame from wide to long. + Extract the time and channel data from the DataFrame. Parameters ---------- df : pd.DataFrame DataFrame in wide format. """ - long_format_data = [] - for idx, row in df.iterrows(): - condition = row["condition"] - subject_index = row["subject_index"] - data_2d = row["data"] - - for channel in range(data_2d.shape[0]): - for timepoint in range(data_2d.shape[1]): - long_format_data.append( - { - "condition": condition, - "subject_index": subject_index, - "channel": channel, - "timepoint": timepoint, - "value": data_2d[channel, timepoint], - } - ) + # Extracting all necessary data using list comprehensions for better performance + long_format_data = [ + { + "condition": row["condition"], + "subject_index": row["subject_index"], + "channel": channel, + "timepoint": timepoint, + "value": row["data"][channel, timepoint], + } + for idx, row in df.iterrows() + for channel in range(row["data"].shape[0]) + for timepoint in range(row["data"].shape[1]) + ] + # Creating the long format DataFrame df_long = pd.DataFrame(long_format_data) + return df_long -df_long = convert_wide_to_long(df) +# Example usage +# Sample wide format DataFrame +df_wide = pd.DataFrame( + { + "condition": ["A", "B"], + "subject_index": [1, 2], + "data": [np.array([[1, 2, 3], [4, 5, 6]]), np.array([[7, 8, 9], [10, 11, 12]])], + } +) def plot_cluster( @@ -553,4 +552,111 @@ def plot_cluster( plt.show() -cluster_test(df) +# translated the limo permutation ttest from matlab to python +def limo_ttest_permute(Data, n_perm=None): + """ + Pseudo one-sample t-test using sign-test with permutations. + + Parameters + ---------- + Data (numpy.ndarray): A matrix of data for the one-sample t-test. + Shape can be (n_channels, n_var, n_obs) or + (n_var, n_obs). + n_perm (int, optional): Number of permutations to perform. + If None, it defaults based on the number of observations. + + Returns + ------- + t_vals (numpy.ndarray): t-values under H0. + p_vals (numpy.ndarray): p-values under H0. + dfe (int): Degrees of freedom. + """ + # Check inputs and reshape if necessary + if Data.ndim == 3: + n_channels, n_var, n_obs = Data.shape + else: + n_channels = 1 + n_var, n_obs = Data.shape + Data = Data[np.newaxis, ...] + + # Warn if the number of observations is very small + if n_obs < 7: + n_psbl_prms = 2**n_obs + print( + f"Due to the very limited number of observations, " + f"the total number of possible permutations is small ({n_psbl_prms}). " + "Thus, only a limited number of p-values are possible " + "and the test might be overly conservative." + ) + + # Set up permutation test + if n_obs <= 12: + n_perm = 2**n_obs # total number of possible permutations + exact = True + print( + "Due to the limited number of observations, all possible permutations " + "of the data will be computed instead of random permutations." + ) + else: + exact = False + if n_perm is None: + n_perm = 1000 + + print(f"Executing permutation test with {n_perm} permutations...") + + # Initialize variables + t_vals = np.full( + (n_channels, n_var, n_perm), np.nan + ) # Array to store t-values for each permutation + sqrt_nXnM1 = np.sqrt( + n_obs * (n_obs - 1) + ) # Precompute constant for t-value calculation + dfe = n_obs - 1 # Degrees of freedom + + if exact: + # Use all possible permutations + for perm in range(n_perm): + # Set sign of each trial / participant's data + temp = np.array( + [int(x) for x in bin(perm)[2:].zfill(n_obs)] + ) # Convert perm index to binary array + sn = np.where(temp == 0, -1, 1) # Map 0 to -1 and 1 to 1 + sn_mtrx = np.tile(sn, (n_var, 1)).T # Repeat sn for each variable + + for c in range(n_channels): + data = Data[c, :, :] + d_perm = data * sn_mtrx # Apply sign flip to data + + # Compute t-score of permuted data + sm = np.sum(d_perm, axis=1) # Sum of permuted data + mn = sm / n_obs # Mean of permuted data + sm_sqrs = ( + np.sum(d_perm**2, axis=1) - (sm**2) / n_obs + ) # Sum of squares for standard error + stder = np.sqrt(sm_sqrs) / sqrt_nXnM1 # Standard error + t_vals[c, :, perm] = mn / stder # Compute t-values + + else: + # Use random permutations + for perm in range(n_perm): + # Randomly set sign of each trial / participant's data + sn = (np.random.rand(n_obs) > 0.5) * 2 - 1 # Generate random sign flips + sn_mtrx = np.tile(sn, (n_var, 1)) # Repeat sn for each variable + + for c in range(n_channels): + data = Data[c, :, :] + d_perm = data * sn_mtrx # Apply sign flip to data + + # Compute t-score of permuted data + sm = np.sum(d_perm, axis=1) # Sum of permuted data + mn = sm / n_obs # Mean of permuted data + sm_sqrs = ( + np.sum(d_perm**2, axis=1) - (sm**2) / n_obs + ) # Sum of squares for standard error + stder = np.sqrt(sm_sqrs) / sqrt_nXnM1 # Standard error + t_vals[c, :, perm] = mn / stder # Compute t-values + + # Compute p-values from t-values + p_vals = 2 * scipy.stats.cdf(-np.abs(t_vals), dfe) + + return t_vals, p_vals, dfe From 5d1cbae78a354321aef1f7f2fa5ecf6881f1533c Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Sat, 6 Jul 2024 10:36:55 +0200 Subject: [PATCH 55/88] first draft without cluster plotting class implemented --- mne/stats/cluster_level.py | 293 +++++++ .../76_new_cluster_test_api.py | 722 +++--------------- 2 files changed, 392 insertions(+), 623 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index 50743c104ef..e50991254fe 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -4,12 +4,17 @@ # License: BSD-3-Clause # Copyright the MNE-Python contributors. +import matplotlib.pyplot as plt import numpy as np +import pandas as pd +from mpl_toolkits.axes_grid1 import make_axes_locatable from scipy import ndimage, sparse from scipy.sparse.csgraph import connected_components from scipy.stats import f as fstat from scipy.stats import t as tstat +from .. import EvokedArray +from ..channels import find_ch_adjacency from ..fixes import has_numba, jit from ..parallel import parallel_func from ..source_estimate import MixedSourceEstimate, SourceEstimate, VolSourceEstimate @@ -18,6 +23,7 @@ ProgressBar, _check_option, _pl, + _soft_import, _validate_type, check_random_state, logger, @@ -25,6 +31,7 @@ verbose, warn, ) +from ..viz import plot_compare_evokeds from .parametric import f_oneway, ttest_1samp_no_p @@ -1723,3 +1730,289 @@ def summarize_clusters_stc( data_summary[:, 0] = np.sum(data_summary, axis=1) return klass(data_summary, vertices, tmin, tstep, subject) + + +def cluster_test( + df: pd.DataFrame, + formula: str = None, # Wilkinson notation formula for design matrix + n_permutations: int = 10000, + seed: None | int | np.random.RandomState = None, + tail: int = 0, # 0 for two-tailed, 1 for greater, -1 for less + n_jobs: int = 1, # how many cores to use + adjacency: tuple = None, + max_step: int = 1, # maximum distance between samples (time points) + exclude: list = None, # exclude no time points or channels + step_down_p: int = 0, # step down in jumps test + t_power: int = 1, # weigh each location by its stats score + out_type: str = "indices", + check_disjoint: bool = False, + buffer_size: int = None, # block size for chunking the data +): + """ + Run the cluster test using the new API. + + # currently supports paired t-test + + Parameters + ---------- + dataframe : pd.DataFrame + Dataframe with evoked data, conditions and subject IDs. + formula : str, optional + Wilkinson notation formula for design matrix. Default is None. + n_permutations : int, optional + Number of permutations. Default is 10000. + seed : None | int | np.random.RandomState, optional + Seed for the random number generator. Default is None. + tail : int, optional + 0 for two-tailed, 1 for greater, -1 for less. Default is 0. + n_jobs : int, optional + How many cores to use. Default is 1. + adjacency : None, optional + Adjacency matrix. Default is None. + max_step : int, optional + Maximum distance between samples (time points). Default is 1. + exclude : np.Array, optional + Exclude no time points or channels. Default is None. + step_down_p : int, optional + Step down in jumps test. Default is 0. + t_power : int, optional + Weigh each location by its stats score. Default is 1. + out_type : str, optional + Output type. Default is "indices". + check_disjoint : bool, optional + Check if clusters are disjoint. Default is False. + buffer_size : int, optional + Block size for chunking the data. Default is None. + seed : int, optional + Seed for the random number generator. Default is None. + + Returns + ------- + T_obs : array + The observed test statistic. + clusters : list + List of clusters. + cluster_p_values : array + Array of cluster p-values. + H0 : array + The permuted test statistics. + """ + # for now this assumes a dataframe with a column for evoked data or epochs + # add a data column to the dataframe (numpy array) + df["data"] = [evoked.data for evoked in df.evoked] + + # extract number of channels and timepoints + # (eventually should also allow for frequency) + n_channels, n_timepoints = df["data"][0].shape + + # convert wide format to long format for formulaic + df_long = unpack_time_and_channels(df) + + # Pivot the DataFrame + pivot_df = df_long.pivot_table( + index=["subject_index", "channel", "timepoint"], + columns="condition", + values="value", + ).reset_index() + + # if not 2 unique conditions raise error + if len(pd.unique(df.condition)) != 2: + raise ValueError("Condition list needs to contain 2 unique values") + + # Get the unique conditions + conditions = np.unique(df.condition) + + # Compute the difference (assuming there are only 2 conditions) + pivot_df["evoked"] = pivot_df[conditions[0]] - pivot_df[conditions[1]] + + # Optional: Clean up the DataFrame + pivot_df = pivot_df[["subject_index", "channel", "timepoint", "evoked"]] + + # check if formula is present + if formula is not None: + formulaic = _soft_import( + "formulaic", purpose="set up Design Matrix" + ) # soft import (not a dependency for MNE) + + # for the paired t-test y is the difference between conditions + # X is the design matrix with a column with 1s and 0s for each participant + # Create the design matrix using formulaic + y, X = formulaic.model_matrix(formula, pivot_df) + else: + raise ValueError( + "Formula is required and needs to be a string in Wilkinson notation." + ) + + # now prep design matrix outcome variable for input into MNE cluster function + # we initially had first channels, then timepoints, + # now we need first timepoints, then channels + y_for_cluster = y.values.reshape(-1, n_channels, n_timepoints).transpose(0, 2, 1) + + adjacency, _ = find_ch_adjacency(df["evoked"][0].info, ch_type="eeg") + + # define stat function and threshold + stat_fun, threshold = _check_fun( + X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="within" + ) + + # Run the cluster-based permutation test + T_obs, clusters, cluster_p_values, H0 = _permutation_cluster_test( + [y_for_cluster], + n_permutations=10000, + threshold=threshold, + stat_fun=stat_fun, + tail=tail, + n_jobs=n_jobs, + adjacency=adjacency, + max_step=max_step, # maximum distance between samples (time points) + exclude=exclude, # exclude no time points or channels + step_down_p=step_down_p, # step down in jumps test + t_power=t_power, # weigh each location by its stats score + out_type=out_type, + check_disjoint=check_disjoint, + buffer_size=buffer_size, # block size for chunking the data + seed=seed, + ) + + print(min(cluster_p_values)) + + return T_obs, clusters, cluster_p_values, H0 + + +def unpack_time_and_channels(df): + """ + Extract the time and channel data from the DataFrame. + + Parameters + ---------- + df : pd.DataFrame + DataFrame in wide format. + """ + # Extracting all necessary data using list comprehensions for better performance + long_format_data = [ + { + "condition": row["condition"], + "subject_index": row["subject_index"], + "channel": channel, + "timepoint": timepoint, + "value": row["data"][channel, timepoint], + } + for idx, row in df.iterrows() + for channel in range(row["data"].shape[0]) + for timepoint in range(row["data"].shape[1]) + ] + + # Creating the long format DataFrame + df_long = pd.DataFrame(long_format_data) + + return df_long + + +def plot_cluster( + contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values +): + """ + Plot the cluster with the lowest p-value. + + Parameters + ---------- + contrast : list + List of contrast evoked objects. + target_only : list + List of target evoked objects. + non_target_only : list + List of non-target evoked objects. + T_obs : array + The observed test statistic. + clusters : list + List of clusters. + cluster_p_values : array + Array of cluster p-values. + + Returns + ------- + None + + """ + # configure variables for visualization + colors = {"target": "crimson", "non-target": "steelblue"} + + # organize data for plotting + evokeds = {"target": target_only, "non-target": non_target_only} + + lowest_p_cluster = np.argmin(cluster_p_values) + + # plot the cluster with the lowest p-value + time_inds, space_inds = np.squeeze(clusters[lowest_p_cluster]) + ch_inds = np.unique(space_inds) + time_inds = np.unique(time_inds) + + # get topography for F stat + t_map = T_obs[time_inds, ...].mean(axis=0) + + # get signals at the sensors contributing to the cluster + sig_times = contrast[0].times[time_inds] + + # create spatial mask + mask = np.zeros((t_map.shape[0], 1), dtype=bool) + mask[ch_inds, :] = True + + # initialize figure + fig, ax_topo = plt.subplots(1, 1, figsize=(10, 3), layout="constrained") + + # plot average test statistic and mark significant sensors + t_evoked = EvokedArray(t_map[:, np.newaxis], contrast[0].info, tmin=0) + t_evoked.plot_topomap( + times=0, + mask=mask, + axes=ax_topo, + cmap="Reds", + vlim=(np.min, np.max), + show=False, + colorbar=False, + mask_params=dict(markersize=10), + ) + image = ax_topo.images[0] + + # remove the title that would otherwise say "0.000 s" + ax_topo.set_title("") + + # soft import? + # make_axes_locatable = _soft_import( + # "mpl_toolkits.axes_grid1.make_axes_locatable", + # purpose="plot cluster results" + # ) # soft import (not a dependency for MNE) + + # create additional axes (for ERF and colorbar) + divider = make_axes_locatable(ax_topo) + + # add axes for colorbar + ax_colorbar = divider.append_axes("right", size="5%", pad=0.05) + plt.colorbar(image, cax=ax_colorbar) + ax_topo.set_xlabel( + "Averaged t-map ({:0.3f} - {:0.3f} s)".format(*sig_times[[0, -1]]) + ) + + # add new axis for time courses and plot time courses + ax_signals = divider.append_axes("right", size="300%", pad=1.2) + title = f"Cluster #1, {len(ch_inds)} sensor" + if len(ch_inds) > 1: + title += "s (mean)" + plot_compare_evokeds( + evokeds, + title=title, + picks=ch_inds, + axes=ax_signals, + colors=colors, + show=False, + split_legend=True, + truncate_yaxis="auto", + ) + + # plot temporal cluster extent + ymin, ymax = ax_signals.get_ylim() + ax_signals.fill_betweenx( + (ymin, ymax), sig_times[0], sig_times[-1], color="green", alpha=0.3 + ) + + plt.show() diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index 6a3a966bbcc..ec8bd8275a1 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -1,662 +1,138 @@ """ .. _tut-new-cluster-test-api: -==================== -New cluster test API -==================== +=============================================================== +New cluster test API that allows for Wilkinson style formulas +=============================================================== This tutorial shows how to use the new API for cluster testing. +This script shows how to estimate significant clusters in +evoked contrast data of multiple subjects. +It uses a non-parametric statistical procedure based on permutations and +cluster level statistics. + +The procedure consists of: + + - loading evoked data from multiple subjects + - construct a dataframe that contains the difference between conditions + - run the new cluster test function + +Here, the unit of observation are evokeds from multiple subjects (2nd level analysis). + +For more information on cluster-based permutation testing in MNE-Python, +see also: :ref:`tut-cluster-one-samp-tfr`. """ +# Authors: Carina Forster +# # License: BSD-3-Clause # Copyright the MNE-Python contributors. +# %% + from pathlib import Path -import matplotlib.pyplot as plt -import numpy as np import pandas as pd -import scipy -from mpl_toolkits.axes_grid1 import make_axes_locatable import mne -from mne.utils import _soft_import - -# TODO: test function and update docstrings -# import and load dataset +# Set parameters +# -------------- +# Define the path to the P3 dataset path_to_p3 = mne.datasets.misc.data_path() / "ERP_CORE" / "P3" +# Define the range of participant IDs +participant_ids = range(15, 20) # This will cover 015 to 019 -def prep_sample_data(plot_evokeds: bool = False): - """Load the P3 dataset.""" - # Define the range of participant IDs - participant_ids = range(15, 20) # This will cover 015 to 019 - - evokeds_allsubs = [] - - # Loop over each participant ID and generate the corresponding filename - for pid in participant_ids: - # Create the filename using an f-string, ID is zero-padded to 3 digits - filename_p3 = f"sub-{pid:03d}_ses-P3_task-P3_ave.fif" - - # Print the filename (or perform your desired operations on it) - print(filename_p3) - - p3_file_path = Path(path_to_p3) / filename_p3 - - evokeds = mne.read_evokeds(p3_file_path) - - # add to list - evokeds_allsubs.append(evokeds) - - target_only = [evoked[0] for evoked in evokeds_allsubs] - non_target_only = [evoked[1] for evoked in evokeds_allsubs] - contrast = [evoked[2] for evoked in evokeds_allsubs] - - if plot_evokeds: - # plot the grand average - mne.grand_average(target_only).plot() - mne.grand_average(non_target_only).plot() - mne.grand_average(contrast).plot() - - # create contrast from evokeds target and non-target - diff_evoked = [ - mne.combine_evoked([evokeds_a, evokeds_b], weights=[1, -1]) - for evokeds_a, evokeds_b in zip(target_only, non_target_only) - ] - - if plot_evokeds: - mne.grand_average(diff_evoked).plot() - - # crop the evokeds in the post stimulus window - contrast = [evokeds.crop(tmin=-0.1, tmax=0.6) for evokeds in contrast] - target_only = [evokeds.crop(tmin=-0.1, tmax=0.6) for evokeds in target_only] - non_target_only = [evokeds.crop(tmin=-0.1, tmax=0.6) for evokeds in non_target_only] - - return contrast, target_only, non_target_only - - -def old_api_cluster(n_permutations: int = 10000, seed: int = 1234): - """ - Run the cluster test using the old API to get a benchmark result for the new API. - - Currently implementing a paired t-test with contrast between participants. - """ - contrast, target_only, non_target_only = prep_sample_data() - - # extract the data for each evoked and store in numpy array - data = np.array([evoked.data for evoked in contrast]) - - # shape should be (n_subjects, n_channels, n_times) - data.shape - - # reshape to channels as last dimension - data = data.transpose(0, 2, 1) - - data.shape +# store the evoked data of all subjects +evokeds_allsubs = [] - adjacency, _ = mne.channels.find_ch_adjacency(contrast[0].info, ch_type="eeg") +# Loop over each participant ID and generate the corresponding filename +for pid in participant_ids: + # Create the filename using an f-string, ID is zero-padded to 3 digits + filename_p3 = f"sub-{pid:03d}_ses-P3_task-P3_ave.fif" - stat_fun, threshold = mne.stats.cluster_level._check_fun( - X=data, stat_fun=None, threshold=None, tail=0, kind="within" - ) + # Create the full path to the file + p3_file_path = Path(path_to_p3) / filename_p3 - # Run the analysis - T_obs, clusters, cluster_p_values, H0 = ( - mne.stats.cluster_level._permutation_cluster_test( - [data], - threshold=threshold, - stat_fun=stat_fun, - n_jobs=-1, # takes all CPU cores - max_step=1, # maximum distance between samples (time points) - exclude=None, # exclude no time points or channels - step_down_p=0, # step down in jumps test - t_power=1, # weigh each location by its stats score - out_type="indices", - check_disjoint=False, - buffer_size=None, # block size for chunking the data - n_permutations=n_permutations, - tail=0, - adjacency=adjacency, - seed=seed, - ) - ) + # load the evoked data + evokeds = mne.read_evokeds(p3_file_path) - print(min(cluster_p_values)) + # add subjects evoked data to list + evokeds_allsubs.append(evokeds) - plot_cluster( - contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values - ) +# the P3b dataset is part of the freely available ERP CORE dataset +# participants were presented with a visual oddball task +# and the P3b component was analyzed +# the conditions of interest are the target (rare visual stimuli) +# and non-target stimuli (frequency visual stimuli) - return T_obs, clusters, cluster_p_values, H0 +# let's extract the target and non-target evokeds +target_only = [evoked[0] for evoked in evokeds_allsubs] +non_target_only = [evoked[1] for evoked in evokeds_allsubs] +# let's first have a look at the data +# create contrast from target and non-target evokeds +diff_evoked = [ + mne.combine_evoked([evokeds_a, evokeds_b], weights=[1, -1]) + for evokeds_a, evokeds_b in zip(target_only, non_target_only) +] -def create_random_evokeds_id_condition_list(): - """ - Create a list of shuffled participant IDs, conditions, and evoked data. +# plot the grand average of the difference signal +mne.grand_average(diff_evoked).plot() +# plot the topography of the difference signal +mne.grand_average(diff_evoked).plot_topomap() - # Keep the participant IDs and conditions paired but shuffle the order of the data. - """ - import random +# we can see that the strongest difference is around 400 ms in +# visual channels (occipital region) - _, evoked_data_a, evoked_data_b = prep_sample_data() +# Next we prepare a dataframe for the cluster test function +# the dataframe should contain the contrast evoked data and the subject index +# each row in the dataframe should represent one observation (evoked data) - # Example participant IDs - participant_ids = ["p1", "p2", "p3", "p4", "p5"] * 2 +# save the evoked data for both conditions in one list +evokeds_conditions = target_only + non_target_only - # Combine the evoked data into a single list - all_evoked_data = evoked_data_a + evoked_data_b +# set up a list that defines the condition for each evoked data +# this will be used to create the conditions column in the dataframe +conditions = ["target"] * len(target_only) + ["non-target"] * len(non_target_only) - # Create a corresponding list of conditions - conditions = [1] * len(evoked_data_a) + [0] * len(evoked_data_b) +# finally add a column that defines the subject index +# this will be used to create the subject_index column in the dataframe +# we multiply the participant_ids by 2 to account for the two conditions +subject_index = list(participant_ids) * 2 - # Combine the participant IDs, conditions, and evoked data into a list of tuples - combined_list = list(zip(participant_ids, conditions, all_evoked_data)) - - # Shuffle the combined list - random.shuffle(combined_list) - - # Separate the shuffled list back into participant IDs, conditions, and evoked data - shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = zip( - *combined_list - ) - - # Convert the tuples back to lists - shuffled_participant_ids = list(shuffled_participant_ids) - shuffled_conditions = list(shuffled_conditions) - shuffled_evoked_data = list(shuffled_evoked_data) - - return shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data - - -def create_random_paired_evokeds_list(): - """ - Create shuffled paired evoked data. - - Create a list of shuffled evoked data where each pair of target - and non-target evoked data is shuffled together. - """ - import random - - _, evoked_data_a, evoked_data_b = prep_sample_data() - - # Ensure evoked_data_a and evoked_data_b are of the same length - assert len(evoked_data_a) == len( - evoked_data_b - ), "evoked_data_a and evoked_data_b must have the same length" - - # Create a list of participant indices - participant_indices = list(range(len(evoked_data_a))) - - # Shuffle the list of participant indices - random.shuffle(participant_indices) - - # Reorder evoked data according to the shuffled participant indices - shuffled_evoked_data_a = [evoked_data_a[i] for i in participant_indices] - shuffled_evoked_data_b = [evoked_data_b[i] for i in participant_indices] - - # Combine the shuffled evoked data into a single list - shuffled_evoked_data = shuffled_evoked_data_a + shuffled_evoked_data_b - - # Combine the original evoked data into a single list - original_evoked_data = evoked_data_a + evoked_data_b - - return original_evoked_data, shuffled_evoked_data - - -# shuffle order of pairs -original_evoked_data, shuffled_evoked_data = create_random_paired_evokeds_list() -# shouldn't change the results (p-value is different though?) - -shuffled_participant_ids, shuffled_conditions, shuffled_evoked_data = ( - create_random_evokeds_id_condition_list() +# create the dataframe +df = pd.DataFrame( + { + "evoked": evokeds_conditions, + "condition": conditions, + "subject_index": subject_index, + } ) +# now we can run the cluster test function +# we will use the new API that allows for Wilkinson style formulas +# the formula should be a string in Wilkinson notation -def prepare_dataframe_for_cluster_function( - evokeds: list = None, - condition: list = None, - subject_index: list = None, -): - """ - Prepare a dataframe for the cluster test function. - - Parameters - ---------- - contrast : bool, optional - If True, a contrast is calculated. Default is False. - evokeds : list, optional - List of evoked objects. Default is None. - condition : list, optional - List of conditions for each evoked object. Default is None. - subject_index : list, optional - List of subject IDs. Default is None. - - Returns - ------- - df : DataFrame - The prepared DataFrame for the cluster test function. - """ - # Initialize the DataFrame with evoked data - df = pd.DataFrame( - { - "evoked": evokeds, - "condition": condition if condition is not None else pd.NA, - "subject_index": subject_index if subject_index is not None else pd.NA, - } - ) - - return df - - -df = prepare_dataframe_for_cluster_function( - evokeds=shuffled_evoked_data, - condition=shuffled_conditions, - subject_index=shuffled_participant_ids, -) +# we want to test whether there is a significant difference between +# target and non-target stimuli in the post-stimulus window +# we will use a cluster-based permutation paired t-test for this +# let's first define the formula based on Wilkinson notation +formula = "evoked ~ 1 + C(subject_index)" -def cluster_test( - df: pd.DataFrame, - formula: str = None, # Wilkinson notation formula for design matrix - n_permutations: int = 10000, - seed: None | int | np.random.RandomState = None, - tail: int = 0, # 0 for two-tailed, 1 for greater, -1 for less - n_jobs: int = 1, # how many cores to use - adjacency: tuple = None, - max_step: int = 1, # maximum distance between samples (time points) - exclude: list = None, # exclude no time points or channels - step_down_p: int = 0, # step down in jumps test - t_power: int = 1, # weigh each location by its stats score - out_type: str = "indices", - check_disjoint: bool = False, - buffer_size: int = None, # block size for chunking the data -): - """ - Run the cluster test using the new API. - - # currently supports paired t-test - - Parameters - ---------- - dataframe : pd.DataFrame - Dataframe with evoked data, conditions and subject IDs. - formula : str, optional - Wilkinson notation formula for design matrix. Default is None. - n_permutations : int, optional - Number of permutations. Default is 10000. - seed : None | int | np.random.RandomState, optional - Seed for the random number generator. Default is None. - tail : int, optional - 0 for two-tailed, 1 for greater, -1 for less. Default is 0. - n_jobs : int, optional - How many cores to use. Default is 1. - adjacency : None, optional - Adjacency matrix. Default is None. - max_step : int, optional - Maximum distance between samples (time points). Default is 1. - exclude : np.Array, optional - Exclude no time points or channels. Default is None. - step_down_p : int, optional - Step down in jumps test. Default is 0. - t_power : int, optional - Weigh each location by its stats score. Default is 1. - out_type : str, optional - Output type. Default is "indices". - check_disjoint : bool, optional - Check if clusters are disjoint. Default is False. - buffer_size : int, optional - Block size for chunking the data. Default is None. - seed : int, optional - Seed for the random number generator. Default is None. - - Returns - ------- - T_obs : array - The observed test statistic. - clusters : list - List of clusters. - cluster_p_values : array - Array of cluster p-values. - H0 : array - The permuted test statistics. - """ - # for now this assumes a dataframe with a column for evoked data - # add a data column to the dataframe (numpy array) - df["data"] = [evoked.data for evoked in df.evoked] - - # extract number of channels and timepoints - # (eventually should also allow for frequency) - n_channels, n_timepoints = df["data"][0].shape - - # convert wide format to long format for formulaic - df_long = unpack_time_and_channels(df) - - # Pivot the DataFrame - pivot_df = df_long.pivot_table( - index=["subject_index", "channel", "timepoint"], - columns="condition", - values="value", - ).reset_index() - - # if not 2 unique conditions raise error - if len(pd.unique(df.condition)) != 2: - raise ValueError("Condition list needs to contain 2 unique values") - - # Compute the difference (assuming there are only 2 conditions) - pivot_df["y"] = pivot_df[0] - pivot_df[1] - - # Optional: Clean up the DataFrame - pivot_df = pivot_df[["subject_index", "channel", "timepoint", "y"]] - - # check if formula is present - if formula is not None: - formulaic = _soft_import( - "formulaic", purpose="set up Design Matrix" - ) # soft import (not a dependency for MNE) - - # for the paired t-test y is the difference between conditions - # X is the design matrix with a column with 1s and 0s for each participant - # Create the design matrix using formulaic - y, X = formulaic.model_matrix(formula, pivot_df) - else: - raise ValueError( - "Formula is required and needs to be a string in Wilkinson notation." - ) - - # now prep design matrix outcome variable for input into MNE cluster function - # we initially had first channels, then timepoints, - # now we need first timepoints, then channels - y_for_cluster = y.values.reshape(-1, n_channels, n_timepoints).transpose(0, 2, 1) - - adjacency, _ = mne.channels.find_ch_adjacency(df["evoked"][0].info, ch_type="eeg") - - # define stat function and threshold - stat_fun, threshold = mne.stats.cluster_level._check_fun( - X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="within" - ) - - # Run the cluster-based permutation test - T_obs, clusters, cluster_p_values, H0 = ( - mne.stats.cluster_level._permutation_cluster_test( - [y_for_cluster], - n_permutations=10000, - threshold=threshold, - stat_fun=stat_fun, - tail=tail, - n_jobs=n_jobs, - adjacency=adjacency, - max_step=max_step, # maximum distance between samples (time points) - exclude=exclude, # exclude no time points or channels - step_down_p=step_down_p, # step down in jumps test - t_power=t_power, # weigh each location by its stats score - out_type=out_type, - check_disjoint=check_disjoint, - buffer_size=buffer_size, # block size for chunking the data - seed=seed, - ) - ) - - print(min(cluster_p_values)) - - # need to adjust plotting function for contrast only data - contrast, evokeds_a, evokeds_b = prep_sample_data() - - # plot cluster - plot_cluster(contrast, evokeds_a, evokeds_b, T_obs, clusters, cluster_p_values) - - return T_obs, clusters, cluster_p_values, H0 - - -def unpack_time_and_channels(df): - """ - Extract the time and channel data from the DataFrame. - - Parameters - ---------- - df : pd.DataFrame - DataFrame in wide format. - """ - # Extracting all necessary data using list comprehensions for better performance - long_format_data = [ - { - "condition": row["condition"], - "subject_index": row["subject_index"], - "channel": channel, - "timepoint": timepoint, - "value": row["data"][channel, timepoint], - } - for idx, row in df.iterrows() - for channel in range(row["data"].shape[0]) - for timepoint in range(row["data"].shape[1]) - ] - - # Creating the long format DataFrame - df_long = pd.DataFrame(long_format_data) - - return df_long - - -# Example usage -# Sample wide format DataFrame -df_wide = pd.DataFrame( - { - "condition": ["A", "B"], - "subject_index": [1, 2], - "data": [np.array([[1, 2, 3], [4, 5, 6]]), np.array([[7, 8, 9], [10, 11, 12]])], - } +# run the cluster test +T_obs, clusters, cluster_p_values, H0 = mne.stats.cluster_level.cluster_test( + df=df, formula=formula ) - -def plot_cluster( - contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values -): - """ - Plot the cluster with the lowest p-value. - - Parameters - ---------- - contrast : list - List of contrast evoked objects. - target_only : list - List of target evoked objects. - non_target_only : list - List of non-target evoked objects. - T_obs : array - The observed test statistic. - clusters : list - List of clusters. - cluster_p_values : array - Array of cluster p-values. - - Returns - ------- - None - - """ - # configure variables for visualization - colors = {"target": "crimson", "non-target": "steelblue"} - - # organize data for plotting - evokeds = {"target": target_only, "non-target": non_target_only} - - lowest_p_cluster = np.argmin(cluster_p_values) - - # plot the cluster with the lowest p-value - time_inds, space_inds = np.squeeze(clusters[lowest_p_cluster]) - ch_inds = np.unique(space_inds) - time_inds = np.unique(time_inds) - - # get topography for F stat - t_map = T_obs[time_inds, ...].mean(axis=0) - - # get signals at the sensors contributing to the cluster - sig_times = contrast[0].times[time_inds] - - # create spatial mask - mask = np.zeros((t_map.shape[0], 1), dtype=bool) - mask[ch_inds, :] = True - - # initialize figure - fig, ax_topo = plt.subplots(1, 1, figsize=(10, 3), layout="constrained") - - # plot average test statistic and mark significant sensors - t_evoked = mne.EvokedArray(t_map[:, np.newaxis], contrast[0].info, tmin=0) - t_evoked.plot_topomap( - times=0, - mask=mask, - axes=ax_topo, - cmap="Reds", - vlim=(np.min, np.max), - show=False, - colorbar=False, - mask_params=dict(markersize=10), - ) - image = ax_topo.images[0] - - # remove the title that would otherwise say "0.000 s" - ax_topo.set_title("") - - # create additional axes (for ERF and colorbar) - divider = make_axes_locatable(ax_topo) - - # add axes for colorbar - ax_colorbar = divider.append_axes("right", size="5%", pad=0.05) - plt.colorbar(image, cax=ax_colorbar) - ax_topo.set_xlabel( - "Averaged t-map ({:0.3f} - {:0.3f} s)".format(*sig_times[[0, -1]]) - ) - - # add new axis for time courses and plot time courses - ax_signals = divider.append_axes("right", size="300%", pad=1.2) - title = f"Cluster #1, {len(ch_inds)} sensor" - if len(ch_inds) > 1: - title += "s (mean)" - mne.viz.plot_compare_evokeds( - evokeds, - title=title, - picks=ch_inds, - axes=ax_signals, - colors=colors, - show=False, - split_legend=True, - truncate_yaxis="auto", - ) - - # plot temporal cluster extent - ymin, ymax = ax_signals.get_ylim() - ax_signals.fill_betweenx( - (ymin, ymax), sig_times[0], sig_times[-1], color="green", alpha=0.3 - ) - - plt.show() - - -# translated the limo permutation ttest from matlab to python -def limo_ttest_permute(Data, n_perm=None): - """ - Pseudo one-sample t-test using sign-test with permutations. - - Parameters - ---------- - Data (numpy.ndarray): A matrix of data for the one-sample t-test. - Shape can be (n_channels, n_var, n_obs) or - (n_var, n_obs). - n_perm (int, optional): Number of permutations to perform. - If None, it defaults based on the number of observations. - - Returns - ------- - t_vals (numpy.ndarray): t-values under H0. - p_vals (numpy.ndarray): p-values under H0. - dfe (int): Degrees of freedom. - """ - # Check inputs and reshape if necessary - if Data.ndim == 3: - n_channels, n_var, n_obs = Data.shape - else: - n_channels = 1 - n_var, n_obs = Data.shape - Data = Data[np.newaxis, ...] - - # Warn if the number of observations is very small - if n_obs < 7: - n_psbl_prms = 2**n_obs - print( - f"Due to the very limited number of observations, " - f"the total number of possible permutations is small ({n_psbl_prms}). " - "Thus, only a limited number of p-values are possible " - "and the test might be overly conservative." - ) - - # Set up permutation test - if n_obs <= 12: - n_perm = 2**n_obs # total number of possible permutations - exact = True - print( - "Due to the limited number of observations, all possible permutations " - "of the data will be computed instead of random permutations." - ) - else: - exact = False - if n_perm is None: - n_perm = 1000 - - print(f"Executing permutation test with {n_perm} permutations...") - - # Initialize variables - t_vals = np.full( - (n_channels, n_var, n_perm), np.nan - ) # Array to store t-values for each permutation - sqrt_nXnM1 = np.sqrt( - n_obs * (n_obs - 1) - ) # Precompute constant for t-value calculation - dfe = n_obs - 1 # Degrees of freedom - - if exact: - # Use all possible permutations - for perm in range(n_perm): - # Set sign of each trial / participant's data - temp = np.array( - [int(x) for x in bin(perm)[2:].zfill(n_obs)] - ) # Convert perm index to binary array - sn = np.where(temp == 0, -1, 1) # Map 0 to -1 and 1 to 1 - sn_mtrx = np.tile(sn, (n_var, 1)).T # Repeat sn for each variable - - for c in range(n_channels): - data = Data[c, :, :] - d_perm = data * sn_mtrx # Apply sign flip to data - - # Compute t-score of permuted data - sm = np.sum(d_perm, axis=1) # Sum of permuted data - mn = sm / n_obs # Mean of permuted data - sm_sqrs = ( - np.sum(d_perm**2, axis=1) - (sm**2) / n_obs - ) # Sum of squares for standard error - stder = np.sqrt(sm_sqrs) / sqrt_nXnM1 # Standard error - t_vals[c, :, perm] = mn / stder # Compute t-values - - else: - # Use random permutations - for perm in range(n_perm): - # Randomly set sign of each trial / participant's data - sn = (np.random.rand(n_obs) > 0.5) * 2 - 1 # Generate random sign flips - sn_mtrx = np.tile(sn, (n_var, 1)) # Repeat sn for each variable - - for c in range(n_channels): - data = Data[c, :, :] - d_perm = data * sn_mtrx # Apply sign flip to data - - # Compute t-score of permuted data - sm = np.sum(d_perm, axis=1) # Sum of permuted data - mn = sm / n_obs # Mean of permuted data - sm_sqrs = ( - np.sum(d_perm**2, axis=1) - (sm**2) / n_obs - ) # Sum of squares for standard error - stder = np.sqrt(sm_sqrs) / sqrt_nXnM1 # Standard error - t_vals[c, :, perm] = mn / stder # Compute t-values - - # Compute p-values from t-values - p_vals = 2 * scipy.stats.cdf(-np.abs(t_vals), dfe) - - return t_vals, p_vals, dfe +# finally let's plot the results +# we plot the cluster with the lowest p-value +# and the topomap of the significant cluster +# we can see that there is something going on around 400 ms +# in the visual channels +# however the cluster is not significant which is not surprising +# given the small sample size (only 5 subjects) +mne.stats.cluster_level.plot_cluster( + diff_evoked, target_only, non_target_only, T_obs, clusters, cluster_p_values +) From 268d0cfb801d60df8cab9d1d83ebf1a7c1837e6f Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Sat, 6 Jul 2024 11:01:12 +0200 Subject: [PATCH 56/88] cleaned up plotting function --- mne/stats/cluster_level.py | 61 ++++++++++--------- .../76_new_cluster_test_api.py | 6 +- 2 files changed, 36 insertions(+), 31 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index e50991254fe..bd1c2c90970 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -1749,14 +1749,14 @@ def cluster_test( buffer_size: int = None, # block size for chunking the data ): """ - Run the cluster test using the new API. + Run a cluster permutation test based on formulaic input. - # currently supports paired t-test + # currently only supports paired t-test on evokeds or epochs Parameters ---------- dataframe : pd.DataFrame - Dataframe with evoked data, conditions and subject IDs. + Dataframe with evoked/epoched data, conditions and subject IDs. formula : str, optional Wilkinson notation formula for design matrix. Default is None. n_permutations : int, optional @@ -1788,6 +1788,7 @@ def cluster_test( Returns ------- + TODO: turn this into a class for further plotting T_obs : array The observed test statistic. clusters : list @@ -1808,7 +1809,7 @@ def cluster_test( # convert wide format to long format for formulaic df_long = unpack_time_and_channels(df) - # Pivot the DataFrame + # pivot the DataFrame pivot_df = df_long.pivot_table( index=["subject_index", "channel", "timepoint"], columns="condition", @@ -1819,7 +1820,7 @@ def cluster_test( if len(pd.unique(df.condition)) != 2: raise ValueError("Condition list needs to contain 2 unique values") - # Get the unique conditions + # get the unique conditions conditions = np.unique(df.condition) # Compute the difference (assuming there are only 2 conditions) @@ -1843,9 +1844,8 @@ def cluster_test( "Formula is required and needs to be a string in Wilkinson notation." ) - # now prep design matrix outcome variable for input into MNE cluster function - # we initially had first channels, then timepoints, - # now we need first timepoints, then channels + # now prep design matrix for input into MNE cluster function + # cluster functions expects channels as list dimension y_for_cluster = y.values.reshape(-1, n_channels, n_timepoints).transpose(0, 2, 1) adjacency, _ = find_ch_adjacency(df["evoked"][0].info, ch_type="eeg") @@ -1858,7 +1858,7 @@ def cluster_test( # Run the cluster-based permutation test T_obs, clusters, cluster_p_values, H0 = _permutation_cluster_test( [y_for_cluster], - n_permutations=10000, + n_permutations=n_permutations, threshold=threshold, stat_fun=stat_fun, tail=tail, @@ -1874,19 +1874,24 @@ def cluster_test( seed=seed, ) - print(min(cluster_p_values)) + print(f"smallest cluster p-value: {min(cluster_p_values)}") return T_obs, clusters, cluster_p_values, H0 -def unpack_time_and_channels(df): +def unpack_time_and_channels(df: pd.DataFrame = None) -> pd.DataFrame: """ - Extract the time and channel data from the DataFrame. + Extract timepoints and channels and convert to long. Parameters ---------- df : pd.DataFrame DataFrame in wide format. + + Returns + ------- + df_long : pd.DataFrame + DataFrame in long format. """ # Extracting all necessary data using list comprehensions for better performance long_format_data = [ @@ -1908,20 +1913,18 @@ def unpack_time_and_channels(df): return df_long -def plot_cluster( - contrast, target_only, non_target_only, T_obs, clusters, cluster_p_values -): +def plot_cluster(cond_dict, T_obs, clusters, cluster_p_values): """ Plot the cluster with the lowest p-value. + 2D cluster plotted with topoplot on the left and evoked signals on the right. + Timepoints that are part of the cluster are + highlighted in green on the evoked signals. + Parameters ---------- - contrast : list - List of contrast evoked objects. - target_only : list - List of target evoked objects. - non_target_only : list - List of non-target evoked objects. + cond_dict : dict + Dictionary with conditions as keys and evoked data as values. T_obs : array The observed test statistic. clusters : list @@ -1934,11 +1937,13 @@ def plot_cluster( None """ - # configure variables for visualization - colors = {"target": "crimson", "non-target": "steelblue"} + # extract condition labels from the dictionary + cond_keys = list(cond_dict.keys()) + # extract the evokeds from the dictionary + cond_values = list(cond_dict.values()) - # organize data for plotting - evokeds = {"target": target_only, "non-target": non_target_only} + # configure variables for visualization + colors = {cond_keys[0]: "crimson", cond_keys[1]: "steelblue"} lowest_p_cluster = np.argmin(cluster_p_values) @@ -1951,7 +1956,7 @@ def plot_cluster( t_map = T_obs[time_inds, ...].mean(axis=0) # get signals at the sensors contributing to the cluster - sig_times = contrast[0].times[time_inds] + sig_times = cond_values[0][0].times[time_inds] # create spatial mask mask = np.zeros((t_map.shape[0], 1), dtype=bool) @@ -1961,7 +1966,7 @@ def plot_cluster( fig, ax_topo = plt.subplots(1, 1, figsize=(10, 3), layout="constrained") # plot average test statistic and mark significant sensors - t_evoked = EvokedArray(t_map[:, np.newaxis], contrast[0].info, tmin=0) + t_evoked = EvokedArray(t_map[:, np.newaxis], cond_values[0][0].info, tmin=0) t_evoked.plot_topomap( times=0, mask=mask, @@ -1999,7 +2004,7 @@ def plot_cluster( if len(ch_inds) > 1: title += "s (mean)" plot_compare_evokeds( - evokeds, + cond_dict, title=title, picks=ch_inds, axes=ax_signals, diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index ec8bd8275a1..a88904a5b5b 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -125,6 +125,8 @@ T_obs, clusters, cluster_p_values, H0 = mne.stats.cluster_level.cluster_test( df=df, formula=formula ) +# set up conditions dictionary for cluster plots +conditions_dict = {"target": target_only, "non-target": non_target_only} # finally let's plot the results # we plot the cluster with the lowest p-value @@ -133,6 +135,4 @@ # in the visual channels # however the cluster is not significant which is not surprising # given the small sample size (only 5 subjects) -mne.stats.cluster_level.plot_cluster( - diff_evoked, target_only, non_target_only, T_obs, clusters, cluster_p_values -) +mne.stats.cluster_level.plot_cluster(conditions_dict, T_obs, clusters, cluster_p_values) From 2f722bdac329e30911aad847e401871ed4f23dd8 Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Sat, 6 Jul 2024 11:53:49 +0200 Subject: [PATCH 57/88] implemented cluser results class --- mne/stats/cluster_level.py | 213 +++++++++--------- .../76_new_cluster_test_api.py | 23 +- 2 files changed, 124 insertions(+), 112 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index bd1c2c90970..f82fe8d7dec 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -1788,15 +1788,8 @@ def cluster_test( Returns ------- - TODO: turn this into a class for further plotting - T_obs : array - The observed test statistic. - clusters : list - List of clusters. - cluster_p_values : array - Array of cluster p-values. - H0 : array - The permuted test statistics. + ClusterResult + Object containing the results of the cluster permutation test. """ # for now this assumes a dataframe with a column for evoked data or epochs # add a data column to the dataframe (numpy array) @@ -1876,7 +1869,7 @@ def cluster_test( print(f"smallest cluster p-value: {min(cluster_p_values)}") - return T_obs, clusters, cluster_p_values, H0 + return ClusterResult(T_obs, clusters, cluster_p_values, H0) def unpack_time_and_channels(df: pd.DataFrame = None) -> pd.DataFrame: @@ -1913,111 +1906,127 @@ def unpack_time_and_channels(df: pd.DataFrame = None) -> pd.DataFrame: return df_long -def plot_cluster(cond_dict, T_obs, clusters, cluster_p_values): +class ClusterResult: """ - Plot the cluster with the lowest p-value. - - 2D cluster plotted with topoplot on the left and evoked signals on the right. - Timepoints that are part of the cluster are - highlighted in green on the evoked signals. + Object containing the results of the cluster permutation test. Parameters ---------- - cond_dict : dict - Dictionary with conditions as keys and evoked data as values. - T_obs : array + T_obs : np.ndarray The observed test statistic. clusters : list List of clusters. - cluster_p_values : array - Array of cluster p-values. - - Returns - ------- - None - + cluster_p_values : np.ndarray + P-values for each cluster. + H0 : np.ndarray + Max cluster level stats observed under permutation. """ - # extract condition labels from the dictionary - cond_keys = list(cond_dict.keys()) - # extract the evokeds from the dictionary - cond_values = list(cond_dict.values()) - - # configure variables for visualization - colors = {cond_keys[0]: "crimson", cond_keys[1]: "steelblue"} - - lowest_p_cluster = np.argmin(cluster_p_values) - - # plot the cluster with the lowest p-value - time_inds, space_inds = np.squeeze(clusters[lowest_p_cluster]) - ch_inds = np.unique(space_inds) - time_inds = np.unique(time_inds) - - # get topography for F stat - t_map = T_obs[time_inds, ...].mean(axis=0) - - # get signals at the sensors contributing to the cluster - sig_times = cond_values[0][0].times[time_inds] - - # create spatial mask - mask = np.zeros((t_map.shape[0], 1), dtype=bool) - mask[ch_inds, :] = True - - # initialize figure - fig, ax_topo = plt.subplots(1, 1, figsize=(10, 3), layout="constrained") - - # plot average test statistic and mark significant sensors - t_evoked = EvokedArray(t_map[:, np.newaxis], cond_values[0][0].info, tmin=0) - t_evoked.plot_topomap( - times=0, - mask=mask, - axes=ax_topo, - cmap="Reds", - vlim=(np.min, np.max), - show=False, - colorbar=False, - mask_params=dict(markersize=10), - ) - image = ax_topo.images[0] - # remove the title that would otherwise say "0.000 s" - ax_topo.set_title("") + def __init__(self, T_obs, clusters, cluster_p_values, H0): + self.T_obs = T_obs + self.clusters = clusters + self.cluster_p_values = cluster_p_values + self.H0 = H0 + + def plot_cluster(self, cond_dict: dict = None): + """ + Plot the cluster with the lowest p-value. + + 2D cluster plotted with topoplot on the left and evoked signals on the right. + Timepoints that are part of the cluster are + highlighted in green on the evoked signals. + + Parameters + ---------- + cond_dict : dict + Dictionary with condition labels as keys and evoked objects as values. + + Returns + ------- + None + + """ + # extract condition labels from the dictionary + cond_keys = list(cond_dict.keys()) + # extract the evokeds from the dictionary + cond_values = list(cond_dict.values()) + + # configure variables for visualization + colors = {cond_keys[0]: "crimson", cond_keys[1]: "steelblue"} + + lowest_p_cluster = np.argmin(self.cluster_p_values) + + # plot the cluster with the lowest p-value + time_inds, space_inds = np.squeeze(self.clusters[lowest_p_cluster]) + ch_inds = np.unique(space_inds) + time_inds = np.unique(time_inds) + + # get topography for F stat + t_map = self.T_obs[time_inds, ...].mean(axis=0) + + # get signals at the sensors contributing to the cluster + sig_times = cond_values[0][0].times[time_inds] + + # create spatial mask + mask = np.zeros((t_map.shape[0], 1), dtype=bool) + mask[ch_inds, :] = True + + # initialize figure + fig, ax_topo = plt.subplots(1, 1, figsize=(10, 3), layout="constrained") + + # plot average test statistic and mark significant sensors + t_evoked = EvokedArray(t_map[:, np.newaxis], cond_values[0][0].info, tmin=0) + t_evoked.plot_topomap( + times=0, + mask=mask, + axes=ax_topo, + cmap="Reds", + vlim=(np.min, np.max), + show=False, + colorbar=False, + mask_params=dict(markersize=10), + ) + image = ax_topo.images[0] - # soft import? - # make_axes_locatable = _soft_import( - # "mpl_toolkits.axes_grid1.make_axes_locatable", - # purpose="plot cluster results" - # ) # soft import (not a dependency for MNE) + # remove the title that would otherwise say "0.000 s" + ax_topo.set_title("") - # create additional axes (for ERF and colorbar) - divider = make_axes_locatable(ax_topo) + # soft import? + # make_axes_locatable = _soft_import( + # "mpl_toolkits.axes_grid1.make_axes_locatable", + # purpose="plot cluster results" + # ) # soft import (not a dependency for MNE) - # add axes for colorbar - ax_colorbar = divider.append_axes("right", size="5%", pad=0.05) - plt.colorbar(image, cax=ax_colorbar) - ax_topo.set_xlabel( - "Averaged t-map ({:0.3f} - {:0.3f} s)".format(*sig_times[[0, -1]]) - ) + # create additional axes (for ERF and colorbar) + divider = make_axes_locatable(ax_topo) - # add new axis for time courses and plot time courses - ax_signals = divider.append_axes("right", size="300%", pad=1.2) - title = f"Cluster #1, {len(ch_inds)} sensor" - if len(ch_inds) > 1: - title += "s (mean)" - plot_compare_evokeds( - cond_dict, - title=title, - picks=ch_inds, - axes=ax_signals, - colors=colors, - show=False, - split_legend=True, - truncate_yaxis="auto", + # add axes for colorbar + ax_colorbar = divider.append_axes("right", size="5%", pad=0.05) + plt.colorbar(image, cax=ax_colorbar) + ax_topo.set_xlabel( + "Averaged t-map ({:0.3f} - {:0.3f} s)".format(*sig_times[[0, -1]]) ) - # plot temporal cluster extent - ymin, ymax = ax_signals.get_ylim() - ax_signals.fill_betweenx( - (ymin, ymax), sig_times[0], sig_times[-1], color="green", alpha=0.3 - ) + # add new axis for time courses and plot time courses + ax_signals = divider.append_axes("right", size="300%", pad=1.2) + title = f"Cluster #1, {len(ch_inds)} sensor" + if len(ch_inds) > 1: + title += "s (mean)" + plot_compare_evokeds( + cond_dict, + title=title, + picks=ch_inds, + axes=ax_signals, + colors=colors, + show=False, + split_legend=True, + truncate_yaxis="auto", + ) + + # plot temporal cluster extent + ymin, ymax = ax_signals.get_ylim() + ax_signals.fill_betweenx( + (ymin, ymax), sig_times[0], sig_times[-1], color="green", alpha=0.3 + ) - plt.show() + plt.show() diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index a88904a5b5b..3acfd21f7f0 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -15,7 +15,8 @@ - loading evoked data from multiple subjects - construct a dataframe that contains the difference between conditions - - run the new cluster test function + - run the new cluster test function with formula in Wilkinson notation + - plot the results with the ClusterResults Class Here, the unit of observation are evokeds from multiple subjects (2nd level analysis). @@ -121,18 +122,20 @@ # let's first define the formula based on Wilkinson notation formula = "evoked ~ 1 + C(subject_index)" -# run the cluster test -T_obs, clusters, cluster_p_values, H0 = mne.stats.cluster_level.cluster_test( - df=df, formula=formula -) +# run the cluster test and return the cluster_result object +cluster_result = mne.stats.cluster_level.cluster_test(df=df, formula=formula) + +# note that we ran an exact test due to the small sample size (only 15 permutations) + # set up conditions dictionary for cluster plots conditions_dict = {"target": target_only, "non-target": non_target_only} -# finally let's plot the results +# finally let's plot the results using the ClusterResults class + # we plot the cluster with the lowest p-value -# and the topomap of the significant cluster + # we can see that there is something going on around 400 ms -# in the visual channels -# however the cluster is not significant which is not surprising +# in the visual channels (topomap on the left) +# however the cluster is not significant which is unsurprising # given the small sample size (only 5 subjects) -mne.stats.cluster_level.plot_cluster(conditions_dict, T_obs, clusters, cluster_p_values) +cluster_result.plot_cluster(cond_dict=conditions_dict) From fb75cfd66f2fb8e2fbdc55a9907931441f2dd13f Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Thu, 18 Jul 2024 14:35:02 +0200 Subject: [PATCH 58/88] fixed codespell --- mne/stats/cluster_level.py | 55 +++++++++++-------- .../76_new_cluster_test_api.py | 34 +++++++----- 2 files changed, 50 insertions(+), 39 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index f82fe8d7dec..bb1f31ef6fd 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -1813,8 +1813,14 @@ def cluster_test( if len(pd.unique(df.condition)) != 2: raise ValueError("Condition list needs to contain 2 unique values") - # get the unique conditions - conditions = np.unique(df.condition) + # Get unique elements and the indices of their first occurrences + unique_elements, indices = np.unique(df.condition, return_index=True) + + # Sort unique elements by the indices of their first occurrences + conditions = unique_elements[np.argsort(indices)] + + # print the contrast used for the paired t-test + print(f"Contrast used for paired t-test: {conditions[0]} - {conditions[1]}") # Compute the difference (assuming there are only 2 conditions) pivot_df["evoked"] = pivot_df[conditions[0]] - pivot_df[conditions[1]] @@ -1961,8 +1967,8 @@ def plot_cluster(self, cond_dict: dict = None): ch_inds = np.unique(space_inds) time_inds = np.unique(time_inds) - # get topography for F stat - t_map = self.T_obs[time_inds, ...].mean(axis=0) + # get topography for t stat + t_map = self.T_obs[time_inds, ...].mean(axis=0).astype(int) # get signals at the sensors contributing to the cluster sig_times = cond_values[0][0].times[time_inds] @@ -1980,11 +1986,11 @@ def plot_cluster(self, cond_dict: dict = None): times=0, mask=mask, axes=ax_topo, - cmap="Reds", - vlim=(np.min, np.max), + cmap="RdBu_r", show=False, colorbar=False, mask_params=dict(markersize=10), + scalings=1.00, ) image = ax_topo.images[0] @@ -2001,32 +2007,33 @@ def plot_cluster(self, cond_dict: dict = None): divider = make_axes_locatable(ax_topo) # add axes for colorbar - ax_colorbar = divider.append_axes("right", size="5%", pad=0.05) - plt.colorbar(image, cax=ax_colorbar) + ax_colorbar = divider.append_axes("right", size="5%", pad=0.1) + cbar = plt.colorbar(image, cax=ax_colorbar) + cbar.set_label("t-value") ax_topo.set_xlabel( - "Averaged t-map ({:0.3f} - {:0.3f} s)".format(*sig_times[[0, -1]]) + "average from {:0.3f} to {:0.3f} s".format(*sig_times[[0, -1]]) ) # add new axis for time courses and plot time courses - ax_signals = divider.append_axes("right", size="300%", pad=1.2) - title = f"Cluster #1, {len(ch_inds)} sensor" - if len(ch_inds) > 1: - title += "s (mean)" - plot_compare_evokeds( - cond_dict, - title=title, - picks=ch_inds, - axes=ax_signals, - colors=colors, - show=False, - split_legend=True, - truncate_yaxis="auto", - ) + ax_signals = divider.append_axes("right", size="300%", pad=1.3) + title = f"Signal averaged over {len(ch_inds)} sensor(s)" + plot_compare_evokeds( + cond_dict, + title=title, + picks=ch_inds, + axes=ax_signals, + colors=colors, + show=False, + split_legend=True, + truncate_yaxis="auto", + truncate_xaxis=False, + ) + plt.legend(frameon=False, loc="upper left") # plot temporal cluster extent ymin, ymax = ax_signals.get_ylim() ax_signals.fill_betweenx( - (ymin, ymax), sig_times[0], sig_times[-1], color="green", alpha=0.3 + (ymin, ymax), sig_times[0], sig_times[-1], color="grey", alpha=0.3 ) plt.show() diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index 3acfd21f7f0..842e0543b0b 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -6,8 +6,9 @@ =============================================================== This tutorial shows how to use the new API for cluster testing. -This script shows how to estimate significant clusters in -evoked contrast data of multiple subjects. +The new API allows for Wilkinson style formulas and allows for more flexibility in +the design of the test. Here we will demonstrate how to use the new API for +a standard paired t-test on evoked data from multiple subjects. It uses a non-parametric statistical procedure based on permutations and cluster level statistics. @@ -16,7 +17,7 @@ - loading evoked data from multiple subjects - construct a dataframe that contains the difference between conditions - run the new cluster test function with formula in Wilkinson notation - - plot the results with the ClusterResults Class + - plot the results with the new ClusterResults API Here, the unit of observation are evokeds from multiple subjects (2nd level analysis). @@ -41,13 +42,14 @@ # Define the path to the P3 dataset path_to_p3 = mne.datasets.misc.data_path() / "ERP_CORE" / "P3" -# Define the range of participant IDs -participant_ids = range(15, 20) # This will cover 015 to 019 +# Define the range of participant IDs (we only have 5 participants in the dataset) +participant_ids = range(15, 20) # This will cover participant 15 to 19 # store the evoked data of all subjects evokeds_allsubs = [] # Loop over each participant ID and generate the corresponding filename +# to load the evoked data for pid in participant_ids: # Create the filename using an f-string, ID is zero-padded to 3 digits filename_p3 = f"sub-{pid:03d}_ses-P3_task-P3_ave.fif" @@ -58,21 +60,22 @@ # load the evoked data evokeds = mne.read_evokeds(p3_file_path) - # add subjects evoked data to list + # add single subjects evoked data to a list evokeds_allsubs.append(evokeds) # the P3b dataset is part of the freely available ERP CORE dataset # participants were presented with a visual oddball task # and the P3b component was analyzed # the conditions of interest are the target (rare visual stimuli) -# and non-target stimuli (frequency visual stimuli) +# and non-target stimuli (frequent visual stimuli) # let's extract the target and non-target evokeds target_only = [evoked[0] for evoked in evokeds_allsubs] non_target_only = [evoked[1] for evoked in evokeds_allsubs] # let's first have a look at the data -# create contrast from target and non-target evokeds + +# create contrast target - non-target diff_evoked = [ mne.combine_evoked([evokeds_a, evokeds_b], weights=[1, -1]) for evokeds_a, evokeds_b in zip(target_only, non_target_only) @@ -84,7 +87,7 @@ mne.grand_average(diff_evoked).plot_topomap() # we can see that the strongest difference is around 400 ms in -# visual channels (occipital region) +# central-parietal channels with a stronger evoked signal for target stimuli # Next we prepare a dataframe for the cluster test function # the dataframe should contain the contrast evoked data and the subject index @@ -93,7 +96,7 @@ # save the evoked data for both conditions in one list evokeds_conditions = target_only + non_target_only -# set up a list that defines the condition for each evoked data +# create a list that defines the condition for each evoked data # this will be used to create the conditions column in the dataframe conditions = ["target"] * len(target_only) + ["non-target"] * len(non_target_only) @@ -102,7 +105,7 @@ # we multiply the participant_ids by 2 to account for the two conditions subject_index = list(participant_ids) * 2 -# create the dataframe +# create the dataframe containing the evoked data, the condition and the subject index df = pd.DataFrame( { "evoked": evokeds_conditions, @@ -122,20 +125,21 @@ # let's first define the formula based on Wilkinson notation formula = "evoked ~ 1 + C(subject_index)" -# run the cluster test and return the cluster_result object +# run the new cluster test API and return the new cluster_result object cluster_result = mne.stats.cluster_level.cluster_test(df=df, formula=formula) # note that we ran an exact test due to the small sample size (only 15 permutations) # set up conditions dictionary for cluster plots +# this is necessary for plotting the evoked data and the cluster result on top conditions_dict = {"target": target_only, "non-target": non_target_only} # finally let's plot the results using the ClusterResults class # we plot the cluster with the lowest p-value - +cluster_result.plot_cluster(cond_dict=conditions_dict) # we can see that there is something going on around 400 ms -# in the visual channels (topomap on the left) +# with a stronger signal for target trials in right central-parietal channels + # however the cluster is not significant which is unsurprising # given the small sample size (only 5 subjects) -cluster_result.plot_cluster(cond_dict=conditions_dict) From a87ffed0dedb2e3e27f5b6b99be0db8ae6a32d55 Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Mon, 22 Jul 2024 20:22:20 +0200 Subject: [PATCH 59/88] first review --- mne/stats/cluster_level.py | 384 ++++++++++++++++++++++++++----------- 1 file changed, 272 insertions(+), 112 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index bb1f31ef6fd..847c464259c 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -4,6 +4,10 @@ # License: BSD-3-Clause # Copyright the MNE-Python contributors. +from __future__ import annotations + +from typing import Literal + import matplotlib.pyplot as plt import numpy as np import pandas as pd @@ -13,12 +17,13 @@ from scipy.stats import f as fstat from scipy.stats import t as tstat -from .. import EvokedArray -from ..channels import find_ch_adjacency +from .. import Epochs, Evoked +from ..epochs import EpochsArray, EvokedArray from ..fixes import has_numba, jit from ..parallel import parallel_func from ..source_estimate import MixedSourceEstimate, SourceEstimate, VolSourceEstimate from ..source_space import SourceSpaces +from ..time_frequency import AverageTFR, AverageTFRArray, EpochsTFR, EpochsTFRArray from ..utils import ( ProgressBar, _check_option, @@ -938,7 +943,7 @@ def _permutation_cluster_test( sample_shape = X[0].shape[1:] for x in X: if x.shape[1:] != sample_shape: - raise ValueError("All samples mush have the same size") + raise ValueError("All samples must have the same size") # flatten the last dimensions in case the data is high dimensional X = [np.reshape(x, (x.shape[0], -1)) for x in X] @@ -1732,21 +1737,186 @@ def summarize_clusters_stc( return klass(data_summary, vertices, tmin, tstep, subject) +def validate_input_dataframe(df: pd.DataFrame, formula: str): + """ + Validate the input dataframe for the cluster permutation test. + + Parameters + ---------- + df : pd.DataFrame + Dataframe with 3 columns (subject_index, condition, data). + formula : formulaic.ModelSpec + Wilkinson style Formula for the design matrix. + + Returns + ------- + dv_name : str + Name of the dependent variable. + """ + # extract dependent variable name from formula + formulaic = _soft_import( + "formulaic", purpose="set up Design Matrix" + ) # soft import (not a dependency for MNE) + formula = formulaic.Formula(formula) + dv_name = str(formula.lhs) + + # check if all necessary columns are present + if dv_name not in df.columns: + raise ValueError("""DataFrame needs to contain a column + with the dependent variable name + as defined in the formula""") + if "condition" not in df.columns: + raise ValueError("DataFrame needs to contain a condition column") + if "subject_index" not in df.columns: + raise ValueError("DataFrame needs to contain a subject_index column") + + # check if the data column contains only valid types + check_column_types(df[dv_name]) + + # check if the shape of the data is consistent + if not all(data.data.shape == df[dv_name][0].data.shape for data in df[dv_name]): + raise ValueError("Data objects need to have the same shape") + + # check if the condition column contains only 2 unique values + if len(pd.unique(df.condition)) != 2: + raise ValueError("currently only supports 2 conditions.") + + return dv_name + + +def check_column_types(input_data: np.ndarray): + """ + Check if the column types are valid for the cluster permutation test. + + Parameters + ---------- + input_data : np.Array + Data to be checked for the cluster permutation test. + """ + # Get the type of the first element + first_type = type(input_data.iloc[0]) + + # Define the possible valid types + valid_types = ( + Evoked, + EvokedArray, + Epochs, + EpochsArray, + AverageTFR, + EpochsTFR, + EpochsTFRArray, + AverageTFRArray, + ) + + # Check if the type of the first element is a valid type + if first_type not in valid_types: + raise ValueError(f"Object type '{first_type}' is not a valid type.") + + # Check if all elements are of the same type as the first one + if not all(isinstance(data, first_type) for data in input_data): + raise ValueError("Data column must contain objects of the same type.") + + +def prepare_data_for_cluster_test(input_df: pd.DataFrame, dv_name: str): + """ + Prepare the data for the cluster permutation test. + + Parameters + ---------- + input_data : np.ndarray + Data to be prepared for the cluster permutation test. + + Returns + ------- + data : np.Array + Data prepared for the cluster permutation test. + """ + # extract data and add to dataframe + input_df["data"] = [data.data for data in input_df[dv_name]] + + # extract dimensions from time series or time-frequency data + first_data_obj = input_df["data"].iloc[0] + if isinstance(first_data_obj, (Epochs, Evoked, EpochsArray, EvokedArray)): + n_channels, n_timepoints = first_data_obj.get_data().shape + if isinstance( + first_data_obj, (AverageTFR, EpochsTFR, AverageTFRArray, EpochsTFRArray) + ): + n_channels, n_freqs, n_timepoints = first_data_obj.get_data().shape + + reshaped_data = [] + + for idx, row in input_df.iterrows(): + subject_index = row["subject_index"] + condition = row["condition"] + data_array = row["data"] + + if data_array.ndim == 2: + n_channels, n_timepoints = data_array.shape + # timepoints are the columns + df_temp = pd.DataFrame( + data_array, columns=[f"timepoint_{i}" for i in range(n_timepoints)] + ) + df_temp["channel"] = range(n_channels) + df_temp["subject_index"] = subject_index + df_temp["condition"] = condition + + reshaped_data.append(df_temp) + + elif data_array.ndim == 3: + n_channels, n_freqs, n_timepoints = data_array.shape + # timepoints are the columns + df_temp = pd.DataFrame( + data_array.reshape(-1, n_timepoints), + columns=[f"timepoint_{i}" for i in range(n_timepoints)], + ) + df_temp["frequency"] = np.repeat(range(n_freqs), n_channels) + df_temp["channel"] = np.tile(range(n_channels), n_freqs) + df_temp["subject_index"] = subject_index + df_temp["condition"] = condition + + reshaped_data.append(df_temp) + + else: + raise ValueError(f"Unsupported data array dimensions: {data_array.ndim}") + # combine the reshaped data + combined_df = pd.concat(reshaped_data, ignore_index=True) + # Convert the dataframe to long format + id_vars = ["subject_index", "condition", "channel"] + if "frequency" in combined_df.columns: + id_vars.append("frequency") + + reshaped_df = pd.melt( + combined_df, id_vars=id_vars, var_name="timepoint", value_name="value" + ) + + # rename column and convert to integer + reshaped_df["timepoint"] = ( + reshaped_df["timepoint"].str.replace("timepoint_", "").astype(int) + ) + + # return the reshaped dataframe and dimensions + if data_array.ndim == 2: + return reshaped_df, data_array.ndim, n_channels, n_timepoints + elif data_array.ndim == 3: + return reshaped_df, data_array.ndim, n_channels, n_freqs, n_timepoints + + def cluster_test( df: pd.DataFrame, - formula: str = None, # Wilkinson notation formula for design matrix - n_permutations: int = 10000, + formula: str, # Wilkinson notation formula for design matrix + paired_test: bool, # whether to run a paired t-test or unpaired test + n_permutations: int = 1024, # same default as in old API seed: None | int | np.random.RandomState = None, - tail: int = 0, # 0 for two-tailed, 1 for greater, -1 for less + tail: Literal[-1, 0, 1] = 0, # 0 for two-tailed, 1 for greater, -1 for less n_jobs: int = 1, # how many cores to use - adjacency: tuple = None, + adjacency: tuple | None = None, max_step: int = 1, # maximum distance between samples (time points) - exclude: list = None, # exclude no time points or channels + exclude: list | None = None, # exclude no time points or channels step_down_p: int = 0, # step down in jumps test t_power: int = 1, # weigh each location by its stats score - out_type: str = "indices", + out_type: Literal["indices", "mask"] = "indices", check_disjoint: bool = False, - buffer_size: int = None, # block size for chunking the data + buffer_size: int | None = None, # block size for chunking the data ): """ Run a cluster permutation test based on formulaic input. @@ -1755,12 +1925,14 @@ def cluster_test( Parameters ---------- - dataframe : pd.DataFrame - Dataframe with evoked/epoched data, conditions and subject IDs. - formula : str, optional - Wilkinson notation formula for design matrix. Default is None. + df : pd.DataFrame + Dataframe with 3 columns (subject_index, condition, evoked). + formula : str + Wilkinson notation formula for design matrix. + paired_test: bool + Whether to run a paired t-test. n_permutations : int, optional - Number of permutations. Default is 10000. + Number of permutations. Default is 1024. seed : None | int | np.random.RandomState, optional Seed for the random number generator. Default is None. tail : int, optional @@ -1768,7 +1940,7 @@ def cluster_test( n_jobs : int, optional How many cores to use. Default is 1. adjacency : None, optional - Adjacency matrix. Default is None. + Provide a adjacency matrix. Default is None. max_step : int, optional Maximum distance between samples (time points). Default is 1. exclude : np.Array, optional @@ -1791,27 +1963,38 @@ def cluster_test( ClusterResult Object containing the results of the cluster permutation test. """ - # for now this assumes a dataframe with a column for evoked data or epochs - # add a data column to the dataframe (numpy array) - df["data"] = [evoked.data for evoked in df.evoked] - - # extract number of channels and timepoints - # (eventually should also allow for frequency) - n_channels, n_timepoints = df["data"][0].shape - - # convert wide format to long format for formulaic - df_long = unpack_time_and_channels(df) - - # pivot the DataFrame - pivot_df = df_long.pivot_table( - index=["subject_index", "channel", "timepoint"], - columns="condition", - values="value", - ).reset_index() - - # if not 2 unique conditions raise error - if len(pd.unique(df.condition)) != 2: - raise ValueError("Condition list needs to contain 2 unique values") + # check if formula is present + if formula is None: + raise ValueError("Wilkinson style formula is required.") + + # validate the input dataframe and return name of dependent variable + dv_name = validate_input_dataframe(df, formula) + + # prepare the data for the cluster permutation test + prep_result = prepare_data_for_cluster_test(df, dv_name) + + if prep_result[1] == 2: + # pivot the dataframe based on condition for later subtraction + pivot_df = ( + prep_result[0] + .pivot_table( + index=["subject_index", "channel", "timepoint"], + columns="condition", + values="value", + ) + .reset_index() + ) + elif prep_result[1] == 3: + # pivot the dataframe based on condition for later subtraction + pivot_df = ( + prep_result[0] + .pivot_table( + index=["subject_index", "channel", "frequency", "timepoint"], + columns="condition", + values="value", + ) + .reset_index() + ) # Get unique elements and the indices of their first occurrences unique_elements, indices = np.unique(df.condition, return_index=True) @@ -1819,41 +2002,51 @@ def cluster_test( # Sort unique elements by the indices of their first occurrences conditions = unique_elements[np.argsort(indices)] - # print the contrast used for the paired t-test - print(f"Contrast used for paired t-test: {conditions[0]} - {conditions[1]}") + # store the contrast for the clusterResults object + contrast = f"{conditions[0]} - {conditions[1]}" - # Compute the difference (assuming there are only 2 conditions) - pivot_df["evoked"] = pivot_df[conditions[0]] - pivot_df[conditions[1]] + # print the contrast used for the paired t-test so the user knows + # what is subtracted from what + logger.info(f"Contrast used for paired t-test: {contrast}") - # Optional: Clean up the DataFrame - pivot_df = pivot_df[["subject_index", "channel", "timepoint", "evoked"]] + # Compute the difference (assuming there are only 2 conditions) + pivot_df[dv_name] = pivot_df[conditions[0]] - pivot_df[conditions[1]] + + # for the paired t-test y is the difference between conditions + # X is the design matrix with a column with 1s and 0s for each participant + # Create the design matrix using formulaic + formulaic = _soft_import( + "formulaic", purpose="set up Design Matrix" + ) # soft import (not a dependency for MNE) + y, X = formulaic.model_matrix(formula, pivot_df) + + # Prepare design matrix for input into MNE cluster function + # MNE cluster functions expect channels as the last dimension + + if prep_result[1] == 2: + # Reshape y.values into a 3D array: (participants, n_channels, n_timepoints) + y_reshaped = y.values.reshape(-1, prep_result[2], prep_result[3]) + # Transpose the array to have channels as the last dimension + y_for_cluster = y_reshaped.transpose(0, 2, 1) + elif prep_result[1] == 3: + # Reshape y.values into a 4D array: + # (participants, n_channels, n_freqs, n_timepoints) + y_reshaped = y.values.reshape( + -1, prep_result[2], prep_result[3], prep_result[4] + ) + # Transpose the array to have channels as the last dimension + y_for_cluster = y_reshaped.transpose(0, 3, 2, 1) - # check if formula is present - if formula is not None: - formulaic = _soft_import( - "formulaic", purpose="set up Design Matrix" - ) # soft import (not a dependency for MNE) - - # for the paired t-test y is the difference between conditions - # X is the design matrix with a column with 1s and 0s for each participant - # Create the design matrix using formulaic - y, X = formulaic.model_matrix(formula, pivot_df) + if paired_test: + # define stat function and threshold + stat_fun, threshold = _check_fun( + X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="within" + ) else: - raise ValueError( - "Formula is required and needs to be a string in Wilkinson notation." + # define stat function and threshold + stat_fun, threshold = _check_fun( + X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="between" ) - - # now prep design matrix for input into MNE cluster function - # cluster functions expects channels as list dimension - y_for_cluster = y.values.reshape(-1, n_channels, n_timepoints).transpose(0, 2, 1) - - adjacency, _ = find_ch_adjacency(df["evoked"][0].info, ch_type="eeg") - - # define stat function and threshold - stat_fun, threshold = _check_fun( - X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="within" - ) - # Run the cluster-based permutation test T_obs, clusters, cluster_p_values, H0 = _permutation_cluster_test( [y_for_cluster], @@ -1878,40 +2071,6 @@ def cluster_test( return ClusterResult(T_obs, clusters, cluster_p_values, H0) -def unpack_time_and_channels(df: pd.DataFrame = None) -> pd.DataFrame: - """ - Extract timepoints and channels and convert to long. - - Parameters - ---------- - df : pd.DataFrame - DataFrame in wide format. - - Returns - ------- - df_long : pd.DataFrame - DataFrame in long format. - """ - # Extracting all necessary data using list comprehensions for better performance - long_format_data = [ - { - "condition": row["condition"], - "subject_index": row["subject_index"], - "channel": channel, - "timepoint": timepoint, - "value": row["data"][channel, timepoint], - } - for idx, row in df.iterrows() - for channel in range(row["data"].shape[0]) - for timepoint in range(row["data"].shape[1]) - ] - - # Creating the long format DataFrame - df_long = pd.DataFrame(long_format_data) - - return df_long - - class ClusterResult: """ Object containing the results of the cluster permutation test. @@ -1928,13 +2087,19 @@ class ClusterResult: Max cluster level stats observed under permutation. """ - def __init__(self, T_obs, clusters, cluster_p_values, H0): + def __init__( + self, + T_obs: np.typing.NDArray, + clusters: list, + cluster_p_values: np.typing.NDArray, + H0: np.typing.NDArray, + ): self.T_obs = T_obs self.clusters = clusters self.cluster_p_values = cluster_p_values self.H0 = H0 - def plot_cluster(self, cond_dict: dict = None): + def plot_cluster(self, condition_labels: dict): """ Plot the cluster with the lowest p-value. @@ -1944,18 +2109,13 @@ def plot_cluster(self, cond_dict: dict = None): Parameters ---------- - cond_dict : dict + condition_labels : dict Dictionary with condition labels as keys and evoked objects as values. - - Returns - ------- - None - """ # extract condition labels from the dictionary - cond_keys = list(cond_dict.keys()) + cond_keys = list(condition_labels.keys()) # extract the evokeds from the dictionary - cond_values = list(cond_dict.values()) + cond_values = list(condition_labels.values()) # configure variables for visualization colors = {cond_keys[0]: "crimson", cond_keys[1]: "steelblue"} @@ -2018,7 +2178,7 @@ def plot_cluster(self, cond_dict: dict = None): ax_signals = divider.append_axes("right", size="300%", pad=1.3) title = f"Signal averaged over {len(ch_inds)} sensor(s)" plot_compare_evokeds( - cond_dict, + condition_labels, title=title, picks=ch_inds, axes=ax_signals, From 1f857ad2d97e684b45378c38a943e3763519fd86 Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Mon, 22 Jul 2024 20:44:43 +0200 Subject: [PATCH 60/88] quick clean up --- .../76_new_cluster_test_api.py | 27 ++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index 842e0543b0b..efbc6d5e3f0 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -29,7 +29,7 @@ # License: BSD-3-Clause # Copyright the MNE-Python contributors. -# %% +# %% Load the required packages from pathlib import Path @@ -37,6 +37,8 @@ import mne +# %% Load the P3 dataset + # Set parameters # -------------- # Define the path to the P3 dataset @@ -69,6 +71,8 @@ # the conditions of interest are the target (rare visual stimuli) # and non-target stimuli (frequent visual stimuli) +# %% visually inspect the evoked data for each condition + # let's extract the target and non-target evokeds target_only = [evoked[0] for evoked in evokeds_allsubs] non_target_only = [evoked[1] for evoked in evokeds_allsubs] @@ -89,7 +93,8 @@ # we can see that the strongest difference is around 400 ms in # central-parietal channels with a stronger evoked signal for target stimuli -# Next we prepare a dataframe for the cluster test function +# %% Prepare the dataframe for the new cluster test API + # the dataframe should contain the contrast evoked data and the subject index # each row in the dataframe should represent one observation (evoked data) @@ -114,7 +119,8 @@ } ) -# now we can run the cluster test function +# %% run the cluster test function with formulaic input + # we will use the new API that allows for Wilkinson style formulas # the formula should be a string in Wilkinson notation @@ -123,12 +129,21 @@ # we will use a cluster-based permutation paired t-test for this # let's first define the formula based on Wilkinson notation +# we want to predict the evoked difference signal based on the subject +# the cluster test randomly permutes the subject label +# the 1 in the formula represents the intercept which is always included +# C is a categorical variable that will be dummy coded formula = "evoked ~ 1 + C(subject_index)" # run the new cluster test API and return the new cluster_result object -cluster_result = mne.stats.cluster_level.cluster_test(df=df, formula=formula) +cluster_result = mne.stats.cluster_level.cluster_test( + df=df, formula=formula, paired_test=True, adjacency=None +) + +# note that we ran an exact test due to the small sample size +# (only 15 permutations) -# note that we ran an exact test due to the small sample size (only 15 permutations) +# %% plot the results # set up conditions dictionary for cluster plots # this is necessary for plotting the evoked data and the cluster result on top @@ -137,7 +152,7 @@ # finally let's plot the results using the ClusterResults class # we plot the cluster with the lowest p-value -cluster_result.plot_cluster(cond_dict=conditions_dict) +cluster_result.plot_cluster(condition_labels=conditions_dict) # we can see that there is something going on around 400 ms # with a stronger signal for target trials in right central-parietal channels From 450738bcbdfa472297c2fd9f02c3c5ee454bae1b Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Mon, 22 Jul 2024 21:52:00 +0200 Subject: [PATCH 61/88] test compare_old_vs_new_cluster_API --- mne/stats/tests/test_cluster_level.py | 150 +++++++++++++++++++++++++- 1 file changed, 149 insertions(+), 1 deletion(-) diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py index e319d018328..097754f097b 100644 --- a/mne/stats/tests/test_cluster_level.py +++ b/mne/stats/tests/test_cluster_level.py @@ -6,6 +6,7 @@ from functools import partial import numpy as np +import pandas as pd import pytest from numpy.testing import ( assert_allclose, @@ -15,10 +16,20 @@ ) from scipy import linalg, sparse, stats -from mne import MixedSourceEstimate, SourceEstimate, SourceSpaces, VolSourceEstimate +from mne import ( + EvokedArray, + MixedSourceEstimate, + SourceEstimate, + SourceSpaces, + VolSourceEstimate, + create_info, +) from mne.fixes import _eye_array from mne.stats import combine_adjacency, ttest_ind_no_p from mne.stats.cluster_level import ( + _check_fun, + _permutation_cluster_test, + cluster_test, f_oneway, permutation_cluster_1samp_test, permutation_cluster_test, @@ -27,6 +38,7 @@ summarize_clusters_stc, ttest_1samp_no_p, ) +from mne.time_frequency import AverageTFRArray from mne.utils import _record_warnings, catch_logging n_space = 50 @@ -867,3 +879,139 @@ def test_output_equiv(shape, out_type, adjacency, threshold): assert out_type == "indices" got_mask[np.ix_(*clu)] = n assert_array_equal(got_mask, want_mask) + + +def create_sample_data_cluster_test(): + """Create sample data to test new cluster API.""" + # Prepare some dummy data + n_subjects = 20 + n_conditions = 2 + n_channels = 5 + n_timepoints = 8 + n_freqs = 3 + + # Create dummy data + dummy_data_2d = [ + np.random.rand(n_channels, n_timepoints) + for _ in range(n_subjects * n_conditions) + ] + dummy_data_3d = [ + np.random.rand(n_channels, n_freqs, n_timepoints) + for _ in range(n_subjects * n_conditions) + ] + + # Create a DataFrame with dummy data + df_2d = pd.DataFrame( + { + "subject_index": np.repeat(range(n_subjects), n_conditions), + "condition": np.tile(["cond1", "cond2"], n_subjects), + "data": dummy_data_2d, + } + ) + + df_3d = pd.DataFrame( + { + "subject_index": np.repeat(range(n_subjects), n_conditions), + "condition": np.tile(["cond1", "cond2"], n_subjects), + "data": dummy_data_3d, + } + ) + + return df_2d, df_3d + + +def compare_old_and_new_cluster_api(): + """Make sure old and new cluster API results are the same.""" + # load sample data + df_2d, df_3d = create_sample_data_cluster_test() + + # mandatory parameters for new cluster API + formula = "evoked ~ 1 + C(subject_index)" + + data_to_test = [df_2d, df_3d] + + # save 2D and 3D data results for both old and new API + result_old_api_all = [] + result_new_api_all = [] + d_all = [] + + for df in data_to_test: + # Pivot the DataFrame to have conditions as columns for old API + pivot_df = df.pivot(index="subject_index", columns="condition", values="data") + + # Subtract condition 2 data from condition 1 data for each subject + pivot_df["cond_diff"] = pivot_df.apply( + lambda row: row["cond1"] - row["cond1"], axis=1 + ) + + # Extract the 'cond_diff' column as a numpy array + cond_diff_array = np.stack(pivot_df["cond_diff"].values) + + # extract data and reshape for old API + if pivot_df.cond_diff[0].ndim == 2: + # reshape to channels as last dimension + d = cond_diff_array.transpose(0, 2, 1) + else: + # reshape 3D data to channels as last dimension + d = cond_diff_array.transpose(0, 3, 2, 1) + + # define test statistic + stat_fun, threshold = _check_fun( + X=d, stat_fun=None, threshold=None, tail=0, kind="within" + ) + + # Run old cluster api + result_old_api = _permutation_cluster_test( + [d], + threshold=threshold, + stat_fun=stat_fun, + n_jobs=-1, # takes all CPU cores + max_step=1, # maximum distance between samples (time points) + exclude=None, # exclude no time points or channels + step_down_p=0, # step down in jumps test + t_power=1, # weigh each location by its stats score + out_type="indices", + check_disjoint=False, + buffer_size=None, # block size for chunking the data + n_permutations=1024, + tail=0, + adjacency=None, + seed=42, + ) + result_old_api_all.append(result_old_api) + d_all.append(d) + + if df.data[0].ndim == 2: + # convert each row in data column into evoked object + df["evoked"] = df["data"].apply( + lambda x: EvokedArray( + x, create_info(df.data[0].shape[0], 1000.0, "eeg") + ) + ) + else: + # convert each row in data column into evoked object + df["evoked"] = df["data"].apply( + lambda x: AverageTFRArray( + create_info(df.data[0].shape[0], 1000.0, "eeg"), + x, + times=np.arange(df.data[0].shape[2]), + freqs=np.arange(df.data[0].shape[1]), + ) + ) + + # run the new cluster test API and return the new cluster_result object + cluster_result = cluster_test( + df=df, formula=formula, paired_test=True, adjacency=None, seed=42 + ) + result_new_api_all.append(cluster_result) + + # compare old and new API results both for 2D and 3D data + for result_old_api, result_new_api in zip(result_old_api_all, result_new_api_all): + # compare the cluster statistics + assert_array_equal(result_old_api[0], result_new_api.T_obs) + + # compare the cluster indices + assert_array_equal(result_old_api[1], result_new_api.clusters) + + # compare the cluster p-values + assert_array_equal(result_old_api[2], result_new_api.cluster_p_values) From d41efbe73f5b5998679b6de910789d61bcbd9bd0 Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Thu, 25 Jul 2024 12:17:36 -0500 Subject: [PATCH 62/88] simplify tests Co-authored-by: Carina Forster --- mne/stats/tests/test_cluster_level.py | 136 ++++++++------------------ 1 file changed, 41 insertions(+), 95 deletions(-) diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py index 097754f097b..f9be3693441 100644 --- a/mne/stats/tests/test_cluster_level.py +++ b/mne/stats/tests/test_cluster_level.py @@ -17,6 +17,7 @@ from scipy import linalg, sparse, stats from mne import ( + EpochsArray, EvokedArray, MixedSourceEstimate, SourceEstimate, @@ -27,8 +28,6 @@ from mne.fixes import _eye_array from mne.stats import combine_adjacency, ttest_ind_no_p from mne.stats.cluster_level import ( - _check_fun, - _permutation_cluster_test, cluster_test, f_oneway, permutation_cluster_1samp_test, @@ -38,7 +37,7 @@ summarize_clusters_stc, ttest_1samp_no_p, ) -from mne.time_frequency import AverageTFRArray +from mne.time_frequency import AverageTFRArray, EpochsTFRArray from mne.utils import _record_warnings, catch_logging n_space = 50 @@ -920,98 +919,45 @@ def create_sample_data_cluster_test(): return df_2d, df_3d -def compare_old_and_new_cluster_api(): - """Make sure old and new cluster API results are the same.""" - # load sample data - df_2d, df_3d = create_sample_data_cluster_test() - - # mandatory parameters for new cluster API - formula = "evoked ~ 1 + C(subject_index)" - - data_to_test = [df_2d, df_3d] - - # save 2D and 3D data results for both old and new API - result_old_api_all = [] - result_new_api_all = [] - d_all = [] - - for df in data_to_test: - # Pivot the DataFrame to have conditions as columns for old API - pivot_df = df.pivot(index="subject_index", columns="condition", values="data") - - # Subtract condition 2 data from condition 1 data for each subject - pivot_df["cond_diff"] = pivot_df.apply( - lambda row: row["cond1"] - row["cond1"], axis=1 - ) - - # Extract the 'cond_diff' column as a numpy array - cond_diff_array = np.stack(pivot_df["cond_diff"].values) - - # extract data and reshape for old API - if pivot_df.cond_diff[0].ndim == 2: - # reshape to channels as last dimension - d = cond_diff_array.transpose(0, 2, 1) - else: - # reshape 3D data to channels as last dimension - d = cond_diff_array.transpose(0, 3, 2, 1) - - # define test statistic - stat_fun, threshold = _check_fun( - X=d, stat_fun=None, threshold=None, tail=0, kind="within" - ) - - # Run old cluster api - result_old_api = _permutation_cluster_test( - [d], - threshold=threshold, - stat_fun=stat_fun, - n_jobs=-1, # takes all CPU cores - max_step=1, # maximum distance between samples (time points) - exclude=None, # exclude no time points or channels - step_down_p=0, # step down in jumps test - t_power=1, # weigh each location by its stats score - out_type="indices", - check_disjoint=False, - buffer_size=None, # block size for chunking the data - n_permutations=1024, - tail=0, - adjacency=None, - seed=42, - ) - result_old_api_all.append(result_old_api) - d_all.append(d) - - if df.data[0].ndim == 2: - # convert each row in data column into evoked object - df["evoked"] = df["data"].apply( - lambda x: EvokedArray( - x, create_info(df.data[0].shape[0], 1000.0, "eeg") - ) - ) - else: - # convert each row in data column into evoked object - df["evoked"] = df["data"].apply( - lambda x: AverageTFRArray( - create_info(df.data[0].shape[0], 1000.0, "eeg"), - x, - times=np.arange(df.data[0].shape[2]), - freqs=np.arange(df.data[0].shape[1]), - ) - ) - - # run the new cluster test API and return the new cluster_result object - cluster_result = cluster_test( - df=df, formula=formula, paired_test=True, adjacency=None, seed=42 +def test_compare_old_and_new_cluster_api(): + """Test for same results from old and new APIs.""" + condition1_1d, condition2_1d, condition1_2d, condition2_2d = _get_conditions() + df_1d = pd.DataFrame( + dict( + data=[condition1_1d, condition2_1d], + condition=["a", "b"], ) - result_new_api_all.append(cluster_result) - - # compare old and new API results both for 2D and 3D data - for result_old_api, result_new_api in zip(result_old_api_all, result_new_api_all): - # compare the cluster statistics - assert_array_equal(result_old_api[0], result_new_api.T_obs) + ) + kwargs = dict(n_permutations=100, tail=1, seed=1, buffer_size=None, out_type="mask") + F_obs, clusters, cluster_pvals, H0 = permutation_cluster_test( + [condition1_1d, condition2_1d], **kwargs + ) + formula = "data ~ condition" + cluster_result = cluster_test(df_1d, formula, **kwargs) + assert_array_equal(cluster_result.H0, H0) + assert_array_equal(cluster_result.stat_obs, F_obs) + assert_array_equal(cluster_result.cluster_p_values, cluster_pvals) + assert cluster_result.clusters == clusters - # compare the cluster indices - assert_array_equal(result_old_api[1], result_new_api.clusters) - # compare the cluster p-values - assert_array_equal(result_old_api[2], result_new_api.cluster_p_values) +@pytest.mark.parametrize( + "Inst", (EpochsArray, EvokedArray, EpochsTFRArray, AverageTFRArray) +) +def test_new_cluster_api(Inst): + """Test handling different MNE objects in the cluster API.""" + pd = pytest.importorskip("pandas") + + n_epo, n_chan, n_freq, n_times = 2, 3, 5, 7 + shape = (n_chan, n_times) + if Inst in (EpochsArray, EpochsTFRArray): + shape = (n_epo,) + shape + if Inst in (EpochsTFRArray, AverageTFRArray): + shape = shape[:-1] + (n_freq, shape[-1]) + + info = create_info(...) + inst1 = Inst(np.random.normal(shape, ...), info=info) + inst2 = Inst(np.random.normal(shape, ...), info=info) + + df = pd.DataFrame(dict(data=[inst1, inst2], condition=["a", "b"])) + result = cluster_test(df, "data~condition", ...) + assert result # TODO do something more interesting here From 9523fae7996f0006cd5c379767d858e6cc2694ef Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Thu, 25 Jul 2024 12:23:49 -0500 Subject: [PATCH 63/88] refactor cluster_test Co-authored-by: Eric Larson Co-authored-by: Carina Forster --- mne/stats/cluster_level.py | 419 ++++++++++++------------------------- 1 file changed, 139 insertions(+), 280 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index 847c464259c..20b54f1f592 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -17,13 +17,12 @@ from scipy.stats import f as fstat from scipy.stats import t as tstat -from .. import Epochs, Evoked -from ..epochs import EpochsArray, EvokedArray +from .. import BaseEpochs, Evoked, EvokedArray from ..fixes import has_numba, jit from ..parallel import parallel_func from ..source_estimate import MixedSourceEstimate, SourceEstimate, VolSourceEstimate from ..source_space import SourceSpaces -from ..time_frequency import AverageTFR, AverageTFRArray, EpochsTFR, EpochsTFRArray +from ..time_frequency import BaseTFR from ..utils import ( ProgressBar, _check_option, @@ -1737,191 +1736,65 @@ def summarize_clusters_stc( return klass(data_summary, vertices, tmin, tstep, subject) -def validate_input_dataframe(df: pd.DataFrame, formula: str): - """ - Validate the input dataframe for the cluster permutation test. - - Parameters - ---------- - df : pd.DataFrame - Dataframe with 3 columns (subject_index, condition, data). - formula : formulaic.ModelSpec - Wilkinson style Formula for the design matrix. - - Returns - ------- - dv_name : str - Name of the dependent variable. - """ - # extract dependent variable name from formula - formulaic = _soft_import( - "formulaic", purpose="set up Design Matrix" - ) # soft import (not a dependency for MNE) - formula = formulaic.Formula(formula) - dv_name = str(formula.lhs) - +def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str): # check if all necessary columns are present - if dv_name not in df.columns: - raise ValueError("""DataFrame needs to contain a column - with the dependent variable name - as defined in the formula""") - if "condition" not in df.columns: - raise ValueError("DataFrame needs to contain a condition column") - if "subject_index" not in df.columns: - raise ValueError("DataFrame needs to contain a subject_index column") - - # check if the data column contains only valid types - check_column_types(df[dv_name]) - + missing = ({dv_name} | {iv_name}) - set(df.columns) + sep = '", "' + if missing: + raise ValueError( + f"DataFrame must contain a column named for each term in `formula`. " + f"Column{_pl(missing)} missing for term{_pl(missing)} " + f'"{sep.join(missing)}".' + ) + # check if the data column contains valid (and consistent) instance types + inst = df[dv_name].iloc[0] + valid_types = (Evoked, BaseEpochs, BaseTFR, np.ndarray) + _validate_type(inst, valid_types, f"Data in dependent variable column '{dv_name}'") + all_types = set(df[dv_name].map(type)) + all_type_names = ", ".join([type(x).__name__ for x in all_types]) + prologue = f"Data in dependent variable column '{dv_name}' must all have " + if len(all_types) > 1: + raise ValueError( + f"{prologue} the same type, but found types {{{all_type_names}}}." + ) # check if the shape of the data is consistent - if not all(data.data.shape == df[dv_name][0].data.shape for data in df[dv_name]): - raise ValueError("Data objects need to have the same shape") - - # check if the condition column contains only 2 unique values - if len(pd.unique(df.condition)) != 2: - raise ValueError("currently only supports 2 conditions.") - - return dv_name - - -def check_column_types(input_data: np.ndarray): - """ - Check if the column types are valid for the cluster permutation test. - - Parameters - ---------- - input_data : np.Array - Data to be checked for the cluster permutation test. - """ - # Get the type of the first element - first_type = type(input_data.iloc[0]) - - # Define the possible valid types - valid_types = ( - Evoked, - EvokedArray, - Epochs, - EpochsArray, - AverageTFR, - EpochsTFR, - EpochsTFRArray, - AverageTFRArray, - ) - - # Check if the type of the first element is a valid type - if first_type not in valid_types: - raise ValueError(f"Object type '{first_type}' is not a valid type.") - - # Check if all elements are of the same type as the first one - if not all(isinstance(data, first_type) for data in input_data): - raise ValueError("Data column must contain objects of the same type.") - - -def prepare_data_for_cluster_test(input_df: pd.DataFrame, dv_name: str): - """ - Prepare the data for the cluster permutation test. - - Parameters - ---------- - input_data : np.ndarray - Data to be prepared for the cluster permutation test. - - Returns - ------- - data : np.Array - Data prepared for the cluster permutation test. - """ - # extract data and add to dataframe - input_df["data"] = [data.data for data in input_df[dv_name]] - - # extract dimensions from time series or time-frequency data - first_data_obj = input_df["data"].iloc[0] - if isinstance(first_data_obj, (Epochs, Evoked, EpochsArray, EvokedArray)): - n_channels, n_timepoints = first_data_obj.get_data().shape - if isinstance( - first_data_obj, (AverageTFR, EpochsTFR, AverageTFRArray, EpochsTFRArray) - ): - n_channels, n_freqs, n_timepoints = first_data_obj.get_data().shape - - reshaped_data = [] - - for idx, row in input_df.iterrows(): - subject_index = row["subject_index"] - condition = row["condition"] - data_array = row["data"] - - if data_array.ndim == 2: - n_channels, n_timepoints = data_array.shape - # timepoints are the columns - df_temp = pd.DataFrame( - data_array, columns=[f"timepoint_{i}" for i in range(n_timepoints)] - ) - df_temp["channel"] = range(n_channels) - df_temp["subject_index"] = subject_index - df_temp["condition"] = condition - - reshaped_data.append(df_temp) - - elif data_array.ndim == 3: - n_channels, n_freqs, n_timepoints = data_array.shape - # timepoints are the columns - df_temp = pd.DataFrame( - data_array.reshape(-1, n_timepoints), - columns=[f"timepoint_{i}" for i in range(n_timepoints)], - ) - df_temp["frequency"] = np.repeat(range(n_freqs), n_channels) - df_temp["channel"] = np.tile(range(n_channels), n_freqs) - df_temp["subject_index"] = subject_index - df_temp["condition"] = condition - - reshaped_data.append(df_temp) - - else: - raise ValueError(f"Unsupported data array dimensions: {data_array.ndim}") - # combine the reshaped data - combined_df = pd.concat(reshaped_data, ignore_index=True) - # Convert the dataframe to long format - id_vars = ["subject_index", "condition", "channel"] - if "frequency" in combined_df.columns: - id_vars.append("frequency") - - reshaped_df = pd.melt( - combined_df, id_vars=id_vars, var_name="timepoint", value_name="value" - ) - - # rename column and convert to integer - reshaped_df["timepoint"] = ( - reshaped_df["timepoint"].str.replace("timepoint_", "").astype(int) - ) - - # return the reshaped dataframe and dimensions - if data_array.ndim == 2: - return reshaped_df, data_array.ndim, n_channels, n_timepoints - elif data_array.ndim == 3: - return reshaped_df, data_array.ndim, n_channels, n_freqs, n_timepoints + if isinstance(inst, np.ndarray): + all_shapes = set(df[dv_name].map(lambda x: x.shape[1:])) # first dim may vary + elif isinstance(inst, BaseEpochs): + all_shapes = set(df[dv_name].map(lambda x: x.get_data().shape[1:])) + else: + all_shapes = set(df[dv_name].map(lambda x: x.get_data().shape)) + if len(all_shapes) > 1: + raise ValueError( + f"{prologue} consistent shape, but {len(all_shapes)} different " + f"shapes were found: {'; '.join(all_shapes)}." + ) + return all_types.pop() +@verbose def cluster_test( df: pd.DataFrame, - formula: str, # Wilkinson notation formula for design matrix - paired_test: bool, # whether to run a paired t-test or unpaired test - n_permutations: int = 1024, # same default as in old API - seed: None | int | np.random.RandomState = None, - tail: Literal[-1, 0, 1] = 0, # 0 for two-tailed, 1 for greater, -1 for less - n_jobs: int = 1, # how many cores to use + formula: str, + *, + within_id: str | None = None, + stat_fun: callable | None = None, + tail: Literal[-1, 0, 1] = 0, + threshold=None, + n_permutations: int = 1024, adjacency: tuple | None = None, - max_step: int = 1, # maximum distance between samples (time points) - exclude: list | None = None, # exclude no time points or channels - step_down_p: int = 0, # step down in jumps test - t_power: int = 1, # weigh each location by its stats score - out_type: Literal["indices", "mask"] = "indices", + max_step: int = 1, + exclude: list | None = None, + step_down_p: int = 0, + t_power: int = 1, check_disjoint: bool = False, - buffer_size: int | None = None, # block size for chunking the data + out_type: Literal["indices", "mask"] = "indices", + seed: None | int | np.random.RandomState = None, + buffer_size: int | None = None, + n_jobs: int = 1, + verbose=None, ): - """ - Run a cluster permutation test based on formulaic input. - - # currently only supports paired t-test on evokeds or epochs + """Run a cluster permutation test from a DataFrame and a formula. Parameters ---------- @@ -1929,16 +1802,14 @@ def cluster_test( Dataframe with 3 columns (subject_index, condition, evoked). formula : str Wilkinson notation formula for design matrix. - paired_test: bool - Whether to run a paired t-test. - n_permutations : int, optional - Number of permutations. Default is 1024. - seed : None | int | np.random.RandomState, optional - Seed for the random number generator. Default is None. + within_id : None | str + Name of column in ``df`` to use in identifying within-group contrasts. + stat_fun : None | callable + Statistical function to use. tail : int, optional 0 for two-tailed, 1 for greater, -1 for less. Default is 0. - n_jobs : int, optional - How many cores to use. Default is 1. + n_permutations : int, optional + Number of permutations. Default is 1024. adjacency : None, optional Provide a adjacency matrix. Default is None. max_step : int, optional @@ -1949,107 +1820,86 @@ def cluster_test( Step down in jumps test. Default is 0. t_power : int, optional Weigh each location by its stats score. Default is 1. - out_type : str, optional - Output type. Default is "indices". check_disjoint : bool, optional Check if clusters are disjoint. Default is False. + out_type : str, optional + Output type. Default is "indices". + seed : None | int | np.random.RandomState, optional + Seed for the random number generator. Default is None. buffer_size : int, optional Block size for chunking the data. Default is None. - seed : int, optional - Seed for the random number generator. Default is None. + n_jobs : int, optional + How many cores to use. Default is 1. + %(verbose)s Returns ------- ClusterResult Object containing the results of the cluster permutation test. """ - # check if formula is present - if formula is None: - raise ValueError("Wilkinson style formula is required.") - - # validate the input dataframe and return name of dependent variable - dv_name = validate_input_dataframe(df, formula) - - # prepare the data for the cluster permutation test - prep_result = prepare_data_for_cluster_test(df, dv_name) - - if prep_result[1] == 2: - # pivot the dataframe based on condition for later subtraction - pivot_df = ( - prep_result[0] - .pivot_table( - index=["subject_index", "channel", "timepoint"], - columns="condition", - values="value", - ) - .reset_index() - ) - elif prep_result[1] == 3: - # pivot the dataframe based on condition for later subtraction - pivot_df = ( - prep_result[0] - .pivot_table( - index=["subject_index", "channel", "frequency", "timepoint"], - columns="condition", - values="value", - ) - .reset_index() + # parse formula + formulaic = _soft_import("formulaic", purpose="parse formula for clustering") + parser = formulaic.parser.DefaultFormulaParser(include_intercept=False) + formula = formulaic.Formula(formula, _parser=parser) + dv_name = str(np.array(formula.lhs.root).item()) + iv_name = str(np.array(formula.rhs.root).item()) + # validate the input dataframe and return the type of the data column entries + _dtype = _validate_cluster_df(df, dv_name, iv_name) + + # for within_subject + _validate_type(within_id, (str, None), "within_id") + if within_id: + df = df.copy(deep=False) # Don't mutate input dataframe row order! + df.sort_values([iv_name, within_id], inplace=True) + counts = df[within_id].value_counts() + if any(counts != 2): + raise ValueError("Badness 10000") + + # extract the data + + def _extract_data_array(series): + return np.concatenate(series.values) + + def _extract_data_mne(series): + return np.array( + series.map(lambda inst: inst.get_data().swapaxes(-2, -1)).to_list() ) - # Get unique elements and the indices of their first occurrences - unique_elements, indices = np.unique(df.condition, return_index=True) - - # Sort unique elements by the indices of their first occurrences - conditions = unique_elements[np.argsort(indices)] - - # store the contrast for the clusterResults object - contrast = f"{conditions[0]} - {conditions[1]}" - - # print the contrast used for the paired t-test so the user knows - # what is subtracted from what - logger.info(f"Contrast used for paired t-test: {contrast}") - - # Compute the difference (assuming there are only 2 conditions) - pivot_df[dv_name] = pivot_df[conditions[0]] - pivot_df[conditions[1]] - - # for the paired t-test y is the difference between conditions - # X is the design matrix with a column with 1s and 0s for each participant - # Create the design matrix using formulaic - formulaic = _soft_import( - "formulaic", purpose="set up Design Matrix" - ) # soft import (not a dependency for MNE) - y, X = formulaic.model_matrix(formula, pivot_df) - - # Prepare design matrix for input into MNE cluster function - # MNE cluster functions expect channels as the last dimension - - if prep_result[1] == 2: - # Reshape y.values into a 3D array: (participants, n_channels, n_timepoints) - y_reshaped = y.values.reshape(-1, prep_result[2], prep_result[3]) - # Transpose the array to have channels as the last dimension - y_for_cluster = y_reshaped.transpose(0, 2, 1) - elif prep_result[1] == 3: - # Reshape y.values into a 4D array: - # (participants, n_channels, n_freqs, n_timepoints) - y_reshaped = y.values.reshape( - -1, prep_result[2], prep_result[3], prep_result[4] - ) - # Transpose the array to have channels as the last dimension - y_for_cluster = y_reshaped.transpose(0, 3, 2, 1) + def _extract_data_tfr(series): + return series.map(lambda inst: inst.get_data().swapaxes(-3, -1)).to_list() - if paired_test: - # define stat function and threshold - stat_fun, threshold = _check_fun( - X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="within" - ) + if _dtype is np.ndarray: + func = _extract_data_array + elif _dtype is BaseTFR: + func = _extract_data_tfr else: - # define stat function and threshold - stat_fun, threshold = _check_fun( - X=y_for_cluster, stat_fun=None, threshold=None, tail=0, kind="between" - ) + func = _extract_data_mne + # convert to a list-like X for clustering + X = df.groupby(iv_name).agg({dv_name: func})[dv_name].to_list() + + # determine test type + if len(X) == 1: + kind = "within" + elif len(X) > 2: + kind = "between" + elif len(set(x.shape for x in X)) > 1: + kind = "between" + # by now we know there are exactly 2 elements in X, and their shapes match + elif within_id in df: + kind = "within" + X = X[0] - X[1] + else: + kind = "between" + + # define stat function and threshold + stat_fun, threshold = _check_fun( + X=X, stat_fun=stat_fun, threshold=threshold, tail=tail, kind=kind + ) + if kind == "within": + X = [X] # Run the cluster-based permutation test - T_obs, clusters, cluster_p_values, H0 = _permutation_cluster_test( - [y_for_cluster], + stat_obs, clusters, cluster_p_values, H0 = _permutation_cluster_test( + X, n_permutations=n_permutations, threshold=threshold, stat_fun=stat_fun, @@ -2066,9 +1916,9 @@ def cluster_test( seed=seed, ) - print(f"smallest cluster p-value: {min(cluster_p_values)}") + # print(f"smallest cluster p-value: {min(cluster_p_values)}") - return ClusterResult(T_obs, clusters, cluster_p_values, H0) + return ClusterResult(stat_obs, clusters, cluster_p_values, H0, stat_fun) class ClusterResult: @@ -2077,7 +1927,7 @@ class ClusterResult: Parameters ---------- - T_obs : np.ndarray + stat_obs : np.ndarray The observed test statistic. clusters : list List of clusters. @@ -2089,15 +1939,24 @@ class ClusterResult: def __init__( self, - T_obs: np.typing.NDArray, + stat_obs: np.typing.NDArray, clusters: list, cluster_p_values: np.typing.NDArray, H0: np.typing.NDArray, + stat_fun: callable, ): - self.T_obs = T_obs + self.stat_obs = stat_obs self.clusters = clusters self.cluster_p_values = cluster_p_values self.H0 = H0 + self.stat_fun = stat_fun + # TODO improve detection of stat name (e.g. unpaired T)? + if stat_fun is f_oneway: + self.stat_name = "F-statistic" + elif stat_fun is ttest_1samp_no_p: + self.stat_name = "paired T-statistic" + else: + self.stat_name = "test statistic" def plot_cluster(self, condition_labels: dict): """ @@ -2128,7 +1987,7 @@ def plot_cluster(self, condition_labels: dict): time_inds = np.unique(time_inds) # get topography for t stat - t_map = self.T_obs[time_inds, ...].mean(axis=0).astype(int) + t_map = self.stat_obs[time_inds, ...].mean(axis=0).astype(int) # get signals at the sensors contributing to the cluster sig_times = cond_values[0][0].times[time_inds] @@ -2169,7 +2028,7 @@ def plot_cluster(self, condition_labels: dict): # add axes for colorbar ax_colorbar = divider.append_axes("right", size="5%", pad=0.1) cbar = plt.colorbar(image, cax=ax_colorbar) - cbar.set_label("t-value") + cbar.set_label(self.stat_name) ax_topo.set_xlabel( "average from {:0.3f} to {:0.3f} s".format(*sig_times[[0, -1]]) ) From 9661492e5fcac3962cc04f926a0cfac60c0745c0 Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Thu, 25 Jul 2024 12:25:12 -0500 Subject: [PATCH 64/88] make tutorial match modified API Co-authored-by: Carina Forster --- tutorials/stats-sensor-space/76_new_cluster_test_api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index efbc6d5e3f0..83b4f019b6f 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -133,11 +133,11 @@ # the cluster test randomly permutes the subject label # the 1 in the formula represents the intercept which is always included # C is a categorical variable that will be dummy coded -formula = "evoked ~ 1 + C(subject_index)" +formula = "evoked ~ condition" # run the new cluster test API and return the new cluster_result object cluster_result = mne.stats.cluster_level.cluster_test( - df=df, formula=formula, paired_test=True, adjacency=None + df=df, formula=formula, within_id="subject_index" ) # note that we ran an exact test due to the small sample size From cac05598c1de8c565fc5b823929472a3b2c9535c Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Thu, 25 Jul 2024 12:30:30 -0500 Subject: [PATCH 65/88] remove unused test helper func --- mne/stats/tests/test_cluster_level.py | 39 --------------------------- 1 file changed, 39 deletions(-) diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py index f9be3693441..00989e3e00c 100644 --- a/mne/stats/tests/test_cluster_level.py +++ b/mne/stats/tests/test_cluster_level.py @@ -880,45 +880,6 @@ def test_output_equiv(shape, out_type, adjacency, threshold): assert_array_equal(got_mask, want_mask) -def create_sample_data_cluster_test(): - """Create sample data to test new cluster API.""" - # Prepare some dummy data - n_subjects = 20 - n_conditions = 2 - n_channels = 5 - n_timepoints = 8 - n_freqs = 3 - - # Create dummy data - dummy_data_2d = [ - np.random.rand(n_channels, n_timepoints) - for _ in range(n_subjects * n_conditions) - ] - dummy_data_3d = [ - np.random.rand(n_channels, n_freqs, n_timepoints) - for _ in range(n_subjects * n_conditions) - ] - - # Create a DataFrame with dummy data - df_2d = pd.DataFrame( - { - "subject_index": np.repeat(range(n_subjects), n_conditions), - "condition": np.tile(["cond1", "cond2"], n_subjects), - "data": dummy_data_2d, - } - ) - - df_3d = pd.DataFrame( - { - "subject_index": np.repeat(range(n_subjects), n_conditions), - "condition": np.tile(["cond1", "cond2"], n_subjects), - "data": dummy_data_3d, - } - ) - - return df_2d, df_3d - - def test_compare_old_and_new_cluster_api(): """Test for same results from old and new APIs.""" condition1_1d, condition2_1d, condition1_2d, condition2_2d = _get_conditions() From 47ac8380ea2c1efe046b87e1475cc7bf930962e9 Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Thu, 25 Jul 2024 12:33:11 -0500 Subject: [PATCH 66/88] vulture allowlist update --- tools/vulture_allowlist.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/vulture_allowlist.py b/tools/vulture_allowlist.py index d612d0ec5ed..f030b4d4346 100644 --- a/tools/vulture_allowlist.py +++ b/tools/vulture_allowlist.py @@ -146,3 +146,6 @@ _qt_raise_window _qt_disable_paint _qt_get_stylesheet + +# used in tutorial, not sure why shows up +plot_cluster From 033c1585783ee524f8743d7b82ba2b2077b7e6d4 Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Sun, 28 Jul 2024 13:00:41 +0200 Subject: [PATCH 67/88] included BaseTFR in validate_cluster_df --- mne/stats/cluster_level.py | 43 +++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index 20b54f1f592..001979461bc 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -1737,18 +1737,24 @@ def summarize_clusters_stc( def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str): + """Validate the input DataFrame for cluster tests.""" # check if all necessary columns are present - missing = ({dv_name} | {iv_name}) - set(df.columns) + missing = ({dv_name} | {iv_name}) - set(df.columns) # should be empty sep = '", "' - if missing: + if missing: # if not empty, there are missing columns raise ValueError( f"DataFrame must contain a column named for each term in `formula`. " - f"Column{_pl(missing)} missing for term{_pl(missing)} " + f"Column{_pl(missing)} missing for term{_pl(missing)} " # _pl = pluralize f'"{sep.join(missing)}".' ) # check if the data column contains valid (and consistent) instance types inst = df[dv_name].iloc[0] - valid_types = (Evoked, BaseEpochs, BaseTFR, np.ndarray) + valid_types = ( + Evoked, + BaseEpochs, + BaseTFR, + np.ndarray, + ) # Base covers all Epochs and TFRs _validate_type(inst, valid_types, f"Data in dependent variable column '{dv_name}'") all_types = set(df[dv_name].map(type)) all_type_names = ", ".join([type(x).__name__ for x in all_types]) @@ -1759,8 +1765,10 @@ def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str): ) # check if the shape of the data is consistent if isinstance(inst, np.ndarray): - all_shapes = set(df[dv_name].map(lambda x: x.shape[1:])) # first dim may vary - elif isinstance(inst, BaseEpochs): + all_shapes = set( + df[dv_name].map(lambda x: x.shape[1:]) + ) # first dim may vary (participants or epochs) + elif isinstance(inst, (BaseEpochs | BaseTFR)): # should include BaseTFR? all_shapes = set(df[dv_name].map(lambda x: x.get_data().shape[1:])) else: all_shapes = set(df[dv_name].map(lambda x: x.get_data().shape)) @@ -1769,14 +1777,14 @@ def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str): f"{prologue} consistent shape, but {len(all_shapes)} different " f"shapes were found: {'; '.join(all_shapes)}." ) - return all_types.pop() + return all_types.pop() # return the type of the data column entries @verbose def cluster_test( df: pd.DataFrame, formula: str, - *, + *, # end of positional-only parameters within_id: str | None = None, stat_fun: callable | None = None, tail: Literal[-1, 0, 1] = 0, @@ -1799,9 +1807,10 @@ def cluster_test( Parameters ---------- df : pd.DataFrame - Dataframe with 3 columns (subject_index, condition, evoked). + Dataframe containing the data, dependent and independent variables. formula : str - Wilkinson notation formula for design matrix. + Wilkinson notation formula for design matrix. The names of the dependent + and independent variable should match the columns in the dataframe. within_id : None | str Name of column in ``df`` to use in identifying within-group contrasts. stat_fun : None | callable @@ -1841,8 +1850,10 @@ def cluster_test( formulaic = _soft_import("formulaic", purpose="parse formula for clustering") parser = formulaic.parser.DefaultFormulaParser(include_intercept=False) formula = formulaic.Formula(formula, _parser=parser) + # extract the dependent and independent variable names dv_name = str(np.array(formula.lhs.root).item()) iv_name = str(np.array(formula.rhs.root).item()) + # validate the input dataframe and return the type of the data column entries _dtype = _validate_cluster_df(df, dv_name, iv_name) @@ -1853,10 +1864,9 @@ def cluster_test( df.sort_values([iv_name, within_id], inplace=True) counts = df[within_id].value_counts() if any(counts != 2): - raise ValueError("Badness 10000") - - # extract the data + raise ValueError("for paired tttest, each subject must have 2 observations") + # extract the data from the dataframe def _extract_data_array(series): return np.concatenate(series.values) @@ -1874,15 +1884,16 @@ def _extract_data_tfr(series): func = _extract_data_tfr else: func = _extract_data_mne + # convert to a list-like X for clustering X = df.groupby(iv_name).agg({dv_name: func})[dv_name].to_list() # determine test type if len(X) == 1: - kind = "within" + kind = "within" # data already subtracted elif len(X) > 2: kind = "between" - elif len(set(x.shape for x in X)) > 1: + elif len(set(x.shape for x in X)) > 1: # check if shapes match kind = "between" # by now we know there are exactly 2 elements in X, and their shapes match elif within_id in df: @@ -1916,8 +1927,6 @@ def _extract_data_tfr(series): seed=seed, ) - # print(f"smallest cluster p-value: {min(cluster_p_values)}") - return ClusterResult(stat_obs, clusters, cluster_p_values, H0, stat_fun) From 2c2f341707cf99525508b934dcb040b94aed26c5 Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Sun, 28 Jul 2024 13:35:40 +0200 Subject: [PATCH 68/88] comments on cluster_test function --- mne/stats/cluster_level.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index 001979461bc..8b4c9f15d10 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -1857,7 +1857,7 @@ def cluster_test( # validate the input dataframe and return the type of the data column entries _dtype = _validate_cluster_df(df, dv_name, iv_name) - # for within_subject + # for within_subject designs, check if each subject has 2 observations _validate_type(within_id, (str, None), "within_id") if within_id: df = df.copy(deep=False) # Don't mutate input dataframe row order! @@ -1870,7 +1870,7 @@ def cluster_test( def _extract_data_array(series): return np.concatenate(series.values) - def _extract_data_mne(series): + def _extract_data_mne(series): # 2D data return np.array( series.map(lambda inst: inst.get_data().swapaxes(-2, -1)).to_list() ) @@ -1893,21 +1893,26 @@ def _extract_data_tfr(series): kind = "within" # data already subtracted elif len(X) > 2: kind = "between" - elif len(set(x.shape for x in X)) > 1: # check if shapes match + elif ( + len(set(x.shape for x in X)) > 1 + ): # check if there are unequal observations in each group kind = "between" # by now we know there are exactly 2 elements in X, and their shapes match elif within_id in df: kind = "within" X = X[0] - X[1] - else: + else: # what would be another else cas kind = "between" # define stat function and threshold stat_fun, threshold = _check_fun( X=X, stat_fun=stat_fun, threshold=threshold, tail=tail, kind=kind ) - if kind == "within": + + # check_fun doesn't work with list input` + if kind == "within": # will this create an issue for already subtracted data? X = [X] + # Run the cluster-based permutation test stat_obs, clusters, cluster_p_values, H0 = _permutation_cluster_test( X, From e9b5fa29522ddcc7f4ddab1a3533e4014dc06fcd Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Sun, 28 Jul 2024 14:11:54 +0200 Subject: [PATCH 69/88] updated clusterResult class and plot function --- mne/stats/cluster_level.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index 8b4c9f15d10..eebca26d35c 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -1964,7 +1964,8 @@ def __init__( self.cluster_p_values = cluster_p_values self.H0 = H0 self.stat_fun = stat_fun - # TODO improve detection of stat name (e.g. unpaired T)? + + # unpaired t-test is f_oneway if stat_fun is f_oneway: self.stat_name = "F-statistic" elif stat_fun is ttest_1samp_no_p: @@ -1972,7 +1973,7 @@ def __init__( else: self.stat_name = "test statistic" - def plot_cluster(self, condition_labels: dict): + def plot_cluster_time_sensor(self, condition_labels: dict): """ Plot the cluster with the lowest p-value. @@ -1985,13 +1986,20 @@ def plot_cluster(self, condition_labels: dict): condition_labels : dict Dictionary with condition labels as keys and evoked objects as values. """ + # define colorblind friendly colors + colorblind_palette = ["#4daf4a", "#f781bf"] + # extract condition labels from the dictionary cond_keys = list(condition_labels.keys()) # extract the evokeds from the dictionary cond_values = list(condition_labels.values()) # configure variables for visualization - colors = {cond_keys[0]: "crimson", cond_keys[1]: "steelblue"} + colors = { + cond_keys[0]: colorblind_palette[0], + cond_keys[1]: colorblind_palette[1], + } + line_styles = {cond_keys[0]: "-", cond_keys[1]: "--"} lowest_p_cluster = np.argmin(self.cluster_p_values) @@ -2044,18 +2052,23 @@ def plot_cluster(self, condition_labels: dict): cbar = plt.colorbar(image, cax=ax_colorbar) cbar.set_label(self.stat_name) ax_topo.set_xlabel( - "average from {:0.3f} to {:0.3f} s".format(*sig_times[[0, -1]]) + "Spatial cluster extent:\n averaged from {:0.3f} to {:0.3f} s".format( + *sig_times[[0, -1]] + ) ) # add new axis for time courses and plot time courses ax_signals = divider.append_axes("right", size="300%", pad=1.3) - title = f"Signal averaged over {len(ch_inds)} sensor(s)" + title = ( + f"Temporal cluster extent:\nSignal averaged over {len(ch_inds)} sensor(s)" + ) plot_compare_evokeds( condition_labels, title=title, picks=ch_inds, axes=ax_signals, colors=colors, + linestyles=line_styles, show=False, split_legend=True, truncate_yaxis="auto", From 2fd17d338313a7ab25f80c6f547b154d112e692a Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Sun, 28 Jul 2024 14:12:12 +0200 Subject: [PATCH 70/88] updated function call for plotting --- tutorials/stats-sensor-space/76_new_cluster_test_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index 83b4f019b6f..b7f933d127b 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -152,7 +152,7 @@ # finally let's plot the results using the ClusterResults class # we plot the cluster with the lowest p-value -cluster_result.plot_cluster(condition_labels=conditions_dict) +cluster_result.plot_cluster_time_sensor(condition_labels=conditions_dict) # we can see that there is something going on around 400 ms # with a stronger signal for target trials in right central-parietal channels From 150c530817bc691c546798d77c43bb68f35f032c Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Sun, 28 Jul 2024 14:14:18 +0200 Subject: [PATCH 71/88] changed color --- mne/stats/cluster_level.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index eebca26d35c..9b65807bd38 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -1987,7 +1987,7 @@ def plot_cluster_time_sensor(self, condition_labels: dict): Dictionary with condition labels as keys and evoked objects as values. """ # define colorblind friendly colors - colorblind_palette = ["#4daf4a", "#f781bf"] + colorblind_palette = ["#4daf4a", "#984ea3"] # extract condition labels from the dictionary cond_keys = list(condition_labels.keys()) From 3cc9e2c7f1159851141bbe8d45d77af807e5d429 Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Thu, 1 Aug 2024 12:30:11 -0500 Subject: [PATCH 72/88] docstring/docdict cleanups and fixes --- mne/stats/cluster_level.py | 68 +++++++++--------- mne/utils/docs.py | 138 ++++++++++++++++++++++--------------- 2 files changed, 118 insertions(+), 88 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index 9b65807bd38..cd86a40e22a 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -1789,12 +1789,12 @@ def cluster_test( stat_fun: callable | None = None, tail: Literal[-1, 0, 1] = 0, threshold=None, - n_permutations: int = 1024, - adjacency: tuple | None = None, - max_step: int = 1, - exclude: list | None = None, - step_down_p: int = 0, - t_power: int = 1, + n_permutations: str | int = 1024, + adjacency: sparse.spmatrix | False = False, + max_step: int = 1, # TODO may need to provide `max_step_time` and `max_step_freq` + exclude: list | None = None, # TODO needs rethink because user passes MNE objects + step_down_p: float = 0.0, + t_power: float = 1.0, check_disjoint: bool = False, out_type: Literal["indices", "mask"] = "indices", seed: None | int | np.random.RandomState = None, @@ -1812,35 +1812,41 @@ def cluster_test( Wilkinson notation formula for design matrix. The names of the dependent and independent variable should match the columns in the dataframe. within_id : None | str - Name of column in ``df`` to use in identifying within-group contrasts. - stat_fun : None | callable - Statistical function to use. - tail : int, optional - 0 for two-tailed, 1 for greater, -1 for less. Default is 0. - n_permutations : int, optional - Number of permutations. Default is 1024. - adjacency : None, optional - Provide a adjacency matrix. Default is None. + Name of column in ``df`` to use in identifying within-group contrasts. If + ``None``, will perform a between-group test. Ignored if the number of groups + (unique values in the independent variable column of ``df``) is greater than 2. + %(stat_fun_clust_both)s + %(tail_clust)s + %(threshold_clust_both)s + %(n_permutations_clust_all)s + %(adjacency_clust_both)s max_step : int, optional Maximum distance between samples (time points). Default is 1. - exclude : np.Array, optional - Exclude no time points or channels. Default is None. - step_down_p : int, optional - Step down in jumps test. Default is 0. - t_power : int, optional - Weigh each location by its stats score. Default is 1. - check_disjoint : bool, optional - Check if clusters are disjoint. Default is False. - out_type : str, optional - Output type. Default is "indices". - seed : None | int | np.random.RandomState, optional - Seed for the random number generator. Default is None. - buffer_size : int, optional - Block size for chunking the data. Default is None. - n_jobs : int, optional - How many cores to use. Default is 1. + exclude : array-like of bool | None + Mask to apply to the data to exclude certain points from clustering + (e.g., medial wall vertices). Should be the same shape as the channels/vertices + dimension of the data objects. If ``None``, no points are excluded. + %(step_down_p_clust)s + %(t_power_clust)s + check_disjoint : bool + Whether to check if the ``adjacency`` matrix can be separated into disjoint + sets before clustering. This may lead to faster clustering, especially if + the "time" and/or "frequency" dimensions are large. + %(out_type_clust)s + %(seed)s + buffer_size : int | None + Block size to use when computing test statistics. This can significantly + reduce memory usage when ``n_jobs > 1`` and memory sharing between + processes is enabled (see :func:`mne.set_cache_dir`), because the data will be + shared between processes and each process only needs to allocate space for + a small block of locations at a time. + %(n_jobs)s %(verbose)s + Notes + ----- + %(threshold_clust_t_or_f_notes)s + Returns ------- ClusterResult diff --git a/mne/utils/docs.py b/mne/utils/docs.py index 0fa9288bec2..624b2e309e8 100644 --- a/mne/utils/docs.py +++ b/mne/utils/docs.py @@ -144,61 +144,54 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75): formatting. This can add overhead so is meant only for debugging. """ -docdict["adjacency_clust"] = """ -adjacency : scipy.sparse.spmatrix | None | False +_adjacency_clust_template = """ +adjacency : scipy.sparse.spmatrix | {param_none}False Defines adjacency between locations in the data, where "locations" can be spatial vertices, frequency bins, time points, etc. For spatial vertices (i.e. sensor space data), see :func:`mne.channels.find_ch_adjacency` or :func:`mne.spatial_inter_hemi_adjacency`. For source space data, see - :func:`mne.spatial_src_adjacency` or - :func:`mne.spatio_temporal_src_adjacency`. If ``False``, assumes - no adjacency (each location is treated as independent and unconnected). - If ``None``, a regular lattice adjacency is assumed, connecting - each {sp} location to its neighbor(s) along the last dimension - of {{eachgrp}} ``{{x}}``{lastdim}. + :func:`mne.spatial_src_adjacency` or :func:`mne.spatio_temporal_src_adjacency`. + If ``False``, assumes no adjacency (each location is treated as independent and + unconnected).{if_none} If ``adjacency`` is a matrix, it is assumed to be symmetric (only the upper triangular half is used) and must be square with dimension equal to - ``{{x}}.shape[-1]`` {parone} or ``{{x}}.shape[-1] * {{x}}.shape[-2]`` - {partwo} or (optionally) - ``{{x}}.shape[-1] * {{x}}.shape[-2] * {{x}}.shape[-3]`` - {parthree}.{memory} + the product of the last 1, 2, or 3 data dimensions (e.g., for time-frequency data: + n_channels, n_channels * n_freqs, or n_channels * n_freqs * n_times).{memory} +""" +_if_none = """ If ``None``, a regular lattice adjacency is assumed, connecting + each {spatial}location to its neighbor(s) along the last dimension + of {the_data}. """ - -mem = ( - " If spatial adjacency is uniform in time, it is recommended to use " - "a square matrix with dimension ``{x}.shape[-1]`` (n_vertices) to save " - "memory and computation, and to use ``max_step`` to define the extent " - "of temporal adjacency to consider when clustering." -) -comb = " The function `mne.stats.combine_adjacency` may be useful for 4D data." st = dict( - sp="spatial", - lastdim="", - parone="(n_vertices)", - partwo="(n_times * n_vertices)", - parthree="(n_times * n_freqs * n_vertices)", - memory=mem, + param_none="None | ", + if_none=_if_none.format(spatial="spatial ", the_data="{eachgrp} ``{x}``"), + memory=""" + If spatial adjacency is uniform in time, it is recommended to use a square matrix + with dimension ``{x}.shape[-1]`` (n_vertices) to save memory and computation, + and to use ``max_step`` to define the extent of temporal adjacency to consider when + clustering. +""", ) tf = dict( - sp="", - lastdim=" (or the last two dimensions if ``{x}`` is 2D)", - parone="(for 2D data)", - partwo="(for 3D data)", - parthree="(for 4D data)", - memory=comb, + param_none="None | ", + if_none=_if_none.format( + spatial="", + the_data="{eachgrp} ``{x}`` (or the last two dimensions if ``{x}`` is 2D)", + ), + memory=""" + The function `mne.stats.combine_adjacency` may be useful for 4D data. +""", ) -nogroups = dict(eachgrp="", x="X") +nogrps = dict(eachgrp="", x="X") groups = dict(eachgrp="each group ", x="X[k]") -docdict["adjacency_clust_1"] = ( - docdict["adjacency_clust"].format(**tf).format(**nogroups) -) -docdict["adjacency_clust_n"] = docdict["adjacency_clust"].format(**tf).format(**groups) -docdict["adjacency_clust_st1"] = ( - docdict["adjacency_clust"].format(**st).format(**nogroups) -) -docdict["adjacency_clust_stn"] = ( - docdict["adjacency_clust"].format(**st).format(**groups) + +docdict["adjacency_clust_1"] = _adjacency_clust_template.format(**tf).format(**nogrps) +docdict["adjacency_clust_both"] = _adjacency_clust_template.format( + param_none="", if_none="", memory="" ) +docdict["adjacency_clust_n"] = _adjacency_clust_template.format(**tf).format(**groups) +docdict["adjacency_clust_st1"] = _adjacency_clust_template.format(**st).format(**nogrps) +docdict["adjacency_clust_stn"] = _adjacency_clust_template.format(**st).format(**groups) docdict["adjust_dig_chpi"] = """ adjust_dig : bool @@ -708,7 +701,7 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75): docdict["check_disjoint_clust"] = """ check_disjoint : bool - Whether to check if the connectivity matrix can be separated into disjoint + Whether to check if the ``adjacency`` matrix can be separated into disjoint sets before clustering. This may lead to faster clustering, especially if the second dimension of ``X`` (usually the "time" dimension) is large. """ @@ -1416,7 +1409,7 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75): """ docdict["exclude_clust"] = """ -exclude : bool array or None +exclude : array-like of bool | None Mask to apply to the data to exclude certain points from clustering (e.g., medial wall vertices). Should be the same shape as ``X``. If ``None``, no points are excluded. @@ -3962,7 +3955,7 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75): seed : None | int | instance of ~numpy.random.RandomState A seed for the NumPy random number generator (RNG). If ``None`` (default), the seed will be obtained from the operating system - (see :class:`~numpy.random.RandomState` for details), meaning it will most + (see :class:`~numpy.random.RandomState` for details), meaning it will most likely produce different output every time this function or method is run. To achieve reproducible results, pass a value here to explicitly initialize the RNG with a defined state. @@ -4253,16 +4246,23 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75): channel names in the file will be used when possible. """ -_stat_fun_clust_base = """ +_stat_fun_template = """ stat_fun : callable | None Function called to calculate the test statistic. Must accept 1D-array as - input and return a 1D array. If ``None`` (the default), uses - `mne.stats.{}`. + input and return a 1D array. If ``None`` (the default), uses {}. """ -docdict["stat_fun_clust_f"] = _stat_fun_clust_base.format("f_oneway") +docdict["stat_fun_clust_both"] = _stat_fun_template.format( + """:func:`mne.stats.ttest_1samp_no_p` + for paired tests and :func:`mne.stats.f_oneway` for unpaired tests or tests of + more than 2 groups.""" +) + +docdict["stat_fun_clust_f"] = _stat_fun_template.format(":func:`mne.stats.f_oneway`") -docdict["stat_fun_clust_t"] = _stat_fun_clust_base.format("ttest_1samp_no_p") +docdict["stat_fun_clust_t"] = _stat_fun_template.format( + ":func:`mne.stats.ttest_1samp_no_p`" +) docdict["static"] = """ static : instance of SpatialImage @@ -4473,10 +4473,10 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75): threshold : float | dict | None The so-called "cluster forming threshold" in the form of a test statistic (note: this is not an alpha level / "p-value"). - If numeric, vertices with data values more extreme than ``threshold`` will - be used to form clusters. If ``None``, {} will be chosen + If numeric, vertices with stat values more extreme than ``threshold`` will + be used to form clusters. If ``None``, {which_thresh} will be chosen automatically that corresponds to a p-value of 0.05 for the given number of - observations (only valid when using {}). If ``threshold`` is a + observations (only valid when using {which_stat}). If ``threshold`` is a :class:`dict` (with keys ``'start'`` and ``'step'``) then threshold-free cluster enhancement (TFCE) will be used (see the :ref:`TFCE example ` and :footcite:`SmithNichols2009`). @@ -4484,8 +4484,14 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75): a particular p-value for one-tailed or two-tailed tests. """ -f_test = ("an F-threshold", "an F-statistic") -docdict["threshold_clust_f"] = _threshold_clust_base.format(*f_test) +docdict["threshold_clust_both"] = _threshold_clust_base.format( + which_thresh="a t- or F-threshold", + which_stat="``stat_fun=None``, i.e., a paired t-test or one-way F-test", +) + +docdict["threshold_clust_f"] = _threshold_clust_base.format( + which_thresh="an F-threshold", which_stat="an F-statistic" +) docdict["threshold_clust_f_notes"] = """ For computing a ``threshold`` based on a p-value, use the conversion @@ -4497,8 +4503,9 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75): thresh = scipy.stats.f.ppf(1 - pval, dfn=dfn, dfd=dfd) # F distribution """ -t_test = ("a t-threshold", "a t-statistic") -docdict["threshold_clust_t"] = _threshold_clust_base.format(*t_test) +docdict["threshold_clust_t"] = _threshold_clust_base.format( + which_thresh="a t-threshold", which_stat="a t-statistic" +) docdict["threshold_clust_t_notes"] = """ For computing a ``threshold`` based on a p-value, use the conversion @@ -4512,6 +4519,23 @@ def _reflow_param_docstring(docstring, has_first_line=True, width=75): For testing the lower tail (``tail=-1``), don't subtract ``pval`` from 1. """ +docdict["threshold_clust_t_or_f_notes"] = """ +For computing a ``threshold`` based on a p-value, use the conversion +from :meth:`scipy.stats.rv_continuous.ppf`:: + + pval = 0.001 # arbitrary + # for t-statistic + df = n_observations - 1 # degrees of freedom for the t-test + thresh = scipy.stats.t.ppf(1 - pval / 2, df) # two-tailed, t distribution + # for f-statistic + dfn = n_conditions - 1 # degrees of freedom numerator + dfd = n_observations - n_conditions # degrees of freedom denominator + thresh = scipy.stats.f.ppf(1 - pval, dfn=dfn, dfd=dfd) # F distribution + +For a one-tailed test (``tail=1``), don't divide the p-value by 2. +For testing the lower tail (``tail=-1``), don't subtract ``pval`` from 1. +""" + docdict["time_bandwidth_tfr"] = """ time_bandwidth : float ``≥ 2.0`` Product between the temporal window length (in seconds) and the *full* From 2c27a6989e5b304966ac085d134f7ee14d3a562a Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Mon, 5 Aug 2024 13:15:37 +0200 Subject: [PATCH 73/88] implemented Dan's comments --- mne/stats/cluster_level.py | 65 ++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 30 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index cd86a40e22a..0b5b35889f6 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -1768,7 +1768,7 @@ def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str): all_shapes = set( df[dv_name].map(lambda x: x.shape[1:]) ) # first dim may vary (participants or epochs) - elif isinstance(inst, (BaseEpochs | BaseTFR)): # should include BaseTFR? + elif isinstance(inst, (BaseEpochs | BaseTFR)): all_shapes = set(df[dv_name].map(lambda x: x.get_data().shape[1:])) else: all_shapes = set(df[dv_name].map(lambda x: x.get_data().shape)) @@ -1790,7 +1790,7 @@ def cluster_test( tail: Literal[-1, 0, 1] = 0, threshold=None, n_permutations: str | int = 1024, - adjacency: sparse.spmatrix | False = False, + adjacency: sparse.spmatrix | None | False = None, # should be None (default) max_step: int = 1, # TODO may need to provide `max_step_time` and `max_step_freq` exclude: list | None = None, # TODO needs rethink because user passes MNE objects step_down_p: float = 0.0, @@ -1810,7 +1810,7 @@ def cluster_test( Dataframe containing the data, dependent and independent variables. formula : str Wilkinson notation formula for design matrix. The names of the dependent - and independent variable should match the columns in the dataframe. + and independent variable should match the columns in ``df``. within_id : None | str Name of column in ``df`` to use in identifying within-group contrasts. If ``None``, will perform a between-group test. Ignored if the number of groups @@ -1870,7 +1870,7 @@ def cluster_test( df.sort_values([iv_name, within_id], inplace=True) counts = df[within_id].value_counts() if any(counts != 2): - raise ValueError("for paired tttest, each subject must have 2 observations") + raise ValueError("for paired t-test, each subject must have 2 observations") # extract the data from the dataframe def _extract_data_array(series): @@ -1907,7 +1907,7 @@ def _extract_data_tfr(series): elif within_id in df: kind = "within" X = X[0] - X[1] - else: # what would be another else cas + else: # 2 elements in X but no within_id provided → unpaired test kind = "between" # define stat function and threshold @@ -1971,7 +1971,7 @@ def __init__( self.H0 = H0 self.stat_fun = stat_fun - # unpaired t-test is f_oneway + # unpaired t-test equivalent to f_oneway w/ 2 groups if stat_fun is f_oneway: self.stat_name = "F-statistic" elif stat_fun is ttest_1samp_no_p: @@ -1979,7 +1979,15 @@ def __init__( else: self.stat_name = "test statistic" - def plot_cluster_time_sensor(self, condition_labels: dict): + def plot_cluster_time_sensor( + self, + condition_labels: dict, + colors: list | dict | None = None, + linestyles: list | dict | None = None, + cmap_evokeds: None | str | tuple = None, + cmap_topo: None | str | tuple = None, + ci: float | bool | callable() | None = None, + ): """ Plot the cluster with the lowest p-value. @@ -1991,21 +1999,23 @@ def plot_cluster_time_sensor(self, condition_labels: dict): ---------- condition_labels : dict Dictionary with condition labels as keys and evoked objects as values. + colors : list|dict|None + Colors to use when plotting the ERP lines and confidence bands. + linestyles : list|dict|None + Styles to use when plotting the ERP lines. + cmap_evokeds : None|str|tuple + Colormap from which to draw color values when plotting the ERP lines. + cmap_topo: matplotlib colormap + Colormap to use for the topomap. + ci : float|bool|callable()|None + Confidence band around each ERP time series. """ - # define colorblind friendly colors - colorblind_palette = ["#4daf4a", "#984ea3"] - # extract condition labels from the dictionary cond_keys = list(condition_labels.keys()) # extract the evokeds from the dictionary cond_values = list(condition_labels.values()) - # configure variables for visualization - colors = { - cond_keys[0]: colorblind_palette[0], - cond_keys[1]: colorblind_palette[1], - } - line_styles = {cond_keys[0]: "-", cond_keys[1]: "--"} + linestyles = {cond_keys[0]: "-", cond_keys[1]: "--"} lowest_p_cluster = np.argmin(self.cluster_p_values) @@ -2033,7 +2043,7 @@ def plot_cluster_time_sensor(self, condition_labels: dict): times=0, mask=mask, axes=ax_topo, - cmap="RdBu_r", + cmap=cmap_topo, show=False, colorbar=False, mask_params=dict(markersize=10), @@ -2042,13 +2052,11 @@ def plot_cluster_time_sensor(self, condition_labels: dict): image = ax_topo.images[0] # remove the title that would otherwise say "0.000 s" - ax_topo.set_title("") - - # soft import? - # make_axes_locatable = _soft_import( - # "mpl_toolkits.axes_grid1.make_axes_locatable", - # purpose="plot cluster results" - # ) # soft import (not a dependency for MNE) + ax_topo.set_title( + "Spatial cluster extent:\n averaged from {:0.3f} to {:0.3f} s".format( + *sig_times[[0, -1]] + ) + ) # create additional axes (for ERF and colorbar) divider = make_axes_locatable(ax_topo) @@ -2057,11 +2065,6 @@ def plot_cluster_time_sensor(self, condition_labels: dict): ax_colorbar = divider.append_axes("right", size="5%", pad=0.1) cbar = plt.colorbar(image, cax=ax_colorbar) cbar.set_label(self.stat_name) - ax_topo.set_xlabel( - "Spatial cluster extent:\n averaged from {:0.3f} to {:0.3f} s".format( - *sig_times[[0, -1]] - ) - ) # add new axis for time courses and plot time courses ax_signals = divider.append_axes("right", size="300%", pad=1.3) @@ -2074,11 +2077,13 @@ def plot_cluster_time_sensor(self, condition_labels: dict): picks=ch_inds, axes=ax_signals, colors=colors, - linestyles=line_styles, + linestyles=linestyles, + cmap=cmap_evokeds, show=False, split_legend=True, truncate_yaxis="auto", truncate_xaxis=False, + ci=ci, ) plt.legend(frameon=False, loc="upper left") From 2664ee218a73a8526726df0ccdb3d24fa912329c Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Mon, 5 Aug 2024 13:22:41 +0200 Subject: [PATCH 74/88] implemented Dan's comments --- tutorials/stats-sensor-space/76_new_cluster_test_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index b7f933d127b..fb928f89d0a 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -152,7 +152,7 @@ # finally let's plot the results using the ClusterResults class # we plot the cluster with the lowest p-value -cluster_result.plot_cluster_time_sensor(condition_labels=conditions_dict) +cluster_result.plot_cluster_time_sensor(condition_labels=conditions_dict, ci=True) # we can see that there is something going on around 400 ms # with a stronger signal for target trials in right central-parietal channels From 492754436fe8fa8e69f1419f2971e1c63a0a2b58 Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Mon, 5 Aug 2024 16:11:23 +0200 Subject: [PATCH 75/88] test for handling different MNE objects - test is failing --- mne/stats/tests/test_cluster_level.py | 101 +++++++++++++++++++++++--- 1 file changed, 90 insertions(+), 11 deletions(-) diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py index 00989e3e00c..4391c51f238 100644 --- a/mne/stats/tests/test_cluster_level.py +++ b/mne/stats/tests/test_cluster_level.py @@ -904,21 +904,100 @@ def test_compare_old_and_new_cluster_api(): @pytest.mark.parametrize( "Inst", (EpochsArray, EvokedArray, EpochsTFRArray, AverageTFRArray) ) +@pytest.mark.filterwarnings('ignore:Ignoring argument "tail":RuntimeWarning') def test_new_cluster_api(Inst): """Test handling different MNE objects in the cluster API.""" pd = pytest.importorskip("pandas") - n_epo, n_chan, n_freq, n_times = 2, 3, 5, 7 - shape = (n_chan, n_times) - if Inst in (EpochsArray, EpochsTFRArray): - shape = (n_epo,) + shape - if Inst in (EpochsTFRArray, AverageTFRArray): - shape = shape[:-1] + (n_freq, shape[-1]) + n_epo, n_chan, n_freq, n_times = 2, 3, 4, 5 + info = create_info(ch_names=n_chan, sfreq=1000, ch_types="eeg") + # Introduce a significant difference in a specific region, time, and frequency + region_start = 1 + region_end = 2 + time_start = 2 + time_end = 4 + freq_start = 2 + freq_end = 4 + + if Inst == EpochsArray: + # Create random data for EpochsArray + inst1 = Inst(np.random.randn(n_epo, n_chan, n_times), info=info) + # Adding a constant to create a difference + data_copy = inst1.get_data().copy() # no data attribute for EpochsArray + data_copy[:, region_start:region_end, time_start:time_end] += ( + 2 # Modify the copy + ) + inst2 = Inst( + data=data_copy, info=info + ) # Use the modified copy as a new instance + + elif Inst == EvokedArray: + # Create random data for EvokedArray + inst1 = Inst(np.random.randn(n_chan, n_times), info=info) + data_copy = inst1.data.copy() + data_copy[region_start:region_end, time_start:time_end] += 2 + inst2 = Inst(data=data_copy, info=info) + + elif Inst == EpochsTFRArray: + # Create random data for EpochsTFRArray + data_tfr1 = np.random.randn(n_epo, n_chan, n_freq, n_times) + data_tfr2 = np.random.randn(n_epo, n_chan, n_freq, n_times) + inst1 = Inst( + data=data_tfr1, info=info, times=np.arange(n_times), freqs=np.arange(n_freq) + ) + inst2 = Inst( + data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq) + ) + data_tfr2 = inst2.data.copy() + data_tfr2[ + :, region_start:region_end, freq_start:freq_end, time_start:time_end + ] += 2 + inst2 = Inst( + data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq) + ) - info = create_info(...) - inst1 = Inst(np.random.normal(shape, ...), info=info) - inst2 = Inst(np.random.normal(shape, ...), info=info) + elif Inst == AverageTFRArray: + # Create random data for AverageTFRArray + data_tfr1 = np.random.randn(n_chan, n_freq, n_times) + data_tfr2 = np.random.randn(n_chan, n_freq, n_times) + inst1 = Inst( + data=data_tfr1, info=info, times=np.arange(n_times), freqs=np.arange(n_freq) + ) + inst2 = Inst( + data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq) + ) + data_tfr2 = inst2.data.copy() + data_tfr2[ + region_start:region_end, freq_start:freq_end, time_start:time_end + ] += 2 + inst2 = Inst( + data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq) + ) + # test old and new API with sample data df = pd.DataFrame(dict(data=[inst1, inst2], condition=["a", "b"])) - result = cluster_test(df, "data~condition", ...) - assert result # TODO do something more interesting here + kwargs = dict(n_permutations=100, seed=1, tail=1, buffer_size=None, out_type="mask") + + result_new_api = cluster_test(df, "data~condition", **kwargs) + + # make sure channels are last dimension for old API + if Inst == EpochsArray: + inst1 = inst1.get_data().transpose(0, 2, 1) + inst2 = inst2.get_data().transpose(0, 2, 1) + elif Inst == EpochsTFRArray: + inst1 = inst1.data.transpose(0, 3, 2, 1) + inst2 = inst2.data.transpose(0, 3, 2, 1) + elif Inst == AverageTFRArray: + inst1 = inst1.data.transpose(2, 1, 0) + inst2 = inst2.data.transpose(2, 1, 0) + else: + inst1 = inst1.data.transpose(1, 0) + inst2 = inst2.data.transpose(1, 0) + + F_obs, clusters, cluster_pvals, H0 = permutation_cluster_test( + [inst1, inst2], **kwargs + ) + assert_array_equal(result_new_api.H0, H0) + assert_array_equal(result_new_api.stat_obs, F_obs) + assert_array_equal(result_new_api.cluster_p_values, cluster_pvals) + assert result_new_api.clusters == clusters From 006acdf9d87f21d180fa6993540fcccefa281829 Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Tue, 6 Aug 2024 16:54:31 +0200 Subject: [PATCH 76/88] adjusted test to account for multiple subjects --- mne/stats/tests/test_cluster_level.py | 40 ++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py index 4391c51f238..9b10aacabf5 100644 --- a/mne/stats/tests/test_cluster_level.py +++ b/mne/stats/tests/test_cluster_level.py @@ -909,7 +909,7 @@ def test_new_cluster_api(Inst): """Test handling different MNE objects in the cluster API.""" pd = pytest.importorskip("pandas") - n_epo, n_chan, n_freq, n_times = 2, 3, 4, 5 + n_subs, n_epo, n_chan, n_freq, n_times = 2, 2, 3, 4, 5 info = create_info(ch_names=n_chan, sfreq=1000, ch_types="eeg") # Introduce a significant difference in a specific region, time, and frequency region_start = 1 @@ -974,9 +974,25 @@ def test_new_cluster_api(Inst): data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq) ) - # test old and new API with sample data - df = pd.DataFrame(dict(data=[inst1, inst2], condition=["a", "b"])) - kwargs = dict(n_permutations=100, seed=1, tail=1, buffer_size=None, out_type="mask") + if Inst == EvokedArray or Inst == AverageTFRArray: + # Generate random noise + noise = np.random.normal(loc=0, scale=0.1, size=inst1.data.shape) + # add noise to the data of the second subject + inst1_n = inst1.copy() + inst1_n.data = inst1.data + noise + inst2_n = inst2.copy() + inst2_n.data = inst2.data + noise + data = [inst1, inst2, inst1_n, inst2_n] + conds = ["a", "b"] * n_subs + else: + data = [inst1, inst2] + conds = ["a", "b"] + + df = pd.DataFrame(dict(data=data, condition=conds)) + + kwargs = dict( + n_permutations=100, seed=42, tail=1, buffer_size=None, out_type="mask" + ) result_new_api = cluster_test(df, "data~condition", **kwargs) @@ -990,14 +1006,24 @@ def test_new_cluster_api(Inst): elif Inst == AverageTFRArray: inst1 = inst1.data.transpose(2, 1, 0) inst2 = inst2.data.transpose(2, 1, 0) + inst1_n = inst1_n.data.transpose(2, 1, 0) + inst2_n = inst2_n.data.transpose(2, 1, 0) + # combine the data of the two subjects + inst1 = np.concatenate([inst1[np.newaxis, :], inst1_n[np.newaxis, :]], axis=0) + inst2 = np.concatenate([inst2[np.newaxis, :], inst2_n[np.newaxis, :]], axis=0) else: inst1 = inst1.data.transpose(1, 0) inst2 = inst2.data.transpose(1, 0) + inst1_n = inst1_n.data.transpose(1, 0) + inst2_n = inst2_n.data.transpose(1, 0) + # combine the data of the two subjects + inst1 = np.concatenate([inst1[np.newaxis, :], inst1_n[np.newaxis, :]], axis=0) + inst2 = np.concatenate([inst2[np.newaxis, :], inst2_n[np.newaxis, :]], axis=0) F_obs, clusters, cluster_pvals, H0 = permutation_cluster_test( [inst1, inst2], **kwargs ) - assert_array_equal(result_new_api.H0, H0) - assert_array_equal(result_new_api.stat_obs, F_obs) - assert_array_equal(result_new_api.cluster_p_values, cluster_pvals) + assert_array_almost_equal(result_new_api.H0, H0) + assert_array_almost_equal(result_new_api.stat_obs, F_obs) + assert_array_almost_equal(result_new_api.cluster_p_values, cluster_pvals) assert result_new_api.clusters == clusters From f0f4cba540e8d51f93d46b32b8f98340f7d9044c Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Sat, 10 Aug 2024 17:39:40 -0500 Subject: [PATCH 77/88] refactor df validation to return bools --- mne/stats/cluster_level.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index 0b5b35889f6..7dff9a41a0a 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -35,6 +35,7 @@ verbose, warn, ) +from ..utils.mixin import GetEpochsMixin from ..viz import plot_compare_evokeds from .parametric import f_oneway, ttest_1samp_no_p @@ -1777,7 +1778,11 @@ def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str): f"{prologue} consistent shape, but {len(all_shapes)} different " f"shapes were found: {'; '.join(all_shapes)}." ) - return all_types.pop() # return the type of the data column entries + obj_type = all_types.pop() + is_epo = GetEpochsMixin in obj_type.__mro__ + is_tfr = BaseTFR in obj_type.__mro__ + is_arr = np.ndarray in obj_type.__mro__ + return is_epo, is_tfr, is_arr @verbose @@ -1861,7 +1866,7 @@ def cluster_test( iv_name = str(np.array(formula.rhs.root).item()) # validate the input dataframe and return the type of the data column entries - _dtype = _validate_cluster_df(df, dv_name, iv_name) + is_epo, is_tfr, is_arr = _validate_cluster_df(df, dv_name, iv_name) # for within_subject designs, check if each subject has 2 observations _validate_type(within_id, (str, None), "within_id") @@ -1873,23 +1878,18 @@ def cluster_test( raise ValueError("for paired t-test, each subject must have 2 observations") # extract the data from the dataframe - def _extract_data_array(series): - return np.concatenate(series.values) + outer_func = np.concatenate if is_epo or is_arr else np.array + axes = (-3, -1) if is_tfr else (-2, -1) - def _extract_data_mne(series): # 2D data - return np.array( - series.map(lambda inst: inst.get_data().swapaxes(-2, -1)).to_list() + def func_mne(series): + return outer_func( + series.map(lambda inst: inst.get_data().swapaxes(*axes)).to_list() ) - def _extract_data_tfr(series): - return series.map(lambda inst: inst.get_data().swapaxes(-3, -1)).to_list() + def func_array(series): + return outer_func(series.values) - if _dtype is np.ndarray: - func = _extract_data_array - elif _dtype is BaseTFR: - func = _extract_data_tfr - else: - func = _extract_data_mne + func = func_array if is_arr else func_mne # convert to a list-like X for clustering X = df.groupby(iv_name).agg({dv_name: func})[dv_name].to_list() From 346e3ce1270bcffa19edeeb8de3949b078729862 Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Sat, 10 Aug 2024 17:40:14 -0500 Subject: [PATCH 78/88] unrelated typing fix --- mne/stats/cluster_level.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index 7dff9a41a0a..dcaf3e615b0 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -1986,7 +1986,7 @@ def plot_cluster_time_sensor( linestyles: list | dict | None = None, cmap_evokeds: None | str | tuple = None, cmap_topo: None | str | tuple = None, - ci: float | bool | callable() | None = None, + ci: float | bool | callable | None = None, ): """ Plot the cluster with the lowest p-value. From a49d2cd8e888786b61e7c7ce29ad3d56c3b7be2e Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Sat, 10 Aug 2024 17:41:19 -0500 Subject: [PATCH 79/88] rework test --- mne/stats/tests/test_cluster_level.py | 164 ++++++++++---------------- 1 file changed, 60 insertions(+), 104 deletions(-) diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py index 9b10aacabf5..ed0d830bdfd 100644 --- a/mne/stats/tests/test_cluster_level.py +++ b/mne/stats/tests/test_cluster_level.py @@ -909,121 +909,77 @@ def test_new_cluster_api(Inst): """Test handling different MNE objects in the cluster API.""" pd = pytest.importorskip("pandas") - n_subs, n_epo, n_chan, n_freq, n_times = 2, 2, 3, 4, 5 + rng = np.random.default_rng(seed=8675309) + is_epo = Inst in (EpochsTFRArray, EpochsArray) + is_tfr = Inst in (EpochsTFRArray, AverageTFRArray) + + n_epo, n_chan, n_freq, n_times = 6, 3, 4, 5 + + # prepare the dimensions of the simulated data, then simulate + size = (n_chan,) + if is_epo: + size = (n_epo, *size) + if is_tfr: + size = (*size, n_freq) + size = (*size, n_times) + data = rng.normal(size=size) + + # construct the instance info = create_info(ch_names=n_chan, sfreq=1000, ch_types="eeg") - # Introduce a significant difference in a specific region, time, and frequency - region_start = 1 - region_end = 2 - time_start = 2 - time_end = 4 - freq_start = 2 - freq_end = 4 - - if Inst == EpochsArray: - # Create random data for EpochsArray - inst1 = Inst(np.random.randn(n_epo, n_chan, n_times), info=info) - # Adding a constant to create a difference - data_copy = inst1.get_data().copy() # no data attribute for EpochsArray - data_copy[:, region_start:region_end, time_start:time_end] += ( - 2 # Modify the copy - ) - inst2 = Inst( - data=data_copy, info=info - ) # Use the modified copy as a new instance - - elif Inst == EvokedArray: - # Create random data for EvokedArray - inst1 = Inst(np.random.randn(n_chan, n_times), info=info) - data_copy = inst1.data.copy() - data_copy[region_start:region_end, time_start:time_end] += 2 - inst2 = Inst(data=data_copy, info=info) - - elif Inst == EpochsTFRArray: - # Create random data for EpochsTFRArray - data_tfr1 = np.random.randn(n_epo, n_chan, n_freq, n_times) - data_tfr2 = np.random.randn(n_epo, n_chan, n_freq, n_times) - inst1 = Inst( - data=data_tfr1, info=info, times=np.arange(n_times), freqs=np.arange(n_freq) - ) - inst2 = Inst( - data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq) - ) - data_tfr2 = inst2.data.copy() - data_tfr2[ - :, region_start:region_end, freq_start:freq_end, time_start:time_end - ] += 2 - inst2 = Inst( - data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq) - ) - - elif Inst == AverageTFRArray: - # Create random data for AverageTFRArray - data_tfr1 = np.random.randn(n_chan, n_freq, n_times) - data_tfr2 = np.random.randn(n_chan, n_freq, n_times) - inst1 = Inst( - data=data_tfr1, info=info, times=np.arange(n_times), freqs=np.arange(n_freq) - ) - inst2 = Inst( - data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq) - ) - data_tfr2 = inst2.data.copy() - data_tfr2[ - region_start:region_end, freq_start:freq_end, time_start:time_end - ] += 2 - inst2 = Inst( - data=data_tfr2, info=info, times=np.arange(n_times), freqs=np.arange(n_freq) - ) - - if Inst == EvokedArray or Inst == AverageTFRArray: - # Generate random noise - noise = np.random.normal(loc=0, scale=0.1, size=inst1.data.shape) - # add noise to the data of the second subject - inst1_n = inst1.copy() - inst1_n.data = inst1.data + noise - inst2_n = inst2.copy() - inst2_n.data = inst2.data + noise - data = [inst1, inst2, inst1_n, inst2_n] - conds = ["a", "b"] * n_subs + kw = dict(times=np.arange(n_times), freqs=np.arange(n_freq)) if is_tfr else dict() + cond_a = Inst(data=data, info=info, **kw) + cond_b = cond_a.copy() + # introduce a significant difference in a specific region, time, and frequency + ch_start, ch_end = 0, 2 # 2 channels + t_start, t_end = 2, 4 # 2 times + f_start, f_end = 2, 4 # 2 freqs + if is_tfr: + cond_b._data[..., ch_start:ch_end, f_start:f_end, t_start:t_end] += 2 + else: + cond_b._data[..., ch_start:ch_end, t_start:t_end] += 2 + # for Evokeds/AverageTFRs, we create fake "subjects" as our observations within each + # condition. We add a bit of noise while we do so. + if not is_epo: + insts = list() + for cond in cond_a, cond_b: + for _n in range(n_epo): + if not _n: + insts.append(cond) + continue + _cond = cond.copy() + _cond.data += rng.normal(scale=0.1, size=_cond.data.shape) + insts.append(_cond) + conds = np.repeat(["a", "b"], n_epo).tolist() else: - data = [inst1, inst2] + # For Epochs(TFR)Array, each epoch is an observation and they're already + # noisy/non-identical, so no duplication / noise-addition necessary. + insts = [cond_a, cond_b] conds = ["a", "b"] - df = pd.DataFrame(dict(data=data, condition=conds)) - + # run new clustering API + df = pd.DataFrame(dict(data=insts, condition=conds)) kwargs = dict( n_permutations=100, seed=42, tail=1, buffer_size=None, out_type="mask" ) - result_new_api = cluster_test(df, "data~condition", **kwargs) # make sure channels are last dimension for old API - if Inst == EpochsArray: - inst1 = inst1.get_data().transpose(0, 2, 1) - inst2 = inst2.get_data().transpose(0, 2, 1) - elif Inst == EpochsTFRArray: - inst1 = inst1.data.transpose(0, 3, 2, 1) - inst2 = inst2.data.transpose(0, 3, 2, 1) - elif Inst == AverageTFRArray: - inst1 = inst1.data.transpose(2, 1, 0) - inst2 = inst2.data.transpose(2, 1, 0) - inst1_n = inst1_n.data.transpose(2, 1, 0) - inst2_n = inst2_n.data.transpose(2, 1, 0) - # combine the data of the two subjects - inst1 = np.concatenate([inst1[np.newaxis, :], inst1_n[np.newaxis, :]], axis=0) - inst2 = np.concatenate([inst2[np.newaxis, :], inst2_n[np.newaxis, :]], axis=0) + if is_epo: + axes = (0, 3, 2, 1) if is_tfr else (0, 2, 1) + X = [cond_a.get_data().transpose(*axes), cond_b.get_data().transpose(*axes)] else: - inst1 = inst1.data.transpose(1, 0) - inst2 = inst2.data.transpose(1, 0) - inst1_n = inst1_n.data.transpose(1, 0) - inst2_n = inst2_n.data.transpose(1, 0) - # combine the data of the two subjects - inst1 = np.concatenate([inst1[np.newaxis, :], inst1_n[np.newaxis, :]], axis=0) - inst2 = np.concatenate([inst2[np.newaxis, :], inst2_n[np.newaxis, :]], axis=0) - - F_obs, clusters, cluster_pvals, H0 = permutation_cluster_test( - [inst1, inst2], **kwargs - ) + axes = (2, 1, 0) if is_tfr else (1, 0) + Xa = list() + Xb = list() + for inst, cond in zip(insts, conds): + container = Xa if cond == "a" else Xb + container.append(inst.get_data().transpose(*axes)) + X = [np.stack(Xa), np.stack(Xb)] + + F_obs, clusters, cluster_pvals, H0 = permutation_cluster_test(X, **kwargs) assert_array_almost_equal(result_new_api.H0, H0) assert_array_almost_equal(result_new_api.stat_obs, F_obs) assert_array_almost_equal(result_new_api.cluster_p_values, cluster_pvals) - assert result_new_api.clusters == clusters + assert len(result_new_api.clusters) == len(clusters) + for clu1, clu2 in zip(result_new_api.clusters, clusters): + assert_array_equal(clu1, clu2) From a01182b56ea7c932db51405748f2d731c45f6d92 Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Mon, 12 Aug 2024 09:08:27 -0500 Subject: [PATCH 80/88] minor cleanup --- mne/stats/cluster_level.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index dcaf3e615b0..f640ba9634f 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -1878,18 +1878,18 @@ def cluster_test( raise ValueError("for paired t-test, each subject must have 2 observations") # extract the data from the dataframe - outer_func = np.concatenate if is_epo or is_arr else np.array + outer_func = np.concatenate if is_epo else np.array axes = (-3, -1) if is_tfr else (-2, -1) + def func_arr(series): + return np.concatenate(series.values) + def func_mne(series): return outer_func( series.map(lambda inst: inst.get_data().swapaxes(*axes)).to_list() ) - def func_array(series): - return outer_func(series.values) - - func = func_array if is_arr else func_mne + func = func_arr if is_arr else func_mne # convert to a list-like X for clustering X = df.groupby(iv_name).agg({dv_name: func})[dv_name].to_list() From 0984b61313bd76d14279e80f5e8db190d8c0e62d Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Mon, 12 Aug 2024 09:16:24 -0500 Subject: [PATCH 81/88] fix imports --- mne/stats/cluster_level.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index f640ba9634f..2999a73c07c 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -17,13 +17,15 @@ from scipy.stats import f as fstat from scipy.stats import t as tstat -from .. import BaseEpochs, Evoked, EvokedArray +from ..epochs import BaseEpochs, EvokedArray +from ..evoked import Evoked from ..fixes import has_numba, jit from ..parallel import parallel_func from ..source_estimate import MixedSourceEstimate, SourceEstimate, VolSourceEstimate from ..source_space import SourceSpaces from ..time_frequency import BaseTFR from ..utils import ( + GetEpochsMixin, ProgressBar, _check_option, _pl, @@ -35,7 +37,6 @@ verbose, warn, ) -from ..utils.mixin import GetEpochsMixin from ..viz import plot_compare_evokeds from .parametric import f_oneway, ttest_1samp_no_p From 6322499b88e414a97cc6d6642de4529594d22ec5 Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Mon, 12 Aug 2024 09:16:35 -0500 Subject: [PATCH 82/88] use MRO in test too --- mne/stats/tests/test_cluster_level.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py index ed0d830bdfd..fc2af127a96 100644 --- a/mne/stats/tests/test_cluster_level.py +++ b/mne/stats/tests/test_cluster_level.py @@ -37,8 +37,8 @@ summarize_clusters_stc, ttest_1samp_no_p, ) -from mne.time_frequency import AverageTFRArray, EpochsTFRArray -from mne.utils import _record_warnings, catch_logging +from mne.time_frequency import AverageTFRArray, BaseTFR, EpochsTFRArray +from mne.utils import GetEpochsMixin, _record_warnings, catch_logging n_space = 50 @@ -910,8 +910,8 @@ def test_new_cluster_api(Inst): pd = pytest.importorskip("pandas") rng = np.random.default_rng(seed=8675309) - is_epo = Inst in (EpochsTFRArray, EpochsArray) - is_tfr = Inst in (EpochsTFRArray, AverageTFRArray) + is_epo = GetEpochsMixin in Inst.__mro__ + is_tfr = BaseTFR in Inst.__mro__ n_epo, n_chan, n_freq, n_times = 6, 3, 4, 5 From a04b8a3e1031890b32ef817eebf01966e49620fd Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Thu, 22 Aug 2024 15:23:39 -0500 Subject: [PATCH 83/88] fix vulture allowlist --- tools/vulture_allowlist.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/vulture_allowlist.py b/tools/vulture_allowlist.py index f030b4d4346..edc3bdf9811 100644 --- a/tools/vulture_allowlist.py +++ b/tools/vulture_allowlist.py @@ -148,4 +148,4 @@ _qt_get_stylesheet # used in tutorial, not sure why shows up -plot_cluster +plot_cluster_time_sensor From f1d39bf6dccf2186e5d9198e73e8d20068425acf Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Thu, 22 Aug 2024 15:40:08 -0500 Subject: [PATCH 84/88] fix nesting and type hints --- mne/stats/cluster_level.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index 2999a73c07c..a86a6dfafe4 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -10,7 +10,6 @@ import matplotlib.pyplot as plt import numpy as np -import pandas as pd from mpl_toolkits.axes_grid1 import make_axes_locatable from scipy import ndimage, sparse from scipy.sparse.csgraph import connected_components @@ -40,6 +39,10 @@ from ..viz import plot_compare_evokeds from .parametric import f_oneway, ttest_1samp_no_p +# need this at top-level of file due to type hints +pd = _soft_import("pandas", purpose="DataFrame integration") +DataFrame = getattr(pd, "DataFrame", None) + def _get_buddies_fallback(r, s, neighbors, indices=None): if indices is None: @@ -1738,7 +1741,7 @@ def summarize_clusters_stc( return klass(data_summary, vertices, tmin, tstep, subject) -def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str): +def _validate_cluster_df(df: DataFrame, dv_name: str, iv_name: str): """Validate the input DataFrame for cluster tests.""" # check if all necessary columns are present missing = ({dv_name} | {iv_name}) - set(df.columns) # should be empty @@ -1788,7 +1791,7 @@ def _validate_cluster_df(df: pd.DataFrame, dv_name: str, iv_name: str): @verbose def cluster_test( - df: pd.DataFrame, + df: DataFrame, formula: str, *, # end of positional-only parameters within_id: str | None = None, From 987ea433c14d7bab513b28c4463357a500db7a57 Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Thu, 22 Aug 2024 16:42:55 -0500 Subject: [PATCH 85/88] strict=False --- mne/stats/cluster_level.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mne/stats/cluster_level.py b/mne/stats/cluster_level.py index a86a6dfafe4..8f24c0c4a0c 100644 --- a/mne/stats/cluster_level.py +++ b/mne/stats/cluster_level.py @@ -40,7 +40,7 @@ from .parametric import f_oneway, ttest_1samp_no_p # need this at top-level of file due to type hints -pd = _soft_import("pandas", purpose="DataFrame integration") +pd = _soft_import("pandas", purpose="DataFrame integration", strict=False) DataFrame = getattr(pd, "DataFrame", None) From 78829b43e14b3447ff0bec06d14c338838de90a4 Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Thu, 22 Aug 2024 16:58:16 -0500 Subject: [PATCH 86/88] nest import in test file too --- mne/stats/tests/test_cluster_level.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mne/stats/tests/test_cluster_level.py b/mne/stats/tests/test_cluster_level.py index fc2af127a96..06a87a07477 100644 --- a/mne/stats/tests/test_cluster_level.py +++ b/mne/stats/tests/test_cluster_level.py @@ -6,7 +6,6 @@ from functools import partial import numpy as np -import pandas as pd import pytest from numpy.testing import ( assert_allclose, @@ -882,6 +881,7 @@ def test_output_equiv(shape, out_type, adjacency, threshold): def test_compare_old_and_new_cluster_api(): """Test for same results from old and new APIs.""" + pd = pytest.importorskip("pandas") condition1_1d, condition2_1d, condition1_2d, condition2_2d = _get_conditions() df_1d = pd.DataFrame( dict( From 372bccacaa9522586fb811b00ac2f85d7adeb3d1 Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Wed, 2 Oct 2024 13:51:31 +0200 Subject: [PATCH 87/88] clean up pyproject mess --- pyproject.toml | 2 -- 1 file changed, 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 83d9479dccc..0c36566021d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -146,8 +146,6 @@ test_extra = [ "snirf", "neo", "mne-bids", - " - ", ] # Dependencies for building the documentation From 4da84634d1b265355f9e75c60d324ef6f1b29dd2 Mon Sep 17 00:00:00 2001 From: CarinaFo Date: Wed, 2 Oct 2024 13:53:08 +0200 Subject: [PATCH 88/88] add n_permutations, plotting, added min_cluster_p_value --- tutorials/stats-sensor-space/76_new_cluster_test_api.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tutorials/stats-sensor-space/76_new_cluster_test_api.py b/tutorials/stats-sensor-space/76_new_cluster_test_api.py index fb928f89d0a..0e5aee91432 100644 --- a/tutorials/stats-sensor-space/76_new_cluster_test_api.py +++ b/tutorials/stats-sensor-space/76_new_cluster_test_api.py @@ -139,6 +139,10 @@ cluster_result = mne.stats.cluster_level.cluster_test( df=df, formula=formula, within_id="subject_index" ) +# TODO: add n_permutations to cluster_result + +# print the lowest cluster p-value +print(f"The lowest cluster p-value is: {cluster_result.cluster_p_values.min()}") # note that we ran an exact test due to the small sample size # (only 15 permutations)