From 4e308c04762c7cca610adfc83695f15328ffed06 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Wed, 4 Sep 2024 17:29:16 +0200
Subject: [PATCH 01/29] add wordnet-mapped indices for Coggan 2024

---
 .../model_helpers/brain_transformation/behavior.py        | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/brainscore_vision/model_helpers/brain_transformation/behavior.py b/brainscore_vision/model_helpers/brain_transformation/behavior.py
index 2c01012a0..89f8a4ff3 100644
--- a/brainscore_vision/model_helpers/brain_transformation/behavior.py
+++ b/brainscore_vision/model_helpers/brain_transformation/behavior.py
@@ -154,6 +154,14 @@ class LabelToImagenetIndices:
     shovel_indices = [792]
     # truck indices used as defined by Geirhos et al., 2021.
 
+    # added from the Coggan 2024 behavioral benchmark:
+    bison_indices = [347]
+    hare_indices = [331]
+    jeep_indices = [609]
+    teapot_indices = [849]
+
+    # car, bear, lamp, elephant
+
     @classmethod
     def label_to_indices(cls, label):
         # for handling multi-word labels given by models or benchmarks

From fc2de9dd5e4331df246b4c0b71faea97ef40fa18 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Wed, 4 Sep 2024 17:38:37 +0200
Subject: [PATCH 02/29] update comment about other indices

---
 .../model_helpers/brain_transformation/behavior.py             | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/brainscore_vision/model_helpers/brain_transformation/behavior.py b/brainscore_vision/model_helpers/brain_transformation/behavior.py
index 89f8a4ff3..3e036bf25 100644
--- a/brainscore_vision/model_helpers/brain_transformation/behavior.py
+++ b/brainscore_vision/model_helpers/brain_transformation/behavior.py
@@ -160,7 +160,8 @@ class LabelToImagenetIndices:
     jeep_indices = [609]
     teapot_indices = [849]
 
-    # car, bear, lamp, elephant
+    # car, bear, and elephant indices used as defined by Geirhos et al., 2021.
+    # lamp indices used as defined by the Scialom2024 benchmark
 
     @classmethod
     def label_to_indices(cls, label):

From 118141723b187864ac3a501389f06d4dabacc005 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Thu, 5 Sep 2024 11:54:36 +0200
Subject: [PATCH 03/29] add labeling benchmark and engineering benchmark

---
 .../coggan2024_behavior/benchmark.py          | 61 +++++++++++++++++--
 1 file changed, 57 insertions(+), 4 deletions(-)

diff --git a/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py b/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py
index 14db4121f..c00a52335 100644
--- a/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py
+++ b/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py
@@ -1,12 +1,10 @@
 # Created by David Coggan on 2024 06 25
 
 import numpy as np
-from brainio.assemblies import DataAssembly, BehavioralAssembly
-from brainscore_vision import load_stimulus_set, load_dataset
+from brainscore_vision import load_stimulus_set, load_dataset, load_metric
 from brainscore_vision.benchmarks import BenchmarkBase
 from brainscore_vision.benchmark_helpers.screen import place_on_screen
 from brainscore_core.metrics import Score
-from brainscore_vision.metric_helpers import Defaults as XarrayDefaults
 from brainscore_vision.model_interface import BrainModel
 from brainscore_vision.utils import LazyLoad
 from scipy.stats import sem
@@ -21,6 +19,39 @@
     url = {},
     journal = {in prep}}"""
 
+
+class Coggan2024_behavior_ConditionWiseLabelingAccuracySimilarity(BenchmarkBase):
+    def __init__(self):
+        self._metric = load_metric('conditionwise_accuracy_distance')
+        self._fitting_stimuli = load_stimulus_set('Coggan2024_behavior_fitting')  # this fails is wrapped by LazyLoad
+        self._assembly = LazyLoad(lambda: load_dataset('Coggan2024_behavior'))
+        self._visual_degrees = 10
+        self._number_of_trials = 1
+        self._ceiling_func = lambda assembly: get_noise_ceiling(assembly)
+        super(Coggan2024_behavior_ConditionWiseLabelingAccuracySimilarity, self).__init__(
+            identifier='tong.Coggan2024_behavior-LabelingConditionWiseAccuracySimilarity',
+            version=1,
+            ceiling_func=lambda: self._metric.ceiling(self._assembly),
+            parent='behavior',
+            bibtex=BIBTEX,
+        )
+
+    def __call__(self, candidate: BrainModel):
+        choice_labels = set(self._assembly['object_class'].values)
+        choice_labels = list(sorted(choice_labels))
+        candidate.start_task(BrainModel.Task.label, choice_labels)
+        stimulus_set = place_on_screen(self._assembly.stimulus_set,
+                                       target_visual_degrees=candidate.visual_degrees(),
+                                       source_visual_degrees=self._visual_degrees)
+        labels = candidate.look_at(stimulus_set, number_of_trials=self._number_of_trials)
+        raw_score = self._metric(labels, self._assembly)
+        ceiling = self.ceiling
+        score = raw_score / ceiling
+        score.attrs['raw'] = raw_score
+        score.attrs['ceiling'] = ceiling
+        return score
+
+
 class Coggan2024_behavior_ConditionWiseAccuracySimilarity(BenchmarkBase):
 
     """
@@ -125,5 +156,27 @@ def ceiler(score: Score, ceiling: Score) -> Score:
     return ceiled_score
 
 
+class Coggan2024_behavior_ConditionWiseLabelingEngineeringAccuracy(BenchmarkBase):
+    def __init__(self):
+        self._metric = load_metric('accuracy')
+        self._ceiling_func = lambda assembly: get_noise_ceiling(assembly)
+        self._stimulus_set = load_dataset('Coggan2024_behavior').stimulus_set
+        super(Coggan2024_behavior_ConditionWiseLabelingEngineeringAccuracy, self).__init__(
+            identifier='tong.Coggan2024_behavior-LabelingConditionWiseEngineeringAccuracy',
+            version=1,
+            ceiling_func=lambda: Score(1),
+            parent='Coggan2024-top1',
+            bibtex=BIBTEX,
+        )
 
-
+    def __call__(self, candidate: BrainModel):
+        choice_labels = set(self._assembly['object_class'].values)
+        choice_labels = list(sorted(choice_labels))
+        candidate.start_task(BrainModel.Task.label, choice_labels)
+        labels = candidate.look_at(self._stimulus_set)
+        raw_score = self._metric(labels, self._stimulus_set['object_class'].values)
+        ceiling = self.ceiling
+        score = raw_score / ceiling
+        score.attrs['raw'] = raw_score
+        score.attrs['ceiling'] = ceiling
+        return score
\ No newline at end of file

From 5cd3df42808225186da41a7305bbc91689ec2b27 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Tue, 1 Oct 2024 11:12:23 +0200
Subject: [PATCH 04/29] hook up benchmark to the new accuracy distance metric

---
 brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py b/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py
index 6e1d019d1..8e7a0b50f 100644
--- a/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py
+++ b/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py
@@ -22,7 +22,7 @@
 
 class Coggan2024_behavior_ConditionWiseLabelingAccuracySimilarity(BenchmarkBase):
     def __init__(self):
-        self._metric = load_metric('conditionwise_accuracy_distance')
+        self._metric = load_metric('accuracy_distance')
         self._fitting_stimuli = load_stimulus_set('Coggan2024_behavior_fitting')  # this fails is wrapped by LazyLoad
         self._assembly = LazyLoad(lambda: load_dataset('Coggan2024_behavior'))
         self._visual_degrees = 10
@@ -44,7 +44,7 @@ def __call__(self, candidate: BrainModel):
                                        target_visual_degrees=candidate.visual_degrees(),
                                        source_visual_degrees=self._visual_degrees)
         labels = candidate.look_at(stimulus_set, number_of_trials=self._number_of_trials)
-        raw_score = self._metric(labels, self._assembly)
+        raw_score = self._metric(labels, self._assembly, variables=['occluder_type', 'occluder_color'])
         ceiling = self.ceiling
         score = raw_score / ceiling
         score.attrs['raw'] = raw_score

From 08cf2364e2f1bc6ddbe60110ce34fa84e2103074 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Tue, 1 Oct 2024 11:15:18 +0200
Subject: [PATCH 05/29] fix reference to the benchmark

---
 .../coggan2024_behavior/benchmark.py          | 57 ++++++++++---------
 1 file changed, 29 insertions(+), 28 deletions(-)

diff --git a/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py b/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py
index 8e7a0b50f..275acebe0 100644
--- a/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py
+++ b/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py
@@ -52,8 +52,35 @@ def __call__(self, candidate: BrainModel):
         return score
 
 
-class Coggan2024_behavior_ConditionWiseAccuracySimilarity(BenchmarkBase):
+class Coggan2024_behavior_ConditionWiseLabelingEngineeringAccuracy(BenchmarkBase):
+    def __init__(self):
+        self._metric = load_metric('accuracy')
+        self._ceiling_func = lambda assembly: get_noise_ceiling(assembly)
+        self._stimulus_set = load_dataset('Coggan2024_behavior').stimulus_set
+        super(Coggan2024_behavior_ConditionWiseLabelingEngineeringAccuracy, self).__init__(
+            identifier='tong.Coggan2024_behavior-LabelingConditionWiseEngineeringAccuracy',
+            version=1,
+            ceiling_func=lambda: Score(1),
+            parent='Coggan2024-top1',
+            bibtex=BIBTEX,
+        )
+
+    def __call__(self, candidate: BrainModel):
+        choice_labels = set(self._assembly['object_class'].values)
+        choice_labels = list(sorted(choice_labels))
+        candidate.start_task(BrainModel.Task.label, choice_labels)
+        labels = candidate.look_at(self._stimulus_set)
+        raw_score = self._metric(labels, self._stimulus_set['object_class'].values)
+        ceiling = self.ceiling
+        score = raw_score / ceiling
+        score.attrs['raw'] = raw_score
+        score.attrs['ceiling'] = ceiling
+        return score
+
 
+class Coggan2024_behavior_ConditionWiseAccuracySimilarity_Correlation(BenchmarkBase):
+    ### DEPRECATED IN FAVOR OF Coggan2024_behavior_ConditionWiseLabelingAccuracySimilarity
+    ### Here for future comparison/reference/proofing
     """
     This benchmark measures classification accuracy for a set of occluded object images, then attains the mean accuracy
     for each of the 18 occlusion conditions. This is then correlated with the corresponding accuracies for each of the
@@ -68,7 +95,7 @@ def __init__(self):
         self._visual_degrees = 10
         self._number_of_trials = 1
         self._ceiling_func = lambda assembly: get_noise_ceiling(assembly)
-        super(Coggan2024_behavior_ConditionWiseAccuracySimilarity, self).__init__(
+        super(Coggan2024_behavior_ConditionWiseAccuracySimilarity_Correlation, self).__init__(
             identifier='tong.Coggan2024_behavior-ConditionWiseAccuracySimilarity',
             version=1,
             ceiling_func=lambda df: get_noise_ceiling(df),
@@ -158,29 +185,3 @@ def ceiler(score: Score, ceiling: Score) -> Score:
     ceiled_score.attrs[Score.RAW_VALUES_KEY] = score
     ceiled_score.attrs['ceiling'] = ceiling
     return ceiled_score
-
-
-class Coggan2024_behavior_ConditionWiseLabelingEngineeringAccuracy(BenchmarkBase):
-    def __init__(self):
-        self._metric = load_metric('accuracy')
-        self._ceiling_func = lambda assembly: get_noise_ceiling(assembly)
-        self._stimulus_set = load_dataset('Coggan2024_behavior').stimulus_set
-        super(Coggan2024_behavior_ConditionWiseLabelingEngineeringAccuracy, self).__init__(
-            identifier='tong.Coggan2024_behavior-LabelingConditionWiseEngineeringAccuracy',
-            version=1,
-            ceiling_func=lambda: Score(1),
-            parent='Coggan2024-top1',
-            bibtex=BIBTEX,
-        )
-
-    def __call__(self, candidate: BrainModel):
-        choice_labels = set(self._assembly['object_class'].values)
-        choice_labels = list(sorted(choice_labels))
-        candidate.start_task(BrainModel.Task.label, choice_labels)
-        labels = candidate.look_at(self._stimulus_set)
-        raw_score = self._metric(labels, self._stimulus_set['object_class'].values)
-        ceiling = self.ceiling
-        score = raw_score / ceiling
-        score.attrs['raw'] = raw_score
-        score.attrs['ceiling'] = ceiling
-        return score
\ No newline at end of file

From 28ff288ea9b5eceb2461baeff9ef46217a2c60eb Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Tue, 1 Oct 2024 14:50:49 +0200
Subject: [PATCH 06/29] add sports car indices

---
 .../benchmarks/coggan2024_behavior/__init__.py              | 6 ++----
 .../benchmarks/coggan2024_behavior/benchmark.py             | 5 +++++
 .../model_helpers/brain_transformation/behavior.py          | 1 +
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/brainscore_vision/benchmarks/coggan2024_behavior/__init__.py b/brainscore_vision/benchmarks/coggan2024_behavior/__init__.py
index a6ca7c28c..86e868958 100644
--- a/brainscore_vision/benchmarks/coggan2024_behavior/__init__.py
+++ b/brainscore_vision/benchmarks/coggan2024_behavior/__init__.py
@@ -1,8 +1,6 @@
 # Created by David Coggan on 2024 06 25
 
 from brainscore_vision import benchmark_registry
-from .benchmark import (
-    Coggan2024_behavior_ConditionWiseAccuracySimilarity)
+from .benchmark import Coggan2024_behavior_ConditionWiseLabelingAccuracySimilarity
 
-benchmark_registry['Coggan2024_behavior-ConditionWiseAccuracySimilarity'] = (
-    Coggan2024_behavior_ConditionWiseAccuracySimilarity)
+benchmark_registry['Coggan2024_behavior-ConditionWiseAccuracySimilarity'] = Coggan2024_behavior_ConditionWiseLabelingAccuracySimilarity
diff --git a/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py b/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py
index 275acebe0..d5dbd6a9f 100644
--- a/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py
+++ b/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py
@@ -53,6 +53,11 @@ def __call__(self, candidate: BrainModel):
 
 
 class Coggan2024_behavior_ConditionWiseLabelingEngineeringAccuracy(BenchmarkBase):
+    # TODO: run locally
+    # TODO: check data format: need sports car indices => just the same as car
+    # TODO: is metric working?
+    # TODO: correct ish scores?
+    # TODO: tests?
     def __init__(self):
         self._metric = load_metric('accuracy')
         self._ceiling_func = lambda assembly: get_noise_ceiling(assembly)
diff --git a/brainscore_vision/model_helpers/brain_transformation/behavior.py b/brainscore_vision/model_helpers/brain_transformation/behavior.py
index 3e036bf25..af34d11e8 100644
--- a/brainscore_vision/model_helpers/brain_transformation/behavior.py
+++ b/brainscore_vision/model_helpers/brain_transformation/behavior.py
@@ -159,6 +159,7 @@ class LabelToImagenetIndices:
     hare_indices = [331]
     jeep_indices = [609]
     teapot_indices = [849]
+    sportscar_indices = [436, 511, 817]
 
     # car, bear, and elephant indices used as defined by Geirhos et al., 2021.
     # lamp indices used as defined by the Scialom2024 benchmark

From 0c08c8dcb41d72ff7a3f76d4ded3d27a6c45cb82 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Wed, 2 Oct 2024 16:03:44 +0200
Subject: [PATCH 07/29] fix dimension bug, ceiling estimate, and uneven design
 metric limitations

---
 .../metrics/accuracy_distance/metric.py       | 90 ++++++++++++-------
 1 file changed, 59 insertions(+), 31 deletions(-)

diff --git a/brainscore_vision/metrics/accuracy_distance/metric.py b/brainscore_vision/metrics/accuracy_distance/metric.py
index eb47e3bba..28201f305 100644
--- a/brainscore_vision/metrics/accuracy_distance/metric.py
+++ b/brainscore_vision/metrics/accuracy_distance/metric.py
@@ -1,4 +1,6 @@
 import itertools
+from functools import reduce
+import operator
 
 import numpy as np
 
@@ -20,41 +22,16 @@ class AccuracyDistance(Metric):
     more target-like pattern of performance across conditions.
     """
     def __call__(self, source: BehavioralAssembly, target:
-            BehavioralAssembly, variables: tuple=()) -> Score:
+            BehavioralAssembly, variables: tuple = ()) -> Score:
         """Target should be the entire BehavioralAssembly, containing truth values."""
 
         subjects = self.extract_subjects(target)
         subject_scores = []
         for subject in subjects:
             subject_assembly = target.sel(subject=subject)
-
-            # compute single score across the entire dataset
-            if len(variables) == 0:
-                subject_score = self.compare_single_subject(source, subject_assembly)
-
-            # compute scores for each condition, then average
-            else:
-                cond_scores = []
-
-                # get iterator across all combinations of variables
-                if len(variables) == 1:
-                    conditions = set(subject_assembly[variables[0]].values)
-                    conditions = [[c] for c in conditions]  # to mimic itertools.product
-                else:
-                    conditions = itertools.product(
-                        *[set(subject_assembly[v].values) for v in variables])
-
-                # loop over conditions and compute scores
-                for cond in conditions:
-                    indexers = {v: cond[i] for i, v in enumerate(variables)}
-                    subject_cond_assembly = subject_assembly.sel(**indexers)
-                    source_cond_assembly = source.sel(**indexers)
-                    # to accomodate unbalanced designs, skip combinations of
-                    # variables that don't exist in both assemblies
-                    if len(subject_cond_assembly) and len(source_cond_assembly):
-                        cond_scores.append(self.compare_single_subject(
-                            source_cond_assembly, subject_cond_assembly))
-                subject_score = Score(np.mean(cond_scores))
+            subject_score = self.condition_filtered_score_per_subject_source_pair(source=source,
+                                                                                  subject=subject_assembly,
+                                                                                  variables=variables)
 
             subject_score = subject_score.expand_dims('subject')
             subject_score['subject'] = 'subject', [subject]
@@ -89,13 +66,16 @@ def compare_single_subject(self, source: BehavioralAssembly, target: BehavioralA
 
         return Score(relative_distance)
 
-    def ceiling(self, assembly):
+    def ceiling(self, assembly, variables = ()):
         subjects = self.extract_subjects(assembly)
         subject_scores = []
         for subject1, subject2 in itertools.combinations(subjects, 2):
             subject1_assembly = assembly.sel(subject=subject1)
             subject2_assembly = assembly.sel(subject=subject2)
-            pairwise_score = self.compare_single_subject(subject1_assembly, subject2_assembly)
+
+            pairwise_score = self.condition_filtered_score_per_subject_source_pair(
+                subject1_assembly, subject2_assembly, variables=variables)
+
             pairwise_score = pairwise_score.expand_dims('subject')
             pairwise_score['subject_left'] = 'subject', [subject1]
             pairwise_score['subject_right'] = 'subject', [subject2]
@@ -107,3 +87,51 @@ def ceiling(self, assembly):
 
     def extract_subjects(self, assembly):
         return list(sorted(set(assembly['subject'].values)))
+
+    def condition_filtered_score_per_subject_source_pair(self, source, subject, variables):
+        # compute single score across the entire dataset
+        if len(variables) == 0:
+            subject_score = self.compare_single_subject(source, subject)
+
+        # compute scores for each condition, then average
+        else:
+            cond_scores = []
+            # get iterator across all combinations of variables
+            if len(variables) == 1:
+                conditions = set(subject[variables[0]].values)
+                conditions = [[c] for c in conditions]  # to mimic itertools.product
+            else:
+                conditions = itertools.product(
+                    *[set(subject[v].values) for v in variables])
+
+            # loop over conditions and compute scores
+            for cond in conditions:
+                # filter assemblies for selected condition
+                subject_cond_assembly = self.get_condition_filtered_assembly(subject, variables, cond)
+                source_cond_assembly = self.get_condition_filtered_assembly(source, variables, cond)
+                # select only the values in source_cond_assembly that has the same 'stimulus_id' as
+                #  subject_cond_assembly to accommodate comparisons where not all subjects saw all the same stimuli
+                stimulus_id_mask = source_cond_assembly['stimulus_id'].isin(
+                    subject_cond_assembly['stimulus_id'].values)
+                source_cond_assembly = source_cond_assembly.where(stimulus_id_mask, drop=True)
+                # to accomodate cases where not all conditions are present in both assemblies, filter out
+                #  calculation of the metric for cases where the
+                if len(subject_cond_assembly) and len(source_cond_assembly):
+                    cond_scores.append(self.compare_single_subject(
+                        source_cond_assembly, subject_cond_assembly))
+
+            subject_score = Score(np.mean(cond_scores))
+        return subject_score
+
+    @staticmethod
+    def get_condition_filtered_assembly(assembly, variables, cond):
+        # get the indexers for the condition
+        indexers = {v: cond[i] for i, v in enumerate(variables)}
+        # convert indexers into a list of boolean arrays for the assembly values
+        assembly_indexers = [(assembly[key] == value) for key, value in indexers.items()]
+        # combine the different conditions into an AND statement to require all conditions simultaneously
+        condition = reduce(operator.and_, assembly_indexers)
+        # filter the assembly based on the condition
+        condition_filtered_assembly = assembly.where(condition, drop=True)
+        return condition_filtered_assembly
+

From 9d5cf936ae96801a91dce865a34ee465fe62e3d5 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Wed, 2 Oct 2024 16:04:50 +0200
Subject: [PATCH 08/29] add coord truth to the assembly to conform to the
 labeling task

---
 .../benchmarks/coggan2024_behavior/benchmark.py          | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py b/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py
index d5dbd6a9f..fa0eca8bd 100644
--- a/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py
+++ b/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py
@@ -25,6 +25,8 @@ def __init__(self):
         self._metric = load_metric('accuracy_distance')
         self._fitting_stimuli = load_stimulus_set('Coggan2024_behavior_fitting')  # this fails is wrapped by LazyLoad
         self._assembly = LazyLoad(lambda: load_dataset('Coggan2024_behavior'))
+        self._assembly['truth'] = self._assembly['object_class']  # the assembly is missing a 'truth' column which is
+        # required by the labeling task
         self._visual_degrees = 10
         self._number_of_trials = 1
         self._ceiling_func = lambda assembly: get_noise_ceiling(assembly)
@@ -44,7 +46,7 @@ def __call__(self, candidate: BrainModel):
                                        target_visual_degrees=candidate.visual_degrees(),
                                        source_visual_degrees=self._visual_degrees)
         labels = candidate.look_at(stimulus_set, number_of_trials=self._number_of_trials)
-        raw_score = self._metric(labels, self._assembly, variables=['occluder_type', 'occluder_color'])
+        raw_score = self._metric(labels, self._assembly, variables=['occluder_type', 'visibility', 'occluder_color'])
         ceiling = self.ceiling
         score = raw_score / ceiling
         score.attrs['raw'] = raw_score
@@ -53,11 +55,6 @@ def __call__(self, candidate: BrainModel):
 
 
 class Coggan2024_behavior_ConditionWiseLabelingEngineeringAccuracy(BenchmarkBase):
-    # TODO: run locally
-    # TODO: check data format: need sports car indices => just the same as car
-    # TODO: is metric working?
-    # TODO: correct ish scores?
-    # TODO: tests?
     def __init__(self):
         self._metric = load_metric('accuracy')
         self._ceiling_func = lambda assembly: get_noise_ceiling(assembly)

From d8892ac3fd342ba5e2047b93d5d4bc9204f19551 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Wed, 2 Oct 2024 16:08:25 +0200
Subject: [PATCH 09/29] add a comment

---
 brainscore_vision/metrics/accuracy_distance/metric.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/brainscore_vision/metrics/accuracy_distance/metric.py b/brainscore_vision/metrics/accuracy_distance/metric.py
index 28201f305..72fe2fb34 100644
--- a/brainscore_vision/metrics/accuracy_distance/metric.py
+++ b/brainscore_vision/metrics/accuracy_distance/metric.py
@@ -115,7 +115,7 @@ def condition_filtered_score_per_subject_source_pair(self, source, subject, vari
                     subject_cond_assembly['stimulus_id'].values)
                 source_cond_assembly = source_cond_assembly.where(stimulus_id_mask, drop=True)
                 # to accomodate cases where not all conditions are present in both assemblies, filter out
-                #  calculation of the metric for cases where the
+                #  calculation of the metric for cases where either assembly has no matches to variables (empty)
                 if len(subject_cond_assembly) and len(source_cond_assembly):
                     cond_scores.append(self.compare_single_subject(
                         source_cond_assembly, subject_cond_assembly))

From 6873f93d4818a0e43da3cd8e14e6431cd3ca0bc4 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Wed, 2 Oct 2024 16:15:45 +0200
Subject: [PATCH 10/29] add variables to ceiling

---
 brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py b/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py
index fa0eca8bd..3cd685597 100644
--- a/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py
+++ b/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py
@@ -33,7 +33,8 @@ def __init__(self):
         super(Coggan2024_behavior_ConditionWiseLabelingAccuracySimilarity, self).__init__(
             identifier='tong.Coggan2024_behavior-LabelingConditionWiseAccuracySimilarity',
             version=1,
-            ceiling_func=lambda: self._metric.ceiling(self._assembly),
+            ceiling_func=lambda: self._metric.ceiling(self._assembly,
+                                                      variables=['occluder_type', 'visibility', 'occluder_color']),
             parent='behavior',
             bibtex=BIBTEX,
         )

From 74f278066daebe8094573711df39916fdac34e18 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Tue, 8 Oct 2024 10:33:03 +0200
Subject: [PATCH 11/29] remove stimulus id requirement from condition-wise
 accuracy distance

---
 .../benchmarks/coggan2024_behavior/benchmark.py  | 12 +++++++++++-
 .../metrics/accuracy_distance/metric.py          | 16 ++++++++++------
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py b/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py
index 3cd685597..49952a295 100644
--- a/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py
+++ b/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py
@@ -26,7 +26,7 @@ def __init__(self):
         self._fitting_stimuli = load_stimulus_set('Coggan2024_behavior_fitting')  # this fails is wrapped by LazyLoad
         self._assembly = LazyLoad(lambda: load_dataset('Coggan2024_behavior'))
         self._assembly['truth'] = self._assembly['object_class']  # the assembly is missing a 'truth' column which is
-        # required by the labeling task
+                                                                  #  required by the labeling task
         self._visual_degrees = 10
         self._number_of_trials = 1
         self._ceiling_func = lambda assembly: get_noise_ceiling(assembly)
@@ -188,3 +188,13 @@ def ceiler(score: Score, ceiling: Score) -> Score:
     ceiled_score.attrs[Score.RAW_VALUES_KEY] = score
     ceiled_score.attrs['ceiling'] = ceiling
     return ceiled_score
+
+
+def remove_nans(data):
+    """
+    removes nans from the data and replaces them with a string 'none'. uses pandas to simultaneously hand numeric
+    and non-numeric data.
+    """
+    for coord in data.coords:
+        data[coord] = data[coord].where(~pd.isna(data[coord]), 'none')
+    return data
\ No newline at end of file
diff --git a/brainscore_vision/metrics/accuracy_distance/metric.py b/brainscore_vision/metrics/accuracy_distance/metric.py
index 72fe2fb34..a782c1a59 100644
--- a/brainscore_vision/metrics/accuracy_distance/metric.py
+++ b/brainscore_vision/metrics/accuracy_distance/metric.py
@@ -101,19 +101,23 @@ def condition_filtered_score_per_subject_source_pair(self, source, subject, vari
                 conditions = set(subject[variables[0]].values)
                 conditions = [[c] for c in conditions]  # to mimic itertools.product
             else:
+                # get all combinations of variables that are present in both assemblies
                 conditions = itertools.product(
-                    *[set(subject[v].values) for v in variables])
+                    *[set(subject[v].values).intersection(set(source[v].values)) for v in variables]
+                )
 
             # loop over conditions and compute scores
+            # TODO: 91 conditions?? where do they come from?
+            # TODO: what did which participants do? is grouping across visibility correct or not? it does not look
+            #  like any participants did the same combination of conditions
+
+            # TODO: but the above is with the caveat that the stimulus_id field is NOT actually the stimulus id.
+            #  it contains the subject number and trial number, in addition to a 5-digit number that may or may not
+            #  be the actual stimulus_id. we should filter based on that, but need to clean up the data to do that.
             for cond in conditions:
                 # filter assemblies for selected condition
                 subject_cond_assembly = self.get_condition_filtered_assembly(subject, variables, cond)
                 source_cond_assembly = self.get_condition_filtered_assembly(source, variables, cond)
-                # select only the values in source_cond_assembly that has the same 'stimulus_id' as
-                #  subject_cond_assembly to accommodate comparisons where not all subjects saw all the same stimuli
-                stimulus_id_mask = source_cond_assembly['stimulus_id'].isin(
-                    subject_cond_assembly['stimulus_id'].values)
-                source_cond_assembly = source_cond_assembly.where(stimulus_id_mask, drop=True)
                 # to accomodate cases where not all conditions are present in both assemblies, filter out
                 #  calculation of the metric for cases where either assembly has no matches to variables (empty)
                 if len(subject_cond_assembly) and len(source_cond_assembly):

From c0320169b31b4a1a6525bd4afc23c8b36e031e30 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Thu, 17 Oct 2024 17:25:14 +0200
Subject: [PATCH 12/29] fix bug with stimulus selection

---
 .../metrics/accuracy_distance/metric.py       | 25 +++++++++++++------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/brainscore_vision/metrics/accuracy_distance/metric.py b/brainscore_vision/metrics/accuracy_distance/metric.py
index a782c1a59..0090bf772 100644
--- a/brainscore_vision/metrics/accuracy_distance/metric.py
+++ b/brainscore_vision/metrics/accuracy_distance/metric.py
@@ -51,7 +51,10 @@ def aggregate(cls, scores):
     def compare_single_subject(self, source: BehavioralAssembly, target: BehavioralAssembly):
         source = source.sortby('stimulus_id')
         target = target.sortby('stimulus_id')
-        assert (target['stimulus_id'].values == source['stimulus_id'].values).all()
+
+        # we used to assert stimulus_ids being equal here, but since this is not an image-level metric, and because
+        # some benchmarks (e.g. Coggan2024) show different images from the same categories to humans, the metric
+        # does not guarantee that the stimulus_ids are the same.
 
         # .flatten() because models return lists of lists, and here we compare subject-by-subject
         source_correct = source.values.flatten() == target['truth'].values
@@ -107,13 +110,6 @@ def condition_filtered_score_per_subject_source_pair(self, source, subject, vari
                 )
 
             # loop over conditions and compute scores
-            # TODO: 91 conditions?? where do they come from?
-            # TODO: what did which participants do? is grouping across visibility correct or not? it does not look
-            #  like any participants did the same combination of conditions
-
-            # TODO: but the above is with the caveat that the stimulus_id field is NOT actually the stimulus id.
-            #  it contains the subject number and trial number, in addition to a 5-digit number that may or may not
-            #  be the actual stimulus_id. we should filter based on that, but need to clean up the data to do that.
             for cond in conditions:
                 # filter assemblies for selected condition
                 subject_cond_assembly = self.get_condition_filtered_assembly(subject, variables, cond)
@@ -121,6 +117,11 @@ def condition_filtered_score_per_subject_source_pair(self, source, subject, vari
                 # to accomodate cases where not all conditions are present in both assemblies, filter out
                 #  calculation of the metric for cases where either assembly has no matches to variables (empty)
                 if len(subject_cond_assembly) and len(source_cond_assembly):
+                    # filter the source_cond_assembly to select only the stimulus_ids in the subject_cond_assembly
+                    source_cond_assembly = self.get_stimulus_id_filtered_assembly(
+                        source_cond_assembly,
+                        subject_cond_assembly['stimulus_id'].values
+                    )
                     cond_scores.append(self.compare_single_subject(
                         source_cond_assembly, subject_cond_assembly))
 
@@ -139,3 +140,11 @@ def get_condition_filtered_assembly(assembly, variables, cond):
         condition_filtered_assembly = assembly.where(condition, drop=True)
         return condition_filtered_assembly
 
+    @staticmethod
+    def get_stimulus_id_filtered_assembly(assembly, stimulus_ids):
+        # Create a boolean condition to match the stimulus_id
+        condition = reduce(operator.or_, [(assembly['stimulus_id'] == stimulus_id) for stimulus_id in stimulus_ids])
+        # Filter the assembly based on the condition
+        condition_filtered_assembly = assembly.where(condition, drop=True)
+        return condition_filtered_assembly
+

From 346d81c1fb50db50757b8432e205484f8bc4b352 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Thu, 17 Oct 2024 17:26:58 +0200
Subject: [PATCH 13/29] change sports car indices to the correct ones

---
 .../model_helpers/brain_transformation/behavior.py              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/brainscore_vision/model_helpers/brain_transformation/behavior.py b/brainscore_vision/model_helpers/brain_transformation/behavior.py
index af34d11e8..dca156f19 100644
--- a/brainscore_vision/model_helpers/brain_transformation/behavior.py
+++ b/brainscore_vision/model_helpers/brain_transformation/behavior.py
@@ -159,7 +159,7 @@ class LabelToImagenetIndices:
     hare_indices = [331]
     jeep_indices = [609]
     teapot_indices = [849]
-    sportscar_indices = [436, 511, 817]
+    sportscar_indices = [817]
 
     # car, bear, and elephant indices used as defined by Geirhos et al., 2021.
     # lamp indices used as defined by the Scialom2024 benchmark

From cef73a8297c7e772bb8f75eb2df7c82ba21e960b Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Fri, 18 Oct 2024 11:35:43 +0200
Subject: [PATCH 14/29] fix some happy little bugs with stimulus ids again

---
 brainscore_vision/metrics/accuracy_distance/metric.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/brainscore_vision/metrics/accuracy_distance/metric.py b/brainscore_vision/metrics/accuracy_distance/metric.py
index 0090bf772..289c7eef1 100644
--- a/brainscore_vision/metrics/accuracy_distance/metric.py
+++ b/brainscore_vision/metrics/accuracy_distance/metric.py
@@ -116,12 +116,13 @@ def condition_filtered_score_per_subject_source_pair(self, source, subject, vari
                 source_cond_assembly = self.get_condition_filtered_assembly(source, variables, cond)
                 # to accomodate cases where not all conditions are present in both assemblies, filter out
                 #  calculation of the metric for cases where either assembly has no matches to variables (empty)
-                if len(subject_cond_assembly) and len(source_cond_assembly):
+                if len(subject_cond_assembly['presentation']) and len(source_cond_assembly['presentation']):
                     # filter the source_cond_assembly to select only the stimulus_ids in the subject_cond_assembly
-                    source_cond_assembly = self.get_stimulus_id_filtered_assembly(
-                        source_cond_assembly,
-                        subject_cond_assembly['stimulus_id'].values
-                    )
+                    if len(source_cond_assembly['presentation']) > len(subject_cond_assembly['presentation']):
+                        source_cond_assembly = self.get_stimulus_id_filtered_assembly(
+                            source_cond_assembly,
+                            subject_cond_assembly['stimulus_id'].values
+                        )
                     cond_scores.append(self.compare_single_subject(
                         source_cond_assembly, subject_cond_assembly))
 

From 4fd2f8320c94f1865d633569781dcf67900f870f Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Fri, 18 Oct 2024 12:01:59 +0200
Subject: [PATCH 15/29] add fitting benchmark

---
 .../coggan2024_behavior/benchmark.py          | 39 ++++++++++++++++++-
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py b/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py
index 49952a295..a4c594d8b 100644
--- a/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py
+++ b/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py
@@ -23,13 +23,11 @@
 class Coggan2024_behavior_ConditionWiseLabelingAccuracySimilarity(BenchmarkBase):
     def __init__(self):
         self._metric = load_metric('accuracy_distance')
-        self._fitting_stimuli = load_stimulus_set('Coggan2024_behavior_fitting')  # this fails is wrapped by LazyLoad
         self._assembly = LazyLoad(lambda: load_dataset('Coggan2024_behavior'))
         self._assembly['truth'] = self._assembly['object_class']  # the assembly is missing a 'truth' column which is
                                                                   #  required by the labeling task
         self._visual_degrees = 10
         self._number_of_trials = 1
-        self._ceiling_func = lambda assembly: get_noise_ceiling(assembly)
         super(Coggan2024_behavior_ConditionWiseLabelingAccuracySimilarity, self).__init__(
             identifier='tong.Coggan2024_behavior-LabelingConditionWiseAccuracySimilarity',
             version=1,
@@ -55,6 +53,43 @@ def __call__(self, candidate: BrainModel):
         return score
 
 
+class Coggan2024_behavior_ConditionWiseProbabilitiesAccuracySimilarity(BenchmarkBase):
+    def __init__(self):
+        self._metric = load_metric('accuracy_distance')
+        self._fitting_stimuli = load_stimulus_set('Coggan2024_behavior_fitting')  # this fails is wrapped by LazyLoad
+        self._assembly = LazyLoad(lambda: load_dataset('Coggan2024_behavior'))
+        self._assembly['truth'] = self._assembly['object_class']  # the assembly is missing a 'truth' column which is
+                                                                  #  required by the labeling task
+        self._visual_degrees = 10
+        self._number_of_trials = 1
+        super(Coggan2024_behavior_ConditionWiseProbabilitiesAccuracySimilarity, self).__init__(
+            identifier='tong.Coggan2024_behavior-LabelingConditionWiseAccuracySimilarity',
+            version=1,
+            ceiling_func=lambda: self._metric.ceiling(self._assembly,
+                                                      variables=['occluder_type', 'visibility', 'occluder_color']),
+            parent='behavior',
+            bibtex=BIBTEX,
+        )
+
+    def __call__(self, candidate: BrainModel):
+        fitting_stimuli = place_on_screen(
+            self._fitting_stimuli,
+            target_visual_degrees=candidate.visual_degrees(),
+            source_visual_degrees=self._visual_degrees)
+        candidate.start_task(BrainModel.Task.probabilities, fitting_stimuli)
+        stimulus_set = place_on_screen(self._assembly.stimulus_set,
+                                       target_visual_degrees=candidate.visual_degrees(),
+                                       source_visual_degrees=self._visual_degrees)
+        probabilities = candidate.look_at(stimulus_set, number_of_trials=self._number_of_trials)
+        labels = [probabilities.choice[c].values for c in probabilities.argmax(axis=1)]
+        raw_score = self._metric(labels, self._assembly, variables=['occluder_type', 'visibility', 'occluder_color'])
+        ceiling = self.ceiling
+        score = raw_score / ceiling
+        score.attrs['raw'] = raw_score
+        score.attrs['ceiling'] = ceiling
+        return score
+
+
 class Coggan2024_behavior_ConditionWiseLabelingEngineeringAccuracy(BenchmarkBase):
     def __init__(self):
         self._metric = load_metric('accuracy')

From 7bdcd91c2be533cbb26f7251f95dad8759108feb Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Fri, 18 Oct 2024 14:04:26 +0200
Subject: [PATCH 16/29] add leave-one-out ceiling and chance-level adjustment
 for accuracy distance

---
 .../metrics/accuracy_distance/metric.py       | 41 ++++++++++++++++---
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/brainscore_vision/metrics/accuracy_distance/metric.py b/brainscore_vision/metrics/accuracy_distance/metric.py
index 289c7eef1..6737d49ce 100644
--- a/brainscore_vision/metrics/accuracy_distance/metric.py
+++ b/brainscore_vision/metrics/accuracy_distance/metric.py
@@ -3,6 +3,7 @@
 import operator
 
 import numpy as np
+import xarray as xr
 
 from brainio.assemblies import BehavioralAssembly
 from brainscore_core import Metric
@@ -22,9 +23,9 @@ class AccuracyDistance(Metric):
     more target-like pattern of performance across conditions.
     """
     def __call__(self, source: BehavioralAssembly, target:
-            BehavioralAssembly, variables: tuple = ()) -> Score:
+            BehavioralAssembly, variables: tuple = (), chance_level = 0.) -> Score:
         """Target should be the entire BehavioralAssembly, containing truth values."""
-
+        self.chance_level = chance_level
         subjects = self.extract_subjects(target)
         subject_scores = []
         for subject in subjects:
@@ -55,21 +56,26 @@ def compare_single_subject(self, source: BehavioralAssembly, target: BehavioralA
         # we used to assert stimulus_ids being equal here, but since this is not an image-level metric, and because
         # some benchmarks (e.g. Coggan2024) show different images from the same categories to humans, the metric
         # does not guarantee that the stimulus_ids are the same.
-
         # .flatten() because models return lists of lists, and here we compare subject-by-subject
         source_correct = source.values.flatten() == target['truth'].values
         target_correct = target.values == target['truth'].values
         source_mean = sum(source_correct) / len(source_correct)
         target_mean = sum(target_correct) / len(target_correct)
 
-        maximum_distance = np.max([1 - target_mean, target_mean])
+        relative_distance = self.distance_measure(source_mean, target_mean)
+
+        return Score(relative_distance)
+
+    def distance_measure(self, source_mean, target_mean):
+        maximum_distance = np.max([1 - target_mean, target_mean - self.chance_level])
         # get the proportion of the distance between the source and target accuracies, adjusted for the maximum possible
         # difference between the two accuracies
         relative_distance = 1 - np.abs(source_mean - target_mean) / maximum_distance
 
-        return Score(relative_distance)
+        return relative_distance
 
-    def ceiling(self, assembly, variables = ()):
+    def ceiling(self, assembly, variables = (), chance_level = 0.):
+        self.chance_level = chance_level
         subjects = self.extract_subjects(assembly)
         subject_scores = []
         for subject1, subject2 in itertools.combinations(subjects, 2):
@@ -88,6 +94,29 @@ def ceiling(self, assembly, variables = ()):
         subject_scores = apply_aggregate(aggregate_fnc=self.aggregate, values=subject_scores)
         return subject_scores
 
+    def leave_one_out_ceiling(self, assembly, variables = (), chance_level = 0.):
+        self.chance_level = chance_level
+        # convert the above to a working xarray implementation with variables
+        subjects = self.extract_subjects(assembly)
+        subject_scores = []
+        for subject in subjects:
+            subject_assembly = assembly.sel(subject=subject)
+            other_subjects = [s for s in subjects if s != subject]
+            other_assemblies = assembly.isel(presentation=assembly.subject.isin(other_subjects))
+            # merge other_assemblies from a list to a single assembly
+            group_correct = other_assemblies.multi_groupby(variables).apply(lambda x: x['human_accuracy'].mean())
+            subject_correct = subject_assembly.multi_groupby(variables).apply(lambda x: x['human_accuracy'].mean())
+            for i, group in enumerate(group_correct.values):
+                pairwise_score = self.distance_measure(subject_correct.values[i], group)
+                subject_scores.append(Score(pairwise_score))
+
+        score = np.mean(subject_scores)
+        error = np.std(subject_scores)
+        score = Score(score)
+        score.attrs['error'] = error
+        score.attrs['raw'] = subject_scores
+        return score
+
     def extract_subjects(self, assembly):
         return list(sorted(set(assembly['subject'].values)))
 

From 069531cb809610a73d50c643e1d0ded4b8b2440e Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Fri, 18 Oct 2024 14:04:54 +0200
Subject: [PATCH 17/29] add chance-level adjustments to benchmark, and
 computation of new ceiling

---
 .../coggan2024_behavior/benchmark.py          | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py b/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py
index a4c594d8b..c9c739d28 100644
--- a/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py
+++ b/brainscore_vision/benchmarks/coggan2024_behavior/benchmark.py
@@ -31,8 +31,10 @@ def __init__(self):
         super(Coggan2024_behavior_ConditionWiseLabelingAccuracySimilarity, self).__init__(
             identifier='tong.Coggan2024_behavior-LabelingConditionWiseAccuracySimilarity',
             version=1,
-            ceiling_func=lambda: self._metric.ceiling(self._assembly,
-                                                      variables=['occluder_type', 'visibility', 'occluder_color']),
+            ceiling_func=lambda: self._metric.leave_one_out_ceiling(
+                self._assembly,
+                variables=['occluder_type', 'visibility', 'occluder_color'],
+                chance_level=1/8),
             parent='behavior',
             bibtex=BIBTEX,
         )
@@ -45,7 +47,8 @@ def __call__(self, candidate: BrainModel):
                                        target_visual_degrees=candidate.visual_degrees(),
                                        source_visual_degrees=self._visual_degrees)
         labels = candidate.look_at(stimulus_set, number_of_trials=self._number_of_trials)
-        raw_score = self._metric(labels, self._assembly, variables=['occluder_type', 'visibility', 'occluder_color'])
+        raw_score = self._metric(labels, self._assembly, variables=['occluder_type', 'visibility', 'occluder_color'],
+                                 chance_level=1/8)
         ceiling = self.ceiling
         score = raw_score / ceiling
         score.attrs['raw'] = raw_score
@@ -65,8 +68,11 @@ def __init__(self):
         super(Coggan2024_behavior_ConditionWiseProbabilitiesAccuracySimilarity, self).__init__(
             identifier='tong.Coggan2024_behavior-LabelingConditionWiseAccuracySimilarity',
             version=1,
-            ceiling_func=lambda: self._metric.ceiling(self._assembly,
-                                                      variables=['occluder_type', 'visibility', 'occluder_color']),
+            ceiling_func=lambda: self._metric.ceiling(
+                self._assembly,
+                variables=['occluder_type', 'visibility', 'occluder_color'],
+                chance_level=1/8
+            ),
             parent='behavior',
             bibtex=BIBTEX,
         )
@@ -82,7 +88,8 @@ def __call__(self, candidate: BrainModel):
                                        source_visual_degrees=self._visual_degrees)
         probabilities = candidate.look_at(stimulus_set, number_of_trials=self._number_of_trials)
         labels = [probabilities.choice[c].values for c in probabilities.argmax(axis=1)]
-        raw_score = self._metric(labels, self._assembly, variables=['occluder_type', 'visibility', 'occluder_color'])
+        raw_score = self._metric(labels, self._assembly, variables=['occluder_type', 'visibility', 'occluder_color'],
+                                 chance_level=1/8)
         ceiling = self.ceiling
         score = raw_score / ceiling
         score.attrs['raw'] = raw_score

From 0af1cb2b4cc3aecb9f9e10096f018c4e9967b419 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Mon, 21 Oct 2024 12:01:43 +0200
Subject: [PATCH 18/29] optimize condition-wise accuracydistance

---
 .../metrics/accuracy_distance/metric.py       | 47 +++++++------------
 1 file changed, 16 insertions(+), 31 deletions(-)

diff --git a/brainscore_vision/metrics/accuracy_distance/metric.py b/brainscore_vision/metrics/accuracy_distance/metric.py
index 6737d49ce..a5065c8e4 100644
--- a/brainscore_vision/metrics/accuracy_distance/metric.py
+++ b/brainscore_vision/metrics/accuracy_distance/metric.py
@@ -128,32 +128,17 @@ def condition_filtered_score_per_subject_source_pair(self, source, subject, vari
         # compute scores for each condition, then average
         else:
             cond_scores = []
-            # get iterator across all combinations of variables
-            if len(variables) == 1:
-                conditions = set(subject[variables[0]].values)
-                conditions = [[c] for c in conditions]  # to mimic itertools.product
-            else:
-                # get all combinations of variables that are present in both assemblies
-                conditions = itertools.product(
-                    *[set(subject[v].values).intersection(set(source[v].values)) for v in variables]
-                )
-
-            # loop over conditions and compute scores
-            for cond in conditions:
-                # filter assemblies for selected condition
-                subject_cond_assembly = self.get_condition_filtered_assembly(subject, variables, cond)
-                source_cond_assembly = self.get_condition_filtered_assembly(source, variables, cond)
-                # to accomodate cases where not all conditions are present in both assemblies, filter out
-                #  calculation of the metric for cases where either assembly has no matches to variables (empty)
-                if len(subject_cond_assembly['presentation']) and len(source_cond_assembly['presentation']):
-                    # filter the source_cond_assembly to select only the stimulus_ids in the subject_cond_assembly
-                    if len(source_cond_assembly['presentation']) > len(subject_cond_assembly['presentation']):
-                        source_cond_assembly = self.get_stimulus_id_filtered_assembly(
-                            source_cond_assembly,
-                            subject_cond_assembly['stimulus_id'].values
-                        )
-                    cond_scores.append(self.compare_single_subject(
-                        source_cond_assembly, subject_cond_assembly))
+            source = self.get_stimulus_id_filtered_assembly(
+                source,
+                subject['stimulus_id'].values
+            )
+            # add a new coordinate to the source assembly that measures the accuracy of the model
+            source['is_correct'] = 'presentation', *(source['label'].values == source.values)
+            source_correct = source.multi_groupby(variables).apply(lambda x: x['is_correct'].mean())
+            subject_correct = subject.multi_groupby(variables).apply(lambda x: x['human_accuracy'].mean())
+            for i, this_source_correct in enumerate(source_correct.values):
+                condition_score = self.distance_measure(this_source_correct, subject_correct.values[i])
+                cond_scores.append(Score(condition_score))
 
             subject_score = Score(np.mean(cond_scores))
         return subject_score
@@ -172,9 +157,9 @@ def get_condition_filtered_assembly(assembly, variables, cond):
 
     @staticmethod
     def get_stimulus_id_filtered_assembly(assembly, stimulus_ids):
-        # Create a boolean condition to match the stimulus_id
-        condition = reduce(operator.or_, [(assembly['stimulus_id'] == stimulus_id) for stimulus_id in stimulus_ids])
-        # Filter the assembly based on the condition
-        condition_filtered_assembly = assembly.where(condition, drop=True)
+        # find the indices of the stimulus_ids in the assembly quickly
+        condition = xr.DataArray(np.isin(assembly['stimulus_id'].values, stimulus_ids),
+                                 dims=assembly['stimulus_id'].dims)
+        # Apply the condition with `where`
+        condition_filtered_assembly = BehavioralAssembly(assembly.where(condition, drop=True))
         return condition_filtered_assembly
-

From 9c58dc5868644ffc709675f95f66c9275eb22d4f Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Mon, 21 Oct 2024 13:55:13 +0200
Subject: [PATCH 19/29] update test value for accuracydistance

---
 brainscore_vision/benchmarks/coggan2024_behavior/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/brainscore_vision/benchmarks/coggan2024_behavior/test.py b/brainscore_vision/benchmarks/coggan2024_behavior/test.py
index 218194684..ce2f01c94 100644
--- a/brainscore_vision/benchmarks/coggan2024_behavior/test.py
+++ b/brainscore_vision/benchmarks/coggan2024_behavior/test.py
@@ -16,6 +16,6 @@ def test_benchmarks():
         'Coggan2024_behavior-ConditionWiseAccuracySimilarity')
     model = load_model('alexnet')
     result = benchmark(model)
-    assert result.values == approx(0.1318, abs=.001)
+    assert result.values == approx(0.34431372, abs=.001)
 
 

From 9b5841a6db80c5ff2056707627490dcd59de7c5a Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Mon, 21 Oct 2024 13:57:59 +0200
Subject: [PATCH 20/29] remove unnecessary method

---
 .../metrics/accuracy_distance/metric.py              | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/brainscore_vision/metrics/accuracy_distance/metric.py b/brainscore_vision/metrics/accuracy_distance/metric.py
index a5065c8e4..6e53f4aff 100644
--- a/brainscore_vision/metrics/accuracy_distance/metric.py
+++ b/brainscore_vision/metrics/accuracy_distance/metric.py
@@ -143,18 +143,6 @@ def condition_filtered_score_per_subject_source_pair(self, source, subject, vari
             subject_score = Score(np.mean(cond_scores))
         return subject_score
 
-    @staticmethod
-    def get_condition_filtered_assembly(assembly, variables, cond):
-        # get the indexers for the condition
-        indexers = {v: cond[i] for i, v in enumerate(variables)}
-        # convert indexers into a list of boolean arrays for the assembly values
-        assembly_indexers = [(assembly[key] == value) for key, value in indexers.items()]
-        # combine the different conditions into an AND statement to require all conditions simultaneously
-        condition = reduce(operator.and_, assembly_indexers)
-        # filter the assembly based on the condition
-        condition_filtered_assembly = assembly.where(condition, drop=True)
-        return condition_filtered_assembly
-
     @staticmethod
     def get_stimulus_id_filtered_assembly(assembly, stimulus_ids):
         # find the indices of the stimulus_ids in the assembly quickly

From 7cbc53a116c8ff860f21f33b07d7ef0817154271 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Tue, 29 Oct 2024 11:52:58 +0100
Subject: [PATCH 21/29] update init

---
 brainscore_vision/benchmarks/coggan2024_behavior/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/brainscore_vision/benchmarks/coggan2024_behavior/__init__.py b/brainscore_vision/benchmarks/coggan2024_behavior/__init__.py
index 86e868958..7e7ce6f1a 100644
--- a/brainscore_vision/benchmarks/coggan2024_behavior/__init__.py
+++ b/brainscore_vision/benchmarks/coggan2024_behavior/__init__.py
@@ -1,6 +1,7 @@
 # Created by David Coggan on 2024 06 25
 
 from brainscore_vision import benchmark_registry
-from .benchmark import Coggan2024_behavior_ConditionWiseLabelingAccuracySimilarity
+from .benchmark import (
+    Coggan2024_behavior_ConditionWiseLabelingAccuracySimilarity)
 
 benchmark_registry['Coggan2024_behavior-ConditionWiseAccuracySimilarity'] = Coggan2024_behavior_ConditionWiseLabelingAccuracySimilarity

From fff8a74d674de7e3a2c0a35e2a8537c96be512c3 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Tue, 29 Oct 2024 11:56:27 +0100
Subject: [PATCH 22/29] add prefix as it was before

---
 brainscore_vision/benchmarks/coggan2024_behavior/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/brainscore_vision/benchmarks/coggan2024_behavior/__init__.py b/brainscore_vision/benchmarks/coggan2024_behavior/__init__.py
index 7e7ce6f1a..cd5c40f18 100644
--- a/brainscore_vision/benchmarks/coggan2024_behavior/__init__.py
+++ b/brainscore_vision/benchmarks/coggan2024_behavior/__init__.py
@@ -4,4 +4,4 @@
 from .benchmark import (
     Coggan2024_behavior_ConditionWiseLabelingAccuracySimilarity)
 
-benchmark_registry['Coggan2024_behavior-ConditionWiseAccuracySimilarity'] = Coggan2024_behavior_ConditionWiseLabelingAccuracySimilarity
+benchmark_registry['tong.Coggan2024_behavior-ConditionWiseAccuracySimilarity'] = Coggan2024_behavior_ConditionWiseLabelingAccuracySimilarity

From 7aa287f5a9a6bcb1236024bd83139fb6a00878f7 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Tue, 29 Oct 2024 11:59:12 +0100
Subject: [PATCH 23/29] add all benchmarks

---
 .../benchmarks/coggan2024_behavior/__init__.py            | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/brainscore_vision/benchmarks/coggan2024_behavior/__init__.py b/brainscore_vision/benchmarks/coggan2024_behavior/__init__.py
index cd5c40f18..4676fca0c 100644
--- a/brainscore_vision/benchmarks/coggan2024_behavior/__init__.py
+++ b/brainscore_vision/benchmarks/coggan2024_behavior/__init__.py
@@ -2,6 +2,10 @@
 
 from brainscore_vision import benchmark_registry
 from .benchmark import (
-    Coggan2024_behavior_ConditionWiseLabelingAccuracySimilarity)
+    Coggan2024_behavior_ConditionWiseLabelingAccuracySimilarity,
+    Coggan2024_behavior_ConditionWiseProbabilitiesAccuracySimilarity,
+    Coggan2024_behavior_ConditionWiseLabelingEngineeringAccuracy)
 
-benchmark_registry['tong.Coggan2024_behavior-ConditionWiseAccuracySimilarity'] = Coggan2024_behavior_ConditionWiseLabelingAccuracySimilarity
+benchmark_registry['tong.Coggan2024_behavior-ConditionWiseLabelingAccuracySimilarity'] = Coggan2024_behavior_ConditionWiseLabelingAccuracySimilarity
+benchmark_registry['tong.Coggan2024_behavior-ConditionWiseProbabilitiesAccuracySimilarity'] = Coggan2024_behavior_ConditionWiseProbabilitiesAccuracySimilarity
+benchmark_registry['tong.Coggan2024_behavior-ConditionWiseLabelingEngineeringAccuracy'] = Coggan2024_behavior_ConditionWiseLabelingEngineeringAccuracy

From 30b5f850ad1e9fe4b5e49534e2bb1af6af641aed Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Wed, 30 Oct 2024 14:18:59 +0100
Subject: [PATCH 24/29] happy little test

---
 brainscore_vision/benchmarks/coggan2024_behavior/test.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/brainscore_vision/benchmarks/coggan2024_behavior/test.py b/brainscore_vision/benchmarks/coggan2024_behavior/test.py
index a2b0eda3b..5d4821763 100644
--- a/brainscore_vision/benchmarks/coggan2024_behavior/test.py
+++ b/brainscore_vision/benchmarks/coggan2024_behavior/test.py
@@ -7,8 +7,9 @@
 
 
 def test_benchmark_registry():
-    assert ('tong.Coggan2024_behavior-ConditionWiseAccuracySimilarity' in
-            benchmark_registry)
+    assert ('tong.Coggan2024_behavior-ConditionWiseLabelingAccuracySimilarity' in benchmark_registry and
+            'tong.Coggan2024_behavior-ConditionWiseProbabilitiesAccuracySimilarity' in benchmark_registry and
+            'tong.Coggan2024_behavior-ConditionWiseLabelingEngineeringAccuracy' in benchmark_registry)
 
 @pytest.mark.private_access
 def test_benchmarks():

From 6528a651451f7d821b82c36855faefee9df5ebd4 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Wed, 30 Oct 2024 14:20:58 +0100
Subject: [PATCH 25/29] add label to accuracy distance test

---
 brainscore_vision/metrics/accuracy_distance/test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/brainscore_vision/metrics/accuracy_distance/test.py b/brainscore_vision/metrics/accuracy_distance/test.py
index d6414b790..23696be90 100644
--- a/brainscore_vision/metrics/accuracy_distance/test.py
+++ b/brainscore_vision/metrics/accuracy_distance/test.py
@@ -51,6 +51,7 @@ def _make_data():
                                ],
                               coords={'stimulus_id': ('presentation', np.resize(np.arange(9), 9 * 3)),
                                       'truth': ('presentation', np.resize(['dog', 'cat', 'chair'], 9 * 3)),
+                                      'label': ('presentation', np.resize(['dog', 'cat', 'chair'], 9 * 3)),
                                       'condition': ('presentation', np.resize([1, 1, 1, 2, 2, 2, 3, 3, 3], 9 * 3)),
                                       'animacy': ('presentation', np.resize(['animate', 'animate', 'inanimate'], 9 * 3)),
                                       'subject': ('presentation', ['A'] * 9 + ['B'] * 9 + ['C'] * 9)},

From 17cb26de263544645bab8ac8efc382335dbad849 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Tue, 5 Nov 2024 11:19:54 +0100
Subject: [PATCH 26/29] fix the id of a test

---
 brainscore_vision/benchmarks/coggan2024_behavior/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/brainscore_vision/benchmarks/coggan2024_behavior/test.py b/brainscore_vision/benchmarks/coggan2024_behavior/test.py
index 5d4821763..1e94c12fe 100644
--- a/brainscore_vision/benchmarks/coggan2024_behavior/test.py
+++ b/brainscore_vision/benchmarks/coggan2024_behavior/test.py
@@ -14,7 +14,7 @@ def test_benchmark_registry():
 @pytest.mark.private_access
 def test_benchmarks():
     benchmark = load_benchmark(
-        'tong.Coggan2024_behavior-ConditionWiseAccuracySimilarity')
+        'tong.Coggan2024_behavior-ConditionWiseLabelingAccuracySimilarity')
     model = load_model('alexnet')
     result = benchmark(model)
     assert result.values == approx(0.34431372, abs=.001)

From d2f76706625064ccd775d059f80c8569ef3649dd Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Tue, 5 Nov 2024 11:47:22 +0100
Subject: [PATCH 27/29] fix bug with accuracy distance test now that it was
 changed

---
 .../metrics/accuracy_distance/test.py          | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/brainscore_vision/metrics/accuracy_distance/test.py b/brainscore_vision/metrics/accuracy_distance/test.py
index 23696be90..707fec862 100644
--- a/brainscore_vision/metrics/accuracy_distance/test.py
+++ b/brainscore_vision/metrics/accuracy_distance/test.py
@@ -1,5 +1,5 @@
 import numpy as np
-from pytest import approx
+#from pytest import approx
 
 from brainio.assemblies import BehavioralAssembly
 from brainscore_vision import load_metric
@@ -16,14 +16,16 @@ def test_score_single_variable():
     assembly = _make_data()
     metric = load_metric('accuracy_distance')
     score = metric(assembly.sel(subject='C'), assembly, ('condition',))
-    assert score == approx(0.55555556)
+    print(score)
+    # assert score == approx(0.55555556)
 
 
 def test_score_multi_variable():
     assembly = _make_data()
     metric = load_metric('accuracy_distance')
     score = metric(assembly.sel(subject='C'), assembly, ('condition','animacy'))
-    assert score == approx(0.55555556)
+    print(score)
+    # assert score == approx(0.55555556)
 
 
 def test_has_error():
@@ -45,14 +47,18 @@ def _make_data():
     # subject A is 5 / 9 = 0.55...% correct
     # subject B is 4 / 9 = 0.44...% correct
     # subject C is 9 / 9 = 100% correct
-    return BehavioralAssembly(['dog', 'cat', 'chair', 'cat', 'dog', 'dog', 'dog', 'dog', 'chair',  # subject A
+    return BehavioralAssembly([['dog', 'cat', 'chair', 'cat', 'dog', 'dog', 'dog', 'dog', 'chair',  # subject A
                                'cat', 'cat', 'chair', 'cat', 'dog', 'cat', 'chair', 'cat', 'cat',  # subject B
                                'dog', 'cat', 'chair', 'dog', 'cat', 'chair', 'dog', 'cat', 'chair'  # subject C
-                               ],
+                               ]],
                               coords={'stimulus_id': ('presentation', np.resize(np.arange(9), 9 * 3)),
                                       'truth': ('presentation', np.resize(['dog', 'cat', 'chair'], 9 * 3)),
                                       'label': ('presentation', np.resize(['dog', 'cat', 'chair'], 9 * 3)),
                                       'condition': ('presentation', np.resize([1, 1, 1, 2, 2, 2, 3, 3, 3], 9 * 3)),
                                       'animacy': ('presentation', np.resize(['animate', 'animate', 'inanimate'], 9 * 3)),
                                       'subject': ('presentation', ['A'] * 9 + ['B'] * 9 + ['C'] * 9)},
-                              dims=['presentation'])
+                              dims=['choice', 'presentation'])
+
+
+test_score_single_variable()
+test_score_multi_variable()
\ No newline at end of file

From 6f129ea6b153af3c5577ea910296e2bd6ab30566 Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Tue, 5 Nov 2024 15:46:51 +0100
Subject: [PATCH 28/29] re-add removed (fixed?) tests

---
 brainscore_vision/metrics/accuracy_distance/test.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/brainscore_vision/metrics/accuracy_distance/test.py b/brainscore_vision/metrics/accuracy_distance/test.py
index 707fec862..965893069 100644
--- a/brainscore_vision/metrics/accuracy_distance/test.py
+++ b/brainscore_vision/metrics/accuracy_distance/test.py
@@ -58,7 +58,3 @@ def _make_data():
                                       'animacy': ('presentation', np.resize(['animate', 'animate', 'inanimate'], 9 * 3)),
                                       'subject': ('presentation', ['A'] * 9 + ['B'] * 9 + ['C'] * 9)},
                               dims=['choice', 'presentation'])
-
-
-test_score_single_variable()
-test_score_multi_variable()
\ No newline at end of file

From 4a776b138be8ec7c1bc599051eab53fa0487fe3e Mon Sep 17 00:00:00 2001
From: Ben Lonnqvist <ben.lonnqvist@epfl.ch>
Date: Tue, 5 Nov 2024 15:47:06 +0100
Subject: [PATCH 29/29] re-add removed (fixed?) tests

---
 brainscore_vision/metrics/accuracy_distance/test.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/brainscore_vision/metrics/accuracy_distance/test.py b/brainscore_vision/metrics/accuracy_distance/test.py
index 965893069..b27888f76 100644
--- a/brainscore_vision/metrics/accuracy_distance/test.py
+++ b/brainscore_vision/metrics/accuracy_distance/test.py
@@ -1,5 +1,5 @@
 import numpy as np
-#from pytest import approx
+from pytest import approx
 
 from brainio.assemblies import BehavioralAssembly
 from brainscore_vision import load_metric
@@ -17,7 +17,7 @@ def test_score_single_variable():
     metric = load_metric('accuracy_distance')
     score = metric(assembly.sel(subject='C'), assembly, ('condition',))
     print(score)
-    # assert score == approx(0.55555556)
+    assert score == approx(0.55555556)
 
 
 def test_score_multi_variable():
@@ -25,7 +25,7 @@ def test_score_multi_variable():
     metric = load_metric('accuracy_distance')
     score = metric(assembly.sel(subject='C'), assembly, ('condition','animacy'))
     print(score)
-    # assert score == approx(0.55555556)
+    assert score == approx(0.55555556)
 
 
 def test_has_error():