diff --git a/migrations/2020-12-20_pkl_to_nc.py b/migrations/2020-12-20_pkl_to_nc.py
new file mode 100644
index 000000000..517502d9b
--- /dev/null
+++ b/migrations/2020-12-20_pkl_to_nc.py
@@ -0,0 +1,90 @@
+import pickle
+from pathlib import Path
+
+import boto3
+from brainio_base.assemblies import BehavioralAssembly
+from brainio_collection.packaging import write_netcdf
+
+
+local_pkl_names = [
+    'alexnet-probabilities.pkl',
+    'resnet34-probabilities.pkl',
+    'resnet18-probabilities.pkl'
+]
+
+
+s3_pkl_names = [
+    "alexnet-freemanziemba2013.aperture-private.pkl",
+    "alexnet-majaj2015.private-features.12.pkl",
+    "CORnetZ-rajalingham2018public.pkl",
+    "cornet_s-kar2019.pkl",
+    "alexnet-sanghavi2020-features.12.pkl",
+    "alexnet-sanghavijozwik2020-features.12.pkl",
+    "alexnet-sanghavimurty2020-features.12.pkl",
+    "alexnet-rajalingham2020-features.12.pkl",
+]
+
+
+def local_pkls():
+    target_dir_path = Path(__file__).parents[1] / "tests" / "test_metrics"
+    for pkl_name in local_pkl_names:
+        pkl_path = target_dir_path / pkl_name
+        nc_path = pkl_path.with_suffix(".nc")
+        if not nc_path.exists():
+            print(f"{nc_path} does not exist.  ")
+            with open(pkl_path, 'rb') as f:
+                unpickled = pickle.load(f)
+                #   write netcdf
+                sha1 = write_netcdf(BehavioralAssembly(unpickled["data"]), str(nc_path))
+        else:
+            print(f"{nc_path} already exists.  ")
+
+
+def s3_pkls():
+    session = boto3.session.Session(profile_name="dicarlolab_jjpr")
+    s3 = session.client("s3")
+    bucket_name = "brain-score-tests"
+    def exists(key):
+        try:
+            s3.head_object(Bucket=bucket_name, Key=key)
+            return True
+        except s3.exceptions.NoSuchKey:
+            return False
+
+    prefix_path = Path("tests", "test_benchmarks")
+    target_dir_path = Path(__file__).parent / "test_pkl"
+
+    for pkl_name in s3_pkl_names:
+        pkl_path = Path(pkl_name)
+        nc_path = pkl_path.with_suffix(".nc")
+        object_key_pkl = prefix_path / pkl_path
+        target_file_pkl = target_dir_path / pkl_path
+        target_file_nc = target_dir_path / nc_path
+        object_key_nc = prefix_path / nc_path
+
+        if not exists(str(object_key_nc)):
+            print(f"{object_key_nc} does not exist.  ")
+            if not target_file_nc.exists():
+                if not target_file_pkl.exists():
+                    #   fetch file
+                    s3.download_file(bucket_name, str(object_key_pkl), str(target_file_pkl))
+                #   unpickle
+                with open(target_file_pkl, 'rb') as f:
+                    unpickled = pickle.load(f)
+                    #   write netcdf
+                    sha1 = write_netcdf(unpickled["data"], str(target_file_nc))
+            #   upload
+            s3.upload_file(str(target_file_nc), bucket_name, str(object_key_nc))
+        else:
+            print(f"{object_key_nc} already exists.  ")
+
+
+def main():
+    # assert xarray is 0.12.3
+    local_pkls()
+    s3_pkls()
+
+
+if __name__ == '__main__':
+    main()
+
diff --git a/setup.py b/setup.py
index 831b9c5b9..d7a1283ab 100644
--- a/setup.py
+++ b/setup.py
@@ -28,7 +28,7 @@
     "tensorflow",
     "result_caching @ git+https://github.com/mschrimpf/result_caching",
     "jupyter",
-    "pandas==0.25.3",
+    "pandas",
     "pybtex",
     'peewee',
     'psycopg2-binary'
diff --git a/test_setup.sh b/test_setup.sh
index f3c017e86..2675e3462 100755
--- a/test_setup.sh
+++ b/test_setup.sh
@@ -3,7 +3,7 @@
 # get directory of this script (i.e. tests), following https://stackoverflow.com/a/246128/2225200
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 
-for f in alexnet-freemanziemba2013.aperture-private.pkl alexnet-majaj2015.private-features.12.pkl CORnetZ-rajalingham2018public.pkl cornet_s-kar2019.pkl alexnet-sanghavi2020-features.12.pkl alexnet-sanghavijozwik2020-features.12.pkl alexnet-sanghavimurty2020-features.12.pkl alexnet-rajalingham2020-features.12.pkl
+for f in alexnet-freemanziemba2013.aperture-private.nc alexnet-majaj2015.private-features.12.nc CORnetZ-rajalingham2018public.nc cornet_s-kar2019.nc alexnet-sanghavi2020-features.12.nc alexnet-sanghavijozwik2020-features.12.nc alexnet-sanghavimurty2020-features.12.nc alexnet-rajalingham2020-features.12.nc
 do
   aws --no-sign-request s3 cp s3://brain-score-tests/tests/test_benchmarks/${f} ${SCRIPT_DIR}/tests/test_benchmarks/
 done
diff --git a/tests/test_benchmarks/test___init__.py b/tests/test_benchmarks/test___init__.py
index 8ae25d025..aa292f2ff 100644
--- a/tests/test_benchmarks/test___init__.py
+++ b/tests/test_benchmarks/test___init__.py
@@ -1,5 +1,4 @@
 import os
-import pickle
 
 import numpy as np
 import pytest
@@ -7,10 +6,12 @@
 from pathlib import Path
 from pytest import approx
 from typing import List, Tuple
+import xarray as xr
 
 from brainscore.benchmarks import benchmark_pool, public_benchmark_pool, evaluation_benchmark_pool
 from brainscore.model_interface import BrainModel
 from tests.test_benchmarks import PrecomputedFeatures
+from brainio_base.assemblies import BehavioralAssembly
 
 
 class TestPoolList:
@@ -148,7 +149,7 @@ class TestPrecomputed:
         ('movshon.FreemanZiemba2013.V2-pls', approx(.459283, abs=.005)),
     ])
     def test_FreemanZiemba2013(self, benchmark, expected):
-        self.run_test(benchmark=benchmark, file='alexnet-freemanziemba2013.aperture-private.pkl', expected=expected)
+        self.run_test(benchmark=benchmark, file='alexnet-freemanziemba2013.aperture-private.nc', expected=expected)
 
     @pytest.mark.memory_intense
     @pytest.mark.parametrize('benchmark, expected', [
@@ -156,13 +157,12 @@ def test_FreemanZiemba2013(self, benchmark, expected):
         ('dicarlo.MajajHong2015.IT-pls', approx(.584053, abs=.005)),
     ])
     def test_MajajHong2015(self, benchmark, expected):
-        self.run_test(benchmark=benchmark, file='alexnet-majaj2015.private-features.12.pkl', expected=expected)
+        self.run_test(benchmark=benchmark, file='alexnet-majaj2015.private-features.12.nc', expected=expected)
 
     def run_test(self, benchmark, file, expected):
         benchmark = benchmark_pool[benchmark]
         precomputed_features = Path(__file__).parent / file
-        with open(precomputed_features, 'rb') as f:
-            precomputed_features = pickle.load(f)['data']
+        precomputed_features = BehavioralAssembly(xr.load_dataarray(precomputed_features))
         precomputed_features = precomputed_features.stack(presentation=['stimulus_path'])
         precomputed_paths = list(map(lambda f: Path(f).name, precomputed_features['stimulus_path'].values))
         # attach stimulus set meta
@@ -184,9 +184,8 @@ def run_test(self, benchmark, file, expected):
     @pytest.mark.slow
     def test_Kar2019ost_cornet_s(self):
         benchmark = benchmark_pool['dicarlo.Kar2019-ost']
-        precomputed_features = Path(__file__).parent / 'cornet_s-kar2019.pkl'
-        with open(precomputed_features, 'rb') as f:
-            precomputed_features = pickle.load(f)['data']
+        precomputed_features = Path(__file__).parent / 'cornet_s-kar2019.nc'
+        precomputed_features = BehavioralAssembly(xr.load_dataarray(precomputed_features))
         precomputed_features = PrecomputedFeatures(precomputed_features, visual_degrees=8)
         # score
         score = benchmark(precomputed_features).raw
@@ -194,9 +193,8 @@ def test_Kar2019ost_cornet_s(self):
 
     def test_Rajalingham2018public(self):
         # load features
-        precomputed_features = Path(__file__).parent / 'CORnetZ-rajalingham2018public.pkl'
-        with open(precomputed_features, 'rb') as f:
-            precomputed_features = pickle.load(f)['data']
+        precomputed_features = Path(__file__).parent / 'CORnetZ-rajalingham2018public.nc'
+        precomputed_features = BehavioralAssembly(xr.load_dataarray(precomputed_features))
         precomputed_features = PrecomputedFeatures(precomputed_features,
                                                    visual_degrees=8,  # doesn't matter, features are already computed
                                                    )
@@ -212,7 +210,7 @@ def test_Rajalingham2018public(self):
         ('dicarlo.Sanghavi2020.IT-pls', approx(.611347, abs=.015)),
     ])
     def test_Sanghavi2020(self, benchmark, expected):
-        self.run_test(benchmark=benchmark, file='alexnet-sanghavi2020-features.12.pkl', expected=expected)
+        self.run_test(benchmark=benchmark, file='alexnet-sanghavi2020-features.12.nc', expected=expected)
 
     @pytest.mark.memory_intense
     @pytest.mark.slow
@@ -221,7 +219,7 @@ def test_Sanghavi2020(self, benchmark, expected):
         ('dicarlo.SanghaviJozwik2020.IT-pls', approx(.590543, abs=.005)),
     ])
     def test_SanghaviJozwik2020(self, benchmark, expected):
-        self.run_test(benchmark=benchmark, file='alexnet-sanghavijozwik2020-features.12.pkl', expected=expected)
+        self.run_test(benchmark=benchmark, file='alexnet-sanghavijozwik2020-features.12.nc', expected=expected)
 
     @pytest.mark.memory_intense
     @pytest.mark.parametrize('benchmark, expected', [
@@ -229,7 +227,7 @@ def test_SanghaviJozwik2020(self, benchmark, expected):
         ('dicarlo.SanghaviMurty2020.IT-pls', approx(.53006, abs=.015)),
     ])
     def test_SanghaviMurty2020(self, benchmark, expected):
-        self.run_test(benchmark=benchmark, file='alexnet-sanghavimurty2020-features.12.pkl', expected=expected)
+        self.run_test(benchmark=benchmark, file='alexnet-sanghavimurty2020-features.12.nc', expected=expected)
 
     @pytest.mark.memory_intense
     @pytest.mark.slow
@@ -237,7 +235,7 @@ def test_SanghaviMurty2020(self, benchmark, expected):
         ('dicarlo.Rajalingham2020.IT-pls', approx(.147549, abs=.01)),
     ])
     def test_Rajalingham2020(self, benchmark, expected):
-        self.run_test(benchmark=benchmark, file='alexnet-rajalingham2020-features.12.pkl', expected=expected)
+        self.run_test(benchmark=benchmark, file='alexnet-rajalingham2020-features.12.nc', expected=expected)
 
 
 class TestVisualDegrees:
diff --git a/tests/test_benchmarks/test_rajalingham2018.py b/tests/test_benchmarks/test_rajalingham2018.py
index b3da8adf1..dae2e8c0f 100644
--- a/tests/test_benchmarks/test_rajalingham2018.py
+++ b/tests/test_benchmarks/test_rajalingham2018.py
@@ -1,7 +1,9 @@
 import numpy as np
 import os
+from pathlib import Path
 
 import pandas as pd
+import xarray as xr
 import pytest
 from pytest import approx
 
@@ -25,9 +27,8 @@ def test_ceiling(self):
                              ])
     def test_precomputed(self, model, expected_score):
         benchmark = DicarloRajalingham2018I2n()
-        probabilities = pd.read_pickle(os.path.join(os.path.dirname(__file__), '..', 'test_metrics',
-                                                    f'{model}-probabilities.pkl'))['data']
-        probabilities = BehavioralAssembly(probabilities)
+        probabilities = Path(__file__).parent.parent / 'test_metrics' / f'{model}-probabilities.nc'
+        probabilities = BehavioralAssembly(xr.load_dataarray(probabilities))
         candidate = PrecomputedProbabilities(probabilities)
         score = benchmark(candidate)
         assert score.raw.sel(aggregation='center') == approx(expected_score, abs=.005)
diff --git a/tests/test_metrics/alexnet-probabilities.nc b/tests/test_metrics/alexnet-probabilities.nc
new file mode 100644
index 000000000..fc5111922
Binary files /dev/null and b/tests/test_metrics/alexnet-probabilities.nc differ
diff --git a/tests/test_metrics/alexnet-probabilities.pkl b/tests/test_metrics/alexnet-probabilities.pkl
deleted file mode 100644
index 6dac86ad6..000000000
Binary files a/tests/test_metrics/alexnet-probabilities.pkl and /dev/null differ
diff --git a/tests/test_metrics/resnet18-probabilities.nc b/tests/test_metrics/resnet18-probabilities.nc
new file mode 100644
index 000000000..0e732d630
Binary files /dev/null and b/tests/test_metrics/resnet18-probabilities.nc differ
diff --git a/tests/test_metrics/resnet18-probabilities.pkl b/tests/test_metrics/resnet18-probabilities.pkl
deleted file mode 100644
index c18976c0d..000000000
Binary files a/tests/test_metrics/resnet18-probabilities.pkl and /dev/null differ
diff --git a/tests/test_metrics/resnet34-probabilities.nc b/tests/test_metrics/resnet34-probabilities.nc
new file mode 100644
index 000000000..2b157ec03
Binary files /dev/null and b/tests/test_metrics/resnet34-probabilities.nc differ
diff --git a/tests/test_metrics/resnet34-probabilities.pkl b/tests/test_metrics/resnet34-probabilities.pkl
deleted file mode 100644
index b5c34059c..000000000
Binary files a/tests/test_metrics/resnet34-probabilities.pkl and /dev/null differ
diff --git a/tests/test_metrics/test_behavior.py b/tests/test_metrics/test_behavior.py
index e75dc9bee..61ce4c7a6 100644
--- a/tests/test_metrics/test_behavior.py
+++ b/tests/test_metrics/test_behavior.py
@@ -1,6 +1,8 @@
 import os
+from pathlib import Path
 
 import pandas as pd
+import xarray as xr
 import pytest
 from pytest import approx
 
@@ -20,9 +22,8 @@ class TestI2N:
     def test_model(self, model, expected_score):
         # assemblies
         objectome = load_assembly()
-        probabilities = pd.read_pickle(os.path.join(os.path.dirname(__file__),
-                                                    f'{model}-probabilities.pkl'))['data']
-        probabilities = BehavioralAssembly(probabilities)
+        probabilities = Path(__file__).parent / f'{model}-probabilities.nc'
+        probabilities = BehavioralAssembly(xr.load_dataarray(probabilities))
         # metric
         i2n = I2n()
         score = i2n(probabilities, objectome)
diff --git a/tests/test_metrics/test_transformations.py b/tests/test_metrics/test_transformations.py
index 74384eab0..ea73b9787 100644
--- a/tests/test_metrics/test_transformations.py
+++ b/tests/test_metrics/test_transformations.py
@@ -154,14 +154,19 @@ def test_no_expand_raw_level(self):
         class RawMetricPlaceholder(Metric):
             def __call__(self, assembly, *args, **kwargs):
                 result = Score([assembly.values[0]], dims=['dim'])
-                raw = result.copy()
-                raw['dim_id'] = 'dim', [assembly.values[1]]
-                raw['division_coord'] = 'dim', [assembly.values[2]]
+                raw = Score(result.copy(), coords={
+                    'dim_id': ('dim', [assembly.values[1]]),
+                    'division_coord': ('dim', [assembly.values[2]])
+                })
                 result.attrs['raw'] = raw
                 return result
 
         metric = RawMetricPlaceholder()
         result = transformation(assembly, apply=metric)
+        assert result.dims == ("division_coord", "dim")
         assert hasattr(result, 'raw')
-        assert 'division_coord' not in result.raw  # no dimension
+        assert result.raw.dims == ("dim",)
+        assert 'division_coord' not in result.raw.dims  # no dimension
         assert hasattr(result.raw, 'division_coord')  # but a level
+        assert result.raw["dim"].variable.level_names == ["dim_id", "division_coord"]
+