From 6ffe1a44d233d57aadee2624acc60cc27dc1c3ec Mon Sep 17 00:00:00 2001
From: Katharina Eggensperger <eggenspk@informatik.uni-freiburg.de>
Date: Mon, 10 Aug 2020 17:25:51 +0200
Subject: [PATCH 1/6] init new version 0.0.4dev (#33)

---
 changelog.md          | 2 ++
 hpolib/__version__.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/changelog.md b/changelog.md
index bb2083f4..016fb84d 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,3 +1,5 @@
+# 0.0.4
+
 # 0.0.3
   * improve forwarding exceptions in containerized benchmarks
   * allow to set debug level with env variable
diff --git a/hpolib/__version__.py b/hpolib/__version__.py
index ffcc925a..3d8c7a09 100644
--- a/hpolib/__version__.py
+++ b/hpolib/__version__.py
@@ -1 +1 @@
-__version__ = '0.0.3'
+__version__ = '0.0.4dev'

From 63e2193d37ad1ca6c1ea1782c92c78c2ff0b1b26 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20M=C3=BCller?= <muller-phil@gmx.net>
Date: Thu, 20 Aug 2020 12:24:08 +0200
Subject: [PATCH 2/6] Fix tests + More tests (#36)

* MOVE Lock files to cache folder + FIX test cases Part 1

* TRAVIS should crash if an error occurs.

* FIX test cases Part 2

* FIX test cases Part 3

* ADD codecov.yml

* ADD test cases

* REMOVE test_data_manager.py

Uploaded a file for a future pull request
---
 .travis.yml                                   |  1 +
 changelog.md                                  |  3 +-
 ci_scripts/script.sh                          | 18 +++++--
 codecov.yml                                   | 25 +++++++++
 hpolib/container/client_abstract_benchmark.py |  4 +-
 tests/test_abstract_benchmark.py              | 12 +++++
 tests/test_check_configuration.py             | 49 +++++++++++++++--
 tests/test_server.py                          | 45 ++++++++++++----
 tests/test_tabular_benchmarks.py              | 44 +++++++++-------
 tests/test_utils.py                           | 52 +++++++++++++++++++
 tests/test_whitebox.py                        | 18 ++++---
 11 files changed, 219 insertions(+), 52 deletions(-)
 create mode 100644 codecov.yml
 create mode 100644 tests/test_abstract_benchmark.py
 create mode 100644 tests/test_utils.py

diff --git a/.travis.yml b/.travis.yml
index c39530b7..e9b744a5 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -12,6 +12,7 @@ jobs:
     env:
       - RUN_TESTS="true"
       - USE_SINGULARITY="true"
+      - RUN_CODECOV="true"
   - python: "3.7"
     env: RUN_CODESTYLE="true"
   - python: "3.7"
diff --git a/changelog.md b/changelog.md
index 016fb84d..2fdfec6d 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,5 +1,6 @@
 # 0.0.4
-
+  * improve test coverage
+  
 # 0.0.3
   * improve forwarding exceptions in containerized benchmarks
   * allow to set debug level with env variable
diff --git a/ci_scripts/script.sh b/ci_scripts/script.sh
index b14da464..bde23b1a 100644
--- a/ci_scripts/script.sh
+++ b/ci_scripts/script.sh
@@ -1,14 +1,24 @@
 #!/usr/bin/env sh
 
 if [[ "$RUN_TESTS" == "true" ]]; then
-    if [[ "$USE_SINGULARITY" == "true" ]]; then
-        echo "Run tests with singularity support"
-        # Create the coverage report for the singularity example, since it covers more tests.
+    if [[ "$RUN_CODECOV" == "true" ]]; then
+        echo "Run tests with code coverage"
         pytest -sv --cov=hpolib tests/
+        exit_code=$?
+
+        echo "Run code coverage"
         codecov
     else
-        echo "Run tests without singularity support"
+        echo "Run tests without code coverage"
         pytest -sv tests/
+        exit_code=$?
+    fi
+
+    if [[ "$exit_code" -eq 0 ]]; then
+        echo "All test have passed."
+    else
+        echo "Some Tests have failed."
+        exit 1
     fi
 fi
 
diff --git a/codecov.yml b/codecov.yml
new file mode 100644
index 00000000..0d4c689b
--- /dev/null
+++ b/codecov.yml
@@ -0,0 +1,25 @@
+codecov:
+  require_ci_to_pass: yes
+
+coverage:
+  precision: 2
+  round: down
+  range: "70...100"
+
+parsers:
+  gcov:
+    branch_detection:
+      conditional: yes
+      loop: yes
+      method: no
+      macro: no
+
+comment:
+  layout: "reach,diff,flags,tree"
+  behavior: default
+  require_changes: no
+
+ignore:
+  - "hpolib/benchmarks"
+  - "hpolib/util/dependencies.py"
+  - "hpolib/util/example_utils.py"
\ No newline at end of file
diff --git a/hpolib/container/client_abstract_benchmark.py b/hpolib/container/client_abstract_benchmark.py
index 9f0c368c..1cf9b874 100644
--- a/hpolib/container/client_abstract_benchmark.py
+++ b/hpolib/container/client_abstract_benchmark.py
@@ -118,10 +118,8 @@ def _setup(self, benchmark_name: str, container_name: str, container_source: Opt
             # split-brain situation between any process that had been waiting for the deleted file, and any process
             # that attempted to lock the file after it had been deleted."
             # See: https://docs.openstack.org/oslo.concurrency/latest/admin/index.html
-            # We limit the number of lock file by having at most one lock file per benchmark and storing them in the
-            # temp folder, so that they are automatically deleted after reboot.
             @lockutils.synchronized('not_thread_process_safe', external=True,
-                                    lock_path=f'{self.config.socket_dir}/lock_{container_name}', delay=5)
+                                    lock_path=f'{self.config.cache_dir}/lock_{container_name}', delay=5)
             def download_container(container_dir, container_name, container_source):
                 if not (container_dir / container_name).exists():
                     logger.debug('Going to pull the container from an online source.')
diff --git a/tests/test_abstract_benchmark.py b/tests/test_abstract_benchmark.py
new file mode 100644
index 00000000..8f2b0a9b
--- /dev/null
+++ b/tests/test_abstract_benchmark.py
@@ -0,0 +1,12 @@
+import pytest
+
+from hpolib.abstract_benchmark import AbstractBenchmark
+
+with pytest.raises(NotImplementedError):
+    AbstractBenchmark.get_configuration_space()
+
+with pytest.raises(NotImplementedError):
+    AbstractBenchmark.get_fidelity_space()
+
+with pytest.raises(NotImplementedError):
+    AbstractBenchmark.get_meta_information()
diff --git a/tests/test_check_configuration.py b/tests/test_check_configuration.py
index 4a8412b6..e8f8038e 100644
--- a/tests/test_check_configuration.py
+++ b/tests/test_check_configuration.py
@@ -1,8 +1,8 @@
-from typing import Union, Tuple, Dict, List
 import unittest
+from typing import Dict, Union
 
 import numpy as np
-
+import pytest
 from ConfigSpace import ConfigurationSpace
 from ConfigSpace import UniformFloatHyperparameter, UniformIntegerHyperparameter, \
     CategoricalHyperparameter
@@ -42,9 +42,13 @@ def tmp(_, configuration: Dict, **kwargs):
         @AbstractBenchmark._check_configuration
         def tmp(_, configuration: Dict, **kwargs):
             return configuration
+
         tmp(self=self.foo, configuration={"flt": 0.2, "cat": 1, "itg": 1})
         tmp(self=self.foo, configuration=self.foo.configuration_space.sample_configuration())
+
         self.assertRaises(Exception, tmp, {"self": self.foo, "configuration": {"flt": 0.2, "cat": 1}})
+        self.assertRaises(Exception, tmp, {"self": self.foo, "configuration": {"flt": 10000, "cat": 500000}})
+        self.assertRaises(Exception, tmp, {"self": self.foo, "configuration": [0.2, 1]})
 
     def test_fidel_decorator(self):
         @AbstractBenchmark._check_fidelity
@@ -68,8 +72,43 @@ def tmp(_, configuration: Dict, fidelity: Dict, **kwargs):
                         configuration=self.foo.configuration_space.sample_configuration())
         self.assertEqual(ret, sample_fidel)
 
-        try:
+        with pytest.raises(ValueError):
             tmp(self=self.foo, configuration=self.foo.configuration_space.sample_configuration(),
                 f_cat=1)
-        except ValueError as e:
-            self.assertEqual(e.__str__(), "Fidelity parameter f_cat should not be part of kwargs")
+
+        self.assertRaises(Exception, tmp, {"self": self.foo,
+                                           "configuration": self.foo.configuration_space.sample_configuration(),
+                                           "fidelity": {"f_cat": "b"}})
+        self.assertRaises(TypeError, tmp, {"self": self.foo,
+                                           "configuration": self.foo.configuration_space.sample_configuration(),
+                                           "fidelity": [0.1]})
+
+
+class TestCheckUnittest2(unittest.TestCase):
+
+    def setUp(self):
+        class Dummy():
+            configuration_space = ConfigurationSpace(seed=1)
+            hp1 = UniformFloatHyperparameter("hp1", lower=0.0, upper=0.5, default_value=0.5)
+            hp2 = UniformFloatHyperparameter("hp2", lower=1.0, upper=1.5, default_value=1.5)
+            hp3 = UniformFloatHyperparameter("hp3", lower=2.0, upper=2.5, default_value=2.5)
+            configuration_space.add_hyperparameters([hp1, hp2, hp3])
+
+        self.foo = Dummy()
+
+    def test_config_decorator(self):
+        @AbstractBenchmark._check_configuration
+        def tmp(_, configuration: Union[Dict, np.ndarray], **kwargs):
+            return configuration
+
+        tmp(self=self.foo, configuration=np.array([0.25, 1.25, 2.25]))
+
+        @AbstractBenchmark._configuration_as_array
+        def tmp(_, configuration: Dict, **kwargs):
+            return configuration
+
+        result = tmp(self=self.foo, configuration=self.foo.configuration_space.get_default_configuration())
+        assert np.array_equal(result, np.array([0.5, 1.5, 2.5]))
+
+        result = tmp(self=self.foo, configuration=np.array([0.5, 1.5, 2.5]))
+        assert np.array_equal(result, np.array([0.5, 1.5, 2.5]))
diff --git a/tests/test_server.py b/tests/test_server.py
index e0fce902..7dc66324 100644
--- a/tests/test_server.py
+++ b/tests/test_server.py
@@ -1,17 +1,20 @@
-import pytest
-import os
-import logging
 import importlib
+import logging
+import os
+
+
+def set_log_level(debug):
+    os.environ['HPOLIB_DEBUG'] = 'true' if debug else 'false'
+    import hpolib.container.client_abstract_benchmark as client
+    importlib.reload(client)
 
 
 def test_debug_env_variable_1():
-    os.environ['HPOLIB_DEBUG'] = 'false'
+    set_log_level(False)
     from hpolib.container.client_abstract_benchmark import log_level
     assert log_level == logging.INFO
 
-    os.environ['HPOLIB_DEBUG'] = 'true'
-    import hpolib.container.client_abstract_benchmark as client
-    importlib.reload(client)
+    set_log_level(True)
     from hpolib.container.client_abstract_benchmark import log_level
     assert log_level == logging.DEBUG
 
@@ -19,10 +22,7 @@ def test_debug_env_variable_1():
 def test_debug_container():
     # Test if the debug option works. Check if some debug output from the server is visible.
 
-    os.environ['HPOLIB_DEBUG'] = 'true'
-
-    import hpolib.container.client_abstract_benchmark as client
-    importlib.reload(client)
+    set_log_level(True)
 
     from hpolib.container.benchmarks.ml.xgboost_benchmark import XGBoostBenchmark as Benchmark
     from hpolib.util.openml_data_manager import get_openmlcc18_taskids
@@ -35,6 +35,29 @@ def test_debug_container():
     cs = b.get_configuration_space()
     assert cs is not None
 
+    set_log_level(False)
+
+
+def test_benchmark_encoder():
+    from enum import Enum
+    class test_enum(Enum):
+        obj = 'name'
+
+        def __str__(self):
+            return str(self.value)
+
+    from hpolib.container.server_abstract_benchmark import BenchmarkEncoder
+    import json
+    import numpy as np
+
+    enum_obj = test_enum.obj
+    enum_obj_str = json.dumps(enum_obj, cls=BenchmarkEncoder)
+    assert enum_obj_str == '"name"'
+
+    array = np.array([1, 2, 3, 4])
+    array_str = json.dumps(array, cls=BenchmarkEncoder)
+    assert array_str == '[1, 2, 3, 4]'
+
 
 if __name__ == '__main__':
     test_debug_env_variable_1()
diff --git a/tests/test_tabular_benchmarks.py b/tests/test_tabular_benchmarks.py
index d78a8260..b89eebe0 100644
--- a/tests/test_tabular_benchmarks.py
+++ b/tests/test_tabular_benchmarks.py
@@ -1,8 +1,13 @@
+import logging
+
 import pytest
 
-import logging
 logging.basicConfig(level=logging.DEBUG)
 
+import os
+
+os.environ['HPOLIB_DEBUG'] = 'true'
+
 from hpolib.container.benchmarks.nas.tabular_benchmarks import SliceLocalizationBenchmark, \
     NavalPropulsionBenchmark, ParkinsonsTelemonitoringBenchmark, ProteinStructureBenchmark
 
@@ -10,43 +15,42 @@
 def setup():
     benchmark = SliceLocalizationBenchmark()
     default_config = benchmark.get_configuration_space(seed=1).get_default_configuration()
-
     return default_config
 
 
 def test_tabular_benchmark_wrong_input():
-    container_source, default_config = setup()
+    default_config = setup()
     benchmark = SliceLocalizationBenchmark(rng=1)
 
-    with pytest.raises(AssertionError):
-        benchmark.objective_function(configuration=default_config, budget=0)
+    with pytest.raises(ValueError):
+        benchmark.objective_function(configuration=default_config, fidelity=dict(budget=0))
 
     with pytest.raises(ValueError):
-        benchmark.objective_function(configuration=default_config, budget=1, run_index=0.1)
+        benchmark.objective_function(configuration=default_config, fidelity=dict(budget=1), run_index=0.1)
 
     with pytest.raises(AssertionError):
-        benchmark.objective_function(configuration=default_config, budget=1, run_index=[4])
+        benchmark.objective_function(configuration=default_config, fidelity=dict(budget=1), run_index=[4])
 
     with pytest.raises(AssertionError):
-        benchmark.objective_function(configuration=default_config, budget=1, run_index=[])
+        benchmark.objective_function(configuration=default_config, fidelity=dict(budget=1), run_index=[])
 
     with pytest.raises(AssertionError):
-        benchmark.objective_function(configuration=default_config, budget=1, run_index=-1)
+        benchmark.objective_function(configuration=default_config, fidelity=dict(budget=1), run_index=-1)
 
     with pytest.raises(AssertionError):
-        benchmark.objective_function(configuration=default_config, budget=1, run_index=4)
+        benchmark.objective_function(configuration=default_config, fidelity=dict(budget=1), run_index=4)
 
-    with pytest.raises(AssertionError):
-        benchmark.objective_function(configuration=default_config, budget=101, run_index=3)
+    with pytest.raises(ValueError):
+        benchmark.objective_function(configuration=default_config, fidelity=dict(budget=101), run_index=3)
 
     benchmark = None
 
 
 def test_slice_benchmark():
-    container_source, default_config = setup()
+    default_config = setup()
 
     benchmark = SliceLocalizationBenchmark(rng=1)
-    result = benchmark.objective_function(configuration=default_config, budget=1, run_index=[0, 1, 2, 3])
+    result = benchmark.objective_function(configuration=default_config, fidelity=dict(budget=1), run_index=[0, 1, 2, 3])
 
     mean = 0.01828
     assert result['function_value'] == pytest.approx(mean, abs=0.0001)
@@ -65,10 +69,10 @@ def test_slice_benchmark():
 
 
 def test_naval_benchmark():
-    container_source, default_config = setup()
+    default_config = setup()
 
     benchmark = NavalPropulsionBenchmark(rng=1)
-    result = benchmark.objective_function(configuration=default_config, budget=1, run_index=[0, 1, 2, 3])
+    result = benchmark.objective_function(configuration=default_config, fidelity=dict(budget=1), run_index=[0, 1, 2, 3])
 
     mean = 0.8928
     assert result['function_value'] == pytest.approx(mean, abs=0.0001)
@@ -87,10 +91,10 @@ def test_naval_benchmark():
 
 
 def test_protein_benchmark():
-    container_source, default_config = setup()
+    default_config = setup()
 
     benchmark = ProteinStructureBenchmark(rng=1)
-    result = benchmark.objective_function(configuration=default_config, budget=1, run_index=[0, 1, 2, 3])
+    result = benchmark.objective_function(configuration=default_config, fidelity=dict(budget=1), run_index=[0, 1, 2, 3])
 
     mean = 0.4474
     assert result['function_value'] == pytest.approx(mean, abs=0.0001)
@@ -109,10 +113,10 @@ def test_protein_benchmark():
 
 
 def test_parkinson_benchmark():
-    container_source, default_config = setup()
+    default_config = setup()
 
     benchmark = ParkinsonsTelemonitoringBenchmark(rng=1)
-    result = benchmark.objective_function(configuration=default_config, budget=1, run_index=[0, 1, 2, 3])
+    result = benchmark.objective_function(configuration=default_config, fidelity=dict(budget=1), run_index=[0, 1, 2, 3])
 
     mean = 0.7425
     assert result['function_value'] == pytest.approx(mean, abs=0.0001)
diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644
index 00000000..141a0e8c
--- /dev/null
+++ b/tests/test_utils.py
@@ -0,0 +1,52 @@
+import numpy as np
+import pytest
+
+
+def test_example_utils():
+    from hpolib.util.example_utils import get_travis_settings
+
+    res = get_travis_settings('smac')
+    assert res['runcount-limit'] == 5
+
+    res = get_travis_settings('bohb')
+    assert res['max_budget'] == 2
+
+    with pytest.raises(ValueError):
+        res = get_travis_settings('unknown')
+
+
+def test_example_utils_2():
+    from hpolib.util.example_utils import set_env_variables_to_use_only_one_core
+    import os
+    set_env_variables_to_use_only_one_core()
+    assert os.environ['OMP_NUM_THREADS'] == '1'
+    assert os.environ['OPENBLAS_NUM_THREADS'] == '1'
+    assert os.environ['MKL_NUM_THREADS'] == '1'
+    assert os.environ['VECLIB_MAXIMUM_THREADS'] == '1'
+    assert os.environ['NUMEXPR_NUM_THREADS'] == '1'
+    assert os.environ['NUMEXPR_MAX_THREADS'] == '1'
+
+
+def test_rng_helper():
+    from hpolib.util.rng_helper import _cast_int_to_random_state
+
+    rng = np.random.RandomState(123)
+
+    with pytest.raises(ValueError):
+        _cast_int_to_random_state('not_an_int')
+
+    assert rng == _cast_int_to_random_state(rng)
+
+    rng = np.random.RandomState(123)
+    assert rng.random() == _cast_int_to_random_state(123).random()
+
+
+def test_rng_helper_2():
+    from hpolib.util.rng_helper import get_rng
+
+    rng = get_rng(None, None)
+    assert isinstance(rng, np.random.RandomState)
+
+    old_rng = np.random.RandomState(123)
+    rng = get_rng(None, old_rng)
+    assert rng == old_rng
diff --git a/tests/test_whitebox.py b/tests/test_whitebox.py
index 0bb852c5..d22d1155 100644
--- a/tests/test_whitebox.py
+++ b/tests/test_whitebox.py
@@ -1,12 +1,13 @@
+import logging
+
 import numpy as np
 import pytest
-from time import time
 
-import logging
 logging.basicConfig(level=logging.DEBUG)
 
 try:
     import Pyro4
+
     skip_container_test = False
 except ImportError:
     skip_container_test = True
@@ -23,11 +24,12 @@ def test_whitebox_without_container_xgb():
 
     n_estimator = 32
     subsample = 1
-    result_dict = b.objective_function(configuration, n_estimators=n_estimator, subsample=subsample, rng=0)
+    result_dict = b.objective_function(configuration, fidelity=dict(n_estimators=n_estimator, subsample=subsample),
+                                       rng=0)
     valid_loss = result_dict['function_value']
-    train_loss = result_dict['train_loss']
+    train_loss = result_dict['info']['train_loss']
 
-    result_dict = b.objective_function_test(configuration, n_estimators=n_estimator, rng=0)
+    result_dict = b.objective_function_test(configuration, fidelity=dict(n_estimators=n_estimator), rng=0)
     test_loss = result_dict['function_value']
 
     assert np.isclose(train_loss, 0.1071, atol=0.001)
@@ -49,10 +51,10 @@ def test_whitebox_with_container():
 
     n_estimator = 32
     subsample = 1
-    result_dict = b.objective_function(configuration, n_estimators=n_estimator, subsample=subsample)
+    result_dict = b.objective_function(configuration, fidelity=dict(n_estimators=n_estimator, subsample=subsample))
     valid_loss = result_dict['function_value']
-    train_loss = result_dict['train_loss']
-    result_dict = b.objective_function_test(configuration, n_estimators=n_estimator)
+    train_loss = result_dict['info']['train_loss']
+    result_dict = b.objective_function_test(configuration, fidelity=dict(n_estimators=n_estimator))
     test_loss = result_dict['function_value']
 
     print(train_loss, valid_loss, test_loss)

From 8368a4dc8513cebfb81eb8f931e6e11d7d17d26b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20M=C3=BCller?= <muller-phil@gmx.net>
Date: Wed, 26 Aug 2020 16:35:19 +0200
Subject: [PATCH 3/6] Nasbench201 (#37)

* MOVE Lock files to cache folder + FIX test cases Part 1

* TRAVIS should crash if an error occurs.

* FIX test cases Part 2

* FIX test cases Part 3

* ADD codecov.yml

* ADD test cases

* REMOVE test_data_manager.py

Uploaded a file for a future pull request

* ADD Nasbench201

* FIX paths in recipe

* ADD small util function to activate / deactivate debugging on container side

* NASBench201 test should partially use the container

* NASBench201 takes List as data seed.

This is because we send the payload as json to the container. And tuples are casted to lists by json.

* FIX ConfigSpace is not deterministic via container

The CS was not deterministic since the seed was not set after receiving the cs string from the contianer.

* FIX dependencies for nasbench201

* FIX dependencies for nasbench201

* ADD Singularity dependencies to standard dependencies

We need it everywhere, so I've removed the singularity tag and add the pyro and the oslo dependency to the default requirements.

* [skip travis] Update README.md

* [skip travis] nasbench_201 returns now the sum of the time per epochs as cost

- Imporve the doc strings. Explain the cost computation + fix a mistake in the used identifiers
- the nasbench201 returns now the exact sum of time needed to train a network for x epochs.
- the evaluation costs in the objective function is now train cost + eval cost. So that it is equal to the return value of the objective_function_test.
- Minor improvement in the logging. Moved the logging from the benchmark inti in the correct datamanager class.

* [skip travis] Update test cases

* Trigger Travis

Co-authored-by: Katharina Eggensperger <eggenspk@informatik.uni-freiburg.de>
---
 README.md                                     |   4 +-
 .../container/tabular_benchmark_example.py    |   2 +-
 examples/container/xgboost_with_container.py  |   2 +-
 examples/w_optimizer/cartpole_bohb.py         |   2 +-
 examples/w_optimizer/cartpole_hyperband.py    |   2 +-
 .../w_optimizer/cartpole_succesive_halving.py |   2 +-
 extra_requirements/nasbench201.json           |   3 -
 extra_requirements/singularity.json           |   3 -
 hpolib/benchmarks/nas/nasbench_201.py         | 443 ++++++++++++++++++
 .../container/benchmarks/nas/nasbench_201.py  |   4 -
 hpolib/container/client_abstract_benchmark.py |   6 +-
 hpolib/container/recipes/Singularity.template |   2 +-
 .../recipes/ml/Singularity.XGBoostBenchmark   |   2 +-
 .../recipes/nas/Singularity.TabularBenchmarks |   2 +-
 .../recipes/nas/Singularity.nasbench_101      |   2 +-
 .../recipes/nas/Singularity.nasbench_201      |  36 +-
 .../container/recipes/rl/Singularity.Cartpole |   2 +-
 .../recipes/rl/Singularity.learnaBenchmark    |   2 +-
 hpolib/util/container_utils.py                |  23 +
 hpolib/util/data_manager.py                   | 109 ++++-
 hpolib/util/openml_data_manager.py            |   3 +-
 requirements.txt                              |   4 +-
 tests/test_data_manager.py                    |  59 +++
 tests/test_nasbench_201.py                    | 147 ++++++
 tests/test_utils.py                           |  10 +
 25 files changed, 810 insertions(+), 66 deletions(-)
 delete mode 100644 extra_requirements/nasbench201.json
 delete mode 100644 extra_requirements/singularity.json
 create mode 100644 hpolib/benchmarks/nas/nasbench_201.py
 create mode 100644 hpolib/util/container_utils.py
 create mode 100644 tests/test_data_manager.py
 create mode 100644 tests/test_nasbench_201.py

diff --git a/README.md b/README.md
index 1a6d456c..587babc4 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ HPOlib2 is a library for hyperparameter optimization and black-box optimization
 
 ## In 4 lines of code
 
-Run a random configuration within a singularity container (requires singularity dependencies: `pip install .[singularity]`)
+Run a random configuration within a singularity container
 ```python
 from hpolib.container.benchmarks.ml.xgboost_benchmark import XGBoostBenchmark
 b = XGBoostBenchmark(task_id=167149, container_source='library://phmueller/automl', rng=1)
@@ -49,7 +49,7 @@ run the following:
 ```
 git clone https://github.com/automl/HPOlib2.git
 cd HPOlib2 
-pip install .[singularity]
+pip install .
 ```
 
 **Note:** This does not install *singularity (version 3.5)*. Please follow the steps described here: [user-guide](https://sylabs.io/guides/3.5/user-guide/quick_start.html#quick-installation-steps).   
diff --git a/examples/container/tabular_benchmark_example.py b/examples/container/tabular_benchmark_example.py
index 1dbf592c..7ad5c8d0 100644
--- a/examples/container/tabular_benchmark_example.py
+++ b/examples/container/tabular_benchmark_example.py
@@ -11,7 +11,7 @@
 hpolibrc. A second call, will first look into the data directory, if the container is already available, so it will not
 be downloaded twice.
 
-Please install the necessary dependencies via ``pip install .[singularity]`` and singularity (v3.5).
+Please install the necessary dependencies via ``pip install .`` and singularity (v3.5).
 https://sylabs.io/guides/3.5/user-guide/quick_start.html#quick-installation-steps
 """
 
diff --git a/examples/container/xgboost_with_container.py b/examples/container/xgboost_with_container.py
index 08034776..a5964f91 100644
--- a/examples/container/xgboost_with_container.py
+++ b/examples/container/xgboost_with_container.py
@@ -12,7 +12,7 @@
 https://sylabs.io/guides/3.1/user-guide/quick_start.html#quick-installation-steps
 
 Furthermore, make sure to install the right dependencies for the hpolib via:
-``pip3 install .[singularity]``.
+``pip3 install .``.
 """
 
 import argparse
diff --git a/examples/w_optimizer/cartpole_bohb.py b/examples/w_optimizer/cartpole_bohb.py
index 5be491bc..f9128d31 100644
--- a/examples/w_optimizer/cartpole_bohb.py
+++ b/examples/w_optimizer/cartpole_bohb.py
@@ -5,7 +5,7 @@
 This example shows the usage of an Hyperparameter Tuner, such as BOHB on the cartpole benchmark.
 BOHB is a combination of Bayesian optimization and Hyperband.
 
-Please install the necessary dependencies via ``pip install .[singularity]`` and singularity (v3.5).
+Please install the necessary dependencies via ``pip install .`` and singularity (v3.5).
 https://sylabs.io/guides/3.5/user-guide/quick_start.html#quick-installation-steps
 
 """
diff --git a/examples/w_optimizer/cartpole_hyperband.py b/examples/w_optimizer/cartpole_hyperband.py
index b816eb6f..8cbd465e 100644
--- a/examples/w_optimizer/cartpole_hyperband.py
+++ b/examples/w_optimizer/cartpole_hyperband.py
@@ -5,7 +5,7 @@
 This example shows the usage of an Hyperparameter Tuner, such as SMAC on the cartpole benchmark.
 We use SMAC with Hyperband.
 
-Please install the necessary dependencies via ``pip install .[singularity]`` and singularity (v3.5).
+Please install the necessary dependencies via ``pip install .`` and singularity (v3.5).
 https://sylabs.io/guides/3.5/user-guide/quick_start.html#quick-installation-steps
 """
 import logging
diff --git a/examples/w_optimizer/cartpole_succesive_halving.py b/examples/w_optimizer/cartpole_succesive_halving.py
index 94b02bb2..0ea255b6 100644
--- a/examples/w_optimizer/cartpole_succesive_halving.py
+++ b/examples/w_optimizer/cartpole_succesive_halving.py
@@ -5,7 +5,7 @@
 This example shows the usage of an Hyperparameter Tuner, such as SMAC on the cartpole benchmark.
 We use SMAC with Successive Halving.
 
-Please install the necessary dependencies via ``pip install .[singularity]`` and singularity (v3.5).
+Please install the necessary dependencies via ``pip install .`` and singularity (v3.5).
 https://sylabs.io/guides/3.5/user-guide/quick_start.html#quick-installation-steps
 """
 import logging
diff --git a/extra_requirements/nasbench201.json b/extra_requirements/nasbench201.json
deleted file mode 100644
index 9bbc53de..00000000
--- a/extra_requirements/nasbench201.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-  "nasbench_201": ["torch>=1.2.0,<=1.5.1","torchvision>=0.4.0"]
-}
\ No newline at end of file
diff --git a/extra_requirements/singularity.json b/extra_requirements/singularity.json
deleted file mode 100644
index 4ab3056f..00000000
--- a/extra_requirements/singularity.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-  "singularity": ["Pyro4==4.80","oslo.concurrency>=4.2.0"]
-}
\ No newline at end of file
diff --git a/hpolib/benchmarks/nas/nasbench_201.py b/hpolib/benchmarks/nas/nasbench_201.py
new file mode 100644
index 00000000..92443bb8
--- /dev/null
+++ b/hpolib/benchmarks/nas/nasbench_201.py
@@ -0,0 +1,443 @@
+"""
+Interface to Benchmarks with Nas-Bench 201
+
+https://github.com/D-X-Y/AutoDL-Projects/blob/master/docs/NAS-Bench-201.md
+
+How to use this benchmark:
+--------------------------
+
+We recommend using the containerized version of this benchmark.
+If you want to use this benchmark locally (without running it via the corresponding container),
+you need to perform the following steps.
+
+
+1. Clone and install
+====================
+Since the data is downloaded automatically, you dont have to do anything but installing the hpolib.
+
+Recommend: ``Python >= 3.6.0``
+
+```
+cd /path/to/HPOlib3
+pip install .
+```
+
+For more info about the nasbench201, please have a look at
+https://github.com/D-X-Y/AutoDL-Projects/blob/master/docs/NAS-Bench-201.md
+"""
+import logging
+from typing import Union, Dict, List, Text, Tuple
+from copy import deepcopy
+
+import ConfigSpace as CS
+import numpy as np
+
+import hpolib.util.rng_helper as rng_helper
+from hpolib.abstract_benchmark import AbstractBenchmark
+from hpolib.util.data_manager import NASBench_201Data
+
+__version__ = '0.0.1'
+MAX_NODES = 4
+
+logger = logging.getLogger('NASBENCH201')
+
+
+class NasBench201BaseBenchmark(AbstractBenchmark):
+    def __init__(self, dataset: str,
+                 rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        """
+        Benchmark interface to the NASBench201 Benchmarks. The NASBench201 contains
+        results for architectures on 4 different data sets.
+
+        We have split the "api" file from NASBench201 in separate files per data set.
+        The original "api" file contains all data sets, but loading this single file took too much RAM.
+
+        We recommend to not call this base class directly but using the correct subclass below.
+
+        The parameter ``dataset`` indicates which data set was used for training.
+
+        For each data set the metrics
+        'train_acc1es', 'train_losses', 'train_times', 'eval_acc1es', 'eval_times', 'eval_losses' are available.
+        However, the data sets report them on different data splits (train, train + valid, test, valid or test+valid).
+
+        Note:
+        - The parameter epoch is 0 indexed!
+        - In the original data, the training splits are always marked with the key 'train' but they use different
+          identifiers to refer to the available evaluation splits. We report them also in the table below.
+
+        The table in the following shows the mapping from data set and metric to used split.
+
+        |-------------------|---------------|-----------------------------------|
+        | Data set          | train_*       | eval_*        (key in orig. data) |
+        |-------------------|---------------|-----------------------------------|
+        | 'cifar10-valid'   | train         | valid         (x-valid)           |
+        | 'cifar10'         | train + valid | test          (ori-test)          |
+        | 'cifar100'        | train         | valid + test  (ori-test)          |
+        | 'ImageNet16-120'  | train         | valid + test  (ori-test)          |
+        |-------------------|---------------|-----------------------------------|
+
+
+        Some further remarks:
+        - cifar10-valid is trained on the train split and tested on the validation split.
+        - cifar10 is trained on the train *and* validation split and tested on the test split.
+        - The train metrics are dictionaries with epochs (e.g. 0, 1, 2) as key and the metric as value.
+          The evaluation metrics, however, have as key the identifiers, e.g. ori-test@0, with 0 indicating the epoch.
+          Also, each data set (except for cifar10) reports values for all 200 epochs for a metric on the specified
+          split (see first table) and a single value on the 200th epoch for the other splits.
+          Table 3 shows the available identifiers for each data set.
+
+        |-------------------|------------------------------|
+        | Data set          | eval*:   values for epochs   |
+        |-------------------|------------------------------|
+        | 'cifar10-valid'   | x-valid:	0-199	           |
+        |		     		| ori-test:	199		           |
+        | 'cifar10'         | ori-test:	0-199	           |
+        | 'cifar100'        | ori-test:	0-199	           |
+        |					| x-valid:	199		           |
+        |   				| x-test:   199                |
+        | 'ImageNet16-120'  | ori-test:	0-199	           |
+        |					| x-valid:	199		           |
+        |   				| x-test:  	199                |
+        |-------------------|------------------------------|
+
+        Parameters
+        ----------
+        dataset : str
+            One of cifar10-valid, cifar10, cifar100, ImageNet16-120.
+        rng : np.random.RandomState, int, None
+            Random seed for the benchmark's random state.
+        """
+
+        super(NasBench201BaseBenchmark, self).__init__(rng=rng)
+
+        data_manager = NASBench_201Data(dataset=dataset)
+
+        self.data = data_manager.load()
+
+        self.config_to_structure = NasBench201BaseBenchmark.config_to_structure_func(max_nodes=MAX_NODES)
+
+    @AbstractBenchmark._configuration_as_dict
+    @AbstractBenchmark._check_configuration
+    @AbstractBenchmark._check_fidelity
+    def objective_function(self, configuration: Union[CS.Configuration, Dict],
+                           fidelity: Union[Dict, None] = None,
+                           rng: Union[np.random.RandomState, int, None] = None,
+                           data_seed: Union[List, Tuple, int, None] = (777, 888, 999),
+                           **kwargs) -> Dict:
+        """
+        Objective function for the NASBench201 benchmark.
+        This functions sends a query to NASBench201 and evaluates the configuration.
+        As already explained in the class definition, different data sets are trained on different splits. For example
+        cifar10 is trained on the train and validation split and tested on the test split. Therefore, different entries
+        are returned from the NASBench201 result.
+
+        Overview of the used splits for training and testing and which are returned in the objective_function and
+        which in the objective_function_test.
+
+        |-------------------|-----------------------|---------------------------|
+        |                   | Returned by           | Returned by               |
+        |                   |   objective_function  |   objective_function_test |
+        | Data set          | train_*               | eval_*                    |
+        |-------------------|-----------------------|---------------------------|
+        | 'cifar10-valid'   | train                 | valid                     |
+        | 'cifar10'         | train + valid         | test                      |
+        | 'cifar100'        | train                 | valid + test              |
+        | 'ImageNet16-120'  | train                 | valid + test              |
+        |-------------------|-----------------------|---------------------------|
+
+        Legend:
+        * = [losses, acc1es, times]
+
+        Parameters
+        ----------
+        configuration
+        fidelity: Dict, None
+            epoch: int - Values: [0, 199]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 0 indexed! (Results after the first epoch: epoch = 0)
+
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        data_seed : List, Tuple, None, int
+            The nasbench_201 benchmark include for each run 3 different seeds: 777, 888, 999.
+            The user can specify which seed to use. If more than one seed is given, the results are averaged
+            across the seeds but then the time needed for training is the sum of the costs per seed.
+            Note:
+                For some architectures (configurations) no run was available. We've set missing values to an
+                available value from another seed. Therefore, it is possible that run results are exactly the same for
+                different seeds.
+
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : training precision
+            cost : time to train the network
+            info : Dict
+                train_precision : float
+                train_losses : float
+                train_cost : float
+                    Time needed to train the network for 'epoch' many epochs. If more than one seed is given,
+                    this field is the sum of the training time per network
+                eval_precision : float
+                eval_losses : float
+                eval_cost : float
+                    Time needed to train the network for 'epoch many epochs plus the time to evaluate the network on the
+                    evaluation split. If more than one seed is given, this field is the sum of the eval cost per network
+                fidelity : Dict
+                    used fidelities in this evaluation
+        """
+
+        # Check if the data set seeds are valid
+        assert isinstance(data_seed, List) or isinstance(data_seed, Tuple) or isinstance(data_seed, int), \
+            f'data seed has unknown data type {type(data_seed)}, but should be tuple or int (777,888,999)'
+
+        if isinstance(data_seed, List):
+            data_seed = tuple(data_seed)
+
+        if isinstance(data_seed, int):
+            data_seed = (data_seed, )
+
+        assert len(set(data_seed) - {777, 888, 999}) == 0,\
+            f'data seed can only contain the elements 777, 888, 999, but was {data_seed}'
+
+        self.rng = rng_helper.get_rng(rng)
+
+        structure = self.config_to_structure(configuration)
+        structure_str = structure.tostr()
+
+        epoch = fidelity['epoch']
+
+        train_accuracies = [self.data[(seed, 'train_acc1es')][structure_str][epoch] for seed in data_seed]
+        train_losses = [self.data[(seed, 'train_losses')][structure_str][epoch] for seed in data_seed]
+        train_times = [np.sum(self.data[(seed, 'train_times')][structure_str][:epoch + 1]) for seed in data_seed]
+
+        eval_accuracies = [self.data[(seed, 'eval_acc1es')][structure_str][epoch] for seed in data_seed]
+        eval_losses = [self.data[(seed, 'eval_losses')][structure_str][epoch] for seed in data_seed]
+        eval_times = [np.sum(self.data[(seed, 'eval_times')][structure_str][:epoch + 1]) for seed in data_seed]
+
+        return {'function_value': float(100 - np.mean(train_accuracies)),
+                'cost': float(np.sum(train_times)),
+                'info': {'train_precision': float(100 - np.mean(train_accuracies)),
+                         'train_losses': float(np.mean(train_losses)),
+                         'train_cost': float(np.sum(train_times)),
+                         'eval_precision': float(100 - np.mean(eval_accuracies)),
+                         'eval_losses': float(np.mean(eval_losses)),
+                         'eval_cost': float(np.sum(train_times)) + float(np.sum(eval_times)),
+                         'fidelity': fidelity
+                         }
+                }
+
+    @AbstractBenchmark._configuration_as_dict
+    @AbstractBenchmark._check_configuration
+    def objective_function_test(self, configuration: Union[CS.Configuration, Dict],
+                                fidelity: Union[Dict, None] = None,
+                                rng: Union[np.random.RandomState, int, None] = None,
+                                **kwargs) -> Dict:
+        """
+        Get the validated results from the NASBench201. Runs a given configuration on the largest budget (here: 199).
+        The test function uses all data set seeds (777, 888, 999).
+
+        See also :py:meth:`~hpolib.benchmarks.nas.nasbench_201.objective_function`
+
+        Parameters
+        ----------
+        configuration
+        fidelity: Dict, None
+            epoch: int - Values: [0, 199]
+                Number of epochs an architecture was trained.
+                Note: the number of epoch is 0 indexed! (Results after the first epoch: epoch = 0)
+
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None
+            Random seed to use in the benchmark.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : evaluation precision
+            cost : time to the network + time to validate
+            info : Dict
+                train_precision
+                train_losses
+                train_cost
+                eval_precision
+                eval_losses
+                eval_cost
+                fidelity : used fidelities in this evaluation
+        """
+
+        # The result dict should contain already all necessary information -> Just swap the function value from valid
+        # to test and the corresponding time cost
+        result = self.objective_function(configuration=configuration, fidelity=fidelity, data_seed=(777, 888, 999),
+                                         rng=rng, **kwargs)
+        result['function_value'] = result['info']['eval_precision']
+        result['cost'] = result['info']['eval_cost']
+        return result
+
+    @staticmethod
+    def config_to_structure_func(max_nodes: int):
+        # From https://github.com/D-X-Y/AutoDL-Projects/blob/master/exps/algos/BOHB.py
+        # Author: https://github.com/D-X-Y [Xuanyi.Dong@student.uts.edu.au]
+        def config_to_structure(config):
+            genotypes = []
+            for i in range(1, max_nodes):
+                x_list = []
+                for j in range(i):
+                    node_str = f'{i}<-{j}'
+                    op_name = config[node_str]
+                    x_list.append((op_name, j))
+                genotypes.append(tuple(x_list))
+            return NasBench201BaseBenchmark._Structure(genotypes)
+        return config_to_structure
+
+    @staticmethod
+    def get_search_spaces(xtype: str, name: str) -> List[Text]:
+        # obtain the search space, i.e., a dict mapping the operation name into a python-function for this op
+        # From https://github.com/D-X-Y/AutoDL-Projects/blob/master/lib/models/__init__.py
+        # Author: https://github.com/D-X-Y [Xuanyi.Dong@student.uts.edu.au]
+        if xtype == 'cell':
+            NAS_BENCH_201 = ['none', 'skip_connect', 'nor_conv_1x1', 'nor_conv_3x3', 'avg_pool_3x3']
+            SearchSpaceNames = {'nas-bench-201': NAS_BENCH_201}
+            assert name in SearchSpaceNames, 'invalid name [{:}] in {:}'.format(name, SearchSpaceNames.keys())
+            return SearchSpaceNames[name]
+        else:
+            raise ValueError('invalid search-space type is {:}'.format(xtype))
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Return the CS representation of the search space.
+        From https://github.com/D-X-Y/AutoDL-Projects/blob/master/exps/algos/BOHB.py
+        Author: https://github.com/D-X-Y [Xuanyi.Dong@student.uts.edu.au]
+
+        Parameters
+        ----------
+        seed : int, None
+            Random seed for the configuration space.
+
+        Returns
+        -------
+        CS.ConfigurationSpace -
+            Containing the benchmark's hyperparameter
+        """
+        seed = seed if seed is not None else np.random.randint(1, 100000)
+        cs = CS.ConfigurationSpace(seed=seed)
+
+        search_space = NasBench201BaseBenchmark.get_search_spaces('cell', 'nas-bench-201')
+        hps = [CS.CategoricalHyperparameter(f'{i}<-{j}', search_space) for i in range(1, MAX_NODES) for j in range(i)]
+        cs.add_hyperparameters(hps)
+        return cs
+
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
+        the NAS Benchmark 201.
+
+        Fidelities:
+         - epoch: int
+         The loss / accuracy at `epoch`. Can be from 0 to 199.
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        seed = seed if seed is not None else np.random.randint(1, 100000)
+        fidel_space = CS.ConfigurationSpace(seed=seed)
+
+        fidel_space.add_hyperparameters([
+            CS.UniformIntegerHyperparameter('epoch', lower=0, upper=199, default_value=199)
+        ])
+
+        return fidel_space
+
+    @staticmethod
+    def get_meta_information() -> Dict:
+        """ Returns the meta information for the benchmark """
+        return {'name': 'NAS-Bench-201',
+                'references': ['Xuanyi Dong, Yi Yang',
+                               'NAS-Bench-201: Extending the Scope of Reproducible Neural Architecture Search',
+                               'https://openreview.net/forum?id=HJxyZkBKDr',
+                               'https://github.com/D-X-Y/AutoDL-Projects'],
+                }
+
+    class _Structure:
+        def __init__(self, genotype):
+            assert isinstance(genotype, list) or isinstance(genotype, tuple), 'invalid class of genotype : {:}'.format(
+                type(genotype))
+            self.node_num = len(genotype) + 1
+            self.nodes = []
+            self.node_N = []
+            for idx, node_info in enumerate(genotype):
+                assert isinstance(node_info, list) or isinstance(node_info,
+                                                                 tuple), 'invalid class of node_info : {:}'.format(
+                    type(node_info))
+                assert len(node_info) >= 1, 'invalid length : {:}'.format(len(node_info))
+                for node_in in node_info:
+                    assert isinstance(node_in, list) or isinstance(node_in,
+                                                                   tuple), 'invalid class of in-node : {:}'.format(
+                        type(node_in))
+                    assert len(node_in) == 2 and node_in[1] <= idx, 'invalid in-node : {:}'.format(node_in)
+                self.node_N.append(len(node_info))
+                self.nodes.append(tuple(deepcopy(node_info)))
+
+        def tostr(self):
+            strings = []
+            for node_info in self.nodes:
+                string = '|'.join([x[0] + '~{:}'.format(x[1]) for x in node_info])
+                string = '|{:}|'.format(string)
+                strings.append(string)
+            return '+'.join(strings)
+
+        def __repr__(self):
+            return (
+                '{name}({node_num} nodes with {node_info})'.format(name=self.__class__.__name__, node_info=self.tostr(),
+                                                                   **self.__dict__))
+
+        def __len__(self):
+            return len(self.nodes) + 1
+
+        def __getitem__(self, index):
+            return self.nodes[index]
+
+
+class Cifar10NasBench201Benchmark(NasBench201BaseBenchmark):
+
+    def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(Cifar10NasBench201Benchmark, self).__init__(dataset='cifar10', rng=rng, **kwargs)
+
+
+class Cifar10ValidNasBench201Benchmark(NasBench201BaseBenchmark):
+
+    def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(Cifar10ValidNasBench201Benchmark, self).__init__(dataset='cifar10-valid', rng=rng, **kwargs)
+
+
+class Cifar100NasBench201Benchmark(NasBench201BaseBenchmark):
+
+    def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(Cifar100NasBench201Benchmark, self).__init__(dataset='cifar100', rng=rng, **kwargs)
+
+
+class ImageNetNasBench201Benchmark(NasBench201BaseBenchmark):
+
+    def __init__(self, rng: Union[np.random.RandomState, int, None] = None, **kwargs):
+        super(ImageNetNasBench201Benchmark, self).__init__(dataset='ImageNet16-120', rng=rng, **kwargs)
diff --git a/hpolib/container/benchmarks/nas/nasbench_201.py b/hpolib/container/benchmarks/nas/nasbench_201.py
index 94fdd645..befc7473 100644
--- a/hpolib/container/benchmarks/nas/nasbench_201.py
+++ b/hpolib/container/benchmarks/nas/nasbench_201.py
@@ -8,7 +8,6 @@
 
 class Cifar10NasBench201Benchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
-        kwargs['data_path'] = '/home/data/NAS-Bench-201-v1_1-096897_cifar10.pth'
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'Cifar10NasBench201Benchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
         super(Cifar10NasBench201Benchmark, self).__init__(**kwargs)
@@ -16,7 +15,6 @@ def __init__(self, **kwargs):
 
 class Cifar10ValidNasBench201Benchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
-        kwargs['data_path'] = '/home/data/NAS-Bench-201-v1_1-096897_cifar10-valid.pth'
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'Cifar10ValidNasBench201Benchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
         super(Cifar10ValidNasBench201Benchmark, self).__init__(**kwargs)
@@ -24,7 +22,6 @@ def __init__(self, **kwargs):
 
 class Cifar100NasBench201Benchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
-        kwargs['data_path'] = '/home/data/NAS-Bench-201-v1_1-096897_cifar100.pth'
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'Cifar100NasBench201Benchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
         super(Cifar100NasBench201Benchmark, self).__init__(**kwargs)
@@ -32,7 +29,6 @@ def __init__(self, **kwargs):
 
 class ImageNetNasBench201Benchmark(AbstractBenchmarkClient):
     def __init__(self, **kwargs):
-        kwargs['data_path'] = '/home/data/NAS-Bench-201-v1_1-096897_ImageNet16-120.pth'
         kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'ImageNetNasBench201Benchmark')
         kwargs['container_name'] = kwargs.get('container_name', 'nasbench_201')
         super(ImageNetNasBench201Benchmark, self).__init__(**kwargs)
diff --git a/hpolib/container/client_abstract_benchmark.py b/hpolib/container/client_abstract_benchmark.py
index 1cf9b874..d4de1e92 100644
--- a/hpolib/container/client_abstract_benchmark.py
+++ b/hpolib/container/client_abstract_benchmark.py
@@ -342,7 +342,11 @@ def get_configuration_space(self, seed: Union[int, None] = None) -> CS.Configura
         seed_dict = json.dumps(seed_dict, indent=None)
         logger.debug(f'Client: seed_dict {seed_dict}')
         json_str = self.benchmark.get_configuration_space(seed_dict)
-        return csjson.read(json_str)
+
+        config_space = csjson.read(json_str)
+        if seed is not None:
+            config_space.seed(seed)
+        return config_space
 
     def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationSpace:
         """
diff --git a/hpolib/container/recipes/Singularity.template b/hpolib/container/recipes/Singularity.template
index 06130997..702a7d0f 100644
--- a/hpolib/container/recipes/Singularity.template
+++ b/hpolib/container/recipes/Singularity.template
@@ -24,7 +24,7 @@ VERSION v0.0.1
     && echo "Please never push a recipe that checks out any other branch than development or master" \
     && git checkout development \
     && echo "Here you can install extra requirements additional to singularity" \
-    && pip install .[singularity] \
+    && pip install . \
     && echo "Please don't touch the following lines"
     && cd / \
     && mkdir /var/lib/hpolib2/ \
diff --git a/hpolib/container/recipes/ml/Singularity.XGBoostBenchmark b/hpolib/container/recipes/ml/Singularity.XGBoostBenchmark
index 67be25a4..c089fd50 100644
--- a/hpolib/container/recipes/ml/Singularity.XGBoostBenchmark
+++ b/hpolib/container/recipes/ml/Singularity.XGBoostBenchmark
@@ -12,7 +12,7 @@ VERSION v0.0.1
     && git clone https://github.com/automl/HPOlib2.git \
     && cd HPOlib2 \
     && git checkout master \
-    && pip install .[singularity,xgboost] \
+    && pip install .[xgboost] \
     && cd / \
     && mkdir /var/lib/hpolib2/ \
     && chmod -R 777 /var/lib/hpolib2/
diff --git a/hpolib/container/recipes/nas/Singularity.TabularBenchmarks b/hpolib/container/recipes/nas/Singularity.TabularBenchmarks
index 7056ba16..64970ee0 100644
--- a/hpolib/container/recipes/nas/Singularity.TabularBenchmarks
+++ b/hpolib/container/recipes/nas/Singularity.TabularBenchmarks
@@ -19,7 +19,7 @@ VERSION v0.0.1
     && git clone https://github.com/automl/HPOlib2.git \
     && cd HPOlib2 \
     && git checkout master \
-    && pip install .[singularity,tabular_benchmarks] \
+    && pip install .[tabular_benchmarks] \
     && cd / \
     && mkdir /var/lib/hpolib2/ \
     && chmod -R 777 /var/lib/hpolib2/
diff --git a/hpolib/container/recipes/nas/Singularity.nasbench_101 b/hpolib/container/recipes/nas/Singularity.nasbench_101
index 967bb063..3033d0a4 100644
--- a/hpolib/container/recipes/nas/Singularity.nasbench_101
+++ b/hpolib/container/recipes/nas/Singularity.nasbench_101
@@ -19,7 +19,7 @@ VERSION v0.0.1
     && git clone https://github.com/automl/HPOlib2.git \
     && cd HPOlib2 \
     && git checkout master \
-    && pip install .[singularity,nasbench_101] \
+    && pip install .[nasbench_101] \
     && cd / \
     && mkdir /var/lib/hpolib2/ \
     && chmod -R 777 /var/lib/hpolib2/
diff --git a/hpolib/container/recipes/nas/Singularity.nasbench_201 b/hpolib/container/recipes/nas/Singularity.nasbench_201
index 5ce3644c..b01ff97a 100644
--- a/hpolib/container/recipes/nas/Singularity.nasbench_201
+++ b/hpolib/container/recipes/nas/Singularity.nasbench_201
@@ -5,45 +5,15 @@ From: python:3.7-slim
 MAINTAINER muelleph@cs.uni-freiburg.de
 VERSION v0.0.1
 
-%environment
-    export PYTHONPATH=${PYTHONPATH}:/home/AutoDL-Projects/lib/
-
 %post
     apt update -y
     apt install build-essential git wget curl -y
 
-    # Cifar10:              https://drive.google.com/file/d/1VHxlF4SaS04tEkRZAslXJbkVTZJPN4hx/view?usp=sharing
-    # Cifar10-valid:        https://drive.google.com/file/d/12GuhUvTGHNPGVRthprP7vHBL8Mcp-uGN/view?usp=sharing
-    # Cifar100:             https://drive.google.com/file/d/1h6vke2_LjbtM7UJ2KEAUu6m0k8Uj6na7/view?usp=sharing
-    # ImageNet16-120:       https://drive.google.com/file/d/1bI5-_2YEWOGA0Ug9VGkGD1ismrKZQNVS/view?usp=sharing
-
-    cd /home \
-    && mkdir data \
-    && cd data \
-    && curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=1VHxlF4SaS04tEkRZAslXJbkVTZJPN4hx" > /dev/null \
-    && curl -Lb ./cookie "http://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' \
-           ./cookie`&id=1VHxlF4SaS04tEkRZAslXJbkVTZJPN4hx" -o NAS-Bench-201-v1_1-096897_cifar10.pth \
-    && rm ./cookie \
-    && curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=12GuhUvTGHNPGVRthprP7vHBL8Mcp-uGN" > /dev/null \
-    && curl -Lb ./cookie "http://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' \
-           ./cookie`&id=12GuhUvTGHNPGVRthprP7vHBL8Mcp-uGN" -o NAS-Bench-201-v1_1-096897_cifar10-valid.pth \
-    && rm ./cookie \
-    && curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=1h6vke2_LjbtM7UJ2KEAUu6m0k8Uj6na7" > /dev/null \
-    && curl -Lb ./cookie "http://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' \
-           ./cookie`&id=1h6vke2_LjbtM7UJ2KEAUu6m0k8Uj6na7" -o NAS-Bench-201-v1_1-096897_cifar100.pth \
-    && rm ./cookie \
-    && curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=1bI5-_2YEWOGA0Ug9VGkGD1ismrKZQNVS" > /dev/null \
-    && curl -Lb ./cookie "http://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' \
-           ./cookie`&id=1bI5-_2YEWOGA0Ug9VGkGD1ismrKZQNVS" -o NAS-Bench-201-v1_1-096897_ImageNet16-120.pth \
-    && rm ./cookie
-
     cd /home \
-    && git clone https://github.com/D-X-Y/AutoDL-Projects.git \
-    && pip install git+https://github.com/D-X-Y/NAS-Bench-201 \
-    && git clone https://github.com/PhMueller/HPOlib3.git \
+    && git clone https://github.com/automl/HPOlib3.git \
     && cd HPOlib3 \
-    && git checkout singularity \
-    && pip install .[singularity,nasbench_201] \
+    && git checkout development \
+    && pip install . \
     && cd / \
     && mkdir /var/lib/hpolib3/ \
     && chmod -R 777 /var/lib/hpolib3/
diff --git a/hpolib/container/recipes/rl/Singularity.Cartpole b/hpolib/container/recipes/rl/Singularity.Cartpole
index 9214b65a..569b76fb 100644
--- a/hpolib/container/recipes/rl/Singularity.Cartpole
+++ b/hpolib/container/recipes/rl/Singularity.Cartpole
@@ -14,7 +14,7 @@ VERSION v0.0.1
     && git clone https://github.com/automl/HPOlib2.git \
     && cd HPOlib2 \
     && git checkout master \
-    && pip install .[singularity,cartpole] \
+    && pip install .[cartpole] \
     && cd / \
     && mkdir /var/lib/hpolib2/ \
     && chmod -R 777 /var/lib/hpolib2/
diff --git a/hpolib/container/recipes/rl/Singularity.learnaBenchmark b/hpolib/container/recipes/rl/Singularity.learnaBenchmark
index c3dd3f07..42fcb1ad 100644
--- a/hpolib/container/recipes/rl/Singularity.learnaBenchmark
+++ b/hpolib/container/recipes/rl/Singularity.learnaBenchmark
@@ -27,7 +27,7 @@ VERSION v0.0.1
     && git clone https://github.com/automl/HPOlib2.git \
     && cd HPOlib2 \
     && git checkout master \
-    && ../learna/thirdparty/miniconda/miniconda/envs/learna/bin/python -m pip install .[singularity] \
+    && ../learna/thirdparty/miniconda/miniconda/envs/learna/bin/python -m pip install . \
     && cd / \
     && mkdir /var/lib/hpolib2/ \
     && chmod -R 777 /var/lib/hpolib2/
diff --git a/hpolib/util/container_utils.py b/hpolib/util/container_utils.py
new file mode 100644
index 00000000..7a554275
--- /dev/null
+++ b/hpolib/util/container_utils.py
@@ -0,0 +1,23 @@
+import os
+import importlib
+
+
+def __reload_module():
+    """
+    The env variable which enables the debug level is read in during the import of the client module.
+    Reloading the module, re-reads the env variable and therefore changes the level.
+    """
+    import hpolib.container.client_abstract_benchmark as client
+    importlib.reload(client)
+
+
+def enable_container_debug():
+    """ Sets the environment variable "HPOLIB_DEBUG" to true. The container checks this variable and if set to true,
+        enables debugging on the container side. """
+    os.environ['HPOLIB_DEBUG'] = 'true'
+    __reload_module()
+
+
+def disable_container_debug():
+    os.environ['HPOLIB_DEBUG'] = 'false'
+    __reload_module()
diff --git a/hpolib/util/data_manager.py b/hpolib/util/data_manager.py
index a9cfef47..c2ca2d4d 100644
--- a/hpolib/util/data_manager.py
+++ b/hpolib/util/data_manager.py
@@ -14,12 +14,19 @@
 import logging
 import pickle
 import tarfile
+from io import BytesIO
 from pathlib import Path
-from typing import Tuple
-from urllib.request import urlretrieve
+from typing import Tuple, Dict
+from urllib.request import urlretrieve, urlopen
+from zipfile import ZipFile
+from time import time
 
 import numpy as np
-from scipy.io import loadmat
+
+try:
+    from oslo_concurrency import lockutils
+except ImportError:
+    print("oslo_concurrency not installed, can't download datasets for nasbench201 (not needed for containers)")
 
 import hpolib
 
@@ -251,7 +258,7 @@ def load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray,
         xs = []
         ys = []
         for j in range(5):
-            fh = open(self.__load_data(filename=f'data_batch_{j+1}'), "rb")
+            fh = open(self.__load_data(filename=f'data_batch_{j + 1}'), "rb")
             d = pickle.load(fh, encoding='latin1')
             fh.close()
             x = d['data']
@@ -381,8 +388,7 @@ def load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray,
             X -= X.mean(axis=1)[:, np.newaxis]
             X = X.reshape(data_shape)
 
-        return X_train, y_train[:, 0], X_valid, y_valid[:, 0], \
-            X_test, y_test[:, 0]
+        return X_train, y_train[:, 0], X_valid, y_valid[:, 0], X_test, y_test[:, 0]
 
     def __load_data(self, filename_train: str,
                     filename_test: str) -> Tuple[np.ndarray, np.ndarray,
@@ -413,6 +419,7 @@ def __load_x_y(file_name):
             else:
                 self.logger.debug(f"Load data {save_fl}")
 
+            from scipy.io import loadmat
             data = loadmat(save_fl)
 
             x = data['X'].T
@@ -423,3 +430,93 @@ def __load_x_y(file_name):
         X_test, y_test = __load_x_y(filename_test)
 
         return X_train, y_train, X_test, y_test
+
+
+class NASBench_201Data(DataManager):
+    """ Download the necessary files for the nasbench201 benchmark. The benchmark has a data file for every pair of
+    data set (cifar10, cifar10-valid, cifar100, ImageNet16-120)
+    seed (777,888,999)
+    metric (train_acc1es, train_times, train_losses, eval_acc1es, eval_times, eval_losses)
+
+    Download for each data set the all corresponding data files.
+    The files should be hosted on automl.org.
+
+    For more information about the metric, have a look in the benchmark docstrings.
+    """
+
+    def __init__(self, dataset: str):
+        """
+        Init the NasbenchData Manager.
+
+        Parameters
+        ----------
+        dataset : str
+            One of cifar10, cifar10-valid, cifar100, ImageNet16-120
+        """
+        assert dataset in ['cifar10', 'cifar10-valid', 'cifar100', 'ImageNet16-120']
+
+        super(NASBench_201Data, self).__init__()
+
+        self.files = self.get_files_per_dataset(dataset)
+        self._save_dir = hpolib.config_file.data_dir / "nasbench_201"
+        self._url_source = 'https://www.automl.org/wp-content/uploads/2020/08/nasbench_201_data_v1.1.zip'
+        self.data = {}
+
+        self.create_save_directory(self._save_dir)
+
+    @staticmethod
+    def get_seeds_metrics():
+        from itertools import product
+        seeds = [777, 888, 999]
+        metrics = NASBench_201Data.get_metrics()
+        return product(seeds, metrics)
+
+    @staticmethod
+    def get_metrics():
+        return ['train_acc1es', 'train_losses', 'train_times',
+                'eval_acc1es', 'eval_times', 'eval_losses']
+
+    @staticmethod
+    def get_files_per_dataset(dataset):
+        seeds_metrics = NASBench_201Data.get_seeds_metrics()
+        files = [f'nb201_{dataset}_{seed}_{metric}.pkl' for seed, metric in seeds_metrics]
+        return files
+
+    @lockutils.synchronized('not_thread_process_safe', external=True,
+                            lock_path=f'{hpolib.config_file.cache_dir}/lock_nasbench_201_data', delay=0.5)
+    def _download(self):
+        # Check if data is already downloaded. If a single file is missing, we have to download the complete zip again.
+        # Use a file lock to ensure that no two processes try to download the same files at the same time.
+        file_is_missing = not all([(self._save_dir / 'data' / file).exists() for file in self.files])
+
+        if not file_is_missing:
+            self.logger.debug('NasBench201DataManager: Data already downloaded')
+        else:
+            self.logger.info(f'NasBench201DataManager: Start downloading data from {self._url_source} '
+                             f'to {self._save_dir}')
+
+            with urlopen(self._url_source) as zip_archive:
+                with ZipFile(BytesIO(zip_archive.read())) as zip_file:
+                    zip_file.extractall(self._save_dir)
+
+    def _load(self) -> Dict:
+        """ Load the data from the file system """
+        import pickle
+        data = {}
+        for (seed, metric_name), file in zip(NASBench_201Data.get_seeds_metrics(), self.files):
+            with (self._save_dir / 'data' / file).open('rb') as fh:
+                metric = pickle.load(fh)
+                data[(seed, metric_name)] = metric
+
+        return data
+
+    def load(self) -> Dict:
+        """ Loads data from data directory as defined in config_file.data_directory"""
+        self.logger.debug('NasBench201DataManager: Starting to load data')
+        t = time()
+
+        self._download()
+        self.data = self._load()
+        self.logger.info(f'NasBench201DataManager: Data successfully loaded after {time() - t:.2f}')
+
+        return self.data
diff --git a/hpolib/util/openml_data_manager.py b/hpolib/util/openml_data_manager.py
index f486556f..20b2b622 100644
--- a/hpolib/util/openml_data_manager.py
+++ b/hpolib/util/openml_data_manager.py
@@ -25,8 +25,7 @@
 
 
 import hpolib
-from hpolib.util.data_manager import HoldoutDataManager, \
-    CrossvalidationDataManager
+from hpolib.util.data_manager import HoldoutDataManager, CrossvalidationDataManager
 from hpolib.util.rng_helper import get_rng
 
 
diff --git a/requirements.txt b/requirements.txt
index 418cd5b8..73ae9818 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,5 @@
 scipy>=1.4.1
 numpy>=1.18.1
-ConfigSpace>=0.4.12
\ No newline at end of file
+ConfigSpace>=0.4.12
+Pyro4==4.80
+oslo.concurrency>=4.2.0
\ No newline at end of file
diff --git a/tests/test_data_manager.py b/tests/test_data_manager.py
new file mode 100644
index 00000000..2219935f
--- /dev/null
+++ b/tests/test_data_manager.py
@@ -0,0 +1,59 @@
+import pytest
+import hpolib
+from hpolib.util.data_manager import NASBench_201Data
+import shutil
+from multiprocessing import Pool
+
+
+def test_nasbench_201_load_thread_safe():
+    shutil.rmtree(hpolib.config_file.data_dir / "nasbench_201", ignore_errors=True)
+    function = lambda: NASBench_201Data(dataset='cifar100').load()
+    with Pool(3) as pool:
+        pool.map(function, [])
+
+
+def test_nasbench_201_get_files():
+
+    files = NASBench_201Data.get_files_per_dataset(dataset='cifar10')
+    assert len(files) == 18
+    assert all([file.startswith('nb201_cifar10') for file in files])
+
+
+def test_nasbench_201_get_metrics():
+
+    metrics = NASBench_201Data.get_metrics()
+    assert metrics == ['train_acc1es', 'train_losses', 'train_times',
+                       'eval_acc1es', 'eval_times', 'eval_losses']
+
+
+def test_nasbench_201_init():
+
+    data_manager = NASBench_201Data(dataset='cifar100')
+    assert len(data_manager.files) == 18
+    assert all([file.startswith('nb201_cifar10') for file in data_manager.files])
+
+    with pytest.raises(AssertionError):
+        NASBench_201Data(dataset='Non_existing_dataset')
+
+    assert data_manager._save_dir == hpolib.config_file.data_dir / "nasbench_201"
+    assert data_manager._save_dir.exists()
+
+
+def test_nasbench_201_load():
+
+    shutil.rmtree(hpolib.config_file.data_dir / "nasbench_201", ignore_errors=True)
+
+    data_manager = NASBench_201Data(dataset='cifar100')
+    data = data_manager.load()
+
+    assert len(data) == len(list(NASBench_201Data.get_seeds_metrics()))
+    assert len(data) == 3 * len(NASBench_201Data.get_metrics())
+    assert (hpolib.config_file.data_dir / "nasbench_201").exists()
+    assert len(list((hpolib.config_file.data_dir / "nasbench_201" / "data").glob('*.pkl'))) == 72
+    assert not (hpolib.config_file.data_dir / "nasbench_201_data_v1.1.zip").exists()
+
+    data_manager.data = None
+
+    data_manager = NASBench_201Data(dataset='cifar100')
+    data = data_manager.load()
+    assert len(data) == 3 * len(NASBench_201Data.get_metrics())
diff --git a/tests/test_nasbench_201.py b/tests/test_nasbench_201.py
new file mode 100644
index 00000000..3cdfb431
--- /dev/null
+++ b/tests/test_nasbench_201.py
@@ -0,0 +1,147 @@
+import logging
+logging.basicConfig(level=logging.DEBUG)
+
+import pytest
+
+from hpolib.benchmarks.nas.nasbench_201 import ImageNetNasBench201Benchmark, Cifar100NasBench201Benchmark, \
+    Cifar10ValidNasBench201Benchmark, Cifar10NasBench201Benchmark as Cifar10NasBench201BenchmarkContainer
+
+from hpolib.benchmarks.nas.nasbench_201 import Cifar10NasBench201Benchmark
+from hpolib.util.container_utils import disable_container_debug, enable_container_debug
+
+
+@pytest.fixture(scope='module')
+def enable_debug():
+    enable_container_debug()
+    yield
+    disable_container_debug()
+
+
+def test_nasbench201_cifar10valid(enable_debug):
+
+    b = Cifar10ValidNasBench201Benchmark(rng=0)
+
+    cs = b.get_configuration_space(seed=0)
+    config = cs.sample_configuration()
+    fidelity = {'epoch': 199}
+
+    result = b.objective_function(configuration=config, fidelity=fidelity, data_seed=(777, 888, 999))
+
+    assert result['function_value'] == pytest.approx(0.411, abs=0.1)
+    assert result['cost'] == pytest.approx(6650.88, abs=0.1)
+    assert result['info']['train_precision'] == result['function_value']
+    assert result['info']['train_cost'] == result['cost']
+
+
+def test_nasbench201_cifar100(enable_debug):
+    b = Cifar100NasBench201Benchmark(rng=0)
+
+    cs = b.get_configuration_space(seed=0)
+    config = cs.sample_configuration()
+    fidelity = {'epoch': 199}
+
+    result = b.objective_function(configuration=config, fidelity=fidelity, data_seed=(777, 888, 999))
+
+    assert result is not None
+    assert result['function_value'] == pytest.approx(7.8259, abs=0.1)
+    assert result['cost'] == pytest.approx(13301.76, abs=0.1)
+    assert result['info']['train_precision'] == result['function_value']
+    assert result['info']['train_cost'] == result['cost']
+
+
+def test_nasbench201_Image(enable_debug):
+    b = ImageNetNasBench201Benchmark(rng=0)
+
+    cs = b.get_configuration_space(seed=0)
+    config = cs.sample_configuration()
+    fidelity = {'epoch': 199}
+
+    result = b.objective_function(configuration=config, fidelity=fidelity, data_seed=(777, 888, 999))
+
+    assert result is not None
+    assert result['function_value'] == pytest.approx(62.858, abs=0.1)
+    assert result['cost'] == pytest.approx(40357.56, abs=0.1)
+    assert result['info']['train_precision'] == result['function_value']
+    assert result['info']['train_cost'] == result['cost']
+
+
+def test_nasbench201_cifar10_container(enable_debug):
+    b = Cifar10NasBench201BenchmarkContainer(rng=0)
+
+    cs = b.get_configuration_space(seed=0)
+    config = cs.sample_configuration()
+    fidelity = {'epoch': 199}
+
+    result = b.objective_function(configuration=config, fidelity=fidelity, data_seed=(777, 888, 999))
+
+    assert result is not None
+    assert result['function_value'] == pytest.approx(0.5019, abs=0.1)
+    assert result['cost'] == pytest.approx(13301.76, abs=0.1)
+    assert result['info']['train_precision'] == result['function_value']
+
+
+def test_nasbench201_cifar10():
+    b = Cifar10NasBench201Benchmark(rng=0)
+
+    assert b.data is not None
+    assert len(b.get_meta_information()) == 2
+
+    cs = b.get_configuration_space(seed=0)
+    config = cs.sample_configuration()
+    fidelity = {'epoch': 199}
+
+    result = b.objective_function(configuration=config, fidelity=fidelity, data_seed=(777, 888, 999))
+
+    assert result is not None
+    assert result['function_value'] == pytest.approx(0.5019, abs=0.1)
+    assert result['cost'] == pytest.approx(13301.76, abs=0.1)
+    assert result['info']['train_precision'] == result['function_value']
+
+    result_test = b.objective_function_test(configuration=config, fidelity=fidelity)
+    assert result['info']['train_precision'] == result_test['info']['train_precision']
+    assert result['info']['train_cost'] == result_test['info']['train_cost']
+    assert result['info']['train_losses'] == result_test['info']['train_losses']
+    assert result['info']['eval_precision'] == result_test['info']['eval_precision']
+    assert result['info']['eval_losses'] == result_test['info']['eval_losses']
+    assert result['info']['eval_cost'] == result_test['info']['eval_cost']
+
+    assert result_test['cost'] > result['cost']
+
+    result_lower = b.objective_function(configuration=config, fidelity={'epoch': 100},
+                                        data_seed=(777, 888, 999))
+    assert result['cost'] > result_lower['cost']
+
+    with pytest.raises(ValueError):
+        b.objective_function(configuration=config, fidelity={'epoch': 200}, data_seed=0.1)
+
+    with pytest.raises(AssertionError):
+        b.objective_function(configuration=config, fidelity=fidelity, data_seed=0.1)
+
+    with pytest.raises(AssertionError):
+        b.objective_function(configuration=config, fidelity=fidelity, data_seed=0)
+
+    with pytest.raises(AssertionError):
+        b.objective_function(configuration=config, fidelity=fidelity, data_seed=[777, 881])
+
+    with pytest.raises(AssertionError):
+        b.objective_function(configuration=config, fidelity=fidelity, data_seed=(777, 881))
+
+
+def test_nasbench201_fidelity_space():
+    fs = Cifar10NasBench201Benchmark(rng=0).get_fidelity_space()
+    assert len(fs.get_hyperparameters()) == 1
+
+
+def test_nasbench201_config():
+    cs = Cifar10NasBench201Benchmark(rng=0).get_configuration_space(seed=0)
+    c = cs.sample_configuration()
+    func = Cifar10NasBench201Benchmark.config_to_structure_func(4)
+    struct = func(c)
+
+    assert struct.__repr__() == '_Structure(4 nodes with |avg_pool_3x3~0|+|none~0|nor_conv_3x3~1|+' \
+                                '|nor_conv_3x3~0|nor_conv_3x3~1|skip_connect~2|)'
+    assert len(struct) == 4
+    assert struct[0] == (('avg_pool_3x3', 0),)
+
+    struct_str = struct.tostr()
+    assert struct_str == '|avg_pool_3x3~0|+|none~0|nor_conv_3x3~1|+|nor_conv_3x3~0|nor_conv_3x3~1|skip_connect~2|'
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 141a0e8c..b33ca02b 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -50,3 +50,13 @@ def test_rng_helper_2():
     old_rng = np.random.RandomState(123)
     rng = get_rng(None, old_rng)
     assert rng == old_rng
+
+
+def test_debug_level():
+    from hpolib.util.container_utils import enable_container_debug, disable_container_debug
+    import os
+    enable_container_debug()
+    assert os.environ['HPOLIB_DEBUG'] == 'true'
+
+    disable_container_debug()
+    assert os.environ['HPOLIB_DEBUG'] == 'false'

From 69cc2e937898715cb447b9ddcd40eccd929092b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20M=C3=BCller?= <muller-phil@gmx.net>
Date: Fri, 28 Aug 2020 14:33:33 +0200
Subject: [PATCH 4/6] Svm benchmark (#38)

* ADD SVM Benchmark

* ADD SVM Benchmark Container

* PEP8 + FLAKE8

* [skip travis] Update Requirements for svm and xgb

* [skip travis] Fix wrong dependency in recipe

* [skip travis] Move the imputer into objective function in xgboost_benchmark.py and svm_benchmark.py

- Move the imputer into objective function in xgboost_benchmark.py and svm_benchmark.py
- Update svm references

* [skip travis] Move the imputer to the pipeline

- Move the imputer to the pipeline
  This requires however that not NaN value is in the categorical data columns.
  Impute them by using a unsued value (here: the smalles value in this column - 1)
  They have to be numerical, since all openml values are numerical and sklearn doesn't allow to mix scalar and strings.
- Impute in a first step only the categorical then transfrom them to one hot
- Use the mean imputer only for the continuous feature
- Fix the same behaviiour in the xgboost benchmark

* [skip travis] Flake + Pep8

* Tests

* [skip travis] Fix error in imputer args

* [skip travis] Update Test

* Update in the client interface

- Receiving the fidelity space did not set the seed.
---
 extra_requirements/svm.json                   |   3 +
 extra_requirements/xgboost.json               |   2 +-
 hpolib/benchmarks/ml/svm_benchmark.py         | 329 ++++++++++++++++++
 hpolib/benchmarks/ml/xgboost_benchmark.py     |  37 +-
 .../container/benchmarks/ml/svm_benchmark.py  |  13 +
 hpolib/container/client_abstract_benchmark.py |   7 +-
 .../ml/Singularity.SupportVectorMachine       |  21 ++
 hpolib/util/openml_data_manager.py            |  28 +-
 tests/test_openml_datamanager.py              |  28 ++
 tests/test_svm.py                             |  40 +++
 10 files changed, 487 insertions(+), 21 deletions(-)
 create mode 100644 extra_requirements/svm.json
 create mode 100644 hpolib/benchmarks/ml/svm_benchmark.py
 create mode 100644 hpolib/container/benchmarks/ml/svm_benchmark.py
 create mode 100644 hpolib/container/recipes/ml/Singularity.SupportVectorMachine
 create mode 100644 tests/test_openml_datamanager.py
 create mode 100644 tests/test_svm.py

diff --git a/extra_requirements/svm.json b/extra_requirements/svm.json
new file mode 100644
index 00000000..80ac0658
--- /dev/null
+++ b/extra_requirements/svm.json
@@ -0,0 +1,3 @@
+{
+  "svm": ["pandas>=0.22.2,<0.24.2","openml==0.10.2","scikit-learn>=0.18.1"]
+}
\ No newline at end of file
diff --git a/extra_requirements/xgboost.json b/extra_requirements/xgboost.json
index dba5b0cf..1437baa9 100644
--- a/extra_requirements/xgboost.json
+++ b/extra_requirements/xgboost.json
@@ -1,3 +1,3 @@
 {
-  "xgboost": ["xgboost==0.90","pandas>=0.22.2,<0.24.2","openml==0.10.2"]
+  "xgboost": ["xgboost==0.90","pandas>=0.22.2,<0.24.2","openml==0.10.2","scikit-learn>=0.18.1"]
 }
\ No newline at end of file
diff --git a/hpolib/benchmarks/ml/svm_benchmark.py b/hpolib/benchmarks/ml/svm_benchmark.py
new file mode 100644
index 00000000..37f5c3c0
--- /dev/null
+++ b/hpolib/benchmarks/ml/svm_benchmark.py
@@ -0,0 +1,329 @@
+import time
+from typing import Union, Tuple, Dict, List
+
+import ConfigSpace as CS
+import numpy as np
+from scipy import sparse
+from sklearn import pipeline
+from sklearn import svm
+from sklearn.compose import ColumnTransformer
+from sklearn.impute import SimpleImputer
+from sklearn.metrics import accuracy_score, make_scorer
+from sklearn.preprocessing import OneHotEncoder
+
+import hpolib.util.rng_helper as rng_helper
+from hpolib.abstract_benchmark import AbstractBenchmark
+from hpolib.util.openml_data_manager import OpenMLHoldoutDataManager
+
+__version__ = '0.0.1'
+
+import logging
+
+logger = logging.getLogger('SVMBenchmark')
+
+
+class SupportVectorMachine(AbstractBenchmark):
+    """
+    Hyperparameter optimization task to optimize the regularization
+    parameter C and the kernel parameter gamma of a support vector machine.
+    Both hyperparameters are optimized on a log scale in [-10, 10].
+    The X_test data set is only used for a final offline evaluation of
+    a configuration. For that the validation and training data is
+    concatenated to form the whole training data set.
+    """
+
+    def __init__(self, task_id: Union[int, None] = None,
+                 rng: Union[np.random.RandomState, int, None] = None):
+        """
+        Parameters
+        ----------
+        task_id : int, None
+        rng : np.random.RandomState, int, None
+        """
+        super(SupportVectorMachine, self).__init__(rng=rng)
+
+        self.task_id = task_id
+        self.cache_size = 200  # Cache for the SVC in MB
+        self.accuracy_scorer = make_scorer(accuracy_score)
+
+        self.X_train, self.y_train, self.X_valid, self.y_valid, self.X_test, self.y_test, variable_types = \
+            self.get_data()
+        self.categorical_data = np.array([var_type == 'categorical' for var_type in variable_types])
+
+        # Sort data (Categorical + numerical) so that categorical and continous are not mixed.
+        categorical_idx = np.argwhere(self.categorical_data)
+        continuous_idx = np.argwhere(~self.categorical_data)
+        sorting = np.concatenate([categorical_idx, continuous_idx]).squeeze()
+        self.categorical_data = self.categorical_data[sorting]
+        self.X_train = self.X_train[:, sorting]
+        self.X_valid = self.X_valid[:, sorting]
+        self.X_test = self.X_test[:, sorting]
+
+        nan_columns = np.all(np.isnan(self.X_train), axis=0)
+        self.categorical_data = self.categorical_data[~nan_columns]
+        self.X_train, self.X_valid, self.X_test, self.categories = \
+            OpenMLHoldoutDataManager.replace_nans_in_cat_columns(self.X_train, self.X_valid, self.X_test,
+                                                                 is_categorical=self.categorical_data)
+
+        self.train_idx = self.rng.choice(a=np.arange(len(self.X_train)),
+                                         size=len(self.X_train),
+                                         replace=False)
+
+        # Similar to [Fast Bayesian Optimization of Machine Learning Hyperparameters on Large Datasets]
+        # (https://arxiv.org/pdf/1605.07079.pdf),
+        # use 10 time the number of classes as lower bound for the dataset fraction
+        n_classes = np.unique(self.y_train).shape[0]
+        self.lower_bound_train_size = int((10 * n_classes) / self.X_train.shape[0])
+
+    def get_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, List]:
+        """ Loads the data given a task or another source. """
+
+        assert self.task_id is not None, NotImplementedError('No task-id given. Please either specify a task-id or '
+                                                             'overwrite the get_data method.')
+
+        dm = OpenMLHoldoutDataManager(openml_task_id=self.task_id, rng=self.rng)
+        X_train, y_train, X_val, y_val, X_test, y_test = dm.load()
+
+        return X_train, y_train, X_val, y_val, X_test, y_test, dm.variable_types
+
+    def shuffle_data(self, rng=None):
+        """ Reshuffle the training data. If 'rng' is None, the training idx are shuffled according to the
+        class-random-state"""
+        random_state = rng_helper.get_rng(rng, self.rng)
+        random_state.shuffle(self.train_idx)
+
+    @AbstractBenchmark._configuration_as_dict
+    @AbstractBenchmark._check_configuration
+    @AbstractBenchmark._check_fidelity
+    def objective_function(self, configuration: Union[Dict, CS.Configuration],
+                           fidelity: Union[Dict, None] = None,
+                           shuffle: bool = False,
+                           rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+        """
+        Trains a SVM model given a hyperparameter configuration and
+        evaluates the model on the validation set.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+            Configuration for the SVM model
+        fidelity: Dict, None
+            Fidelity parameters for the SVM model, check get_fidelity_space(). Uses default (max) value if None.
+        shuffle : bool
+            If ``True``, shuffle the training idx. If no parameter ``rng`` is given, use the class random state.
+            Defaults to ``False``.
+        rng : np.random.RandomState, int, None,
+            Random seed for benchmark. By default the class level random seed.
+
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : validation loss
+            cost : time to train and evaluate the model
+            info : Dict
+                train_loss : training loss
+                fidelity : used fidelities in this evaluation
+        """
+        start_time = time.time()
+
+        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
+
+        if shuffle:
+            self.shuffle_data(self.rng)
+
+        # Split of dataset subset
+        if self.lower_bound_train_size > fidelity['dataset_fraction']:
+            train_size = self.lower_bound_train_size
+            logger.warning(f'The given data set fraction is lower than the lower bound (10 * number of classes.) '
+                           f'Increase the fidelity from {fidelity["dataset_fraction"]:.2f} to '
+                           f'{self.lower_bound_train_size:.2f}')
+        else:
+            train_size = fidelity['dataset_fraction']
+
+        train_size = int(train_size * len(self.train_idx))
+        train_idx = self.train_idx[:train_size]
+
+        # Transform hyperparameters to linear scale
+        hp_c = np.exp(float(configuration['C']))
+        hp_gamma = np.exp(float(configuration['gamma']))
+
+        # Train support vector machine
+        model = self.get_pipeline(hp_c, hp_gamma)
+        model.fit(self.X_train[train_idx], self.y_train[train_idx])
+
+        # Compute validation error
+        train_loss = 1 - self.accuracy_scorer(model, self.X_train[train_idx], self.y_train[train_idx])
+        val_loss = 1 - self.accuracy_scorer(model, self.X_valid, self.y_valid)
+
+        cost = time.time() - start_time
+
+        return {'function_value': val_loss,
+                "cost": cost,
+                'info': {'train_loss': train_loss,
+                         'fidelity': fidelity}}
+
+    @AbstractBenchmark._configuration_as_dict
+    @AbstractBenchmark._check_configuration
+    @AbstractBenchmark._check_fidelity
+    def objective_function_test(self, configuration: Union[Dict, CS.Configuration],
+                                fidelity: Union[Dict, None] = None, shuffle: bool = False,
+                                rng: Union[np.random.RandomState, int, None] = None, **kwargs) -> Dict:
+        """
+        Trains a SVM model with a given configuration on both the X_train
+        and validation data set and evaluates the model on the X_test data set.
+
+        Parameters
+        ----------
+        configuration : Dict, CS.Configuration
+            Configuration for the SVM Model
+        fidelity: Dict, None
+            Fidelity parameters, check get_fidelity_space(). Uses default (max) value if None.
+        rng : np.random.RandomState, int, None,
+            Random seed for benchmark. By default the class level random seed.
+            To prevent overfitting on a single seed, it is possible to pass a
+            parameter ``rng`` as 'int' or 'np.random.RandomState' to this function.
+            If this parameter is not given, the default random state is used.
+        kwargs
+
+        Returns
+        -------
+        Dict -
+            function_value : X_test loss
+            cost : time to X_train and evaluate the model
+            info : Dict
+                train_valid_loss: Loss on the train+valid data set
+                fidelity : used fidelities in this evaluation
+        """
+        assert np.isclose(fidelity['dataset_fraction'], 1), \
+            f'Data set fraction must be 1 but was {fidelity["dataset_fraction"]}'
+
+        self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
+
+        start_time = time.time()
+
+        # Concatenate training and validation dataset
+        if isinstance(self.X_train, sparse.csr.csr_matrix) or isinstance(self.X_valid, sparse.csr.csr_matrix):
+            data = sparse.vstack((self.X_train, self.X_valid))
+        else:
+            data = np.concatenate((self.X_train, self.X_valid))
+        targets = np.concatenate((self.y_train, self.y_valid))
+
+        # Transform hyperparameters to linear scale
+        hp_c = np.exp(float(configuration['C']))
+        hp_gamma = np.exp(float(configuration['gamma']))
+
+        model = self.get_pipeline(hp_c, hp_gamma)
+        model.fit(data, targets)
+
+        # Compute validation error
+        train_valid_loss = 1 - self.accuracy_scorer(model, data, targets)
+
+        # Compute test error
+        test_loss = 1 - self.accuracy_scorer(model, self.X_test, self.y_test)
+
+        cost = time.time() - start_time
+
+        return {'function_value': test_loss,
+                "cost": cost,
+                'info': {'train_valid_loss': train_valid_loss,
+                         'fidelity': fidelity}}
+
+    def get_pipeline(self, C: float, gamma: float) -> pipeline.Pipeline:
+        """ Create the scikit-learn (training-)pipeline """
+
+        model = pipeline.Pipeline([
+            ('preprocess_impute',
+             ColumnTransformer([
+                 ("categorical", "passthrough", self.categorical_data),
+                 ("continuous", SimpleImputer(strategy="mean"), ~self.categorical_data)])),
+            ('preprocess_one_hot',
+             ColumnTransformer([
+                 ("categorical", OneHotEncoder(categories=self.categories, sparse=False), self.categorical_data),
+                 ("continuous", "passthrough", ~self.categorical_data)])),
+            ('svm',
+             svm.SVC(gamma=gamma, C=C, random_state=self.rng, cache_size=self.cache_size))
+        ])
+        return model
+
+    @staticmethod
+    def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all parameters for
+        the SVM Model
+
+        For a detailed explanation of the hyperparameters:
+        https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+
+        seed = seed if seed is not None else np.random.randint(1, 100000)
+        cs = CS.ConfigurationSpace(seed=seed)
+
+        cs.add_hyperparameters([
+            CS.UniformFloatHyperparameter('C', lower=-10., upper=10., default_value=0., log=False),
+            CS.UniformFloatHyperparameter('gamma', lower=-10., upper=10., default_value=1., log=False),
+        ])
+        # cs.generate_all_continuous_from_bounds(SupportVectorMachine.get_meta_information()['bounds'])
+        return cs
+
+    @staticmethod
+    def get_fidelity_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
+        """
+        Creates a ConfigSpace.ConfigurationSpace containing all fidelity parameters for
+        the SupportVector Benchmark
+
+        Fidelities
+        ----------
+        dataset_fraction: float - [0.1, 1]
+            fraction of training data set to use
+
+        Parameters
+        ----------
+        seed : int, None
+            Fixing the seed for the ConfigSpace.ConfigurationSpace
+
+        Returns
+        -------
+        ConfigSpace.ConfigurationSpace
+        """
+        seed = seed if seed is not None else np.random.randint(1, 100000)
+        fidel_space = CS.ConfigurationSpace(seed=seed)
+
+        fidel_space.add_hyperparameters([
+            CS.UniformFloatHyperparameter("dataset_fraction", lower=0.1, upper=1.0, default_value=1.0, log=False),
+        ])
+        return fidel_space
+
+    @staticmethod
+    def get_meta_information():
+        """ Returns the meta information for the benchmark """
+        return {'name': 'Support Vector Machine',
+                'references': ["@InProceedings{pmlr-v54-klein17a",
+                               "author = {Aaron Klein and Stefan Falkner and Simon Bartels and Philipp Hennig and "
+                               "Frank Hutter}, "
+                               "title = {{Fast Bayesian Optimization of Machine Learning Hyperparameters on "
+                               "Large Datasets}}"
+                               "pages = {528--536}, year = {2017},"
+                               "editor = {Aarti Singh and Jerry Zhu},"
+                               "volume = {54},"
+                               "series = {Proceedings of Machine Learning Research},"
+                               "address = {Fort Lauderdale, FL, USA},"
+                               "month = {20--22 Apr},"
+                               "publisher = {PMLR},"
+                               "pdf = {http://proceedings.mlr.press/v54/klein17a/klein17a.pdf}, "
+                               "url = {http://proceedings.mlr.press/v54/klein17a.html}, "
+                               ]
+                }
diff --git a/hpolib/benchmarks/ml/xgboost_benchmark.py b/hpolib/benchmarks/ml/xgboost_benchmark.py
index b5b2a406..488087e3 100644
--- a/hpolib/benchmarks/ml/xgboost_benchmark.py
+++ b/hpolib/benchmarks/ml/xgboost_benchmark.py
@@ -37,7 +37,6 @@ def __init__(self, task_id: Union[int, None] = None, n_threads: int = 1,
 
         self.X_train, self.y_train, self.X_valid, self.y_valid, self.X_test, self.y_test, variable_types = \
             self.get_data()
-
         self.categorical_data = np.array([var_type == 'categorical' for var_type in variable_types])
 
         # XGB needs sorted data. Data should be (Categorical + numerical) not mixed.
@@ -52,15 +51,9 @@ def __init__(self, task_id: Union[int, None] = None, n_threads: int = 1,
         nan_columns = np.all(np.isnan(self.X_train), axis=0)
         self.categorical_data = self.categorical_data[~nan_columns]
 
-        mean_imputer = SimpleImputer(strategy='mean')
-        self.X_train = mean_imputer.fit_transform(self.X_train)
-        self.X_valid = mean_imputer.transform(self.X_valid)
-        self.X_test = mean_imputer.transform(self.X_test)
-
-        # Determine all possible values per categorical feature
-        complete_data = np.concatenate([self.X_train, self.X_valid, self.X_test], axis=0)
-        self.categories = [np.unique(complete_data[:, i])
-                           for i in range(self.X_train.shape[1]) if self.categorical_data[i]]
+        self.X_train, self.X_valid, self.X_test, self.categories = \
+            OpenMLHoldoutDataManager.replace_nans_in_cat_columns(self.X_train, self.X_valid, self.X_test,
+                                                                 is_categorical=self.categorical_data)
 
         # Determine the number of categories in the labels.
         # In case of binary classification ``self.num_class`` has to be 1 for xgboost.
@@ -184,17 +177,20 @@ def objective_function_test(self, configuration: Union[Dict, CS.Configuration],
         self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
 
         start = time.time()
+
+        # Impute potential nan values with the feature-
+        data = np.concatenate((self.X_train, self.X_valid))
+        targets = np.concatenate((self.y_train, self.y_valid))
+
         model = self._get_pipeline(n_estimators=fidelity['n_estimators'], **configuration)
-        model.fit(X=np.concatenate((self.X_train, self.X_valid)),
-                  y=np.concatenate((self.y_train, self.y_valid)))
+        model.fit(X=data, y=targets)
 
         test_loss = 1 - self.accuracy_scorer(model, self.X_test, self.y_test)
         cost = time.time() - start
 
         return {'function_value': test_loss,
                 'cost': cost,
-                'info': {'fidelity': fidelity},
-                }
+                'info': {'fidelity': fidelity}}
 
     @staticmethod
     def get_configuration_space(seed: Union[int, None] = None) -> CS.ConfigurationSpace:
@@ -262,12 +258,17 @@ def _get_pipeline(self, eta: float, min_child_weight: int, colsample_bytree: flo
         """ Create the scikit-learn (training-)pipeline """
         objective = 'binary:logistic' if self.num_class <= 2 else 'multi:softmax'
 
-        clf = pipeline.Pipeline(
-            [('preprocess_one_hot',
-              ColumnTransformer([
+        clf = pipeline.Pipeline([
+            ('preprocess_impute',
+             ColumnTransformer([
+                ("categorical", "passthrough", self.categorical_data),
+                ("continuous", SimpleImputer(strategy="mean"), ~self.categorical_data)])),
+            ('preprocess_one_hot',
+             ColumnTransformer([
                  ("categorical", OneHotEncoder(categories=self.categories, sparse=False), self.categorical_data),
                  ("continuous", "passthrough", ~self.categorical_data)])),
-             ('xgb', xgb.XGBClassifier(
+            ('xgb',
+             xgb.XGBClassifier(
                  learning_rate=eta,
                  min_child_weight=min_child_weight,
                  colsample_bytree=colsample_bytree,
diff --git a/hpolib/container/benchmarks/ml/svm_benchmark.py b/hpolib/container/benchmarks/ml/svm_benchmark.py
new file mode 100644
index 00000000..198b2f26
--- /dev/null
+++ b/hpolib/container/benchmarks/ml/svm_benchmark.py
@@ -0,0 +1,13 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+""" Benchmark for the XGBoost Benchmark from hpolib/benchmarks/ml/xgboost_benchmark """
+
+from hpolib.container.client_abstract_benchmark import AbstractBenchmarkClient
+
+
+class SupportVectorMachine(AbstractBenchmarkClient):
+    def __init__(self, **kwargs):
+        kwargs['benchmark_name'] = kwargs.get('benchmark_name', 'SupportVectorMachine')
+        kwargs['container_name'] = kwargs.get('container_name', 'svm_benchmark')
+        super(SupportVectorMachine, self).__init__(**kwargs)
diff --git a/hpolib/container/client_abstract_benchmark.py b/hpolib/container/client_abstract_benchmark.py
index d4de1e92..00ba9307 100644
--- a/hpolib/container/client_abstract_benchmark.py
+++ b/hpolib/container/client_abstract_benchmark.py
@@ -367,7 +367,12 @@ def get_fidelity_space(self, seed: Union[int, None] = None) -> CS.ConfigurationS
         seed_dict = json.dumps(seed_dict, indent=None)
         logger.debug(f'Client: seed_dict {seed_dict}')
         json_str = self.benchmark.get_fidelity_space(seed_dict)
-        return csjson.read(json_str)
+
+        fs = csjson.read(json_str)
+        if seed is not None:
+            fs.seed(seed)
+
+        return fs
 
     def get_meta_information(self) -> Dict:
         """ Return the information about the benchmark. """
diff --git a/hpolib/container/recipes/ml/Singularity.SupportVectorMachine b/hpolib/container/recipes/ml/Singularity.SupportVectorMachine
new file mode 100644
index 00000000..3ca6c876
--- /dev/null
+++ b/hpolib/container/recipes/ml/Singularity.SupportVectorMachine
@@ -0,0 +1,21 @@
+Bootstrap: docker
+From: python:3.7-slim
+
+%labels
+MAINTAINER muelleph@cs.uni-freiburg.de
+VERSION v0.0.1
+
+%post
+    apt update -y \
+    && apt install build-essential git -y \
+    && cd /home \
+    && git clone https://github.com/automl/HPOlib2.git \
+    && cd HPOlib2 \
+    && git checkout master \
+    && pip install .[svm] \
+    && cd / \
+    && mkdir /var/lib/hpolib2/ \
+    && chmod -R 777 /var/lib/hpolib2/
+
+%runscript
+    python -s /home/HPOlib2/hpolib/container/server_abstract_benchmark.py ml.svm_benchmark $@
diff --git a/hpolib/util/openml_data_manager.py b/hpolib/util/openml_data_manager.py
index 20b2b622..1bc414d3 100644
--- a/hpolib/util/openml_data_manager.py
+++ b/hpolib/util/openml_data_manager.py
@@ -9,7 +9,7 @@
 For Non-OpenML data sets please use the hpolib.util.data_manager.
 """
 
-from typing import Tuple, Union
+from typing import Tuple, Union, List
 
 import numpy as np
 
@@ -165,6 +165,32 @@ def load(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray,
 
         return self.X_train, self.y_train, self.X_valid, self.y_valid, self.X_test, self.y_test
 
+    @staticmethod
+    def replace_nans_in_cat_columns(X_train: np.ndarray, X_valid: np.ndarray, X_test: np.ndarray,
+                                    is_categorical: Union[np.ndarray, List]) \
+            -> Tuple[np.ndarray, np.ndarray, np.ndarray, List]:
+        """ Helper function to replace nan values in categorical features / columns by a non-used value.
+        Here: Min - 1.
+        """
+        _cat_data = np.concatenate([X_train, X_valid, X_test], axis=0)
+        nan_index = np.isnan(_cat_data[:, is_categorical])
+        categories = [np.unique(_cat_data[:, i][~nan_index[:, i]])
+                      for i in range(X_train.shape[1]) if is_categorical[i]]
+        replace_nans_with = np.nanmin(_cat_data[:, is_categorical], axis=0) - 1
+
+        categories = [np.concatenate([replace_value.flatten(), cat])
+                      for (replace_value, cat) in zip(replace_nans_with, categories)]
+
+        def _find_and_replace(array, replace_nans_with, categorical_data):
+            nan_idx = np.where(np.isnan(array))
+            array[nan_idx] = np.take(replace_nans_with, nan_idx[1])
+            return array
+
+        X_train[:, is_categorical] = _find_and_replace(X_train[:, is_categorical], replace_nans_with, is_categorical)
+        X_valid[:, is_categorical] = _find_and_replace(X_valid[:, is_categorical], replace_nans_with, is_categorical)
+        X_test[:, is_categorical] = _find_and_replace(X_test[:, is_categorical], replace_nans_with, is_categorical)
+        return X_train, X_valid, X_test, categories
+
 
 class OpenMLCrossvalidationDataManager(CrossvalidationDataManager):
     """ Base class for loading cross-validation data set from OpenML.
diff --git a/tests/test_openml_datamanager.py b/tests/test_openml_datamanager.py
new file mode 100644
index 00000000..4d416e9d
--- /dev/null
+++ b/tests/test_openml_datamanager.py
@@ -0,0 +1,28 @@
+from hpolib.util.openml_data_manager import OpenMLHoldoutDataManager
+import numpy as np
+
+
+def test_convert_nan_values_in_cat_columns():
+    x = np.array([[1,  np.nan,  3,      4],
+                  [5,       6,  7,      8],
+                  [np.nan, 10, 11, np.nan]])
+
+    is_cat = [True, True, False, False]
+
+    x, _, _, categories = OpenMLHoldoutDataManager.replace_nans_in_cat_columns(x, x, x, is_cat)
+
+    solution = np.array([[1., 5.,  3., 4.],
+                         [5., 6.,  7., 8.],
+                         [0., 10., 11., np.nan]])
+
+    solution_cat = np.array([[1., 5., 0.],
+                             [5., 6., 10.]])
+
+    assert np.array_equiv(x[:, :3], solution[:, :3])  # unfortunately np.nan != np.nan :)
+    assert np.isnan(x[2, 3])
+
+    cats = np.array(categories).flatten()
+    cats.sort()
+    solution_cat = solution_cat.flatten()
+    solution_cat.sort()
+    assert np.array_equal(cats, solution_cat)
diff --git a/tests/test_svm.py b/tests/test_svm.py
new file mode 100644
index 00000000..6c072a7b
--- /dev/null
+++ b/tests/test_svm.py
@@ -0,0 +1,40 @@
+import pytest
+
+from hpolib.container.benchmarks.ml.svm_benchmark import SupportVectorMachine
+from hpolib.util.openml_data_manager import get_openmlcc18_taskids
+
+task_ids = get_openmlcc18_taskids()
+
+import logging
+logging.basicConfig(level=logging.DEBUG)
+
+
+def test_svm_init():
+    benchmark = SupportVectorMachine(task_id=task_ids[0])
+
+    fs = benchmark.get_fidelity_space(seed=0)
+    fidelity = fs.sample_configuration().get_dictionary()
+    assert fidelity['dataset_fraction'] == pytest.approx(0.5939, abs=0.001)
+
+    meta = benchmark.get_meta_information()
+    assert meta is not None
+
+    cs = benchmark.get_configuration_space(seed=0)
+    config = cs.sample_configuration().get_dictionary()
+    assert config['C'] == pytest.approx(0.9762, abs=0.001)
+    assert config['gamma'] == pytest.approx(4.3037, abs=0.001)
+
+    result = benchmark.objective_function(configuration=config, fidelity=fidelity)
+    assert result['function_value'] == pytest.approx(0.4837, abs=0.1)
+    assert result['cost'] == pytest.approx(0.323, abs=0.1)
+
+    with pytest.raises(AssertionError):
+        result = benchmark.objective_function_test(configuration=config, fidelity=fidelity)
+
+    result = benchmark.objective_function_test(configuration=config)
+    assert result['function_value'] == pytest.approx(0.4648, abs=0.1)
+    assert result['cost'] is not None
+
+
+if __name__ == "__main__":
+    test_svm_init()

From 71b5e756a21f021667142c92d28d9768af1d9807 Mon Sep 17 00:00:00 2001
From: Katharina Eggensperger <eggenspk@informatik.uni-freiburg.de>
Date: Mon, 31 Aug 2020 10:21:24 +0200
Subject: [PATCH 5/6] How to add a benchmark (#39)

* Update howto

* update changelog

* update
---
 changelog.md                    |  3 ++
 hpolib/HowToAddANewBenchmark.md | 52 +++++++++++++++++++++++----------
 2 files changed, 39 insertions(+), 16 deletions(-)

diff --git a/changelog.md b/changelog.md
index 2fdfec6d..41f88b92 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,5 +1,8 @@
 # 0.0.4
   * improve test coverage
+  * update HowToAddANewBenchmark.md
+  * Add SVM benchmark working on OpenML data
+  * Add nasbench201 benchmark
   
 # 0.0.3
   * improve forwarding exceptions in containerized benchmarks
diff --git a/hpolib/HowToAddANewBenchmark.md b/hpolib/HowToAddANewBenchmark.md
index ad19903d..6ae1c477 100644
--- a/hpolib/HowToAddANewBenchmark.md
+++ b/hpolib/HowToAddANewBenchmark.md
@@ -2,9 +2,8 @@
 
 ## Create a local benchmark
 
-  1. Clone Hpolib2, switch to the development branch and create your own branch, then install hpolib2. 
-Note that with `pip install .`
-
+Clone Hpolib2, switch to the development branch and create your own branch, then install hpolib2. 
+with `pip install .`
 ```bash
 git clone https://github.com/automl/HPOlib2.git
 cd HPOlib2
@@ -14,33 +13,54 @@ git checkout newBenchmark
 pip install .
 ```
 
+Then: 
   2. Implement your new benchmark `hpolib/benchmarks/<type>/<name>` inheriting from the base class 
-  `AbstractBenchmark` in `hpolib.abstract_benchmark`
- 
-  3. Collect **all additional Python** and **non-Python** dependencies while doing this. 
+  `AbstractBenchmark` in `hpolib.abstract_benchmark`. Your benchmark should implement `__init__()`, 
+  `get_configuration_space()`, `get_fidelity_space()`, `objective_function()` and `objective_function_test()`.
+  A good example for this can be found in `hpolib/benchmarks/ml/xgboost_benchmark.py`
+  2. If your benchmarks needs a dataset (e.g. for training a ml model), please also implement a DataManager, see e.g.
+   `hpolib/util/openml_data_manager.py` with a `load()` method that downloads data once and reuses it for further calls.
+  3. Collect all **additional Python** and **non-Python** dependencies while doing this. 
   Consider fixing the version of each dependency to maintain reproducibility.
-  4. Add dependencies to PiPy in a new file to `/extra_requirements`
-  5. Add the remaining dependencies or steps necessary to run your benchmark in the docstring of your benchmark class.
+  4. Add dependencies to PIPy in a new file to `/extra_requirements`
+  5. Add the remaining dependencies or steps necessary to run your benchmark in the docstring of your benchmark class
+    (see, e.g. `hpolib/benchmarks/nas/nasbench_101.py`).
+  6. Verify that everything works with, e.g.
+```python
+from hpolib.benchmarks.<type>.<newbenchmark> import <NewBenchmark>
+b = <NewBenchmark>(<some_args>, rng=1)
+config = b.get_configuration_space(seed=1).sample_configuration()
+result_dict = b.objective_function(configuration=config, rng=1)
+print(result_dict)
+```
+
+**Note:** Ideally, your benchmark behaves deterministic given a seed!
+
+Now, you can create a PR marked as [WIP] and proceed with building a containerized version. 
 
 ## Create a containerized benchmark
 
   1. Create a container benchmark class in `hpolib/container/benchmarks/<type>/<name>` inheriting from the 
-  base class `AbstractBenchmarkClient` in `hpolib.container.client_abstract_benchmark` (note: this is just copy/paste from existing classes)
+  base class `AbstractBenchmarkClient` in `hpolib.container.client_abstract_benchmark`. 
+  Note: this are just a few lines of code, see, e.g. `hpolib/container/benchmarks/ml/xgboost_benchmark.py`)
   2. Copy `hpolib/container/recipes/Singularity.template` to  `hpolib/container/recipes/<type>/name`
-  3. Create a pull request marked as [WIP]
+  3. Test your container locally (see below)
+
+Now, you can update your PR and let us know, so we can upload the container to Sylabs. Thanks.
   
 ## How to test your container locally
 
-  1. `cd hpolib/container/benchmarks/recipes/<type>` and change to following line in the recipe:
+  1. `cd hpolib/container/benchmarks/recipes/<type>` and change to following lines in the recipe:
   ```bash
+    && git clone https://github.com/automl/HPOlib2.git \
+    && cd HPOlib2 \
     && git checkout development \
 ```
-   to point to the branch where your pull request is, e.g. `newBenchmark`
+   to point to the branch/repo where your fork is on, e.g. `newBenchmark`
   2. Run `sudo singularity build <newBenchmark> Singularity.<newBenchmark>`
   3. Verify that everything works with
   ```python
-from hpolib.container.benchmarks.<type>.new_benchmark import newBenchmark
-b = newBenchmark(container_source="./", container_name="newBenchmark")
+from hpolib.container.benchmarks.<type>.<newbenchmark> import <NewBenchmark>
+b = <NewBenchmark>(container_source="./", container_name="newBenchmark")
 res = b.objective_function(configuration=b.get_configuration_space(seed=1).sample_configuration())
-```
-  4. Finalize your pull request and let us know, so we can upload the container to Sylabs. Thanks.
\ No newline at end of file
+```
\ No newline at end of file

From 56e38b401e83b7df519b0381840a7d8c1acb1878 Mon Sep 17 00:00:00 2001
From: Katharina Eggensperger <eggenspk@informatik.uni-freiburg.de>
Date: Mon, 31 Aug 2020 11:41:35 +0200
Subject: [PATCH 6/6] update to 0.0.4; mv nasbench201 to master (#40)

* update to 0.0.4; mv nasbench201 to master

* FIX

* Update HowToAddANewBenchmark.md

fix list of steps

* Update HowToAddANewBenchmark.md

update

* Update HowToAddANewBenchmark.md

fix style
---
 hpolib/HowToAddANewBenchmark.md               | 29 +++++++++++--------
 hpolib/__version__.py                         |  2 +-
 .../recipes/nas/Singularity.nasbench_201      |  2 +-
 3 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/hpolib/HowToAddANewBenchmark.md b/hpolib/HowToAddANewBenchmark.md
index 6ae1c477..9b9384f4 100644
--- a/hpolib/HowToAddANewBenchmark.md
+++ b/hpolib/HowToAddANewBenchmark.md
@@ -14,18 +14,20 @@ pip install .
 ```
 
 Then: 
-  2. Implement your new benchmark `hpolib/benchmarks/<type>/<name>` inheriting from the base class 
+
+  1. Implement your new benchmark `hpolib/benchmarks/<type>/<name>` inheriting from the base class 
   `AbstractBenchmark` in `hpolib.abstract_benchmark`. Your benchmark should implement `__init__()`, 
   `get_configuration_space()`, `get_fidelity_space()`, `objective_function()` and `objective_function_test()`.
-  A good example for this can be found in `hpolib/benchmarks/ml/xgboost_benchmark.py`
-  2. If your benchmarks needs a dataset (e.g. for training a ml model), please also implement a DataManager, see e.g.
+    A good example for this can be found in `hpolib/benchmarks/ml/xgboost_benchmark.py`
+  3. If your benchmarks needs a dataset (e.g. for training a ml model), please also implement a DataManager, see e.g.
    `hpolib/util/openml_data_manager.py` with a `load()` method that downloads data once and reuses it for further calls.
-  3. Collect all **additional Python** and **non-Python** dependencies while doing this. 
+  4. Collect all **additional Python** and **non-Python** dependencies while doing this. 
   Consider fixing the version of each dependency to maintain reproducibility.
-  4. Add dependencies to PIPy in a new file to `/extra_requirements`
-  5. Add the remaining dependencies or steps necessary to run your benchmark in the docstring of your benchmark class
+  5. Add dependencies to PIPy in a new file to `/extra_requirements`
+  6. Add the remaining dependencies or steps necessary to run your benchmark in the docstring of your benchmark class
     (see, e.g. `hpolib/benchmarks/nas/nasbench_101.py`).
-  6. Verify that everything works with, e.g.
+  7. Verify that everything works with, e.g.
+
 ```python
 from hpolib.benchmarks.<type>.<newbenchmark> import <NewBenchmark>
 b = <NewBenchmark>(<some_args>, rng=1)
@@ -44,6 +46,7 @@ Now, you can create a PR marked as [WIP] and proceed with building a containeriz
   base class `AbstractBenchmarkClient` in `hpolib.container.client_abstract_benchmark`. 
   Note: this are just a few lines of code, see, e.g. `hpolib/container/benchmarks/ml/xgboost_benchmark.py`)
   2. Copy `hpolib/container/recipes/Singularity.template` to  `hpolib/container/recipes/<type>/name`
+  3. Modify the recipe and add your **additional Python** and **non-Python** dependencies collected above. 
   3. Test your container locally (see below)
 
 Now, you can update your PR and let us know, so we can upload the container to Sylabs. Thanks.
@@ -55,12 +58,14 @@ Now, you can update your PR and let us know, so we can upload the container to S
     && git clone https://github.com/automl/HPOlib2.git \
     && cd HPOlib2 \
     && git checkout development \
-```
-   to point to the branch/repo where your fork is on, e.g. `newBenchmark`
+  ```
+   to point to the branch/repo where your fork is on, e.g. `newBenchmark`.
+   
   2. Run `sudo singularity build <newBenchmark> Singularity.<newBenchmark>`
-  3. Verify that everything works with
-  ```python
+  3. Verify that everything works with:
+
+```python
 from hpolib.container.benchmarks.<type>.<newbenchmark> import <NewBenchmark>
 b = <NewBenchmark>(container_source="./", container_name="newBenchmark")
 res = b.objective_function(configuration=b.get_configuration_space(seed=1).sample_configuration())
-```
\ No newline at end of file
+```
diff --git a/hpolib/__version__.py b/hpolib/__version__.py
index 3d8c7a09..156d6f9a 100644
--- a/hpolib/__version__.py
+++ b/hpolib/__version__.py
@@ -1 +1 @@
-__version__ = '0.0.4dev'
+__version__ = '0.0.4'
diff --git a/hpolib/container/recipes/nas/Singularity.nasbench_201 b/hpolib/container/recipes/nas/Singularity.nasbench_201
index b01ff97a..55767424 100644
--- a/hpolib/container/recipes/nas/Singularity.nasbench_201
+++ b/hpolib/container/recipes/nas/Singularity.nasbench_201
@@ -12,7 +12,7 @@ VERSION v0.0.1
     cd /home \
     && git clone https://github.com/automl/HPOlib3.git \
     && cd HPOlib3 \
-    && git checkout development \
+    && git checkout master \
     && pip install . \
     && cd / \
     && mkdir /var/lib/hpolib3/ \