diff --git a/CITATION.cff b/CITATION.cff
index 39264389b..4973dd3dd 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -2,16 +2,20 @@
 cff-version: 1.2.0
 message: "If you use this software, please cite it using these metadata."
 authors:
+  - affiliation: "Center for Advanced Systems Understanding (CASUS), Helmholtz-Zentrum Dresden-Rossendorf e.V. (HZDR)"
+    family-names: Cangi
+    given-names: Attila
+    orcid: https://orcid.org/0000-0001-9162-262X
+  - affiliation: "Sandia National Laboratories (SNL)"
+    family-names: Rajamanickam
+    given-names: Sivasankaran
+    orcid: https://orcid.org/0000-0002-5854-409X
   - affiliation: "Center for Advanced Systems Understanding (CASUS), Helmholtz-Zentrum Dresden-Rossendorf e.V. (HZDR)"
     family-names: Brzoza
     given-names: Bartosz
   - affiliation: "Center for Advanced Systems Understanding (CASUS), Helmholtz-Zentrum Dresden-Rossendorf e.V. (HZDR)"
     family-names: Callow
     given-names: Timothy J.
-  - affiliation: "Center for Advanced Systems Understanding (CASUS), Helmholtz-Zentrum Dresden-Rossendorf e.V. (HZDR)"
-    family-names: Cangi
-    given-names: Attila
-    orcid: https://orcid.org/0000-0001-9162-262X
   - affiliation: "Oak Ridge National Laboratory (ORNL)"
     family-names: Ellis
     given-names: J. Austin
@@ -54,10 +58,6 @@ authors:
   - affiliation: "Center for Advanced Systems Understanding (CASUS), Helmholtz-Zentrum Dresden-Rossendorf e.V. (HZDR)"
     family-names: Pöschel
     given-names: Franz
-  - affiliation: "Sandia National Laboratories (SNL)"
-    family-names: Rajamanickam
-    given-names: Sivasankaran
-    orcid: https://orcid.org/0000-0002-5854-409X
   - affiliation: "Nvidia Corporation"
     family-names: Romero
     given-names: Josh
diff --git a/docs/source/img/logos/mala_favicon.png b/docs/source/img/logos/mala_favicon.png
index c882ab8dd..5559ddd62 100644
Binary files a/docs/source/img/logos/mala_favicon.png and b/docs/source/img/logos/mala_favicon.png differ
diff --git a/docs/source/img/logos/mala_horizontal.png b/docs/source/img/logos/mala_horizontal.png
index 10b540c7b..eeb848dee 100644
Binary files a/docs/source/img/logos/mala_horizontal.png and b/docs/source/img/logos/mala_horizontal.png differ
diff --git a/docs/source/img/logos/mala_horizontal_white.png b/docs/source/img/logos/mala_horizontal_white.png
index a2db66938..3e53651f8 100644
Binary files a/docs/source/img/logos/mala_horizontal_white.png and b/docs/source/img/logos/mala_horizontal_white.png differ
diff --git a/docs/source/img/logos/mala_vertical.png b/docs/source/img/logos/mala_vertical.png
index ba015ec47..0c2ebcee9 100644
Binary files a/docs/source/img/logos/mala_vertical.png and b/docs/source/img/logos/mala_vertical.png differ
diff --git a/mala/common/parameters.py b/mala/common/parameters.py
index 314ea44ff..c6c67e9cd 100644
--- a/mala/common/parameters.py
+++ b/mala/common/parameters.py
@@ -6,8 +6,10 @@
 import pickle
 from time import sleep
 
+horovod_available = False
 try:
     import horovod.torch as hvd
+    horovod_available = True
 except ModuleNotFoundError:
     pass
 import numpy as np
@@ -732,7 +734,7 @@ def __init__(self):
         self.use_mixed_precision = False
         self.use_graphs = False
         self.training_report_frequency = 1000
-        self.profiler_range = [1000, 2000]
+        self.profiler_range = None #[1000, 2000]
 
     def _update_horovod(self, new_horovod):
         super(ParametersRunning, self)._update_horovod(new_horovod)
@@ -1257,19 +1259,25 @@ def use_horovod(self):
 
     @use_horovod.setter
     def use_horovod(self, value):
-        if value:
-            hvd.init()
+        if value is False:
+            self._use_horovod = False
+        else:
+            if horovod_available:
+                hvd.init()
+                # Invalidate, will be updated in setter.
+                set_horovod_status(value)
+                self.device = None
+                self._use_horovod = value
+                self.network._update_horovod(self.use_horovod)
+                self.descriptors._update_horovod(self.use_horovod)
+                self.targets._update_horovod(self.use_horovod)
+                self.data._update_horovod(self.use_horovod)
+                self.running._update_horovod(self.use_horovod)
+                self.hyperparameters._update_horovod(self.use_horovod)
+            else:
+                parallel_warn("Horovod requested, but not installed found. "
+                              "MALA will operate without horovod only.")
 
-        # Invalidate, will be updated in setter.
-        set_horovod_status(value)
-        self.device = None
-        self._use_horovod = value
-        self.network._update_horovod(self.use_horovod)
-        self.descriptors._update_horovod(self.use_horovod)
-        self.targets._update_horovod(self.use_horovod)
-        self.data._update_horovod(self.use_horovod)
-        self.running._update_horovod(self.use_horovod)
-        self.hyperparameters._update_horovod(self.use_horovod)
 
     @property
     def device(self):
diff --git a/mala/network/tester.py b/mala/network/tester.py
index 7c61b5c25..f7a9e7373 100644
--- a/mala/network/tester.py
+++ b/mala/network/tester.py
@@ -210,7 +210,7 @@ def __calculate_observable_error(self, snapshot_number, observable,
             target_calculator.read_from_array(predicted_target)
             predicted = target_calculator.band_energy
             return [actual, predicted,
-                    target_calculator.total_energy_dft_calculation]
+                    target_calculator.band_energy_dft_calculation]
 
         elif observable == "number_of_electrons":
             target_calculator = self.data.target_calculator
diff --git a/mala/network/trainer.py b/mala/network/trainer.py
index 98dc291b8..86d601ac0 100644
--- a/mala/network/trainer.py
+++ b/mala/network/trainer.py
@@ -279,17 +279,18 @@ def train_network(self):
                 self.data.training_data_sets[0].shuffle()
 
             if self.parameters._configuration["gpu"]:
-                torch.cuda.synchronize()
+                torch.cuda.synchronize(self.parameters._configuration["device"])
                 tsample = time.time()
                 t0 = time.time()
                 batchid = 0
                 for loader in self.training_data_loaders:
                     for (inputs, outputs) in loader:
 
-                        if batchid == self.parameters.profiler_range[0]:
-                            torch.cuda.profiler.start()
-                        if batchid == self.parameters.profiler_range[1]:
-                            torch.cuda.profiler.stop()
+                        if self.parameters.profiler_range is not None:
+                            if batchid == self.parameters.profiler_range[0]:
+                                torch.cuda.profiler.start()
+                            if batchid == self.parameters.profiler_range[1]:
+                                torch.cuda.profiler.stop()
 
                         torch.cuda.nvtx.range_push(f"step {batchid}")
 
@@ -309,7 +310,7 @@ def train_network(self):
                         training_loss_sum += loss
 
                         if batchid != 0 and (batchid + 1) % self.parameters.training_report_frequency == 0:
-                            torch.cuda.synchronize()
+                            torch.cuda.synchronize(self.parameters._configuration["device"])
                             sample_time = time.time() - tsample
                             avg_sample_time = sample_time / self.parameters.training_report_frequency
                             avg_sample_tput = self.parameters.training_report_frequency * inputs.shape[0] / sample_time
@@ -319,14 +320,14 @@ def train_network(self):
                                      min_verbosity=2)
                             tsample = time.time()
                         batchid += 1
-                torch.cuda.synchronize()
+                torch.cuda.synchronize(self.parameters._configuration["device"])
                 t1 = time.time()
                 printout(f"training time: {t1 - t0}", min_verbosity=2)
 
                 training_loss = training_loss_sum.item() / batchid
 
                 # Calculate the validation loss. and output it.
-                torch.cuda.synchronize()
+                torch.cuda.synchronize(self.parameters._configuration["device"])
             else:
                 batchid = 0
                 for loader in self.training_data_loaders:
@@ -375,14 +376,14 @@ def train_network(self):
                 self.tensor_board.close()
 
             if self.parameters._configuration["gpu"]:
-                torch.cuda.synchronize()
+                torch.cuda.synchronize(self.parameters._configuration["device"])
 
             # Mix the DataSets up (this function only does something
             # in the lazy loading case).
             if self.parameters.use_shuffling_for_samplers:
                 self.data.mix_datasets()
             if self.parameters._configuration["gpu"]:
-                torch.cuda.synchronize()
+                torch.cuda.synchronize(self.parameters._configuration["device"])
 
             # If a scheduler is used, update it.
             if self.scheduler is not None:
@@ -636,8 +637,8 @@ def __process_mini_batch(self, network, input_data, target_data):
         if self.parameters._configuration["gpu"]:
             if self.parameters.use_graphs and self.train_graph is None:
                 printout("Capturing CUDA graph for training.", min_verbosity=2)
-                s = torch.cuda.Stream()
-                s.wait_stream(torch.cuda.current_stream())
+                s = torch.cuda.Stream(self.parameters._configuration["device"])
+                s.wait_stream(torch.cuda.current_stream(self.parameters._configuration["device"]))
                 # Warmup for graphs
                 with torch.cuda.stream(s):
                     for _ in range(20):
@@ -651,7 +652,7 @@ def __process_mini_batch(self, network, input_data, target_data):
                             self.gradscaler.scale(loss).backward()
                         else:
                             loss.backward()
-                torch.cuda.current_stream().wait_stream(s)
+                torch.cuda.current_stream(self.parameters._configuration["device"]).wait_stream(s)
 
                 # Create static entry point tensors to graph
                 self.static_input_data = torch.empty_like(input_data)
@@ -742,7 +743,7 @@ def __validate_network(self, network, data_set_type, validation_type):
             with torch.no_grad():
                 if self.parameters._configuration["gpu"]:
                     report_freq = self.parameters.training_report_frequency
-                    torch.cuda.synchronize()
+                    torch.cuda.synchronize(self.parameters._configuration["device"])
                     tsample = time.time()
                     batchid = 0
                     for loader in data_loaders:
@@ -754,15 +755,15 @@ def __validate_network(self, network, data_set_type, validation_type):
 
                             if self.parameters.use_graphs and self.validation_graph is None:
                                 printout("Capturing CUDA graph for validation.", min_verbosity=2)
-                                s = torch.cuda.Stream()
-                                s.wait_stream(torch.cuda.current_stream())
+                                s = torch.cuda.Stream(self.parameters._configuration["device"])
+                                s.wait_stream(torch.cuda.current_stream(self.parameters._configuration["device"]))
                                 # Warmup for graphs
                                 with torch.cuda.stream(s):
                                     for _ in range(20):
                                         with torch.cuda.amp.autocast(enabled=self.parameters.use_mixed_precision):
                                             prediction = network(x)
                                             loss = network.calculate_loss(prediction, y)
-                                torch.cuda.current_stream().wait_stream(s)
+                                torch.cuda.current_stream(self.parameters._configuration["device"]).wait_stream(s)
 
                                 # Create static entry point tensors to graph
                                 self.static_input_validation = torch.empty_like(x)
@@ -786,7 +787,7 @@ def __validate_network(self, network, data_set_type, validation_type):
                                     loss = network.calculate_loss(prediction, y)
                                     validation_loss_sum += loss
                             if batchid != 0 and (batchid + 1) % report_freq == 0:
-                                torch.cuda.synchronize()
+                                torch.cuda.synchronize(self.parameters._configuration["device"])
                                 sample_time = time.time() - tsample
                                 avg_sample_time = sample_time / report_freq
                                 avg_sample_tput = report_freq * x.shape[0] / sample_time
@@ -796,7 +797,7 @@ def __validate_network(self, network, data_set_type, validation_type):
                                          min_verbosity=2)
                                 tsample = time.time()
                             batchid += 1
-                    torch.cuda.synchronize()
+                    torch.cuda.synchronize(self.parameters._configuration["device"])
                 else:
                     batchid = 0
                     for loader in data_loaders: