diff --git a/CITATION.cff b/CITATION.cff index 39264389b..4973dd3dd 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -2,16 +2,20 @@ cff-version: 1.2.0 message: "If you use this software, please cite it using these metadata." authors: + - affiliation: "Center for Advanced Systems Understanding (CASUS), Helmholtz-Zentrum Dresden-Rossendorf e.V. (HZDR)" + family-names: Cangi + given-names: Attila + orcid: https://orcid.org/0000-0001-9162-262X + - affiliation: "Sandia National Laboratories (SNL)" + family-names: Rajamanickam + given-names: Sivasankaran + orcid: https://orcid.org/0000-0002-5854-409X - affiliation: "Center for Advanced Systems Understanding (CASUS), Helmholtz-Zentrum Dresden-Rossendorf e.V. (HZDR)" family-names: Brzoza given-names: Bartosz - affiliation: "Center for Advanced Systems Understanding (CASUS), Helmholtz-Zentrum Dresden-Rossendorf e.V. (HZDR)" family-names: Callow given-names: Timothy J. - - affiliation: "Center for Advanced Systems Understanding (CASUS), Helmholtz-Zentrum Dresden-Rossendorf e.V. (HZDR)" - family-names: Cangi - given-names: Attila - orcid: https://orcid.org/0000-0001-9162-262X - affiliation: "Oak Ridge National Laboratory (ORNL)" family-names: Ellis given-names: J. Austin @@ -54,10 +58,6 @@ authors: - affiliation: "Center for Advanced Systems Understanding (CASUS), Helmholtz-Zentrum Dresden-Rossendorf e.V. (HZDR)" family-names: Pöschel given-names: Franz - - affiliation: "Sandia National Laboratories (SNL)" - family-names: Rajamanickam - given-names: Sivasankaran - orcid: https://orcid.org/0000-0002-5854-409X - affiliation: "Nvidia Corporation" family-names: Romero given-names: Josh diff --git a/docs/source/img/logos/mala_favicon.png b/docs/source/img/logos/mala_favicon.png index c882ab8dd..5559ddd62 100644 Binary files a/docs/source/img/logos/mala_favicon.png and b/docs/source/img/logos/mala_favicon.png differ diff --git a/docs/source/img/logos/mala_horizontal.png b/docs/source/img/logos/mala_horizontal.png index 10b540c7b..eeb848dee 100644 Binary files a/docs/source/img/logos/mala_horizontal.png and b/docs/source/img/logos/mala_horizontal.png differ diff --git a/docs/source/img/logos/mala_horizontal_white.png b/docs/source/img/logos/mala_horizontal_white.png index a2db66938..3e53651f8 100644 Binary files a/docs/source/img/logos/mala_horizontal_white.png and b/docs/source/img/logos/mala_horizontal_white.png differ diff --git a/docs/source/img/logos/mala_vertical.png b/docs/source/img/logos/mala_vertical.png index ba015ec47..0c2ebcee9 100644 Binary files a/docs/source/img/logos/mala_vertical.png and b/docs/source/img/logos/mala_vertical.png differ diff --git a/mala/common/parameters.py b/mala/common/parameters.py index 314ea44ff..c6c67e9cd 100644 --- a/mala/common/parameters.py +++ b/mala/common/parameters.py @@ -6,8 +6,10 @@ import pickle from time import sleep +horovod_available = False try: import horovod.torch as hvd + horovod_available = True except ModuleNotFoundError: pass import numpy as np @@ -732,7 +734,7 @@ def __init__(self): self.use_mixed_precision = False self.use_graphs = False self.training_report_frequency = 1000 - self.profiler_range = [1000, 2000] + self.profiler_range = None #[1000, 2000] def _update_horovod(self, new_horovod): super(ParametersRunning, self)._update_horovod(new_horovod) @@ -1257,19 +1259,25 @@ def use_horovod(self): @use_horovod.setter def use_horovod(self, value): - if value: - hvd.init() + if value is False: + self._use_horovod = False + else: + if horovod_available: + hvd.init() + # Invalidate, will be updated in setter. + set_horovod_status(value) + self.device = None + self._use_horovod = value + self.network._update_horovod(self.use_horovod) + self.descriptors._update_horovod(self.use_horovod) + self.targets._update_horovod(self.use_horovod) + self.data._update_horovod(self.use_horovod) + self.running._update_horovod(self.use_horovod) + self.hyperparameters._update_horovod(self.use_horovod) + else: + parallel_warn("Horovod requested, but not installed found. " + "MALA will operate without horovod only.") - # Invalidate, will be updated in setter. - set_horovod_status(value) - self.device = None - self._use_horovod = value - self.network._update_horovod(self.use_horovod) - self.descriptors._update_horovod(self.use_horovod) - self.targets._update_horovod(self.use_horovod) - self.data._update_horovod(self.use_horovod) - self.running._update_horovod(self.use_horovod) - self.hyperparameters._update_horovod(self.use_horovod) @property def device(self): diff --git a/mala/network/tester.py b/mala/network/tester.py index 7c61b5c25..f7a9e7373 100644 --- a/mala/network/tester.py +++ b/mala/network/tester.py @@ -210,7 +210,7 @@ def __calculate_observable_error(self, snapshot_number, observable, target_calculator.read_from_array(predicted_target) predicted = target_calculator.band_energy return [actual, predicted, - target_calculator.total_energy_dft_calculation] + target_calculator.band_energy_dft_calculation] elif observable == "number_of_electrons": target_calculator = self.data.target_calculator diff --git a/mala/network/trainer.py b/mala/network/trainer.py index 98dc291b8..86d601ac0 100644 --- a/mala/network/trainer.py +++ b/mala/network/trainer.py @@ -279,17 +279,18 @@ def train_network(self): self.data.training_data_sets[0].shuffle() if self.parameters._configuration["gpu"]: - torch.cuda.synchronize() + torch.cuda.synchronize(self.parameters._configuration["device"]) tsample = time.time() t0 = time.time() batchid = 0 for loader in self.training_data_loaders: for (inputs, outputs) in loader: - if batchid == self.parameters.profiler_range[0]: - torch.cuda.profiler.start() - if batchid == self.parameters.profiler_range[1]: - torch.cuda.profiler.stop() + if self.parameters.profiler_range is not None: + if batchid == self.parameters.profiler_range[0]: + torch.cuda.profiler.start() + if batchid == self.parameters.profiler_range[1]: + torch.cuda.profiler.stop() torch.cuda.nvtx.range_push(f"step {batchid}") @@ -309,7 +310,7 @@ def train_network(self): training_loss_sum += loss if batchid != 0 and (batchid + 1) % self.parameters.training_report_frequency == 0: - torch.cuda.synchronize() + torch.cuda.synchronize(self.parameters._configuration["device"]) sample_time = time.time() - tsample avg_sample_time = sample_time / self.parameters.training_report_frequency avg_sample_tput = self.parameters.training_report_frequency * inputs.shape[0] / sample_time @@ -319,14 +320,14 @@ def train_network(self): min_verbosity=2) tsample = time.time() batchid += 1 - torch.cuda.synchronize() + torch.cuda.synchronize(self.parameters._configuration["device"]) t1 = time.time() printout(f"training time: {t1 - t0}", min_verbosity=2) training_loss = training_loss_sum.item() / batchid # Calculate the validation loss. and output it. - torch.cuda.synchronize() + torch.cuda.synchronize(self.parameters._configuration["device"]) else: batchid = 0 for loader in self.training_data_loaders: @@ -375,14 +376,14 @@ def train_network(self): self.tensor_board.close() if self.parameters._configuration["gpu"]: - torch.cuda.synchronize() + torch.cuda.synchronize(self.parameters._configuration["device"]) # Mix the DataSets up (this function only does something # in the lazy loading case). if self.parameters.use_shuffling_for_samplers: self.data.mix_datasets() if self.parameters._configuration["gpu"]: - torch.cuda.synchronize() + torch.cuda.synchronize(self.parameters._configuration["device"]) # If a scheduler is used, update it. if self.scheduler is not None: @@ -636,8 +637,8 @@ def __process_mini_batch(self, network, input_data, target_data): if self.parameters._configuration["gpu"]: if self.parameters.use_graphs and self.train_graph is None: printout("Capturing CUDA graph for training.", min_verbosity=2) - s = torch.cuda.Stream() - s.wait_stream(torch.cuda.current_stream()) + s = torch.cuda.Stream(self.parameters._configuration["device"]) + s.wait_stream(torch.cuda.current_stream(self.parameters._configuration["device"])) # Warmup for graphs with torch.cuda.stream(s): for _ in range(20): @@ -651,7 +652,7 @@ def __process_mini_batch(self, network, input_data, target_data): self.gradscaler.scale(loss).backward() else: loss.backward() - torch.cuda.current_stream().wait_stream(s) + torch.cuda.current_stream(self.parameters._configuration["device"]).wait_stream(s) # Create static entry point tensors to graph self.static_input_data = torch.empty_like(input_data) @@ -742,7 +743,7 @@ def __validate_network(self, network, data_set_type, validation_type): with torch.no_grad(): if self.parameters._configuration["gpu"]: report_freq = self.parameters.training_report_frequency - torch.cuda.synchronize() + torch.cuda.synchronize(self.parameters._configuration["device"]) tsample = time.time() batchid = 0 for loader in data_loaders: @@ -754,15 +755,15 @@ def __validate_network(self, network, data_set_type, validation_type): if self.parameters.use_graphs and self.validation_graph is None: printout("Capturing CUDA graph for validation.", min_verbosity=2) - s = torch.cuda.Stream() - s.wait_stream(torch.cuda.current_stream()) + s = torch.cuda.Stream(self.parameters._configuration["device"]) + s.wait_stream(torch.cuda.current_stream(self.parameters._configuration["device"])) # Warmup for graphs with torch.cuda.stream(s): for _ in range(20): with torch.cuda.amp.autocast(enabled=self.parameters.use_mixed_precision): prediction = network(x) loss = network.calculate_loss(prediction, y) - torch.cuda.current_stream().wait_stream(s) + torch.cuda.current_stream(self.parameters._configuration["device"]).wait_stream(s) # Create static entry point tensors to graph self.static_input_validation = torch.empty_like(x) @@ -786,7 +787,7 @@ def __validate_network(self, network, data_set_type, validation_type): loss = network.calculate_loss(prediction, y) validation_loss_sum += loss if batchid != 0 and (batchid + 1) % report_freq == 0: - torch.cuda.synchronize() + torch.cuda.synchronize(self.parameters._configuration["device"]) sample_time = time.time() - tsample avg_sample_time = sample_time / report_freq avg_sample_tput = report_freq * x.shape[0] / sample_time @@ -796,7 +797,7 @@ def __validate_network(self, network, data_set_type, validation_type): min_verbosity=2) tsample = time.time() batchid += 1 - torch.cuda.synchronize() + torch.cuda.synchronize(self.parameters._configuration["device"]) else: batchid = 0 for loader in data_loaders: