Skip to content

Commit

Permalink
Merge pull request #499 from mala-project/develop
Browse files Browse the repository at this point in the history
v1.2.1 - Minor Bugfixes
  • Loading branch information
RandomDefaultUser authored Feb 1, 2024
2 parents 04e4d2d + e4880d4 commit e55883f
Show file tree
Hide file tree
Showing 8 changed files with 50 additions and 41 deletions.
16 changes: 8 additions & 8 deletions CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,20 @@
cff-version: 1.2.0
message: "If you use this software, please cite it using these metadata."
authors:
- affiliation: "Center for Advanced Systems Understanding (CASUS), Helmholtz-Zentrum Dresden-Rossendorf e.V. (HZDR)"
family-names: Cangi
given-names: Attila
orcid: https://orcid.org/0000-0001-9162-262X
- affiliation: "Sandia National Laboratories (SNL)"
family-names: Rajamanickam
given-names: Sivasankaran
orcid: https://orcid.org/0000-0002-5854-409X
- affiliation: "Center for Advanced Systems Understanding (CASUS), Helmholtz-Zentrum Dresden-Rossendorf e.V. (HZDR)"
family-names: Brzoza
given-names: Bartosz
- affiliation: "Center for Advanced Systems Understanding (CASUS), Helmholtz-Zentrum Dresden-Rossendorf e.V. (HZDR)"
family-names: Callow
given-names: Timothy J.
- affiliation: "Center for Advanced Systems Understanding (CASUS), Helmholtz-Zentrum Dresden-Rossendorf e.V. (HZDR)"
family-names: Cangi
given-names: Attila
orcid: https://orcid.org/0000-0001-9162-262X
- affiliation: "Oak Ridge National Laboratory (ORNL)"
family-names: Ellis
given-names: J. Austin
Expand Down Expand Up @@ -54,10 +58,6 @@ authors:
- affiliation: "Center for Advanced Systems Understanding (CASUS), Helmholtz-Zentrum Dresden-Rossendorf e.V. (HZDR)"
family-names: Pöschel
given-names: Franz
- affiliation: "Sandia National Laboratories (SNL)"
family-names: Rajamanickam
given-names: Sivasankaran
orcid: https://orcid.org/0000-0002-5854-409X
- affiliation: "Nvidia Corporation"
family-names: Romero
given-names: Josh
Expand Down
Binary file modified docs/source/img/logos/mala_favicon.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/source/img/logos/mala_horizontal.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/source/img/logos/mala_horizontal_white.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/source/img/logos/mala_vertical.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
34 changes: 21 additions & 13 deletions mala/common/parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
import pickle
from time import sleep

horovod_available = False
try:
import horovod.torch as hvd
horovod_available = True
except ModuleNotFoundError:
pass
import numpy as np
Expand Down Expand Up @@ -732,7 +734,7 @@ def __init__(self):
self.use_mixed_precision = False
self.use_graphs = False
self.training_report_frequency = 1000
self.profiler_range = [1000, 2000]
self.profiler_range = None #[1000, 2000]

def _update_horovod(self, new_horovod):
super(ParametersRunning, self)._update_horovod(new_horovod)
Expand Down Expand Up @@ -1257,19 +1259,25 @@ def use_horovod(self):

@use_horovod.setter
def use_horovod(self, value):
if value:
hvd.init()
if value is False:
self._use_horovod = False
else:
if horovod_available:
hvd.init()
# Invalidate, will be updated in setter.
set_horovod_status(value)
self.device = None
self._use_horovod = value
self.network._update_horovod(self.use_horovod)
self.descriptors._update_horovod(self.use_horovod)
self.targets._update_horovod(self.use_horovod)
self.data._update_horovod(self.use_horovod)
self.running._update_horovod(self.use_horovod)
self.hyperparameters._update_horovod(self.use_horovod)
else:
parallel_warn("Horovod requested, but not installed found. "
"MALA will operate without horovod only.")

# Invalidate, will be updated in setter.
set_horovod_status(value)
self.device = None
self._use_horovod = value
self.network._update_horovod(self.use_horovod)
self.descriptors._update_horovod(self.use_horovod)
self.targets._update_horovod(self.use_horovod)
self.data._update_horovod(self.use_horovod)
self.running._update_horovod(self.use_horovod)
self.hyperparameters._update_horovod(self.use_horovod)

@property
def device(self):
Expand Down
2 changes: 1 addition & 1 deletion mala/network/tester.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def __calculate_observable_error(self, snapshot_number, observable,
target_calculator.read_from_array(predicted_target)
predicted = target_calculator.band_energy
return [actual, predicted,
target_calculator.total_energy_dft_calculation]
target_calculator.band_energy_dft_calculation]

elif observable == "number_of_electrons":
target_calculator = self.data.target_calculator
Expand Down
39 changes: 20 additions & 19 deletions mala/network/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,17 +279,18 @@ def train_network(self):
self.data.training_data_sets[0].shuffle()

if self.parameters._configuration["gpu"]:
torch.cuda.synchronize()
torch.cuda.synchronize(self.parameters._configuration["device"])
tsample = time.time()
t0 = time.time()
batchid = 0
for loader in self.training_data_loaders:
for (inputs, outputs) in loader:

if batchid == self.parameters.profiler_range[0]:
torch.cuda.profiler.start()
if batchid == self.parameters.profiler_range[1]:
torch.cuda.profiler.stop()
if self.parameters.profiler_range is not None:
if batchid == self.parameters.profiler_range[0]:
torch.cuda.profiler.start()
if batchid == self.parameters.profiler_range[1]:
torch.cuda.profiler.stop()

torch.cuda.nvtx.range_push(f"step {batchid}")

Expand All @@ -309,7 +310,7 @@ def train_network(self):
training_loss_sum += loss

if batchid != 0 and (batchid + 1) % self.parameters.training_report_frequency == 0:
torch.cuda.synchronize()
torch.cuda.synchronize(self.parameters._configuration["device"])
sample_time = time.time() - tsample
avg_sample_time = sample_time / self.parameters.training_report_frequency
avg_sample_tput = self.parameters.training_report_frequency * inputs.shape[0] / sample_time
Expand All @@ -319,14 +320,14 @@ def train_network(self):
min_verbosity=2)
tsample = time.time()
batchid += 1
torch.cuda.synchronize()
torch.cuda.synchronize(self.parameters._configuration["device"])
t1 = time.time()
printout(f"training time: {t1 - t0}", min_verbosity=2)

training_loss = training_loss_sum.item() / batchid

# Calculate the validation loss. and output it.
torch.cuda.synchronize()
torch.cuda.synchronize(self.parameters._configuration["device"])
else:
batchid = 0
for loader in self.training_data_loaders:
Expand Down Expand Up @@ -375,14 +376,14 @@ def train_network(self):
self.tensor_board.close()

if self.parameters._configuration["gpu"]:
torch.cuda.synchronize()
torch.cuda.synchronize(self.parameters._configuration["device"])

# Mix the DataSets up (this function only does something
# in the lazy loading case).
if self.parameters.use_shuffling_for_samplers:
self.data.mix_datasets()
if self.parameters._configuration["gpu"]:
torch.cuda.synchronize()
torch.cuda.synchronize(self.parameters._configuration["device"])

# If a scheduler is used, update it.
if self.scheduler is not None:
Expand Down Expand Up @@ -636,8 +637,8 @@ def __process_mini_batch(self, network, input_data, target_data):
if self.parameters._configuration["gpu"]:
if self.parameters.use_graphs and self.train_graph is None:
printout("Capturing CUDA graph for training.", min_verbosity=2)
s = torch.cuda.Stream()
s.wait_stream(torch.cuda.current_stream())
s = torch.cuda.Stream(self.parameters._configuration["device"])
s.wait_stream(torch.cuda.current_stream(self.parameters._configuration["device"]))
# Warmup for graphs
with torch.cuda.stream(s):
for _ in range(20):
Expand All @@ -651,7 +652,7 @@ def __process_mini_batch(self, network, input_data, target_data):
self.gradscaler.scale(loss).backward()
else:
loss.backward()
torch.cuda.current_stream().wait_stream(s)
torch.cuda.current_stream(self.parameters._configuration["device"]).wait_stream(s)

# Create static entry point tensors to graph
self.static_input_data = torch.empty_like(input_data)
Expand Down Expand Up @@ -742,7 +743,7 @@ def __validate_network(self, network, data_set_type, validation_type):
with torch.no_grad():
if self.parameters._configuration["gpu"]:
report_freq = self.parameters.training_report_frequency
torch.cuda.synchronize()
torch.cuda.synchronize(self.parameters._configuration["device"])
tsample = time.time()
batchid = 0
for loader in data_loaders:
Expand All @@ -754,15 +755,15 @@ def __validate_network(self, network, data_set_type, validation_type):

if self.parameters.use_graphs and self.validation_graph is None:
printout("Capturing CUDA graph for validation.", min_verbosity=2)
s = torch.cuda.Stream()
s.wait_stream(torch.cuda.current_stream())
s = torch.cuda.Stream(self.parameters._configuration["device"])
s.wait_stream(torch.cuda.current_stream(self.parameters._configuration["device"]))
# Warmup for graphs
with torch.cuda.stream(s):
for _ in range(20):
with torch.cuda.amp.autocast(enabled=self.parameters.use_mixed_precision):
prediction = network(x)
loss = network.calculate_loss(prediction, y)
torch.cuda.current_stream().wait_stream(s)
torch.cuda.current_stream(self.parameters._configuration["device"]).wait_stream(s)

# Create static entry point tensors to graph
self.static_input_validation = torch.empty_like(x)
Expand All @@ -786,7 +787,7 @@ def __validate_network(self, network, data_set_type, validation_type):
loss = network.calculate_loss(prediction, y)
validation_loss_sum += loss
if batchid != 0 and (batchid + 1) % report_freq == 0:
torch.cuda.synchronize()
torch.cuda.synchronize(self.parameters._configuration["device"])
sample_time = time.time() - tsample
avg_sample_time = sample_time / report_freq
avg_sample_tput = report_freq * x.shape[0] / sample_time
Expand All @@ -796,7 +797,7 @@ def __validate_network(self, network, data_set_type, validation_type):
min_verbosity=2)
tsample = time.time()
batchid += 1
torch.cuda.synchronize()
torch.cuda.synchronize(self.parameters._configuration["device"])
else:
batchid = 0
for loader in data_loaders:
Expand Down

0 comments on commit e55883f

Please sign in to comment.