From c0e3a6135d08659cdfaed88e932636a32d25daa5 Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Tue, 19 Dec 2023 16:32:27 +0100 Subject: [PATCH 1/4] Configuration for 2x3 GPUs using mixed precision --- ...et_base_run_FR-2x3GPUs-MixedPrecision.yaml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100755 configs/experiment/RandLaNet_base_run_FR-2x3GPUs-MixedPrecision.yaml diff --git a/configs/experiment/RandLaNet_base_run_FR-2x3GPUs-MixedPrecision.yaml b/configs/experiment/RandLaNet_base_run_FR-2x3GPUs-MixedPrecision.yaml new file mode 100755 index 00000000..5065a335 --- /dev/null +++ b/configs/experiment/RandLaNet_base_run_FR-2x3GPUs-MixedPrecision.yaml @@ -0,0 +1,19 @@ +# @package _global_ +defaults: + - RandLaNet_base_run_FR.yaml + +logger: + comet: + experiment_name: "RandLaNet_base_run_FR-2x3GPUs-MixedPrecision" + + +# 2 nodes x 3 GPUs - No gradient accumulation. +# This is equivalent to training with 2 GPUs with gradients accumulated 3 times. +# Consider trying precision=bf16 once version of pytorch/pytorch-geometric/pytorch-scatter are updated. +trainer: + strategy: ddp_find_unused_parameters_false + accelerator: gpu + num_nodes: 2 + gpus: 3 + accumulate_grad_batches: 1 + precision: 16 From bab3d172135eca6603f3b484562159a42ef6e946 Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Wed, 20 Dec 2023 10:09:48 +0100 Subject: [PATCH 2/4] Update comments in config of multi gpu --- configs/experiment/RandLaNet_base_run_FR-MultiGPU.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/experiment/RandLaNet_base_run_FR-MultiGPU.yaml b/configs/experiment/RandLaNet_base_run_FR-MultiGPU.yaml index 5a9e8727..f5664212 100755 --- a/configs/experiment/RandLaNet_base_run_FR-MultiGPU.yaml +++ b/configs/experiment/RandLaNet_base_run_FR-MultiGPU.yaml @@ -4,11 +4,11 @@ defaults: logger: comet: - experiment_name: "Pyg RandLaNet - FR Data - 2xGPUs" + experiment_name: "RandLaNet_base_run_FR-2xGPUs" trainer: strategy: ddp_find_unused_parameters_false - # Replace by gpu to simulate multi-gpus training. + # Replace by cpu to simulate multi-cpus training. accelerator: gpu num_processes: 2 gpus: 2 From 743850fb691217bcbfdda42452f69a48352b28be Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Wed, 20 Dec 2023 10:11:30 +0100 Subject: [PATCH 3/4] Remove mention of MixedPrecision since it does not bring any speed improvement --- ...ixedPrecision.yaml => RandLaNet_base_run_FR-2x3GPUs.yaml} | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) rename configs/experiment/{RandLaNet_base_run_FR-2x3GPUs-MixedPrecision.yaml => RandLaNet_base_run_FR-2x3GPUs.yaml} (64%) diff --git a/configs/experiment/RandLaNet_base_run_FR-2x3GPUs-MixedPrecision.yaml b/configs/experiment/RandLaNet_base_run_FR-2x3GPUs.yaml similarity index 64% rename from configs/experiment/RandLaNet_base_run_FR-2x3GPUs-MixedPrecision.yaml rename to configs/experiment/RandLaNet_base_run_FR-2x3GPUs.yaml index 5065a335..888ec903 100755 --- a/configs/experiment/RandLaNet_base_run_FR-2x3GPUs-MixedPrecision.yaml +++ b/configs/experiment/RandLaNet_base_run_FR-2x3GPUs.yaml @@ -4,16 +4,15 @@ defaults: logger: comet: - experiment_name: "RandLaNet_base_run_FR-2x3GPUs-MixedPrecision" + experiment_name: "RandLaNet_base_run_FR-2x3GPUs" # 2 nodes x 3 GPUs - No gradient accumulation. # This is equivalent to training with 2 GPUs with gradients accumulated 3 times. -# Consider trying precision=bf16 once version of pytorch/pytorch-geometric/pytorch-scatter are updated. +# Setting precision=16 did not bring any speed improvement for Lidar HD data and RandLa-Net model. trainer: strategy: ddp_find_unused_parameters_false accelerator: gpu num_nodes: 2 gpus: 3 accumulate_grad_batches: 1 - precision: 16 From f52d11c7dff4721cc1f2d3b9f2bc3b39cc006d89 Mon Sep 17 00:00:00 2001 From: Charles Gaydon <11660435+CharlesGaydon@users.noreply.github.com> Date: Tue, 2 Jan 2024 17:42:29 +0100 Subject: [PATCH 4/4] Update RandLaNet_base_run_FR-2x3GPUs.yaml Prefer using trainer.devices to trainer.gpus --- configs/experiment/RandLaNet_base_run_FR-2x3GPUs.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/experiment/RandLaNet_base_run_FR-2x3GPUs.yaml b/configs/experiment/RandLaNet_base_run_FR-2x3GPUs.yaml index 888ec903..03c8511e 100755 --- a/configs/experiment/RandLaNet_base_run_FR-2x3GPUs.yaml +++ b/configs/experiment/RandLaNet_base_run_FR-2x3GPUs.yaml @@ -14,5 +14,5 @@ trainer: strategy: ddp_find_unused_parameters_false accelerator: gpu num_nodes: 2 - gpus: 3 + devices: 3 accumulate_grad_batches: 1