From bb146c333c535b03e863ef8588ff5071f894381e Mon Sep 17 00:00:00 2001 From: Liran Bachar Date: Mon, 20 May 2024 15:50:20 +0300 Subject: [PATCH] CPUAdam fp16 and bf16 support (#5409) Hi. Please review the following changes I added support for BF16 to cpu adam. BF16, FP16 and float are supported at compilation time. the correct template is called at runtime according to input params dtype. --------- Co-authored-by: Olatunji Ruwase --- csrc/adagrad/cpu_adagrad.cpp | 207 +++++++---------- csrc/adam/cpu_adam.cpp | 3 - csrc/adam/cpu_adam_impl.cpp | 254 ++++++++------------- csrc/common/custom_cuda_kernel.cu | 44 ---- csrc/includes/cpu_adagrad.h | 150 +++--------- csrc/includes/cpu_adam.h | 173 +++----------- csrc/includes/cpu_lion.h | 162 +++---------- csrc/includes/custom_cuda_layers.h | 3 - csrc/includes/simd.h | 136 +++++++++-- csrc/lion/cpu_lion.cpp | 3 - csrc/lion/cpu_lion_impl.cpp | 211 +++++++---------- deepspeed/ops/adagrad/cpu_adagrad.py | 17 +- deepspeed/ops/adam/cpu_adam.py | 23 +- deepspeed/ops/lion/cpu_lion.py | 20 +- op_builder/builder.py | 1 + op_builder/cpu/builder.py | 6 +- op_builder/cpu_adagrad.py | 20 +- op_builder/cpu_adam.py | 21 +- op_builder/cpu_lion.py | 25 +- op_builder/hpu/builder.py | 6 +- op_builder/hpu/cpu_adam.py | 5 - tests/perf/adam_test1.py | 4 +- tests/unit/common.py | 8 +- tests/unit/ops/adagrad/test_cpu_adagrad.py | 4 +- tests/unit/ops/adam/test_cpu_adam.py | 33 +-- tests/unit/ops/adam/test_hybrid_adam.py | 6 +- tests/unit/ops/lion/test_cpu_lion.py | 6 +- 27 files changed, 530 insertions(+), 1021 deletions(-) delete mode 100644 csrc/common/custom_cuda_kernel.cu diff --git a/csrc/adagrad/cpu_adagrad.cpp b/csrc/adagrad/cpu_adagrad.cpp index 563255176500..5790e79e2bc2 100644 --- a/csrc/adagrad/cpu_adagrad.cpp +++ b/csrc/adagrad/cpu_adagrad.cpp @@ -5,55 +5,38 @@ #include "cpu_adagrad.h" #include +#include #include +#include #include #include #include -#if defined(__ENABLE_CUDA__) -#include -#include "cublas_v2.h" -#include "cuda.h" -#include "curand.h" -#include "custom_cuda_layers.h" -#endif +using namespace std::string_literals; static std::unordered_map> s_optimizers; // C++ interface -void Adagrad_Optimizer::Step_1(float* _params, - float* grads, - float* _exp_avg_sq, - size_t _param_size, - ds_half_precision_t* dev_params, - bool half_precision) +template +void Adagrad_Optimizer::Step_1(ds_params_percision_t* _params, + ds_params_percision_t* grads, + ds_state_precision_t* _exp_avg_sq, + size_t _param_size) { size_t rounded_size = 0; #if defined(__AVX512__) or defined(__AVX256__) - Step_AVX<1>( - &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision); + Step_AVX<1>(&rounded_size, _params, grads, _exp_avg_sq, _param_size); #endif if (_param_size > rounded_size) { float step_size = -1 * _alpha; - ds_half_precision_t* grads_cast_h; - ds_half_precision_t* params_cast_h; - if (half_precision) { - grads_cast_h = reinterpret_cast(grads); - params_cast_h = reinterpret_cast(_params); - } for (size_t t = rounded_size; t < _param_size; t += TILE) { size_t copy_size = TILE; if ((t + TILE) > _param_size) copy_size = _param_size - t; size_t offset = copy_size + t; -#if defined(__ENABLE_CUDA__) - if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); } -#elif defined(__ENABLE_CANN__) - if ((t / TILE) >= 2) { aclrtSynchronizeStream(_streams[_buf_index].stream()); } -#endif #pragma omp parallel for for (size_t k = t; k < offset; k++) { - float grad = half_precision ? (float)grads_cast_h[k] : grads[k]; - float param = half_precision ? (float)params_cast_h[k] : _params[k]; + float grad = (float)grads[k]; + float param = (float)_params[k]; float momentum = grads[k]; float variance = _exp_avg_sq[k]; if (_weight_decay > 0) { grad = param * _weight_decay + grad; } @@ -64,58 +47,30 @@ void Adagrad_Optimizer::Step_1(float* _params, grad += _eps; grad = momentum / grad; param = grad * step_size + param; -#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__) - if (dev_params) _doubled_buffer[_buf_index][k - t] = param; -#endif - if (half_precision) - params_cast_h[k] = (ds_half_precision_t)param; - else - _params[k] = param; + _params[k] = param; // STORE UPDATE TERM TO GRAD'S MEMORY grads[k] = grad * step_size; _exp_avg_sq[k] = variance; } -#if defined(__ENABLE_CUDA__) - if (dev_params) { - launch_param_update( - _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]); - _buf_index = !_buf_index; - } -#elif defined(__ENABLE_CANN__) - if (dev_params) { - size_t memcpy_size = copy_size * sizeof(_doubled_buffer[_buf_index][0]); - aclrtMemcpy(dev_params + t, - memcpy_size, - _doubled_buffer[_buf_index], - memcpy_size, - aclrtMemcpyKind::ACL_MEMCPY_HOST_TO_DEVICE); - - _buf_index = !_buf_index; - } -#endif } } } -void Adagrad_Optimizer::Step_4(float* _params, - float* grads, - float* _exp_avg_sq, - size_t _param_size, - ds_half_precision_t* dev_params, - bool half_precision) +template +void Adagrad_Optimizer::Step_4(ds_params_percision_t* _params, + ds_params_percision_t* grads, + ds_state_precision_t* _exp_avg_sq, + size_t _param_size) { size_t rounded_size = 0; #if defined(__AVX512__) or defined(__AVX256__) - Step_AVX<4>( - &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision); + Step_AVX<4>(&rounded_size, _params, grads, _exp_avg_sq, _param_size); #endif if (_param_size > rounded_size) Step_1((_params + rounded_size), (grads + rounded_size), (_exp_avg_sq + rounded_size), - (_param_size - rounded_size), - (dev_params != nullptr ? (dev_params + rounded_size) : dev_params), - half_precision); + (_param_size - rounded_size)); } int create_adagrad_optimizer(int optimizer_id, @@ -149,25 +104,77 @@ int create_adagrad_optimizer(int optimizer_id, return 0; } -void Adagrad_Optimizer::Step_8(float* _params, - float* grads, - float* _exp_avg_sq, - size_t _param_size, - ds_half_precision_t* dev_params, - bool half_precision) +template +void Adagrad_Optimizer::Step_8(ds_params_percision_t* _params, + ds_params_percision_t* grads, + ds_state_precision_t* _exp_avg_sq, + size_t _param_size) { size_t rounded_size = 0; #if defined(__AVX512__) or defined(__AVX256__) - Step_AVX<8>( - &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision); + Step_AVX<8>(&rounded_size, _params, grads, _exp_avg_sq, _param_size); #endif if (_param_size > rounded_size) Step_4((_params + rounded_size), (grads + rounded_size), (_exp_avg_sq + rounded_size), - (_param_size - rounded_size), - (dev_params != nullptr ? (dev_params + rounded_size) : dev_params), - half_precision); + (_param_size - rounded_size)); +} + +template +void step_invoker(std::shared_ptr opt, + void* _params, + void* grads, + void* _exp_avg_sq, + size_t _param_size) +{ + opt->Step_8((ds_params_percision_t*)(_params), + (ds_params_percision_t*)(grads), + (ds_state_precision_t*)(_exp_avg_sq), + _param_size); +} + +std::map, + std::function, void*, void*, void*, size_t)>> + invokers; + +// Fill map with template functions for each type +template +void create_invoker() +{ + invokers[std::tuple(c10::CppTypeToScalarType(), + c10::CppTypeToScalarType())] = + step_invoker; +} +struct InvokerInitializer { + InvokerInitializer() + { + create_invoker(); + create_invoker(); + create_invoker(); + create_invoker(); + create_invoker(); + } +} _invoker_initializer; + +void invoke(std::shared_ptr opt, + torch::Tensor& params, + torch::Tensor& grads, + torch::Tensor& exp_avg_sq, + size_t param_size) +{ + c10::ScalarType params_type = at::typeMetaToScalarType(params.options().dtype()); + c10::ScalarType state_type = at::typeMetaToScalarType(exp_avg_sq.options().dtype()); + + auto it = invokers.find(std::tuple(params_type, state_type)); + if (it == invokers.end()) { + throw std::runtime_error("Adagrad optimizer with param type "s + + c10::toString(params_type) + " and state type "s + + c10::toString(state_type) + + " is not supported on current hardware"s); + } + + it->second(opt, params.data_ptr(), grads.data_ptr(), exp_avg_sq.data_ptr(), param_size); } int ds_adagrad_step(int optimizer_id, @@ -183,58 +190,13 @@ int ds_adagrad_step(int optimizer_id, auto grads_c = grads.contiguous(); auto exp_avg_sq_c = exp_avg_sq.contiguous(); - float* params_ptr = (float*)params_c.data_ptr(); - float* grads_ptr = (float*)grads_c.data_ptr(); - float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr(); - std::shared_ptr opt = std::static_pointer_cast(s_optimizers[optimizer_id]); opt->IncrementStep(step); opt->update_state(lr, epsilon, weight_decay); - opt->Step_8(params_ptr, grads_ptr, exp_avg_sq_ptr, params_c.numel()); -#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__) - opt->SynchronizeStreams(); -#endif - return 0; -} + invoke(opt, params_c, grads_c, exp_avg_sq_c, params_c.numel()); -int ds_adagrad_step_plus_copy(int optimizer_id, - size_t step, - float lr, - float epsilon, - float weight_decay, - torch::Tensor& params, - torch::Tensor& grads, - torch::Tensor& exp_avg_sq, - torch::Tensor& gpu_params) -{ -#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__) - auto params_c = params.contiguous(); - auto gpu_params_c = gpu_params.contiguous(); - auto exp_avg_sq_c = exp_avg_sq.contiguous(); - auto grads_c = grads.contiguous(); - - float* params_ptr = (float*)params_c.data_ptr(); - float* grads_ptr = (float*)grads_c.data_ptr(); - ds_half_precision_t* gpu_params_ptr = (ds_half_precision_t*)gpu_params_c.data_ptr(); - float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr(); - - std::shared_ptr opt = - std::static_pointer_cast(s_optimizers[optimizer_id]); - opt->IncrementStep(step); - opt->update_state(lr, epsilon, weight_decay); - opt->Step_8(params_ptr, - grads_ptr, - exp_avg_sq_ptr, - params_c.numel(), - gpu_params_ptr, - (params.options().dtype() == at::kHalf)); - - opt->SynchronizeStreams(); -#else - assert(false); -#endif return 0; } @@ -248,9 +210,6 @@ int destroy_adagrad_optimizer(int optimizer_id) PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("adagrad_update", &ds_adagrad_step, "DeepSpeed CPU Adagrad update (C++)"); - m.def("adagrad_update_copy", - &ds_adagrad_step_plus_copy, - "DeepSpeed CPU Adagrad update and param copy (C++)"); m.def("create_adagrad", &create_adagrad_optimizer, "DeepSpeed CPU Adagrad (C++)"); m.def("destroy_adagrad", &destroy_adagrad_optimizer, "DeepSpeed CPU Adagrad destroy (C++)"); } diff --git a/csrc/adam/cpu_adam.cpp b/csrc/adam/cpu_adam.cpp index 96809827f3e1..263c443cb4d4 100644 --- a/csrc/adam/cpu_adam.cpp +++ b/csrc/adam/cpu_adam.cpp @@ -8,9 +8,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("adam_update", &ds_adam_step, "DeepSpeed CPU Adam update (C++)"); - m.def("adam_update_copy", - &ds_adam_step_plus_copy, - "DeepSpeed CPU Adam update and param copy (C++)"); m.def("create_adam", &create_adam_optimizer, "DeepSpeed CPU Adam (C++)"); m.def("destroy_adam", &destroy_adam_optimizer, "DeepSpeed CPU Adam destroy (C++)"); } diff --git a/csrc/adam/cpu_adam_impl.cpp b/csrc/adam/cpu_adam_impl.cpp index 9a4a8d956519..15d4e74d69d5 100644 --- a/csrc/adam/cpu_adam_impl.cpp +++ b/csrc/adam/cpu_adam_impl.cpp @@ -5,42 +5,29 @@ #include #include +#include #include +#include #include #include #include #include "cpu_adam.h" -#if defined(__ENABLE_CUDA__) -#include -#include "cublas_v2.h" -#include "cuda.h" -#include "curand.h" -#include "custom_cuda_layers.h" -#endif - +using namespace std::string_literals; static std::unordered_map> s_optimizers; // C++ interface -void Adam_Optimizer::Step_1(float* _params, - float* grads, - float* _exp_avg, - float* _exp_avg_sq, - size_t _param_size, - ds_half_precision_t* dev_params, - bool half_precision) +template +void Adam_Optimizer::Step_1(ds_params_percision_t* _params, + ds_params_percision_t* grads, + ds_state_precision_t* _exp_avg, + ds_state_precision_t* _exp_avg_sq, + size_t _param_size) { size_t rounded_size = 0; #if defined(__AVX512__) or defined(__AVX256__) - Step_AVX<1>(&rounded_size, - _params, - grads, - _exp_avg, - _exp_avg_sq, - _param_size, - dev_params, - half_precision); + Step_AVX<1>(&rounded_size, _params, grads, _exp_avg, _exp_avg_sq, _param_size); #endif if (_param_size > rounded_size) { float betta1_minus1 = 1 - _betta1; @@ -48,26 +35,15 @@ void Adam_Optimizer::Step_1(float* _params, float step_size = -1 * _alpha / _bias_correction1; float w_decay = -1 * _alpha * _weight_decay; - ds_half_precision_t* grads_cast_h; - ds_half_precision_t* params_cast_h; - if (half_precision) { - grads_cast_h = reinterpret_cast(grads); - params_cast_h = reinterpret_cast(_params); - } for (size_t t = rounded_size; t < _param_size; t += TILE) { size_t copy_size = TILE; if ((t + TILE) > _param_size) copy_size = _param_size - t; size_t offset = copy_size + t; -#if defined(__ENABLE_CUDA__) - if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); } -#elif defined(__ENABLE_CANN__) - if ((t / TILE) >= 2) { aclrtSynchronizeStream(_streams[_buf_index].stream()); } -#endif #pragma omp parallel for for (size_t k = t; k < offset; k++) { - float grad = half_precision ? (float)grads_cast_h[k] : grads[k]; - float param = half_precision ? (float)params_cast_h[k] : _params[k]; + float grad = (float)grads[k]; + float param = (float)_params[k]; float momentum = _exp_avg[k]; float variance = _exp_avg_sq[k]; if (_weight_decay > 0 && !_adamw_mode) { grad = param * _weight_decay + grad; } @@ -83,66 +59,31 @@ void Adam_Optimizer::Step_1(float* _params, grad = momentum / grad; if (_weight_decay > 0 && _adamw_mode) { param += w_decay * param; } param = grad * step_size + param; -#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__) - if (dev_params) _doubled_buffer[_buf_index][k - t] = param; -#endif - if (half_precision) - params_cast_h[k] = (ds_half_precision_t)param; - else - _params[k] = param; + _params[k] = param; _exp_avg[k] = momentum; _exp_avg_sq[k] = variance; } -#if defined(__ENABLE_CUDA__) - if (dev_params) { - launch_param_update( - _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]); - - _buf_index = !_buf_index; - } -#elif defined(__ENABLE_CANN__) - if (dev_params) { - size_t memcpy_size = copy_size * sizeof(_doubled_buffer[_buf_index][0]); - aclrtMemcpy(dev_params + t, - memcpy_size, - _doubled_buffer[_buf_index], - memcpy_size, - aclrtMemcpyKind::ACL_MEMCPY_HOST_TO_DEVICE); - - _buf_index = !_buf_index; - } -#endif } } } -void Adam_Optimizer::Step_4(float* _params, - float* grads, - float* _exp_avg, - float* _exp_avg_sq, - size_t _param_size, - ds_half_precision_t* dev_params, - bool half_precision) +template +void Adam_Optimizer::Step_4(ds_params_percision_t* _params, + ds_params_percision_t* grads, + ds_state_precision_t* _exp_avg, + ds_state_precision_t* _exp_avg_sq, + size_t _param_size) { size_t rounded_size = 0; #if defined(__AVX512__) or defined(__AVX256__) - Step_AVX<4>(&rounded_size, - _params, - grads, - _exp_avg, - _exp_avg_sq, - _param_size, - dev_params, - half_precision); + Step_AVX<4>(&rounded_size, _params, grads, _exp_avg, _exp_avg_sq, _param_size); #endif if (_param_size > rounded_size) Step_1((_params + rounded_size), (grads + rounded_size), (_exp_avg + rounded_size), (_exp_avg_sq + rounded_size), - (_param_size - rounded_size), - (dev_params != nullptr ? (dev_params + rounded_size) : dev_params), - half_precision); + (_param_size - rounded_size)); } int create_adam_optimizer(int optimizer_id, @@ -185,33 +126,86 @@ int create_adam_optimizer(int optimizer_id, return 0; } -void Adam_Optimizer::Step_8(float* _params, - float* grads, - float* _exp_avg, - float* _exp_avg_sq, - size_t _param_size, - ds_half_precision_t* dev_params, - bool half_precision) +template +void Adam_Optimizer::Step_8(ds_params_percision_t* _params, + ds_params_percision_t* grads, + ds_state_precision_t* _exp_avg, + ds_state_precision_t* _exp_avg_sq, + size_t _param_size) { size_t rounded_size = 0; #if defined(__AVX512__) or defined(__AVX256__) - Step_AVX<8>(&rounded_size, - _params, - grads, - _exp_avg, - _exp_avg_sq, - _param_size, - dev_params, - half_precision); + Step_AVX<8>(&rounded_size, _params, grads, _exp_avg, _exp_avg_sq, _param_size); #endif if (_param_size > rounded_size) Step_4((_params + rounded_size), (grads + rounded_size), (_exp_avg + rounded_size), (_exp_avg_sq + rounded_size), - (_param_size - rounded_size), - (dev_params != nullptr ? (dev_params + rounded_size) : dev_params), - half_precision); + (_param_size - rounded_size)); +} + +template +void step_invoker(std::shared_ptr opt, + void* _params, + void* grads, + void* _exp_avg, + void* _exp_avg_sq, + size_t _param_size) +{ + opt->Step_8((ds_params_percision_t*)(_params), + (ds_params_percision_t*)(grads), + (ds_state_precision_t*)(_exp_avg), + (ds_state_precision_t*)(_exp_avg_sq), + _param_size); +} + +std::map, + std::function, void*, void*, void*, void*, size_t)>> + invokers; + +// Fill map with template functions for each type +template +void create_invoker() +{ + invokers[std::tuple(c10::CppTypeToScalarType(), + c10::CppTypeToScalarType())] = + step_invoker; +} +struct InvokerInitializer { + InvokerInitializer() + { + create_invoker(); + create_invoker(); + create_invoker(); + create_invoker(); + create_invoker(); + } +} _invoker_initializer; + +void invoke(std::shared_ptr opt, + torch::Tensor& params, + torch::Tensor& grads, + torch::Tensor& exp_avg, + torch::Tensor& exp_avg_sq, + size_t param_size) +{ + c10::ScalarType params_type = at::typeMetaToScalarType(params.options().dtype()); + c10::ScalarType state_type = at::typeMetaToScalarType(exp_avg.options().dtype()); + + auto it = invokers.find(std::tuple(params_type, state_type)); + if (it == invokers.end()) { + throw std::runtime_error("Adam optimizer with param type "s + c10::toString(params_type) + + " and state type "s + c10::toString(state_type) + + " is not supported on current hardware"s); + } + + it->second(opt, + params.data_ptr(), + grads.data_ptr(), + exp_avg.data_ptr(), + exp_avg_sq.data_ptr(), + param_size); } int ds_adam_step(int optimizer_id, @@ -232,75 +226,13 @@ int ds_adam_step(int optimizer_id, auto exp_avg_c = exp_avg.contiguous(); auto exp_avg_sq_c = exp_avg_sq.contiguous(); - // assert(params.options().dtype() == grads.options().dtype()); - - float* params_ptr = (float*)params_c.data_ptr(); - float* grads_ptr = (float*)grads_c.data_ptr(); - float* exp_avg_ptr = (float*)exp_avg_c.data_ptr(); - float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr(); - std::shared_ptr opt = std::static_pointer_cast(s_optimizers[optimizer_id]); opt->IncrementStep(step, beta1, beta2); opt->update_state(lr, epsilon, weight_decay, bias_correction); - opt->Step_8(params_ptr, - grads_ptr, - exp_avg_ptr, - exp_avg_sq_ptr, - params_c.numel(), - nullptr, - (params.options().dtype() == at::kHalf)); + invoke(opt, params_c, grads_c, exp_avg_c, exp_avg_sq_c, params_c.numel()); -#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__) - opt->SynchronizeStreams(); -#endif - return 0; -} - -int ds_adam_step_plus_copy(int optimizer_id, - size_t step, - float lr, - float beta1, - float beta2, - float epsilon, - float weight_decay, - bool bias_correction, - torch::Tensor& params, - torch::Tensor& grads, - torch::Tensor& exp_avg, - torch::Tensor& exp_avg_sq, - torch::Tensor& device_params) -{ -#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__) - auto params_c = params.contiguous(); - auto device_params_c = device_params.contiguous(); - auto exp_avg_c = exp_avg.contiguous(); - auto exp_avg_sq_c = exp_avg_sq.contiguous(); - auto grads_c = grads.contiguous(); - - float* params_ptr = (float*)params_c.data_ptr(); - float* grads_ptr = (float*)grads_c.data_ptr(); - ds_half_precision_t* device_params_ptr = (ds_half_precision_t*)device_params_c.data_ptr(); - float* exp_avg_ptr = (float*)exp_avg_c.data_ptr(); - float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr(); - - std::shared_ptr opt = - std::static_pointer_cast(s_optimizers[optimizer_id]); - opt->IncrementStep(step, beta1, beta2); - opt->update_state(lr, epsilon, weight_decay, bias_correction); - opt->Step_8(params_ptr, - grads_ptr, - exp_avg_ptr, - exp_avg_sq_ptr, - params_c.numel(), - device_params_ptr, - (params.options().dtype() == at::kHalf)); - - opt->SynchronizeStreams(); -#else - assert(false); -#endif return 0; } diff --git a/csrc/common/custom_cuda_kernel.cu b/csrc/common/custom_cuda_kernel.cu deleted file mode 100644 index f46bf303125c..000000000000 --- a/csrc/common/custom_cuda_kernel.cu +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// SPDX-License-Identifier: Apache-2.0 - -// DeepSpeed Team - -#include "custom_cuda_layers.h" - -__global__ void param_update_kernel(const float* input, __half* output, int size) -{ - int id = blockIdx.x * blockDim.x + threadIdx.x; - - if (id < size) { output[id] = (__half)input[id]; } -} - -void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream) -{ - int threads = 1024; - - dim3 grid_dim((size - 1) / threads + 1); - dim3 block_dim(threads); - - param_update_kernel<<>>(input, output, size); -} - -__global__ void param_update_kernel_half(const float* input, __half* output, int size) -{ - int id = blockIdx.x * blockDim.x + threadIdx.x; - __half2* output_cast = reinterpret_cast<__half2*>(output); - if (id < size) { - float input_f = input[id]; - __half2* input_h = reinterpret_cast<__half2*>(&input_f); - output_cast[id] = *input_h; - } -} - -void launch_param_update_half(const float* input, __half* output, int size, cudaStream_t stream) -{ - int threads = 1024; - size /= 2; - dim3 grid_dim((size - 1) / threads + 1); - dim3 block_dim(threads); - - param_update_kernel_half<<>>(input, output, size); -} diff --git a/csrc/includes/cpu_adagrad.h b/csrc/includes/cpu_adagrad.h index e60984d64b76..c06d3a6b35e9 100644 --- a/csrc/includes/cpu_adagrad.h +++ b/csrc/includes/cpu_adagrad.h @@ -9,84 +9,35 @@ // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c #include +#include #include #include "simd.h" -#if defined(__ENABLE_CUDA__) -#include -#include -#include "cuda.h" -#include "custom_cuda_layers.h" -typedef __half ds_half_precision_t; -#elif defined(__ENABLE_CANN__) -#include "acl/acl.h" -#include "torch_npu/csrc/core/npu/NPUStream.h" -typedef c10::Half ds_half_precision_t; -#else -typedef unsigned short ds_half_precision_t; -#endif - -#define STEP(SPAN) \ - void Step_##SPAN(float* _params, \ - float* grads, \ - float* _exp_avg_sq, \ - size_t _param_size, \ - ds_half_precision_t* dev_param = nullptr, \ - bool half_precision = false); +#define STEP(SPAN) \ + template \ + void Step_##SPAN(ds_params_percision_t* _params, \ + ds_params_percision_t* grads, \ + ds_state_precision_t* _exp_avg_sq, \ + size_t _param_size); class Adagrad_Optimizer { public: Adagrad_Optimizer(float alpha = 1e-2, float eps = 1e-8, float weight_decay = 0) : _alpha(alpha), _eps(eps), _weight_decay(weight_decay) { -#if defined(__ENABLE_CUDA__) - cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float)); - cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float)); - - _streams[0] = TrainingContext::Instance().GetCurrentStream(); - _streams[1] = TrainingContext::Instance().GetNewStream(); - _buf_index = false; -#elif defined(__ENABLE_CANN__) - aclrtMallocHost((void**)_doubled_buffer, TILE * sizeof(float)); - aclrtMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float)); - - _buf_index = false; -#endif - } - ~Adagrad_Optimizer() - { -#if defined(__ENABLE_CUDA__) - cudaFreeHost(_doubled_buffer[0]); - cudaFreeHost(_doubled_buffer[1]); -#elif defined(__ENABLE_CANN__) - aclrtFreeHost(_doubled_buffer[0]); - aclrtFreeHost(_doubled_buffer[1]); -#endif } + ~Adagrad_Optimizer() {} #if defined(__AVX512__) or defined(__AVX256__) - template + template void Step_AVX(size_t* rounded_size, - float* _params, - float* grads, - float* _exp_avg_sq, - size_t param_size, - ds_half_precision_t* dev_param = nullptr, - bool half_precision = false); + ds_params_percision_t* _params, + ds_params_percision_t* grads, + ds_state_precision_t* _exp_avg_sq, + size_t param_size); #endif STEP(1) STEP(4) STEP(8) -#if defined(__ENABLE_CUDA__) - inline void SynchronizeStreams() - { - for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]); - } -#elif defined(__ENABLE_CANN__) - inline void SynchronizeStreams() - { - for (int i = 0; i < 2; i++) aclrtSynchronizeStream(_streams[i].stream()); - } -#endif inline void IncrementStep(size_t step) { _step++; @@ -107,29 +58,22 @@ class Adagrad_Optimizer { float _betta1_t; float _betta2_t; size_t _step; - -#if defined(__ENABLE_CUDA__) - bool _buf_index; - float* _doubled_buffer[2]; - cudaStream_t _streams[2]; -#elif defined(__ENABLE_CANN__) - float* _doubled_buffer[2]; - c10_npu::NPUStream _streams[2] = {c10_npu::getCurrentNPUStream(), - c10_npu::getNPUStreamFromPool()}; - bool _buf_index; -#endif }; #if defined(__AVX512__) or defined(__AVX256__) -template +template void Adagrad_Optimizer::Step_AVX(size_t* rounded_size, - float* _params, - float* grads, - float* _exp_avg_sq, - size_t _param_size, - ds_half_precision_t* dev_params, - bool half_precision) + ds_params_percision_t* _params, + ds_params_percision_t* grads, + ds_state_precision_t* _exp_avg_sq, + size_t _param_size) { +#if !defined(__AVX512__) + if (std::is_same_v || + std::is_same_v) { + return; + } +#endif size_t new_rounded_size = 0; AVX_Data eps_4; eps_4.data = SIMD_SET(_eps); @@ -145,24 +89,19 @@ void Adagrad_Optimizer::Step_AVX(size_t* rounded_size, size_t copy_size = TILE; if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t; size_t offset = copy_size + t; -#if defined(__ENABLE_CUDA__) - if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); } -#elif defined(__ENABLE_CANN__) - if ((t / TILE) >= 2) { aclrtSynchronizeStream(_streams[_buf_index].stream()); } -#endif #pragma omp parallel for for (size_t i = t; i < offset; i += SIMD_WIDTH * span) { AVX_Data grad_4[span]; - simd_load(grad_4, grads + i, half_precision); + simd_load(grad_4, grads + i); AVX_Data momentum_4[span]; - simd_load(momentum_4, grads + i, false); + simd_load(momentum_4, grads + i); AVX_Data variance_4[span]; - simd_load(variance_4, _exp_avg_sq + i, false); + simd_load(variance_4, _exp_avg_sq + i); AVX_Data param_4[span]; - simd_load(param_4, _params + i, half_precision); + simd_load(param_4, _params + i); if (_weight_decay > 0) { simd_fma(grad_4, param_4, weight_decay4, grad_4); } @@ -172,38 +111,9 @@ void Adagrad_Optimizer::Step_AVX(size_t* rounded_size, simd_div(grad_4, momentum_4, grad_4); simd_fma(param_4, grad_4, step_size_4, param_4); - simd_store(_params + i, param_4, half_precision); -#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__) - if (dev_params) { - simd_store(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision); - } -#endif - simd_store(_exp_avg_sq + i, variance_4, false); - } -#if defined(__ENABLE_CUDA__) - if (dev_params) { - if (half_precision) - launch_param_update_half( - _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]); - else - launch_param_update( - _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]); - - _buf_index = !_buf_index; + simd_store(_params + i, param_4); + simd_store(_exp_avg_sq + i, variance_4); } -#elif defined(__ENABLE_CANN__) - if (dev_params) { - size_t memcpy_size = copy_size * sizeof(_doubled_buffer[_buf_index][0]); - if (half_precision) memcpy_size /= 2; - aclrtMemcpy(dev_params + t, - memcpy_size, - _doubled_buffer[_buf_index], - memcpy_size, - aclrtMemcpyKind::ACL_MEMCPY_HOST_TO_DEVICE); - - _buf_index = !_buf_index; - } -#endif } *rounded_size = new_rounded_size; } diff --git a/csrc/includes/cpu_adam.h b/csrc/includes/cpu_adam.h index b1a104b2571d..faf99020aee5 100644 --- a/csrc/includes/cpu_adam.h +++ b/csrc/includes/cpu_adam.h @@ -13,29 +13,13 @@ #include #include "simd.h" -#if defined(__ENABLE_CUDA__) -#include -#include -#include "cuda.h" -#include "custom_cuda_layers.h" -typedef __half ds_half_precision_t; -#elif defined(__ENABLE_CANN__) -#include "acl/acl.h" -#include "torch_npu/csrc/core/npu/NPUStream.h" -typedef c10::Half ds_half_precision_t; -#else -#include -typedef unsigned short ds_half_precision_t; -#endif - -#define STEP(SPAN) \ - void Step_##SPAN(float* _params, \ - float* grads, \ - float* _exp_avg, \ - float* _exp_avg_sq, \ - size_t _param_size, \ - ds_half_precision_t* dev_param = nullptr, \ - bool half_precision = false); +#define STEP(SPAN) \ + template \ + void Step_##SPAN(ds_params_percision_t* _params, \ + ds_params_percision_t* grads, \ + ds_state_precision_t* _exp_avg, \ + ds_state_precision_t* _exp_avg_sq, \ + size_t _param_size); class Adam_Optimizer { public: @@ -55,56 +39,21 @@ class Adam_Optimizer { _step(0), _adamw_mode(adamw_mode) { -#if defined(__ENABLE_CUDA__) - cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float)); - cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float)); - - _streams[0] = TrainingContext::Instance().GetCurrentStream(); - _streams[1] = TrainingContext::Instance().GetNewStream(); - _buf_index = false; -#elif defined(__ENABLE_CANN__) - aclrtMallocHost((void**)_doubled_buffer, TILE * sizeof(float)); - aclrtMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float)); - - _buf_index = false; -#endif - } - ~Adam_Optimizer() - { -#if defined(__ENABLE_CUDA__) - cudaFreeHost(_doubled_buffer[0]); - cudaFreeHost(_doubled_buffer[1]); -#elif defined(__ENABLE_CANN__) - aclrtFreeHost(_doubled_buffer[0]); - aclrtFreeHost(_doubled_buffer[1]); -#endif } + ~Adam_Optimizer() {} #if defined(__AVX512__) or defined(__AVX256__) - template + template void Step_AVX(size_t* rounded_size, - float* _params, - float* grads, - float* _exp_avg, - float* _exp_avg_sq, - size_t param_size, - ds_half_precision_t* dev_param = nullptr, - bool half_precision = false); + ds_params_percision_t* _params, + ds_params_percision_t* grads, + ds_state_precision_t* _exp_avg, + ds_state_precision_t* _exp_avg_sq, + size_t param_size); #endif STEP(1) STEP(4) STEP(8) -#if defined(__ENABLE_CUDA__) - inline void SynchronizeStreams() - { - for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]); - } -#elif defined(__ENABLE_CANN__) - inline void SynchronizeStreams() - { - for (int i = 0; i < 2; i++) aclrtSynchronizeStream(_streams[i].stream()); - } -#endif inline void IncrementStep(size_t step, float beta1, float beta2) { if (beta1 != _betta1 || beta2 != _betta2) { @@ -154,32 +103,24 @@ class Adam_Optimizer { float _bias_correction2; bool _adamw_mode; - -#if defined(__ENABLE_CUDA__) - float* _doubled_buffer[2]; - cudaStream_t _streams[2]; - bool _buf_index; -#elif defined(__ENABLE_CANN__) - float* _doubled_buffer[2]; - c10_npu::NPUStream _streams[2] = {c10_npu::getCurrentNPUStream(), - c10_npu::getNPUStreamFromPool()}; - bool _buf_index; -#endif }; #if defined(__AVX512__) or defined(__AVX256__) -template +template void Adam_Optimizer::Step_AVX(size_t* rounded_size, - float* _params, - float* grads, - float* _exp_avg, - float* _exp_avg_sq, - size_t _param_size, - ds_half_precision_t* dev_params, - bool half_precision) + ds_params_percision_t* _params, + ds_params_percision_t* grads, + ds_state_precision_t* _exp_avg, + ds_state_precision_t* _exp_avg_sq, + size_t _param_size) { +#if !defined(__AVX512__) + if (std::is_same_v || + std::is_same_v) { + return; + } +#endif size_t new_rounded_size = 0; - int rshft = half_precision ? 1 : 0; AVX_Data betta1_4; betta1_4.data = SIMD_SET(_betta1); @@ -212,24 +153,19 @@ void Adam_Optimizer::Step_AVX(size_t* rounded_size, size_t copy_size = TILE; if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t; size_t offset = copy_size + t; -#if defined(__ENABLE_CUDA__) - if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); } -#elif defined(__ENABLE_CANN__) - if ((t / TILE) >= 2) { aclrtSynchronizeStream(_streams[_buf_index].stream()); } -#endif #pragma omp parallel for for (size_t i = t; i < offset; i += SIMD_WIDTH * span) { AVX_Data grad_4[span]; - simd_load(grad_4, grads + (i >> rshft), half_precision); + simd_load(grad_4, grads + i); AVX_Data momentum_4[span]; - simd_load(momentum_4, _exp_avg + i, false); + simd_load(momentum_4, _exp_avg + i); AVX_Data variance_4[span]; - simd_load(variance_4, _exp_avg_sq + i, false); + simd_load(variance_4, _exp_avg_sq + i); AVX_Data param_4[span]; - simd_load(param_4, _params + (i >> rshft), half_precision); + simd_load(param_4, _params + i); if (_weight_decay > 0 && !_adamw_mode) { simd_fma(grad_4, param_4, weight_decay4, grad_4); @@ -250,39 +186,10 @@ void Adam_Optimizer::Step_AVX(size_t* rounded_size, simd_fma(param_4, grad_4, step_size_4, param_4); - simd_store(_params + (i >> rshft), param_4, half_precision); -#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__) - if (dev_params) { - simd_store(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision); - } -#endif - simd_store(_exp_avg + i, momentum_4, false); - simd_store(_exp_avg_sq + i, variance_4, false); + simd_store(_params + i, param_4); + simd_store(_exp_avg + i, momentum_4); + simd_store(_exp_avg_sq + i, variance_4); } -#if defined(__ENABLE_CUDA__) - if (dev_params) { - if (half_precision) - launch_param_update_half( - _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]); - else - launch_param_update( - _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]); - - _buf_index = !_buf_index; - } -#elif defined(__ENABLE_CANN__) - if (dev_params) { - size_t memcpy_size = copy_size * sizeof(_doubled_buffer[_buf_index][0]); - if (half_precision) memcpy_size /= 2; - aclrtMemcpy(dev_params + t, - memcpy_size, - _doubled_buffer[_buf_index], - memcpy_size, - aclrtMemcpyKind::ACL_MEMCPY_HOST_TO_DEVICE); - - _buf_index = !_buf_index; - } -#endif } *rounded_size = new_rounded_size; } @@ -310,18 +217,4 @@ int ds_adam_step(int optimizer_id, torch::Tensor& exp_avg, torch::Tensor& exp_avg_sq); -int ds_adam_step_plus_copy(int optimizer_id, - size_t step, - float lr, - float beta1, - float beta2, - float epsilon, - float weight_decay, - bool bias_correction, - torch::Tensor& params, - torch::Tensor& grads, - torch::Tensor& exp_avg, - torch::Tensor& exp_avg_sq, - torch::Tensor& gpu_params); - int destroy_adam_optimizer(int optimizer_id); diff --git a/csrc/includes/cpu_lion.h b/csrc/includes/cpu_lion.h index 34c29eec47db..62b304923222 100644 --- a/csrc/includes/cpu_lion.h +++ b/csrc/includes/cpu_lion.h @@ -13,28 +13,12 @@ #include #include "simd.h" -#if defined(__ENABLE_CUDA__) -#include -#include -#include "cuda.h" -#include "custom_cuda_layers.h" -typedef __half ds_half_precision_t; -#elif defined(__ENABLE_CANN__) -#include "acl/acl.h" -#include "torch_npu/csrc/core/npu/NPUStream.h" -typedef c10::Half ds_half_precision_t; -#else -#include -typedef unsigned short ds_half_precision_t; -#endif - -#define STEP(SPAN) \ - void Step_##SPAN(float* _params, \ - float* grads, \ - float* _exp_avg, \ - size_t _param_size, \ - ds_half_precision_t* dev_param = nullptr, \ - bool half_precision = false); +#define STEP(SPAN) \ + template \ + void Step_##SPAN(ds_params_percision_t* _params, \ + ds_params_percision_t* grads, \ + ds_state_precision_t* _exp_avg, \ + size_t _param_size); class Lion_Optimizer { public: @@ -44,55 +28,21 @@ class Lion_Optimizer { float weight_decay = 0) : _alpha(alpha), _betta1(betta1), _betta2(betta2), _weight_decay(weight_decay), _step(0) { -#if defined(__ENABLE_CUDA__) - cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float)); - cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float)); - - _streams[0] = TrainingContext::Instance().GetCurrentStream(); - _streams[1] = TrainingContext::Instance().GetNewStream(); - _buf_index = false; -#elif defined(__ENABLE_CANN__) - aclrtMallocHost((void**)_doubled_buffer, TILE * sizeof(float)); - aclrtMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float)); - - _buf_index = false; -#endif - } - ~Lion_Optimizer() - { -#if defined(__ENABLE_CUDA__) - cudaFreeHost(_doubled_buffer[0]); - cudaFreeHost(_doubled_buffer[1]); -#elif defined(__ENABLE_CANN__) - aclrtFreeHost(_doubled_buffer[0]); - aclrtFreeHost(_doubled_buffer[1]); -#endif } + ~Lion_Optimizer() {} #if defined(__AVX512__) or defined(__AVX256__) - template + template void Step_AVX(size_t* rounded_size, - float* _params, - float* grads, - float* _exp_avg, - size_t param_size, - ds_half_precision_t* dev_param = nullptr, - bool half_precision = false); + ds_params_percision_t* _params, + ds_params_percision_t* grads, + ds_state_precision_t* _exp_avg, + size_t param_size); #endif STEP(1) STEP(4) STEP(8) -#if defined(__ENABLE_CUDA__) - inline void SynchronizeStreams() - { - for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]); - } -#elif defined(__ENABLE_CANN__) - inline void SynchronizeStreams() - { - for (int i = 0; i < 2; i++) aclrtSynchronizeStream(_streams[i].stream()); - } -#endif + inline void IncrementStep(size_t step, float beta1, float beta2) { _step++; @@ -114,31 +64,23 @@ class Lion_Optimizer { float _betta2; float _weight_decay; size_t _step; - -#if defined(__ENABLE_CUDA__) - float* _doubled_buffer[2]; - cudaStream_t _streams[2]; - bool _buf_index; -#elif defined(__ENABLE_CANN__) - float* _doubled_buffer[2]; - c10_npu::NPUStream _streams[2] = {c10_npu::getCurrentNPUStream(), - c10_npu::getNPUStreamFromPool()}; - bool _buf_index; -#endif }; #if defined(__AVX512__) or defined(__AVX256__) -template +template void Lion_Optimizer::Step_AVX(size_t* rounded_size, - float* _params, - float* grads, - float* _exp_avg, - size_t _param_size, - ds_half_precision_t* dev_params, - bool half_precision) + ds_params_percision_t* _params, + ds_params_percision_t* grads, + ds_state_precision_t* _exp_avg, + size_t _param_size) { +#if !defined(__AVX512__) + if (std::is_same_v || + std::is_same_v) { + return; + } +#endif size_t new_rounded_size = 0; - int rshft = half_precision ? 1 : 0; constexpr float neg1 = -1.0f; AVX_Data neg1_4; @@ -169,21 +111,17 @@ void Lion_Optimizer::Step_AVX(size_t* rounded_size, size_t copy_size = TILE; if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t; size_t offset = copy_size + t; -#if defined(__ENABLE_CUDA__) - if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); } -#elif defined(__ENABLE_CANN__) - if ((t / TILE) >= 2) { aclrtSynchronizeStream(_streams[_buf_index].stream()); } -#endif + #pragma omp parallel for for (size_t i = t; i < offset; i += SIMD_WIDTH * span) { AVX_Data grad_4[span]; - simd_load(grad_4, grads + (i >> rshft), half_precision); + simd_load(grad_4, grads + i); AVX_Data momentum_4[span]; - simd_load(momentum_4, _exp_avg + i, false); + simd_load(momentum_4, _exp_avg + i); AVX_Data param_4[span]; - simd_load(param_4, _params + (i >> rshft), half_precision); + simd_load(param_4, _params + i); AVX_Data tmp_4[span]; @@ -201,38 +139,9 @@ void Lion_Optimizer::Step_AVX(size_t* rounded_size, simd_mul(momentum_4, momentum_4, betta2_4); simd_fma(momentum_4, grad_4, betta2_minus1_4, momentum_4); - simd_store(_params + (i >> rshft), param_4, half_precision); -#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__) - if (dev_params) { - simd_store(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision); - } -#endif - simd_store(_exp_avg + i, momentum_4, false); - } -#if defined(__ENABLE_CUDA__) - if (dev_params) { - if (half_precision) - launch_param_update_half( - _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]); - else - launch_param_update( - _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]); - - _buf_index = !_buf_index; + simd_store(_params + i, param_4); + simd_store(_exp_avg + i, momentum_4); } -#elif defined(__ENABLE_CANN__) - if (dev_params) { - size_t memcpy_size = copy_size * sizeof(_doubled_buffer[_buf_index][0]); - if (half_precision) memcpy_size /= 2; - aclrtMemcpy(dev_params + t, - memcpy_size, - _doubled_buffer[_buf_index], - memcpy_size, - aclrtMemcpyKind::ACL_MEMCPY_HOST_TO_DEVICE); - - _buf_index = !_buf_index; - } -#endif } *rounded_size = new_rounded_size; } @@ -255,15 +164,4 @@ int ds_lion_step(int optimizer_id, torch::Tensor& grads, torch::Tensor& exp_avg); -int ds_lion_step_plus_copy(int optimizer_id, - size_t step, - float lr, - float beta1, - float beta2, - float weight_decay, - torch::Tensor& params, - torch::Tensor& grads, - torch::Tensor& exp_avg, - torch::Tensor& gpu_params); - int destroy_lion_optimizer(int optimizer_id); diff --git a/csrc/includes/custom_cuda_layers.h b/csrc/includes/custom_cuda_layers.h index 265eb7b12444..21f19749d4cf 100644 --- a/csrc/includes/custom_cuda_layers.h +++ b/csrc/includes/custom_cuda_layers.h @@ -272,9 +272,6 @@ void launch_fuse_transpose_bias_kernel(const T* inp, int cols, cudaStream_t stream); -void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream); -void launch_param_update_half(const float* input, __half* output, int size, cudaStream_t stream); - void launch_token_sort(int32_t* indices, int layers, int batch_size, diff --git a/csrc/includes/simd.h b/csrc/includes/simd.h index 59237b0261c1..73e41216a652 100644 --- a/csrc/includes/simd.h +++ b/csrc/includes/simd.h @@ -13,6 +13,19 @@ #define TILE (128 * 1024 * 1024) #if defined(__AVX512__) or defined(__AVX256__) +template +inline T readAs(const void* src) +{ + T res; + std::memcpy(&res, src, sizeof(T)); + return res; +} +template +inline void writeAs(void* dst, const T& val) +{ + std::memcpy(dst, &val, sizeof(T)); +} + #define ROUND_DOWN(size, step) ((size) & ~((step)-1)) #if defined(__AVX512__) @@ -30,11 +43,52 @@ #define SIMD_XOR(x, y) _mm512_xor_ps(x, y) #define SIMD_WIDTH 16 -#define SIMD_LOAD2(x, h) \ - ((h) ? _mm512_cvtph_ps(_mm256_castps_si256(_mm256_loadu_ps(x))) : _mm512_loadu_ps(x)) -#define SIMD_STORE2(x, d, h) \ - ((h) ? _mm256_store_ps(x, _mm256_castsi256_ps(_mm512_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) \ - : _mm512_storeu_ps(x, d)) +static __m512 load_16_bf16_as_f32(const void* data) +{ + __m256i a = readAs<__m256i>(data); // use memcpy to avoid aliasing + __m512i b = _mm512_cvtepu16_epi32(a); // convert 8 u16 to 8 u32 + __m512i c = _mm512_slli_epi32(b, 16); // logical shift left of all u32 by + // 16 bits (representing bf16->f32) + return readAs<__m512>(&c); // use memcpy to avoid aliasing +} + +static void store_16_f32_as_bf16_nearest(__m512 v, void* data) +{ + __m512i u32 = readAs<__m512i>(&v); + + // flow assuming non-nan: + + // uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF); + __m512i b = _mm512_srli_epi32(u32, 16); + __m512i lsb_mask = _mm512_set1_epi32(0x00000001); + __m512i c = _mm512_and_si512(b, lsb_mask); + __m512i bias_constant = _mm512_set1_epi32(0x00007fff); + __m512i rounding_bias = _mm512_add_epi32(c, bias_constant); + + // uint16_t res = static_cast((U32 + rounding_bias) >> 16); + __m512i d = _mm512_add_epi32(u32, rounding_bias); + __m512i e = _mm512_srli_epi32(d, 16); + __m256i non_nan_res = _mm512_cvtusepi32_epi16(e); + + // handle nan (exp is all 1s and mantissa != 0) + // if ((x & 0x7fffffffU) > 0x7f800000U) + __m512i mask_out_sign = _mm512_set1_epi32(0x7fffffff); + __m512i non_sign_bits = _mm512_and_si512(u32, mask_out_sign); + __m512i nan_threshold = _mm512_set1_epi32(0x7f800000); + __mmask16 nan_mask = _mm512_cmp_epi32_mask(non_sign_bits, nan_threshold, _MM_CMPINT_GT); + + // mix in results with nans as needed + __m256i nans = _mm256_set1_epi16(0x7fc0); + __m256i res = _mm256_mask_mov_epi16(non_nan_res, nan_mask, nans); + + writeAs(data, res); +} +#define SIMD_LOAD_BF16(x) load_16_bf16_as_f32(x) +#define SIMD_STORE_BF16(x, d) store_16_f32_as_bf16_nearest(d, x) + +#define SIMD_LOAD_FP16(x) _mm512_cvtph_ps(_mm256_castps_si256(_mm256_loadu_ps(x))) +#define SIMD_STORE_FP16(x, d) \ + _mm256_store_ps(x, _mm256_castsi256_ps(_mm512_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) #define INTV __m256i #elif defined(__AVX256__) @@ -52,11 +106,11 @@ #define SIMD_XOR(x, y) _mm256_xor_ps(x, y) #define SIMD_WIDTH 8 -#define SIMD_LOAD2(x, h) \ - ((h) ? _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)(x))) : _mm256_loadu_ps(x)) -#define SIMD_STORE2(x, d, h) \ - ((h) ? _mm_store_ps(x, _mm_castsi128_ps(_mm256_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) \ - : _mm256_storeu_ps(x, d)) +#define SIMD_LOAD_BF16(x) static_assert(false && "AVX256 does not support BFloat16") +#define SIMD_STORE_BF16(x, d) static_assert(false && "AVX256 does not support BFloat16") +#define SIMD_LOAD_FP16(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)x)) +#define SIMD_STORE_FP16(x, d) \ + _mm_store_ps(x, _mm_castsi128_ps(_mm256_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) #define INTV __m128i #endif @@ -70,20 +124,66 @@ union AVX_Data { // float data_f[16]; }; -template -inline void simd_store(float* dst, AVX_Data* src, bool half_precision) +template +inline typename std::enable_if_t, void> simd_store(T* dst, + AVX_Data* src) { - size_t width = (half_precision ? SIMD_WIDTH / 2 : SIMD_WIDTH); + size_t width = SIMD_WIDTH; #pragma unroll - for (size_t i = 0; i < span; ++i) { SIMD_STORE2(dst + width * i, src[i].data, half_precision); } + for (size_t i = 0; i < span; ++i) { SIMD_STORE_FP16((float*)(dst + width * i), src[i].data); } } -template -inline void simd_load(AVX_Data* dst, float* src, bool half_precision) + +template +inline typename std::enable_if_t, void> simd_store(T* dst, + AVX_Data* src) { - size_t width = (half_precision ? SIMD_WIDTH / 2 : SIMD_WIDTH); +#ifdef __AVX512__ + size_t width = SIMD_WIDTH; #pragma unroll - for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_LOAD2(src + width * i, half_precision); } + for (size_t i = 0; i < span; ++i) { SIMD_STORE_BF16((float*)(dst + width * i), src[i].data); } +#else + throw std::runtime_error("AVX512 required for BFloat16"); +#endif +} + +template +inline typename std::enable_if_t, void> simd_store(T* dst, AVX_Data* src) +{ + size_t width = SIMD_WIDTH; +#pragma unroll + for (size_t i = 0; i < span; ++i) { SIMD_STORE(dst + width * i, src[i].data); } } + +template +inline typename std::enable_if_t, void> simd_load(AVX_Data* dst, + T* src) +{ + size_t width = SIMD_WIDTH; +#pragma unroll + for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_LOAD_FP16((float*)(src + width * i)); } +} + +template +inline typename std::enable_if_t, void> simd_load(AVX_Data* dst, + T* src) +{ +#ifdef __AVX512__ + size_t width = SIMD_WIDTH; +#pragma unroll + for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_LOAD_BF16((float*)(src + width * i)); } +#else + throw std::runtime_error("AVX512 required for BFloat16"); +#endif +} + +template +inline typename std::enable_if_t, void> simd_load(AVX_Data* dst, T* src) +{ + size_t width = SIMD_WIDTH; +#pragma unroll + for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_LOAD(src + width * i); } +} + template inline void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data src_m_r, AVX_Data* src_a) { diff --git a/csrc/lion/cpu_lion.cpp b/csrc/lion/cpu_lion.cpp index a0562eac9c4a..c5cf3e9e9235 100644 --- a/csrc/lion/cpu_lion.cpp +++ b/csrc/lion/cpu_lion.cpp @@ -8,9 +8,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("lion_update", &ds_lion_step, "DeepSpeed CPU Lion update (C++)"); - m.def("lion_update_copy", - &ds_lion_step_plus_copy, - "DeepSpeed CPU Lion update and param copy (C++)"); m.def("create_lion", &create_lion_optimizer, "DeepSpeed CPU Lion (C++)"); m.def("destroy_lion", &destroy_lion_optimizer, "DeepSpeed CPU Lion destroy (C++)"); } diff --git a/csrc/lion/cpu_lion_impl.cpp b/csrc/lion/cpu_lion_impl.cpp index 28314cf5b6e1..85896ba86e19 100644 --- a/csrc/lion/cpu_lion_impl.cpp +++ b/csrc/lion/cpu_lion_impl.cpp @@ -6,34 +6,28 @@ #include #include #include +#include #include +#include #include #include #include #include "cpu_lion.h" -#if defined(__ENABLE_CUDA__) -#include -#include "cublas_v2.h" -#include "cuda.h" -#include "curand.h" -#include "custom_cuda_layers.h" -#endif - +using namespace std::string_literals; static std::unordered_map> s_optimizers; // C++ interface -void Lion_Optimizer::Step_1(float* _params, - float* grads, - float* _exp_avg, - size_t _param_size, - ds_half_precision_t* dev_params, - bool half_precision) +template +void Lion_Optimizer::Step_1(ds_params_percision_t* _params, + ds_params_percision_t* grads, + ds_state_precision_t* _exp_avg, + size_t _param_size) { size_t rounded_size = 0; #if defined(__AVX512__) or defined(__AVX256__) - Step_AVX<1>(&rounded_size, _params, grads, _exp_avg, _param_size, dev_params, half_precision); + Step_AVX<1>(&rounded_size, _params, grads, _exp_avg, _param_size); #endif if (_param_size > rounded_size) { float betta1_minus1 = 1 - _betta1; @@ -41,26 +35,15 @@ void Lion_Optimizer::Step_1(float* _params, float alpha = _alpha; float after_decay = 1 - alpha * _weight_decay; - ds_half_precision_t* grads_cast_h; - ds_half_precision_t* params_cast_h; - if (half_precision) { - grads_cast_h = reinterpret_cast(grads); - params_cast_h = reinterpret_cast(_params); - } for (size_t t = rounded_size; t < _param_size; t += TILE) { size_t copy_size = TILE; if ((t + TILE) > _param_size) copy_size = _param_size - t; size_t offset = copy_size + t; -#if defined(__ENABLE_CUDA__) - if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); } -#elif defined(__ENABLE_CANN__) - if ((t / TILE) >= 2) { aclrtSynchronizeStream(_streams[_buf_index].stream()); } -#endif #pragma omp parallel for for (size_t k = t; k < offset; k++) { - float grad = half_precision ? (float)grads_cast_h[k] : grads[k]; - float param = half_precision ? (float)params_cast_h[k] : _params[k]; + float grad = (float)grads[k]; + float param = (float)_params[k]; float momentum = _exp_avg[k]; float tmp = momentum * _betta1; tmp = grad * betta1_minus1 + tmp; @@ -74,56 +57,28 @@ void Lion_Optimizer::Step_1(float* _params, } momentum = momentum * _betta2; momentum = grad * betta2_minus1 + momentum; -#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__) - if (dev_params) _doubled_buffer[_buf_index][k - t] = param; -#endif - if (half_precision) - params_cast_h[k] = (ds_half_precision_t)param; - else - _params[k] = param; + _params[k] = param; _exp_avg[k] = momentum; } -#if defined(__ENABLE_CUDA__) - if (dev_params) { - launch_param_update( - _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]); - - _buf_index = !_buf_index; - } -#elif defined(__ENABLE_CANN__) - if (dev_params) { - size_t memcpy_size = copy_size * sizeof(_doubled_buffer[_buf_index][0]); - aclrtMemcpy(dev_params + t, - memcpy_size, - _doubled_buffer[_buf_index], - memcpy_size, - aclrtMemcpyKind::ACL_MEMCPY_HOST_TO_DEVICE); - - _buf_index = !_buf_index; - } -#endif } } } -void Lion_Optimizer::Step_4(float* _params, - float* grads, - float* _exp_avg, - size_t _param_size, - ds_half_precision_t* dev_params, - bool half_precision) +template +void Lion_Optimizer::Step_4(ds_params_percision_t* _params, + ds_params_percision_t* grads, + ds_state_precision_t* _exp_avg, + size_t _param_size) { size_t rounded_size = 0; #if defined(__AVX512__) or defined(__AVX256__) - Step_AVX<4>(&rounded_size, _params, grads, _exp_avg, _param_size, dev_params, half_precision); + Step_AVX<4>(&rounded_size, _params, grads, _exp_avg, _param_size); #endif if (_param_size > rounded_size) Step_1((_params + rounded_size), (grads + rounded_size), (_exp_avg + rounded_size), - (_param_size - rounded_size), - (dev_params != nullptr ? (dev_params + rounded_size) : dev_params), - half_precision); + (_param_size - rounded_size)); } int create_lion_optimizer(int optimizer_id, @@ -162,24 +117,76 @@ int create_lion_optimizer(int optimizer_id, return 0; } -void Lion_Optimizer::Step_8(float* _params, - float* grads, - float* _exp_avg, - size_t _param_size, - ds_half_precision_t* dev_params, - bool half_precision) +template +void Lion_Optimizer::Step_8(ds_params_percision_t* _params, + ds_params_percision_t* grads, + ds_state_precision_t* _exp_avg, + size_t _param_size) { size_t rounded_size = 0; #if defined(__AVX512__) or defined(__AVX256__) - Step_AVX<8>(&rounded_size, _params, grads, _exp_avg, _param_size, dev_params, half_precision); + Step_AVX<8>(&rounded_size, _params, grads, _exp_avg, _param_size); #endif if (_param_size > rounded_size) Step_4((_params + rounded_size), (grads + rounded_size), (_exp_avg + rounded_size), - (_param_size - rounded_size), - (dev_params != nullptr ? (dev_params + rounded_size) : dev_params), - half_precision); + (_param_size - rounded_size)); +} + +template +void step_invoker(std::shared_ptr opt, + void* _params, + void* grads, + void* _exp_avg, + size_t _param_size) +{ + opt->Step_8((ds_params_percision_t*)(_params), + (ds_params_percision_t*)(grads), + (ds_state_precision_t*)(_exp_avg), + _param_size); +} + +std::map, + std::function, void*, void*, void*, size_t)>> + invokers; + +// Fill map with template functions for each type +template +void create_invoker() +{ + invokers[std::tuple(c10::CppTypeToScalarType(), + c10::CppTypeToScalarType())] = + step_invoker; +} +struct InvokerInitializer { + InvokerInitializer() + { + create_invoker(); + create_invoker(); + create_invoker(); + create_invoker(); + create_invoker(); + } +} _invoker_initializer; + +void invoke(std::shared_ptr opt, + torch::Tensor& params, + torch::Tensor& grads, + torch::Tensor& exp_avg, + size_t param_size) +{ + c10::ScalarType params_type = at::typeMetaToScalarType(params.options().dtype()); + c10::ScalarType state_type = at::typeMetaToScalarType(exp_avg.options().dtype()); + + auto it = invokers.find(std::tuple(params_type, state_type)); + if (it == invokers.end()) { + throw std::runtime_error("Lion optimizer with param type "s + c10::toString(params_type) + + " and state type "s + c10::toString(state_type) + + " is not supported on current hardware"s); + } + + it->second(opt, params.data_ptr(), grads.data_ptr(), exp_avg.data_ptr(), param_size); } int ds_lion_step(int optimizer_id, @@ -196,67 +203,13 @@ int ds_lion_step(int optimizer_id, auto grads_c = grads.contiguous(); auto exp_avg_c = exp_avg.contiguous(); - // assert(params.options().dtype() == grads.options().dtype()); - - float* params_ptr = (float*)params_c.data_ptr(); - float* grads_ptr = (float*)grads_c.data_ptr(); - float* exp_avg_ptr = (float*)exp_avg_c.data_ptr(); - std::shared_ptr opt = std::static_pointer_cast(s_optimizers[optimizer_id]); opt->IncrementStep(step, beta1, beta2); opt->update_state(lr, weight_decay); - opt->Step_8(params_ptr, - grads_ptr, - exp_avg_ptr, - params_c.numel(), - nullptr, - (params.options().dtype() == at::kHalf)); + invoke(opt, params_c, grads_c, exp_avg_c, params_c.numel()); -#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__) - opt->SynchronizeStreams(); -#endif - return 0; -} - -int ds_lion_step_plus_copy(int optimizer_id, - size_t step, - float lr, - float beta1, - float beta2, - float weight_decay, - torch::Tensor& params, - torch::Tensor& grads, - torch::Tensor& exp_avg, - torch::Tensor& gpu_params) -{ -#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__) - auto params_c = params.contiguous(); - auto gpu_params_c = gpu_params.contiguous(); - auto exp_avg_c = exp_avg.contiguous(); - auto grads_c = grads.contiguous(); - - float* params_ptr = (float*)params_c.data_ptr(); - float* grads_ptr = (float*)grads_c.data_ptr(); - ds_half_precision_t* gpu_params_ptr = (ds_half_precision_t*)gpu_params_c.data_ptr(); - float* exp_avg_ptr = (float*)exp_avg_c.data_ptr(); - - std::shared_ptr opt = - std::static_pointer_cast(s_optimizers[optimizer_id]); - opt->IncrementStep(step, beta1, beta2); - opt->update_state(lr, weight_decay); - opt->Step_8(params_ptr, - grads_ptr, - exp_avg_ptr, - params_c.numel(), - gpu_params_ptr, - (params.options().dtype() == at::kHalf)); - - opt->SynchronizeStreams(); -#else - assert(false); -#endif return 0; } diff --git a/deepspeed/ops/adagrad/cpu_adagrad.py b/deepspeed/ops/adagrad/cpu_adagrad.py index c356a52777f2..dbde6d95f652 100755 --- a/deepspeed/ops/adagrad/cpu_adagrad.py +++ b/deepspeed/ops/adagrad/cpu_adagrad.py @@ -34,7 +34,7 @@ def __setstate__(self, state): group.setdefault('amsgrad', False) @torch.no_grad() - def step(self, closure=None, fp16_param_groups=None): + def step(self, closure=None): """Update the model parameters. .. note:: @@ -46,8 +46,6 @@ def step(self, closure=None, fp16_param_groups=None): Args: closure (callable, optional): closure to compute the loss. Defaults to ``None``. - fp16_param_groups: FP16 GPU parameters to update. Performing the - copy here reduces communication time. Defaults to ``None``. Returns: loss: if ``closure`` is provided. Otherwise ``None``. @@ -94,16 +92,7 @@ def step(self, closure=None, fp16_param_groups=None): sparse_exp_avg_sq.values()) p[sparse_param.indices()] = sparse_param.values() state['exp_avg_sq'][sparse_exp_avg_sq.indices()] = sparse_exp_avg_sq.values() - if fp16_param_groups is not None: - fp16_param_groups[group_id][param_id][sparse_param.indices()] = sparse_param.values() else: - if fp16_param_groups is not None: - self.ds_opt_adagrad.adagrad_update_copy(self.opt_id, state['step'], group['lr'], group['eps'], - group['weight_decay'], p.data, p.grad.data, - state['exp_avg_sq'], - fp16_param_groups[group_id][param_id].data) - else: - self.ds_opt_adagrad.adagrad_update(self.opt_id, state['step'], group['lr'], group['eps'], - group['weight_decay'], p.data, p.grad.data, - state['exp_avg_sq']) + self.ds_opt_adagrad.adagrad_update(self.opt_id, state['step'], group['lr'], group['eps'], + group['weight_decay'], p.data, p.grad.data, state['exp_avg_sq']) return loss diff --git a/deepspeed/ops/adam/cpu_adam.py b/deepspeed/ops/adam/cpu_adam.py index 10b8c15f970b..e0a72a494257 100755 --- a/deepspeed/ops/adam/cpu_adam.py +++ b/deepspeed/ops/adam/cpu_adam.py @@ -107,7 +107,7 @@ def __setstate__(self, state): group.setdefault('amsgrad', False) @torch.no_grad() - def step(self, closure=None, fp16_param_groups=None): + def step(self, closure=None): """Update the model parameters. .. note:: @@ -119,8 +119,6 @@ def step(self, closure=None, fp16_param_groups=None): Args: closure (callable, optional): closure to compute the loss. Defaults to ``None``. - fp16_param_groups: FP16 GPU parameters to update. Performing the - copy here reduces communication time. Defaults to ``None``. Returns: loss: if ``closure`` is provided. Otherwise ``None``. @@ -134,13 +132,6 @@ def step(self, closure=None, fp16_param_groups=None): # intended device for step device = torch.device('cpu') - # converting the fp16 params to a group of parameter - if type(fp16_param_groups) is list: - if type(fp16_param_groups[0]) is not list: - fp16_param_groups = [fp16_param_groups] - elif fp16_param_groups is not None: - fp16_param_groups = [[fp16_param_groups]] - for group_id, group in enumerate(self.param_groups): for param_id, p in enumerate(group['params']): @@ -169,13 +160,7 @@ def step(self, closure=None, fp16_param_groups=None): state['step'] += 1 beta1, beta2 = group['betas'] - if fp16_param_groups is not None: - self.ds_opt_adam.adam_update_copy(self.opt_id, state['step'], group['lr'], beta1, beta2, - group['eps'], group['weight_decay'], group['bias_correction'], - p.data, p.grad.data, state['exp_avg'], state['exp_avg_sq'], - fp16_param_groups[group_id][param_id].data) - else: - self.ds_opt_adam.adam_update(self.opt_id, state['step'], group['lr'], beta1, beta2, group['eps'], - group['weight_decay'], group['bias_correction'], p.data, p.grad.data, - state['exp_avg'], state['exp_avg_sq']) + self.ds_opt_adam.adam_update(self.opt_id, state['step'], group['lr'], beta1, beta2, group['eps'], + group['weight_decay'], group['bias_correction'], p.data, p.grad.data, + state['exp_avg'], state['exp_avg_sq']) return loss diff --git a/deepspeed/ops/lion/cpu_lion.py b/deepspeed/ops/lion/cpu_lion.py index a91a00643873..03342a3fcd34 100755 --- a/deepspeed/ops/lion/cpu_lion.py +++ b/deepspeed/ops/lion/cpu_lion.py @@ -69,7 +69,7 @@ def __setstate__(self, state): group.setdefault('amsgrad', False) @torch.no_grad() - def step(self, closure=None, fp16_param_groups=None): + def step(self, closure=None): """Update the model parameters. .. note:: @@ -81,8 +81,6 @@ def step(self, closure=None, fp16_param_groups=None): Args: closure (callable, optional): closure to compute the loss. Defaults to ``None``. - fp16_param_groups: FP16 GPU parameters to update. Performing the - copy here reduces communication time. Defaults to ``None``. Returns: loss: if ``closure`` is provided. Otherwise ``None``. @@ -96,13 +94,6 @@ def step(self, closure=None, fp16_param_groups=None): # intended device for step device = torch.device('cpu') - # converting the fp16 params to a group of parameter - if type(fp16_param_groups) is list: - if type(fp16_param_groups[0]) is not list: - fp16_param_groups = [fp16_param_groups] - elif fp16_param_groups is not None: - fp16_param_groups = [[fp16_param_groups]] - for group_id, group in enumerate(self.param_groups): for param_id, p in enumerate(group['params']): @@ -131,11 +122,6 @@ def step(self, closure=None, fp16_param_groups=None): state['step'] += 1 beta1, beta2 = group['betas'] - if fp16_param_groups is not None: - self.ds_opt_lion.lion_update_copy(self.opt_id, state['step'], group['lr'], beta1, beta2, - group['weight_decay'], p.data, p.grad.data, state['exp_avg'], - fp16_param_groups[group_id][param_id].data) - else: - self.ds_opt_lion.lion_update(self.opt_id, state['step'], group['lr'], beta1, beta2, - group['weight_decay'], p.data, p.grad.data, state['exp_avg']) + self.ds_opt_lion.lion_update(self.opt_id, state['step'], group['lr'], beta1, beta2, + group['weight_decay'], p.data, p.grad.data, state['exp_avg']) return loss diff --git a/op_builder/builder.py b/op_builder/builder.py index 18c130221b0e..4c4978c29575 100644 --- a/op_builder/builder.py +++ b/op_builder/builder.py @@ -545,6 +545,7 @@ def jit_load(self, verbose=True): nvcc_args.append("-DBF16_AVAILABLE") nvcc_args.append("-U__CUDA_NO_BFLOAT16_OPERATORS__") nvcc_args.append("-U__CUDA_NO_BFLOAT162_OPERATORS__") + nvcc_args.append("-U__CUDA_NO_BFLOAT16_CONVERSIONS__") if self.is_rocm_pytorch(): cxx_args.append("-D__HIP_PLATFORM_AMD__=1") diff --git a/op_builder/cpu/builder.py b/op_builder/cpu/builder.py index d2bc8eacfa25..d881842ad0b1 100644 --- a/op_builder/cpu/builder.py +++ b/op_builder/cpu/builder.py @@ -30,7 +30,11 @@ def builder(self): return cpp_ext def cxx_args(self): - return ['-O3', '-g', '-Wno-reorder'] + args = ['-O3', '-g', '-Wno-reorder'] + CPU_ARCH = self.cpu_arch() + SIMD_WIDTH = self.simd_width() + args += [CPU_ARCH, '-fopenmp', SIMD_WIDTH] + return args def libraries_args(self): return [] diff --git a/op_builder/cpu_adagrad.py b/op_builder/cpu_adagrad.py index d3f163f7464a..c05f71488950 100644 --- a/op_builder/cpu_adagrad.py +++ b/op_builder/cpu_adagrad.py @@ -3,7 +3,6 @@ # DeepSpeed Team -import os from .builder import TorchCPUOpBuilder @@ -18,26 +17,11 @@ def absolute_name(self): return f'deepspeed.ops.adagrad.{self.NAME}_op' def sources(self): - if self.build_for_cpu: - return ['csrc/adagrad/cpu_adagrad.cpp'] - - return ['csrc/adagrad/cpu_adagrad.cpp', 'csrc/common/custom_cuda_kernel.cu'] + return ['csrc/adagrad/cpu_adagrad.cpp'] def libraries_args(self): args = super().libraries_args() - if self.build_for_cpu: - return args - - if not self.is_rocm_pytorch(): - args += ['curand'] return args def include_paths(self): - import torch - if self.build_for_cpu: - CUDA_INCLUDE = [] - elif not self.is_rocm_pytorch(): - CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")] - else: - CUDA_INCLUDE = [] - return ['csrc/includes'] + CUDA_INCLUDE + return ['csrc/includes'] diff --git a/op_builder/cpu_adam.py b/op_builder/cpu_adam.py index 7c34c4ce43a1..7f4c0847a8c4 100644 --- a/op_builder/cpu_adam.py +++ b/op_builder/cpu_adam.py @@ -3,7 +3,6 @@ # DeepSpeed Team -import os from .builder import TorchCPUOpBuilder @@ -18,27 +17,11 @@ def absolute_name(self): return f'deepspeed.ops.adam.{self.NAME}_op' def sources(self): - if self.build_for_cpu: - return ['csrc/adam/cpu_adam.cpp', 'csrc/adam/cpu_adam_impl.cpp'] - - return ['csrc/adam/cpu_adam.cpp', 'csrc/adam/cpu_adam_impl.cpp', 'csrc/common/custom_cuda_kernel.cu'] + return ['csrc/adam/cpu_adam.cpp', 'csrc/adam/cpu_adam_impl.cpp'] def libraries_args(self): args = super().libraries_args() - if self.build_for_cpu: - return args - - if not self.is_rocm_pytorch(): - args += ['curand'] - return args def include_paths(self): - import torch - if self.build_for_cpu: - CUDA_INCLUDE = [] - elif not self.is_rocm_pytorch(): - CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")] - else: - CUDA_INCLUDE = [] - return ['csrc/includes'] + CUDA_INCLUDE + return ['csrc/includes'] diff --git a/op_builder/cpu_lion.py b/op_builder/cpu_lion.py index 5c16d10ebb44..9a60d99773b3 100644 --- a/op_builder/cpu_lion.py +++ b/op_builder/cpu_lion.py @@ -3,7 +3,6 @@ # DeepSpeed Team -import os from .builder import TorchCPUOpBuilder @@ -18,31 +17,11 @@ def absolute_name(self): return f'deepspeed.ops.lion.{self.NAME}_op' def sources(self): - if self.build_for_cpu: - return ['csrc/lion/cpu_lion.cpp', 'csrc/lion/cpu_lion_impl.cpp'] - - return ['csrc/lion/cpu_lion.cpp', 'csrc/lion/cpu_lion_impl.cpp', 'csrc/common/custom_cuda_kernel.cu'] + return ['csrc/lion/cpu_lion.cpp', 'csrc/lion/cpu_lion_impl.cpp'] def libraries_args(self): args = super().libraries_args() - if self.build_for_cpu: - return args - - if not self.is_rocm_pytorch(): - args += ['curand'] - return args def include_paths(self): - import torch - if self.build_for_cpu: - CUDA_INCLUDE = [] - elif not self.is_rocm_pytorch(): - CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")] - else: - CUDA_INCLUDE = [ - os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include"), - os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "rocrand"), - os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "hiprand"), - ] - return ['csrc/includes'] + CUDA_INCLUDE + return ['csrc/includes'] diff --git a/op_builder/hpu/builder.py b/op_builder/hpu/builder.py index 3c86128fffd6..c176a586ba49 100644 --- a/op_builder/hpu/builder.py +++ b/op_builder/hpu/builder.py @@ -31,7 +31,11 @@ def builder(self): return cpp_ext def cxx_args(self): - return ['-O3', '-g', '-Wno-reorder'] + args = ['-O3', '-g', '-Wno-reorder'] + CPU_ARCH = self.cpu_arch() + SIMD_WIDTH = self.simd_width() + args += [CPU_ARCH, '-fopenmp', SIMD_WIDTH] + return args def libraries_args(self): return [] diff --git a/op_builder/hpu/cpu_adam.py b/op_builder/hpu/cpu_adam.py index 2f3b7aefe705..58eea2698ebb 100644 --- a/op_builder/hpu/cpu_adam.py +++ b/op_builder/hpu/cpu_adam.py @@ -20,11 +20,6 @@ def absolute_name(self): def sources(self): return ['csrc/adam/cpu_adam.cpp', 'csrc/adam/cpu_adam_impl.cpp'] - def cxx_args(self): - args = super().cxx_args() - args += ['-DENABLE_BFLOAT16'] - return args - def libraries_args(self): args = super().libraries_args() return args diff --git a/tests/perf/adam_test1.py b/tests/perf/adam_test1.py index b35477afb4fe..bde1d53e5179 100755 --- a/tests/perf/adam_test1.py +++ b/tests/perf/adam_test1.py @@ -6,12 +6,10 @@ import torch from deepspeed.ops.adam import DeepSpeedCPUAdam import time -from deepspeed.accelerator import get_accelerator device = 'cpu' model_size = 1 * 1024**3 param = torch.nn.Parameter(torch.ones(model_size, device=device)) -param_fp16 = torch.nn.Parameter(torch.ones(model_size, dtype=torch.half, device=get_accelerator().device_name(0))) optimizer = DeepSpeedCPUAdam([param]) #torch.set_num_threads(128) @@ -19,7 +17,7 @@ avg = 0 for i in range(100): start = time.time() - optimizer.step(fp16_param_groups=[param_fp16]) + optimizer.step() stop = time.time() avg += (stop - start) param.grad = torch.ones(model_size, device=device) * 2 diff --git a/tests/unit/common.py b/tests/unit/common.py index a2593e703aef..58bb26ca18b4 100644 --- a/tests/unit/common.py +++ b/tests/unit/common.py @@ -82,8 +82,12 @@ def set_accelerator_visible(): if match: num_accelerators += 1 elif get_accelerator().device_name() == 'hpu': - hl_smi = subprocess.check_output(['hl-smi', "-L"]) - num_accelerators = re.findall(r"Module ID\s+:\s+(\d+)", hl_smi.decode()) + try: + hl_smi = subprocess.check_output(['hl-smi', "-L"]) + num_accelerators = re.findall(r"Module ID\s+:\s+(\d+)", hl_smi.decode()) + except FileNotFoundError: + sim_list = subprocess.check_output(['ls', '-1', '/dev/accel']) + num_accelerators = re.findall(r"accel(\d+)", sim_list.decode()) num_accelerators = sorted(num_accelerators, key=int) os.environ["HABANA_VISIBLE_MODULES"] = ",".join(num_accelerators) elif get_accelerator().device_name() == 'npu': diff --git a/tests/unit/ops/adagrad/test_cpu_adagrad.py b/tests/unit/ops/adagrad/test_cpu_adagrad.py index 99e934e2efda..0c675ecd6a85 100644 --- a/tests/unit/ops/adagrad/test_cpu_adagrad.py +++ b/tests/unit/ops/adagrad/test_cpu_adagrad.py @@ -18,8 +18,8 @@ def check_equal(first, second, atol=1e-2, verbose=False): - x = first.detach().numpy() - y = second.detach().numpy() + x = first.detach().float().numpy() + y = second.detach().float().numpy() if verbose: print("x = {}".format(x.flatten())) print("y = {}".format(y.flatten())) diff --git a/tests/unit/ops/adam/test_cpu_adam.py b/tests/unit/ops/adam/test_cpu_adam.py index 785cf786acc3..851485440428 100644 --- a/tests/unit/ops/adam/test_cpu_adam.py +++ b/tests/unit/ops/adam/test_cpu_adam.py @@ -21,8 +21,8 @@ def check_equal(first, second, atol=1e-2, verbose=False): - x = first.detach().numpy() - y = second.detach().numpy() + x = first.detach().float().numpy() + y = second.detach().float().numpy() print("ATOL", atol) if verbose: print("x = {}".format(x.flatten())) @@ -43,7 +43,7 @@ def _compare_optimizers(model_size, param1, optimizer1, param2, optimizer2): check_equal(param1.float().norm(), param2.float().cpu().norm(), atol=tolerance, verbose=True) -@pytest.mark.parametrize('dtype', [torch.half, torch.float], ids=["fp16", "fp32"]) +@pytest.mark.parametrize('dtype', [torch.half, torch.bfloat16, torch.float], ids=["fp16", "bf16", "fp32"]) @pytest.mark.parametrize('model_size', [ (64), @@ -65,6 +65,9 @@ class TestCPUAdam(DistributedTest): @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedAdamBuilder.NAME], reason="FusedAdam is not compatible") def test_fused_adam_equal(self, dtype, model_size): + if dtype not in get_accelerator().supported_dtypes(): + pytest.skip(f"dtype {dtype} not supported in current accelerator") + if ("amd" in pytest.cpu_vendor) and (dtype == torch.half): pytest.skip("cpu-adam with half precision not supported on AMD CPUs") @@ -91,6 +94,8 @@ def test_fused_adam_equal(self, dtype, model_size): def test_torch_adamw_equal(self, dtype, model_size): if get_accelerator().is_available(): + if dtype == torch.half: + pytest.skip("torch.optim.AdamW with half precision inf/nan output.") if ("amd" in pytest.cpu_vendor) and (dtype == torch.half): pytest.skip("cpu-adam with half precision not supported on AMD CPUs") ref_param_device = get_accelerator().device_name() @@ -99,20 +104,20 @@ def test_torch_adamw_equal(self, dtype, model_size): pytest.skip("torch.optim.AdamW with half precision only supported in CUDA environments.") ref_param_device = 'cpu' - from deepspeed.ops.adam import DeepSpeedCPUAdam + from deepspeed.ops.adam import DeepSpeedCPUAdam - cpu_data = torch.randn(model_size, device='cpu').to(dtype) - cpu_param = torch.nn.Parameter(cpu_data) - ref_param = torch.nn.Parameter(cpu_data.to(ref_param_device)) + cpu_data = torch.randn(model_size, device='cpu').to(dtype) + cpu_param = torch.nn.Parameter(cpu_data) + ref_param = torch.nn.Parameter(cpu_data.to(ref_param_device)) - cpu_optimizer = DeepSpeedCPUAdam([cpu_param]) - ref_optimizer = torch.optim.AdamW([ref_param]) + cpu_optimizer = DeepSpeedCPUAdam([cpu_param]) + ref_optimizer = torch.optim.AdamW([ref_param]) - _compare_optimizers(model_size=model_size, - param1=cpu_param, - optimizer1=cpu_optimizer, - param2=ref_param, - optimizer2=ref_optimizer) + _compare_optimizers(model_size=model_size, + param1=cpu_param, + optimizer1=cpu_optimizer, + param2=ref_param, + optimizer2=ref_optimizer) class TestCPUAdamGPUError(DistributedTest): diff --git a/tests/unit/ops/adam/test_hybrid_adam.py b/tests/unit/ops/adam/test_hybrid_adam.py index 9003e02588c1..652090d5b9d5 100644 --- a/tests/unit/ops/adam/test_hybrid_adam.py +++ b/tests/unit/ops/adam/test_hybrid_adam.py @@ -22,8 +22,8 @@ def check_equal(first, second, atol=1e-2, verbose=False): - x = first.detach().numpy() - y = second.detach().numpy() + x = first.detach().float().numpy() + y = second.detach().float().numpy() print("ATOL", atol) if verbose: print("x = {}".format(x.flatten())) @@ -32,7 +32,7 @@ def check_equal(first, second, atol=1e-2, verbose=False): np.testing.assert_allclose(x, y, err_msg="param-update mismatch!", atol=atol) -@pytest.mark.parametrize('dtype', [torch.half, torch.float], ids=["fp16", "fp32"]) +@pytest.mark.parametrize('dtype', [torch.half, torch.bfloat16, torch.float], ids=["fp16", "bf16", "fp32"]) @pytest.mark.parametrize('model_size', [8, 16]) class TestHybridAdam(DistributedTest): world_size = 1 diff --git a/tests/unit/ops/lion/test_cpu_lion.py b/tests/unit/ops/lion/test_cpu_lion.py index 7d40a98f35b9..dce027e286fb 100644 --- a/tests/unit/ops/lion/test_cpu_lion.py +++ b/tests/unit/ops/lion/test_cpu_lion.py @@ -18,8 +18,8 @@ def check_equal(first, second, atol=1e-2, verbose=False): - x = first.detach().numpy() - y = second.detach().numpy() + x = first.detach().float().numpy() + y = second.detach().float().numpy() print("ATOL", atol) if verbose: print("x = {}".format(x.flatten())) @@ -40,7 +40,7 @@ def _compare_optimizers(model_size, param1, optimizer1, param2, optimizer2): check_equal(param1.float().norm(), param2.float().cpu().norm(), atol=tolerance, verbose=True) -@pytest.mark.parametrize('dtype', [torch.half, torch.float], ids=["fp16", "fp32"]) +@pytest.mark.parametrize('dtype', [torch.half, torch.bfloat16, torch.float], ids=["fp16", "bf16", "fp32"]) @pytest.mark.parametrize('model_size', [ (64),