From bcab56ab4aea151647b3e157779c79e82bdb530c Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Wed, 20 Dec 2023 02:28:36 -0500 Subject: [PATCH 01/19] dropout --- include/flexflow/ops/dropout.h | 7 + .../flexflow/ops/kernels/dropout_kernels.h | 16 ++- src/ops/dropout.cc | 38 ++++-- src/ops/kernels/dropout_kernels.cpp | 126 ++++++++++++++---- src/ops/kernels/dropout_kernels.cu | 12 +- 5 files changed, 151 insertions(+), 48 deletions(-) diff --git a/include/flexflow/ops/dropout.h b/include/flexflow/ops/dropout.h index 37304bdada..b8033c98ba 100644 --- a/include/flexflow/ops/dropout.h +++ b/include/flexflow/ops/dropout.h @@ -5,6 +5,13 @@ #include "flexflow/node.h" #include "flexflow/operator.h" #include "flexflow/ops/dropout_params.h" +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include +#include +#elif defined(FF_USE_HIP_ROCM) +#include +#include +#endif namespace FlexFlow { diff --git a/include/flexflow/ops/kernels/dropout_kernels.h b/include/flexflow/ops/kernels/dropout_kernels.h index 421974fbaa..b2201dd34e 100644 --- a/include/flexflow/ops/kernels/dropout_kernels.h +++ b/include/flexflow/ops/kernels/dropout_kernels.h @@ -5,6 +5,7 @@ #include "flexflow/fftype.h" #include "flexflow/op_meta.h" #include "flexflow/ops/dropout.h" +#include "flexflow/accessor.h" namespace FlexFlow { @@ -17,33 +18,40 @@ class DropoutMeta : public OpMeta { ~DropoutMeta(void); Realm::RegionInstance reserveInst; #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) + curandState *state; cudnnTensorDescriptor_t inputTensor, outputTensor; cudnnDropoutDescriptor_t dropoutDesc; #else miopenTensorDescriptor_t inputTensor, outputTensor; miopenDropoutDescriptor_t dropoutDesc; + hiprandState *state; #endif void *reserveSpace, *dropoutStates; size_t reserveSpaceSize, dropoutStateSize; + size_t num_elements; + long long seed; + float rate; }; namespace Kernels { namespace Dropout { void forward_kernel_wrapper(DropoutMeta *m, - float const *input_ptr, - float *output_ptr); + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); void backward_kernel_wrapper(DropoutMeta *m, - float const *output_grad_ptr, - float *input_grad_ptr); + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad); namespace Internal { void forward_kernel(DropoutMeta *m, float const *input_ptr, float *output_ptr, + size_t num_elements, ffStream_t stream); void backward_kernel(DropoutMeta *m, float const *output_grad_ptr, float *input_grad_ptr, + size_t num_elements, ffStream_t stream); } // namespace Internal } // namespace Dropout diff --git a/src/ops/dropout.cc b/src/ops/dropout.cc index 55f6730827..2ebfaff539 100644 --- a/src/ops/dropout.cc +++ b/src/ops/dropout.cc @@ -28,7 +28,7 @@ using PCG::Node; using namespace FlexFlow::Kernels::Dropout; -Tensor FFModel::dropout(const Tensor input, +Tensor FFModel::dropout(Tensor const input, float rate, unsigned long long seed, char const *name) { @@ -86,7 +86,7 @@ bool operator==(DropoutParams const &lhs, DropoutParams const &rhs) { } Dropout::Dropout(FFModel &model, - const ParallelTensor _input, + ParallelTensor const _input, float _rate, unsigned long long _seed, char const *name) @@ -111,12 +111,12 @@ Dropout::Dropout(FFModel &model, Dropout::Dropout(FFModel &model, Dropout const &other, - const ParallelTensor input) + ParallelTensor const input) : Dropout(model, input, other.rate, other.seed, other.name) {} Dropout::Dropout(FFModel &model, DropoutParams const ¶ms, - const ParallelTensor input, + ParallelTensor const input, char const *name) : Dropout(model, input, params.rate, params.seed, name) {} @@ -210,12 +210,12 @@ void Dropout::forward_task(Task const *task, assert(task->regions.size() == 2); // const Dropout* dropout = (const Dropout*) task->args; DropoutMeta *m = *((DropoutMeta **)task->local_args); - float const *input_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); - float *output_ptr = helperGetTensorPointerWO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - - forward_kernel_wrapper(m, input_ptr, output_ptr); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + forward_kernel_wrapper(m, input, output); } void Dropout::backward(FFModel const &ff) { @@ -264,7 +264,13 @@ void Dropout::backward_task(Task const *task, float const *output_grad_ptr = helperGetTensorPointerRO( regions[1], task->regions[1], FID_DATA, ctx, runtime); - backward_kernel_wrapper(m, output_grad_ptr, input_grad_ptr); + + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + backward_kernel_wrapper(m, output_grad, input_grad); } void Dropout::serialize(Legion::Serializer &sez) const { @@ -304,30 +310,36 @@ bool Dropout::measure_operator_cost(Simulator *sim, sim->free_all(); float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); assert(input_ptr != NULL); + + GenericTensorAccessorR input_acc(m->input_type[0], sub_input.get_domain(), input_ptr); cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); assert(output_ptr != NULL); + + GenericTensorAccessorW output_acc(m->output_type[0], sub_input.get_domain(), output_ptr); cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); assert(m->profiling == false); std::function forward, backward; - forward = [&] { forward_kernel_wrapper(m, input_ptr, output_ptr); }; + forward = [&] { forward_kernel_wrapper(m, input_acc, output_acc); }; if (sim->computationMode == COMP_MODE_TRAINING) { float *input_grad_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); assert(input_grad_ptr != NULL); + GenericTensorAccessorW input_grad_acc(m->output_type[0], sub_input.get_domain(), input_grad_ptr); cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); float *output_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); assert(output_grad_ptr != NULL); + GenericTensorAccessorR output_grad_acc(m->output_type[0], sub_input.get_domain(), output_grad_ptr); cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); backward = [&] { - backward_kernel_wrapper(m, output_grad_ptr, input_grad_ptr); + backward_kernel_wrapper(m, output_grad_acc, input_grad_acc); }; } diff --git a/src/ops/kernels/dropout_kernels.cpp b/src/ops/kernels/dropout_kernels.cpp index b0dd4c644e..c0d5748464 100644 --- a/src/ops/kernels/dropout_kernels.cpp +++ b/src/ops/kernels/dropout_kernels.cpp @@ -30,6 +30,11 @@ DropoutMeta::DropoutMeta(FFHandler handler, Domain const &output_domain) : OpMeta(handler) { profiling = dropout->profiling; + rate = dropout->rate; + seed = dropout->seed; + input_type[0] = dropout->data_type; + output_type[0] = dropout->data_type; + checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); checkCUDNN(miopenCreateDropoutDescriptor(&dropoutDesc)); @@ -78,20 +83,68 @@ DropoutMeta::~DropoutMeta(void) { namespace Kernels { namespace Dropout { +__global__ void dropout_forward_kernel(float p, + long long seed, + size_t num_elements, + float const *input_ptr, + float *output_ptr) { + CUDA_KERNEL_LOOP(i, num_elements) { + float scale = 1.0 / p; + hiprandStatePhilox4_32_10_t state; + hiprand_init(seed, i, 0, &state); + float rand = hiprand_uniform(&state); + if (input_ptr[i] < p) { + output_ptr[i] = 0; + } else { + output_ptr[i] = input_ptr[i] * scale; + } + } +} + +__global__ void dropout_backward_kernel(float p, + long long seed, + size_t num_elements, + float const *input_ptr, + float *output_ptr) { + CUDA_KERNEL_LOOP(i, num_elements) { + float scale = 1.0 / p; + hiprandStatePhilox4_32_10_t state; + hiprand_init(seed, i, 0, &state); + float rand = hiprand_uniform(&state); + if (input_ptr[i] < p) { + output_ptr[i] = 0; + } else { + output_ptr[i] = input_ptr[i] * scale; + } + } +} + void forward_kernel_wrapper(DropoutMeta *m, - float const *input_ptr, - float *output_ptr) { + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - Internal::forward_kernel(m, input_ptr, output_ptr, stream); + + Internal::forward_kernel(m, + input.get_float_ptr(), + output.get_float_ptr(), + input.domain.get_volume(), + stream); + + // printf("dropout %d\n", input.domain.get_volume()); + // assert(false); } void backward_kernel_wrapper(DropoutMeta *m, - float const *output_grad_ptr, - float *input_grad_ptr) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - Internal::backward_kernel(m, output_grad_ptr, input_grad_ptr, stream); + Internal::backward_kernel(m, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + output_grad.domain.get_volume(), + stream); } namespace Internal { @@ -99,35 +152,58 @@ namespace Internal { void forward_kernel(DropoutMeta *m, float const *input_ptr, float *output_ptr, + size_t num_elements, hipStream_t stream) { checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + int parallelism = num_elements; + hipLaunchKernelGGL(HIP_KERNEL_NAME(dropout_forward_kernel), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + m->seed, + m->rate, + num_elements, + input_ptr, + output_ptr); - checkCUDNN(miopenDropoutForward(m->handle.dnn, - m->dropoutDesc, - m->inputTensor /* not used */, - m->inputTensor, - input_ptr, - m->outputTensor, - output_ptr, - m->reserveSpace, - m->reserveSpaceSize)); + // checkCUDNN(miopenDropoutForward(m->handle.dnn, + // m->dropoutDesc, + // m->inputTensor /* not used */, + // m->inputTensor, + // input_ptr, + // m->outputTensor, + // output_ptr, + // m->reserveSpace, + // m->reserveSpaceSize)); } void backward_kernel(DropoutMeta *m, float const *output_grad_ptr, float *input_grad_ptr, + size_t num_elements, hipStream_t stream) { checkCUDNN(miopenSetStream(m->handle.dnn, stream)); - - checkCUDNN(miopenDropoutBackward(m->handle.dnn, - m->dropoutDesc, - m->inputTensor /* not used */, - m->outputTensor, - output_grad_ptr, - m->inputTensor, - input_grad_ptr, - m->reserveSpace, - m->reserveSpaceSize)); + int parallelism = num_elements; + hipLaunchKernelGGL(HIP_KERNEL_NAME(dropout_backward_kernel), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + m->seed, + m->rate, + num_elements, + output_grad_ptr, + input_grad_ptr); + // checkCUDNN(miopenDropoutBackward(m->handle.dnn, + // m->dropoutDesc, + // m->inputTensor /* not used */, + // m->outputTensor, + // output_grad_ptr, + // m->inputTensor, + // input_grad_ptr, + // m->reserveSpace, + // m->reserveSpaceSize)); } } // namespace Internal diff --git a/src/ops/kernels/dropout_kernels.cu b/src/ops/kernels/dropout_kernels.cu index 4a76301fd6..39037cb37f 100644 --- a/src/ops/kernels/dropout_kernels.cu +++ b/src/ops/kernels/dropout_kernels.cu @@ -75,19 +75,19 @@ namespace Kernels { namespace Dropout { void forward_kernel_wrapper(DropoutMeta *m, - float const *input_ptr, - float *output_ptr) { + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - Internal::forward_kernel(m, input_ptr, output_ptr, stream); + Internal::forward_kernel(m, input.get_float_ptr(), output.get_float_ptr(), stream); } void backward_kernel_wrapper(DropoutMeta *m, - float const *output_grad_ptr, - float *input_grad_ptr) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - Internal::backward_kernel(m, output_grad_ptr, input_grad_ptr, stream); + Internal::backward_kernel(m, output_grad.get_float_ptr(), input_grad.get_float_ptr(), stream); } namespace Internal { From fa1fffcdc70425b099393a56bbf39c5c6a30e198 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Wed, 20 Dec 2023 16:37:54 -0500 Subject: [PATCH 02/19] optimizer --- include/flexflow/flexflow_c.h | 2 + include/flexflow/model.h | 2 + include/flexflow/optimizer.h | 23 ++++ python/flexflow/core/flexflow_cffi.py | 7 ++ src/c/flexflow_c.cc | 5 + src/runtime/model.cc | 19 +++ src/runtime/optimizer.cc | 167 ++++++++++++++++++++++++++ src/runtime/optimizer_kernel.cpp | 64 ++++++++++ 8 files changed, 289 insertions(+) diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index bde40de31b..2ddc8549fa 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -95,6 +95,8 @@ void flexflow_model_compute_metrics(flexflow_model_t handle); void flexflow_model_update(flexflow_model_t handle); +void flexflow_model_unified_update(flexflow_model_t handle); + void flexflow_model_compile(flexflow_model_t handle, enum LossType loss_type, int *metrics, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 5e7ad8025c..82a29c8182 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -151,6 +151,7 @@ enum TaskIDs { // Optimizer with NCCL SGD_UPD_NCCL_TASK_ID, ADAM_UPD_NCCL_TASK_ID, + ADAM_UNIFY_UPD_NCCL_TASK_ID, // Initializer GLOROT_INIT_TASK_ID, ZERO_INIT_TASK_ID, @@ -777,6 +778,7 @@ class FFModel { void get_metrics(); void backward(int seq_length = -1); void update(); + void unified_update(); bool apply_fusion(std::vector const &operators, std::vector &new_operators); Op *get_final_operator() const; diff --git a/include/flexflow/optimizer.h b/include/flexflow/optimizer.h index bab7e6e4ed..1101a35ae5 100644 --- a/include/flexflow/optimizer.h +++ b/include/flexflow/optimizer.h @@ -30,6 +30,7 @@ class Optimizer { virtual void init(void) = 0; virtual void next(void) = 0; virtual void update(const ParallelTensor p) = 0; + virtual void unified_update(std::vector parameters) = 0; FFModel const *model; }; @@ -43,6 +44,7 @@ class SGDOptimizer : public Optimizer { void init(void); void next(void); void update(const ParallelTensor p); + void unified_update(std::vector parameters); void set_weight_decay(double _weight_decay); static void ps_update_task(Legion::Task const *task, std::vector const ®ions, @@ -60,6 +62,11 @@ class SGDOptimizer : public Optimizer { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void + nccl_unified_update_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void nccl_update_task_gpu(SGDOptimizer const *op, OpMeta const *meta, float const *w_grad_ptr, @@ -85,6 +92,7 @@ class AdamOptimizer : public Optimizer { void init(void); void next(void); void update(const ParallelTensor p); + void unified_update(std::vector parameters); void set_weight_decay(double _weight_decay); static void ps_update_task(Legion::Task const *task, std::vector const ®ions, @@ -103,6 +111,11 @@ class AdamOptimizer : public Optimizer { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void + nccl_unified_update_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void nccl_update_task_gpu(AdamOptimizer const *op, OpMeta const *meta, float const *w_grad_ptr, @@ -110,10 +123,20 @@ class AdamOptimizer : public Optimizer { float *w_ptr, float *v_ptr, float *m_ptr); + static void nccl_unified_update_task_gpu(AdamOptimizer const *op, + OpMeta const *meta, + float const *w_grad_ptr[], + size_t *size, + float *w_ptr[], + float *v_ptr[], + float *m_ptr[]); #endif double alpha, beta1, beta2, weight_decay, epsilon; double alpha_t, beta1_t, beta2_t; std::map v_values, m_values; + size_t reservedWorkSpaceSize; + int parameters_num; + int processed_parameters_num; }; }; // namespace FlexFlow diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index c05fb96661..49ad5b3dc8 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -2018,6 +2018,13 @@ def update(self): :returns: None -- no returns. """ ffc.flexflow_model_update(self.handle) + + def unified_update(self): + """Update weights and biases of all layers. + + :returns: None -- no returns. + """ + ffc.flexflow_model_unified_update(self.handle) def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None): """Configure the model for trainting. FlexFlow uses lazy initialization, diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index e3c430fd92..1d8634f224 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -178,6 +178,11 @@ void flexflow_model_update(flexflow_model_t handle_) { handle->update(); } +void flexflow_model_unified_update(flexflow_model_t handle_) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + handle->unified_update(); +} + void flexflow_model_compile(flexflow_model_t handle_, enum LossType loss_type, int *metrics, diff --git a/src/runtime/model.cc b/src/runtime/model.cc index c52d5e7f0c..b16766291a 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -2474,6 +2474,10 @@ void FFModel::update() { } } +void FFModel::unified_update() { + optimizer->unified_update(parameters); +} + Op *FFModel::get_final_operator() const { int idx = operators.size() - 1; while (operators[idx]->op_type == OP_INPUT || @@ -5302,6 +5306,21 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(ADAM_UNIFY_UPD_NCCL_TASK_ID, "Adam unified NCCL Update"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Adam unified NCCL Update Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } #endif // Initializer { diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc index c42a0c9aa6..42e227f022 100644 --- a/src/runtime/optimizer.cc +++ b/src/runtime/optimizer.cc @@ -333,6 +333,7 @@ void AdamOptimizer::init(void) { Context ctx = model->config.lg_ctx; Runtime *runtime = model->config.lg_hlr; Initializer *initializer = new ZeroInitializer(); + reservedWorkSpaceSize = 0; for (size_t i = 0; i < model->parameters.size(); i++) { ParallelTensor p = model->parameters[i]; Domain domain = @@ -381,6 +382,8 @@ void AdamOptimizer::update(const ParallelTensor p) { assert(v_values.find(p->region) != v_values.end()); assert(m_values.find(p->region) != m_values.end()); assert(p->owner_op != NULL); + reservedWorkSpaceSize += p->get_volume() * sizeof(float); + printf("update workspace: %d", reservedWorkSpaceSize); if (p->sync_type == ParameterSyncType::PS) { TaskLauncher launcher(ADAM_UPD_PS_TASK_ID, TaskArgument(this, sizeof(AdamOptimizer)), @@ -492,6 +495,105 @@ void AdamOptimizer::update(const ParallelTensor p) { } } +void AdamOptimizer::unified_update(std::vector parameters) { + Context ctx = model->config.lg_ctx; + Runtime *runtime = model->config.lg_hlr; + + + printf("update workspace: %d", reservedWorkSpaceSize); + + const ParallelTensor p0 = parameters.at(0); + ArgumentMap argmap; + Domain domain = runtime->get_index_space_domain(ctx, p0->parallel_is); + switch (domain.get_dim()) { +#define DIMFUNC(DIM) \ + case DIM: { \ + Rect rect = domain; \ + int idx = 0; \ + for (PointInRectIterator it(rect); it(); it++) { \ + OpMeta *mp = p0->owner_op->meta[idx++]; \ + argmap.set_point(*it, TaskArgument(&mp, sizeof(OpMeta *))); \ + } \ + break; \ + } + LEGION_FOREACH_N(DIMFUNC) +#undef DIMFUNC + default: + assert(false); + } + + int offset = 0; + + while(parameters_num < parameters.size()){ + for(int i = 0; i < parameters.size(); i++){ + const ParallelTensor p = parameters.at(i); + assert(v_values.find(p->region) != v_values.end()); + assert(m_values.find(p->region) != m_values.end()); + assert(p->owner_op != NULL); + if(reservedWorkSpaceSize + p->get_volume() * sizeof(float) >= model->handlers->workSpaceSize){ + break; + } + reservedWorkSpaceSize += p->get_volume() * sizeof(float); + parameters_num += 1; + assert (p->sync_type == ParameterSyncType::NCCL); + assert(p->parallel_is != IndexSpace::NO_SPACE); + } + assert(parameters_num <= parameters.size()); + + //launch a unified task + for(int j = 0; j < parameters_num; j++){ + this->next(); + const ParallelTensor p = parameters.at(processed_parameters_num + j); + IndexLauncher launcher(ADAM_UNIFY_UPD_NCCL_TASK_ID, + p0->parallel_is, + TaskArgument(this, sizeof(AdamOptimizer)), + argmap, + Predicate::TRUE_PRED, + false /*must_epoch*/, + 0 /*mapper_id*/, + p0->machine_view.hash()); + // regions[0]: region_grad + launcher.add_region_requirement(RegionRequirement(p->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + p->region_grad)); + launcher.add_field(offset, FID_DATA); + // regions[1]: region + launcher.add_region_requirement(RegionRequirement( + p->part, 0 /*projection id*/, READ_WRITE, EXCLUSIVE, p->region)); + launcher.add_field(offset + 1, FID_DATA); + // regions[2]: w_region + launcher.add_region_requirement( + RegionRequirement(v_values[p->region]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + v_values[p->region]->region)); + launcher.add_field(offset + 2, FID_DATA); + // regions[3]: m_region + launcher.add_region_requirement( + RegionRequirement(m_values[p->region]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + m_values[p->region]->region)); + launcher.add_field(offset + 3, FID_DATA); + offset += 4; + launcher.concurrent = true; + FutureMap fm = runtime->execute_index_space(ctx, launcher); + // runtime->execute_must_epoch(ctx, must_epoch_launcher); + runtime->issue_execution_fence(ctx); + reservedWorkSpaceSize = 0; + offset = 0; + processed_parameters_num += parameters_num; + } + printf("offset: %d\n", offset); + + } + +} + void AdamOptimizer::ps_update_task(Task const *task, std::vector const ®ions, Context ctx, @@ -605,6 +707,71 @@ void AdamOptimizer::nccl_update_task(Task const *task, nccl_update_task_gpu(op, meta, w_grad_ptr, size, w_ptr, v_ptr, m_ptr); } + + +void AdamOptimizer::nccl_unified_update_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + // assert(regions.size() == 4); + // assert(task->regions.size() == 4); + AdamOptimizer const *op = (AdamOptimizer *)task->args; + OpMeta const *meta = *((OpMeta **)task->local_args); + // FFHandler handler = *((FFHandler*) task->local_args); + Domain domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + + printf("parameters_num: %d\n", op->parameters_num); + float const *w_grad_ptr = NULL; + float *w_ptr = NULL, *v_ptr = NULL, *m_ptr = NULL; + size_t size = 0; + + for(int i = 0; i < op->parameters_num; i++){ + + switch (domain.get_dim()) { +#define DIMFUNC(DIM) \ + case DIM: { \ + TensorAccessorR accWGrad( \ + regions[0], task->regions[0], FID_DATA, ctx, runtime); \ + TensorAccessorW accW(regions[1], \ + task->regions[1], \ + FID_DATA, \ + ctx, \ + runtime, \ + true /*readOutput*/); \ + TensorAccessorW accV(regions[2], \ + task->regions[2], \ + FID_DATA, \ + ctx, \ + runtime, \ + true /*readOutput*/); \ + TensorAccessorW accM(regions[3], \ + task->regions[3], \ + FID_DATA, \ + ctx, \ + runtime, \ + true /*readOutput*/); \ + size = accW.rect.volume(); \ + assert(accWGrad.rect == accW.rect); \ + assert(accWGrad.rect == accV.rect); \ + assert(accWGrad.rect == accM.rect); \ + w_grad_ptr = accWGrad.ptr; \ + w_ptr = accW.ptr; \ + v_ptr = accV.ptr; \ + m_ptr = accM.ptr; \ + break; \ + } + LEGION_FOREACH_N(DIMFUNC) +#undef DIMFUNC + default: { + // Unsupported dims + assert(false); + } + } + } + + nccl_unified_update_task_gpu(op, meta, w_grad_ptr, size, w_ptr, v_ptr, m_ptr); +} #endif }; // namespace FlexFlow diff --git a/src/runtime/optimizer_kernel.cpp b/src/runtime/optimizer_kernel.cpp index e71adc87a8..039b27c654 100644 --- a/src/runtime/optimizer_kernel.cpp +++ b/src/runtime/optimizer_kernel.cpp @@ -204,6 +204,7 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op, m_ptr, v_ptr, w_ptr); + // checkCUDA(hipDeviceSynchronize()); } @@ -245,6 +246,69 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, w_ptr); // checkCUDA(hipDeviceSynchronize()); } + +__host__ void AdamOptimizer::nccl_unified_update_task_gpu(AdamOptimizer const *op, + OpMeta const *meta, + float const *w_grad_ptr[], + size_t *size, + float *w_ptr[], + float *v_ptr[], + float *m_ptr[]) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(op->reservedWorkSpaceSize < meta->handle.workSpaceSize); + + void *workSpace_ptr = static_cast(meta->handle.workSpace); + + for(int i = 0; i < op->parameters_num; i++){ + checkCUDA(hipMemcpyAsync(workSpace_ptr, + w_grad_ptr[i], + size[i] * sizeof(float), + hipMemcpyDeviceToDevice, + stream)); + workSpace_ptr += size[i] * sizeof(float); + } + + //do allreduce once + checkNCCL(ncclAllReduce(meta->handle.workSpace, + (float *)meta->handle.workSpace, + op->reservedWorkSpaceSize / sizeof(float), + ncclFloat, + ncclSum, + meta->handle.ncclComm, + stream)); + + workSpace_ptr = static_cast(meta->handle.workSpace); + float alpha_t = op->alpha_t; + float beta1_t = op->beta1_t; + float beta2_t = op->beta2_t; + for(int i = 0; i < op->parameters_num; i++){ + // update + hipLaunchKernelGGL(HIP_KERNEL_NAME(adam_update), + GET_BLOCKS(size), + CUDA_NUM_THREADS, + 0, + stream, + size[i], + alpha_t, + op->beta1, + op->beta2, + op->weight_decay, + op->epsilon, + static_cast(workSpace_ptr), + m_ptr[i], + v_ptr[i], + w_ptr[i]); + workSpace_ptr += size[i] * sizeof(float); + + //update + beta1_t *= beta1; + beta2_t *= beta2; + alpha_t = alpha * sqrt(1 - beta2_t) / (1 - beta1_t); + } + + // checkCUDA(hipDeviceSynchronize()); +} #endif }; // namespace FlexFlow \ No newline at end of file From 40d830c8b9c67e66f25bfe17ecc6f407ef179865 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Wed, 20 Dec 2023 19:09:18 -0500 Subject: [PATCH 03/19] opt --- include/flexflow/optimizer.h | 12 ++-- src/runtime/optimizer.cc | 111 +++++++++++++------------------ src/runtime/optimizer_kernel.cpp | 33 +++++---- 3 files changed, 72 insertions(+), 84 deletions(-) diff --git a/include/flexflow/optimizer.h b/include/flexflow/optimizer.h index 1101a35ae5..201408868b 100644 --- a/include/flexflow/optimizer.h +++ b/include/flexflow/optimizer.h @@ -30,7 +30,7 @@ class Optimizer { virtual void init(void) = 0; virtual void next(void) = 0; virtual void update(const ParallelTensor p) = 0; - virtual void unified_update(std::vector parameters) = 0; + virtual void unified_update(std::vector const parameters) = 0; FFModel const *model; }; @@ -44,7 +44,7 @@ class SGDOptimizer : public Optimizer { void init(void); void next(void); void update(const ParallelTensor p); - void unified_update(std::vector parameters); + void unified_update(std::vector const parameters); void set_weight_decay(double _weight_decay); static void ps_update_task(Legion::Task const *task, std::vector const ®ions, @@ -92,7 +92,7 @@ class AdamOptimizer : public Optimizer { void init(void); void next(void); void update(const ParallelTensor p); - void unified_update(std::vector parameters); + void unified_update(std::vector const parameters); void set_weight_decay(double _weight_decay); static void ps_update_task(Legion::Task const *task, std::vector const ®ions, @@ -134,9 +134,9 @@ class AdamOptimizer : public Optimizer { double alpha, beta1, beta2, weight_decay, epsilon; double alpha_t, beta1_t, beta2_t; std::map v_values, m_values; - size_t reservedWorkSpaceSize; - int parameters_num; - int processed_parameters_num; + size_t reservedWorkSpaceSize = 0; + int parameters_num = 0; + int processed_parameters_num = 0; }; }; // namespace FlexFlow diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc index 42e227f022..fc30ae12cc 100644 --- a/src/runtime/optimizer.cc +++ b/src/runtime/optimizer.cc @@ -383,7 +383,6 @@ void AdamOptimizer::update(const ParallelTensor p) { assert(m_values.find(p->region) != m_values.end()); assert(p->owner_op != NULL); reservedWorkSpaceSize += p->get_volume() * sizeof(float); - printf("update workspace: %d", reservedWorkSpaceSize); if (p->sync_type == ParameterSyncType::PS) { TaskLauncher launcher(ADAM_UPD_PS_TASK_ID, TaskArgument(this, sizeof(AdamOptimizer)), @@ -495,13 +494,13 @@ void AdamOptimizer::update(const ParallelTensor p) { } } -void AdamOptimizer::unified_update(std::vector parameters) { +void SGDOptimizer::unified_update(std::vector const parameters) { + //todo +} + +void AdamOptimizer::unified_update(std::vector const parameters) { Context ctx = model->config.lg_ctx; Runtime *runtime = model->config.lg_hlr; - - - printf("update workspace: %d", reservedWorkSpaceSize); - const ParallelTensor p0 = parameters.at(0); ArgumentMap argmap; Domain domain = runtime->get_index_space_domain(ctx, p0->parallel_is); @@ -523,8 +522,10 @@ void AdamOptimizer::unified_update(std::vector parameters) { } int offset = 0; + printf("param size: %d, %d\n", parameters.size(), parameters_num); - while(parameters_num < parameters.size()){ + while(processed_parameters_num < parameters.size()){ + for(int i = 0; i < parameters.size(); i++){ const ParallelTensor p = parameters.at(i); assert(v_values.find(p->region) != v_values.end()); @@ -538,13 +539,11 @@ void AdamOptimizer::unified_update(std::vector parameters) { assert (p->sync_type == ParameterSyncType::NCCL); assert(p->parallel_is != IndexSpace::NO_SPACE); } + + printf("parameters_num: %d %d, %d\n", parameters_num, reservedWorkSpaceSize, model->handlers->workSpaceSize); assert(parameters_num <= parameters.size()); - //launch a unified task - for(int j = 0; j < parameters_num; j++){ - this->next(); - const ParallelTensor p = parameters.at(processed_parameters_num + j); - IndexLauncher launcher(ADAM_UNIFY_UPD_NCCL_TASK_ID, + IndexLauncher launcher(ADAM_UNIFY_UPD_NCCL_TASK_ID, p0->parallel_is, TaskArgument(this, sizeof(AdamOptimizer)), argmap, @@ -552,6 +551,10 @@ void AdamOptimizer::unified_update(std::vector parameters) { false /*must_epoch*/, 0 /*mapper_id*/, p0->machine_view.hash()); + //launch a unified task + for(int j = 0; j < parameters_num; j++){ + const ParallelTensor p = parameters.at(processed_parameters_num + j); + // regions[0]: region_grad launcher.add_region_requirement(RegionRequirement(p->part_grad, 0 /*projection id*/, @@ -580,16 +583,19 @@ void AdamOptimizer::unified_update(std::vector parameters) { m_values[p->region]->region)); launcher.add_field(offset + 3, FID_DATA); offset += 4; - launcher.concurrent = true; - FutureMap fm = runtime->execute_index_space(ctx, launcher); - // runtime->execute_must_epoch(ctx, must_epoch_launcher); - runtime->issue_execution_fence(ctx); - reservedWorkSpaceSize = 0; - offset = 0; - processed_parameters_num += parameters_num; } - printf("offset: %d\n", offset); + //update alpha, beta + for(int i = 0; i < parameters_num; i++){ + this->next(); + } + launcher.concurrent = true; + FutureMap fm = runtime->execute_index_space(ctx, launcher); + // runtime->execute_must_epoch(ctx, must_epoch_launcher); + runtime->issue_execution_fence(ctx); + reservedWorkSpaceSize = 0; + offset = 0; + processed_parameters_num += parameters_num; } } @@ -721,55 +727,28 @@ void AdamOptimizer::nccl_unified_update_task(Task const *task, Domain domain = runtime->get_index_space_domain( ctx, task->regions[1].region.get_index_space()); - printf("parameters_num: %d\n", op->parameters_num); - float const *w_grad_ptr = NULL; - float *w_ptr = NULL, *v_ptr = NULL, *m_ptr = NULL; - size_t size = 0; + float const *w_grad_ptr[op->parameters_num]; + float *w_ptr[op->parameters_num], *v_ptr[op->parameters_num], *m_ptr[op->parameters_num]; + size_t size[op->parameters_num]; + int offset = 0; + printf("parameters_num: %d\n", op->parameters_num); + for(int i = 0; i < op->parameters_num; i++){ - - switch (domain.get_dim()) { -#define DIMFUNC(DIM) \ - case DIM: { \ - TensorAccessorR accWGrad( \ - regions[0], task->regions[0], FID_DATA, ctx, runtime); \ - TensorAccessorW accW(regions[1], \ - task->regions[1], \ - FID_DATA, \ - ctx, \ - runtime, \ - true /*readOutput*/); \ - TensorAccessorW accV(regions[2], \ - task->regions[2], \ - FID_DATA, \ - ctx, \ - runtime, \ - true /*readOutput*/); \ - TensorAccessorW accM(regions[3], \ - task->regions[3], \ - FID_DATA, \ - ctx, \ - runtime, \ - true /*readOutput*/); \ - size = accW.rect.volume(); \ - assert(accWGrad.rect == accW.rect); \ - assert(accWGrad.rect == accV.rect); \ - assert(accWGrad.rect == accM.rect); \ - w_grad_ptr = accWGrad.ptr; \ - w_ptr = accW.ptr; \ - v_ptr = accV.ptr; \ - m_ptr = accM.ptr; \ - break; \ + GenericTensorAccessorR accWGrad = helperGetGenericTensorAccessorRO(DataType::DT_FLOAT, regions[offset], task->regions[offset], FID_DATA, ctx, runtime); + GenericTensorAccessorW accW = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, regions[offset+1], task->regions[offset+1], FID_DATA, ctx, runtime); + GenericTensorAccessorW accV = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, regions[offset+2], task->regions[offset+2], FID_DATA, ctx, runtime); + GenericTensorAccessorW accM = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, regions[offset+3], task->regions[offset+3], FID_DATA, ctx, runtime); + offset += 4; + + size[i] = accW.domain.get_volume(); + // assert(accWGrad.rect == accW.rect); + // assert(accWGrad.rect == accV.rect); + // assert(accWGrad.rect == accM.rect); + w_ptr[i] = accW.get_float_ptr(); + v_ptr[i] = accV.get_float_ptr(); + m_ptr[i] = accM.get_float_ptr(); } - LEGION_FOREACH_N(DIMFUNC) -#undef DIMFUNC - default: { - // Unsupported dims - assert(false); - } - } - } - nccl_unified_update_task_gpu(op, meta, w_grad_ptr, size, w_ptr, v_ptr, m_ptr); } #endif diff --git a/src/runtime/optimizer_kernel.cpp b/src/runtime/optimizer_kernel.cpp index 039b27c654..5e7e613bc7 100644 --- a/src/runtime/optimizer_kernel.cpp +++ b/src/runtime/optimizer_kernel.cpp @@ -254,19 +254,27 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu(AdamOptimizer const *o float *w_ptr[], float *v_ptr[], float *m_ptr[]) { + hipStream_t stream; checkCUDA(get_legion_stream(&stream)); assert(op->reservedWorkSpaceSize < meta->handle.workSpaceSize); - void *workSpace_ptr = static_cast(meta->handle.workSpace); + void *workSpace_ptr = meta->handle.workSpace; for(int i = 0; i < op->parameters_num; i++){ - checkCUDA(hipMemcpyAsync(workSpace_ptr, - w_grad_ptr[i], - size[i] * sizeof(float), - hipMemcpyDeviceToDevice, - stream)); - workSpace_ptr += size[i] * sizeof(float); + // hipMemcpyAsync(static_cast(workSpace_ptr), + // w_grad_ptr[i], + // size[i] * sizeof(float), + // hipMemcpyDeviceToDevice, + // stream); + // hipError_t error = hipGetLastError(); + // if(error != hipSuccess) + // { + // // print the CUDA error message and exit + // printf("CUDA error: %s\n", hipGetErrorString(error)); + // } + + workSpace_ptr = static_cast(workSpace_ptr) + size[i] * sizeof(float); } //do allreduce once @@ -284,8 +292,9 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu(AdamOptimizer const *o float beta2_t = op->beta2_t; for(int i = 0; i < op->parameters_num; i++){ // update + std::cout<<"update"<<"\n"; hipLaunchKernelGGL(HIP_KERNEL_NAME(adam_update), - GET_BLOCKS(size), + GET_BLOCKS(size[i]), CUDA_NUM_THREADS, 0, stream, @@ -299,12 +308,12 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu(AdamOptimizer const *o m_ptr[i], v_ptr[i], w_ptr[i]); - workSpace_ptr += size[i] * sizeof(float); + workSpace_ptr = static_cast(workSpace_ptr) + size[i] * sizeof(float); //update - beta1_t *= beta1; - beta2_t *= beta2; - alpha_t = alpha * sqrt(1 - beta2_t) / (1 - beta1_t); + beta1_t *= op->beta1; + beta2_t *= op->beta2; + alpha_t = op->alpha * sqrt(1 - beta2_t) / (1 - beta1_t); } // checkCUDA(hipDeviceSynchronize()); From e8255265235c041e477119fc68015c0213c90d77 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Wed, 20 Dec 2023 20:46:51 -0500 Subject: [PATCH 04/19] fix --- src/runtime/optimizer.cc | 14 ++++++++++---- src/runtime/optimizer_kernel.cpp | 19 ++++++------------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc index fc30ae12cc..cf23cc39a5 100644 --- a/src/runtime/optimizer.cc +++ b/src/runtime/optimizer.cc @@ -522,7 +522,7 @@ void AdamOptimizer::unified_update(std::vector const parameters) } int offset = 0; - printf("param size: %d, %d\n", parameters.size(), parameters_num); + // printf("param size: %d, %d\n", parameters.size(), parameters_num); while(processed_parameters_num < parameters.size()){ @@ -540,7 +540,7 @@ void AdamOptimizer::unified_update(std::vector const parameters) assert(p->parallel_is != IndexSpace::NO_SPACE); } - printf("parameters_num: %d %d, %d\n", parameters_num, reservedWorkSpaceSize, model->handlers->workSpaceSize); + // printf("parameters_num: %d %d, %d\n", parameters_num, reservedWorkSpaceSize, model->handlers->workSpaceSize); assert(parameters_num <= parameters.size()); IndexLauncher launcher(ADAM_UNIFY_UPD_NCCL_TASK_ID, @@ -729,7 +729,12 @@ void AdamOptimizer::nccl_unified_update_task(Task const *task, float const *w_grad_ptr[op->parameters_num]; float *w_ptr[op->parameters_num], *v_ptr[op->parameters_num], *m_ptr[op->parameters_num]; - size_t size[op->parameters_num]; + + hipMalloc(w_grad_ptr, sizeof(float*) * op->parameters_num); + hipMalloc(w_ptr, sizeof(float*) * op->parameters_num); + hipMalloc(v_ptr, sizeof(float*) * op->parameters_num); + hipMalloc(m_ptr, sizeof(float*) * op->parameters_num); + size_t *size = new size_t[op->parameters_num]; int offset = 0; printf("parameters_num: %d\n", op->parameters_num); @@ -741,10 +746,11 @@ void AdamOptimizer::nccl_unified_update_task(Task const *task, GenericTensorAccessorW accM = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, regions[offset+3], task->regions[offset+3], FID_DATA, ctx, runtime); offset += 4; - size[i] = accW.domain.get_volume(); + size[i] = accWGrad.domain.get_volume(); // assert(accWGrad.rect == accW.rect); // assert(accWGrad.rect == accV.rect); // assert(accWGrad.rect == accM.rect); + w_grad_ptr[i] = accWGrad.get_float_ptr(); w_ptr[i] = accW.get_float_ptr(); v_ptr[i] = accV.get_float_ptr(); m_ptr[i] = accM.get_float_ptr(); diff --git a/src/runtime/optimizer_kernel.cpp b/src/runtime/optimizer_kernel.cpp index 5e7e613bc7..a0e3180185 100644 --- a/src/runtime/optimizer_kernel.cpp +++ b/src/runtime/optimizer_kernel.cpp @@ -262,18 +262,11 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu(AdamOptimizer const *o void *workSpace_ptr = meta->handle.workSpace; for(int i = 0; i < op->parameters_num; i++){ - // hipMemcpyAsync(static_cast(workSpace_ptr), - // w_grad_ptr[i], - // size[i] * sizeof(float), - // hipMemcpyDeviceToDevice, - // stream); - // hipError_t error = hipGetLastError(); - // if(error != hipSuccess) - // { - // // print the CUDA error message and exit - // printf("CUDA error: %s\n", hipGetErrorString(error)); - // } - + hipMemcpyAsync(workSpace_ptr, + w_grad_ptr[i], + size[i] * sizeof(float), + hipMemcpyDeviceToDevice, + stream); workSpace_ptr = static_cast(workSpace_ptr) + size[i] * sizeof(float); } @@ -292,7 +285,7 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu(AdamOptimizer const *o float beta2_t = op->beta2_t; for(int i = 0; i < op->parameters_num; i++){ // update - std::cout<<"update"<<"\n"; + // printf("update %d\n", i); hipLaunchKernelGGL(HIP_KERNEL_NAME(adam_update), GET_BLOCKS(size[i]), CUDA_NUM_THREADS, From d2bdb15a4f360c7e72e6d42b1a29e07fba24dcad Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Thu, 11 Jan 2024 23:38:06 -0500 Subject: [PATCH 05/19] fix --- include/flexflow/optimizer.h | 9 +-- src/ops/fused.cu | 8 +-- src/ops/kernels/dropout_kernels.cu | 97 ++++++++++++++++++++++++------ src/runtime/optimizer.cc | 41 +++++++------ src/runtime/optimizer_kernel.cpp | 93 ++++++++++++++-------------- src/runtime/optimizer_kernel.cu | 62 +++++++++++++++++++ 6 files changed, 216 insertions(+), 94 deletions(-) diff --git a/include/flexflow/optimizer.h b/include/flexflow/optimizer.h index 201408868b..2b710d1e7d 100644 --- a/include/flexflow/optimizer.h +++ b/include/flexflow/optimizer.h @@ -18,6 +18,7 @@ #include "flexflow/parallel_tensor.h" #include "legion.h" +#include "accessor.h" namespace FlexFlow { @@ -125,11 +126,11 @@ class AdamOptimizer : public Optimizer { float *m_ptr); static void nccl_unified_update_task_gpu(AdamOptimizer const *op, OpMeta const *meta, - float const *w_grad_ptr[], + GenericTensorAccessorR *accWGrads, size_t *size, - float *w_ptr[], - float *v_ptr[], - float *m_ptr[]); + GenericTensorAccessorW *accWs, + GenericTensorAccessorW *accVs, + GenericTensorAccessorW *accMs); #endif double alpha, beta1, beta2, weight_decay, epsilon; double alpha_t, beta1_t, beta2_t; diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 62262c89af..e43ce51f9c 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -216,9 +216,7 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); DropoutMeta *m = (DropoutMeta *)metas->meta[op]; Kernels::Dropout::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); + m, my_input_accessor[0], my_output_accessor[0]); break; } case OP_LINEAR: { @@ -819,9 +817,7 @@ __host__ void FusedOp::backward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); DropoutMeta *m = (DropoutMeta *)metas->meta[op]; Kernels::Dropout::backward_kernel_wrapper( - m, - my_output_grad_accessor[0].get_float_ptr(), - my_input_grad_accessor[0].get_float_ptr()); + m, my_output_grad_accessor[0], my_input_grad_accessor[0]); break; } case OP_EW_ADD: diff --git a/src/ops/kernels/dropout_kernels.cu b/src/ops/kernels/dropout_kernels.cu index 39037cb37f..80ea2de941 100644 --- a/src/ops/kernels/dropout_kernels.cu +++ b/src/ops/kernels/dropout_kernels.cu @@ -74,12 +74,52 @@ DropoutMeta::~DropoutMeta(void) { namespace Kernels { namespace Dropout { +__global__ void dropout_forward_kernel(float p, + long long seed, + size_t num_elements, + float const *input_ptr, + float *output_ptr) { + CUDA_KERNEL_LOOP(i, num_elements) { + float scale = 1.0 / p; + curandStatePhilox4_32_10_t state; + curand_init(seed, i, 0, &state); + float rand = curand_uniform(&state); + if (input_ptr[i] < p) { + output_ptr[i] = 0; + } else { + output_ptr[i] = input_ptr[i] * scale; + } + } +} + +__global__ void dropout_backward_kernel(float p, + long long seed, + size_t num_elements, + float const *input_ptr, + float *output_ptr) { + CUDA_KERNEL_LOOP(i, num_elements) { + float scale = 1.0 / p; + curandStatePhilox4_32_10_t state; + curand_init(seed, i, 0, &state); + float rand = curand_uniform(&state); + if (input_ptr[i] < p) { + output_ptr[i] = 0; + } else { + output_ptr[i] = input_ptr[i] * scale; + } + } +} + void forward_kernel_wrapper(DropoutMeta *m, - GenericTensorAccessorR const &input, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - Internal::forward_kernel(m, input.get_float_ptr(), output.get_float_ptr(), stream); + Internal::forward_kernel(m, + input.get_float_ptr(), + output.get_float_ptr(), + input.domain.get_volume(), + stream); } void backward_kernel_wrapper(DropoutMeta *m, @@ -87,7 +127,11 @@ void backward_kernel_wrapper(DropoutMeta *m, GenericTensorAccessorW const &input_grad) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - Internal::backward_kernel(m, output_grad.get_float_ptr(), input_grad.get_float_ptr(), stream); + Internal::backward_kernel(m, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + output_grad.domain.get_volume(), + stream); } namespace Internal { @@ -95,33 +139,48 @@ namespace Internal { void forward_kernel(DropoutMeta *m, float const *input_ptr, float *output_ptr, + size_t num_elements, cudaStream_t stream) { checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - checkCUDNN(cudnnDropoutForward(m->handle.dnn, - m->dropoutDesc, - m->inputTensor, - input_ptr, - m->outputTensor, - output_ptr, - m->reserveSpace, - m->reserveSpaceSize)); + int parallelism = num_elements; + dropout_forward_kernel<<>>( + m->seed, m->rate, num_elements, input_ptr, output_ptr); + + // checkCUDNN(cudnnDropoutForward(m->handle.dnn, + // m->dropoutDesc, + // m->inputTensor, + // input_ptr, + // m->outputTensor, + // output_ptr, + // m->reserveSpace, + // m->reserveSpaceSize)); } void backward_kernel(DropoutMeta *m, float const *output_grad_ptr, float *input_grad_ptr, + size_t num_elements, cudaStream_t stream) { checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + int parallelism = num_elements; + dropout_backward_kernel<<>>( + m->seed, m->rate, num_elements, output_grad_ptr, input_grad_ptr); - checkCUDNN(cudnnDropoutBackward(m->handle.dnn, - m->dropoutDesc, - m->outputTensor, - output_grad_ptr, - m->inputTensor, - input_grad_ptr, - m->reserveSpace, - m->reserveSpaceSize)); + // checkCUDNN(cudnnDropoutBackward(m->handle.dnn, + // m->dropoutDesc, + // m->outputTensor, + // output_grad_ptr, + // m->inputTensor, + // input_grad_ptr, + // m->reserveSpace, + // m->reserveSpaceSize)); } } // namespace Internal diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc index cf23cc39a5..3bca298b01 100644 --- a/src/runtime/optimizer.cc +++ b/src/runtime/optimizer.cc @@ -727,35 +727,36 @@ void AdamOptimizer::nccl_unified_update_task(Task const *task, Domain domain = runtime->get_index_space_domain( ctx, task->regions[1].region.get_index_space()); - float const *w_grad_ptr[op->parameters_num]; - float *w_ptr[op->parameters_num], *v_ptr[op->parameters_num], *m_ptr[op->parameters_num]; - - hipMalloc(w_grad_ptr, sizeof(float*) * op->parameters_num); - hipMalloc(w_ptr, sizeof(float*) * op->parameters_num); - hipMalloc(v_ptr, sizeof(float*) * op->parameters_num); - hipMalloc(m_ptr, sizeof(float*) * op->parameters_num); + // float const *w_grad_ptr[op->parameters_num]; + // float *w_ptr[op->parameters_num], *v_ptr[op->parameters_num], *m_ptr[op->parameters_num]; + + // hipMalloc(w_grad_ptr, sizeof(float*) * op->parameters_num); + // hipMalloc(w_ptr, sizeof(float*) * op->parameters_num); + // hipMalloc(v_ptr, sizeof(float*) * op->parameters_num); + // hipMalloc(m_ptr, sizeof(float*) * op->parameters_num); + GenericTensorAccessorR accWGrads[op->parameters_num]; + GenericTensorAccessorW accWs[op->parameters_num]; + GenericTensorAccessorW accVs[op->parameters_num]; + GenericTensorAccessorW accMs[op->parameters_num]; size_t *size = new size_t[op->parameters_num]; int offset = 0; printf("parameters_num: %d\n", op->parameters_num); for(int i = 0; i < op->parameters_num; i++){ - GenericTensorAccessorR accWGrad = helperGetGenericTensorAccessorRO(DataType::DT_FLOAT, regions[offset], task->regions[offset], FID_DATA, ctx, runtime); - GenericTensorAccessorW accW = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, regions[offset+1], task->regions[offset+1], FID_DATA, ctx, runtime); - GenericTensorAccessorW accV = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, regions[offset+2], task->regions[offset+2], FID_DATA, ctx, runtime); - GenericTensorAccessorW accM = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, regions[offset+3], task->regions[offset+3], FID_DATA, ctx, runtime); + accWGrads[i] = helperGetGenericTensorAccessorRO(DataType::DT_FLOAT, regions[offset], task->regions[offset], FID_DATA, ctx, runtime); + accWs[i] = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, regions[offset+1], task->regions[offset+1], FID_DATA, ctx, runtime); + accVs[i] = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, regions[offset+2], task->regions[offset+2], FID_DATA, ctx, runtime); + accMs[i] = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, regions[offset+3], task->regions[offset+3], FID_DATA, ctx, runtime); offset += 4; - size[i] = accWGrad.domain.get_volume(); - // assert(accWGrad.rect == accW.rect); - // assert(accWGrad.rect == accV.rect); - // assert(accWGrad.rect == accM.rect); - w_grad_ptr[i] = accWGrad.get_float_ptr(); - w_ptr[i] = accW.get_float_ptr(); - v_ptr[i] = accV.get_float_ptr(); - m_ptr[i] = accM.get_float_ptr(); + size[i] = accWGrads[i].domain.get_volume(); + // w_grad_ptr[i] = accWGrad.get_float_ptr(); + // w_ptr[i] = accW.get_float_ptr(); + // v_ptr[i] = accV.get_float_ptr(); + // m_ptr[i] = accM.get_float_ptr(); } - nccl_unified_update_task_gpu(op, meta, w_grad_ptr, size, w_ptr, v_ptr, m_ptr); + nccl_unified_update_task_gpu(op, meta, accWGrads, size, accWs, accVs, accMs); } #endif diff --git a/src/runtime/optimizer_kernel.cpp b/src/runtime/optimizer_kernel.cpp index a0e3180185..0b7b459d9b 100644 --- a/src/runtime/optimizer_kernel.cpp +++ b/src/runtime/optimizer_kernel.cpp @@ -204,7 +204,7 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op, m_ptr, v_ptr, w_ptr); - + // checkCUDA(hipDeviceSynchronize()); } @@ -247,68 +247,71 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, // checkCUDA(hipDeviceSynchronize()); } -__host__ void AdamOptimizer::nccl_unified_update_task_gpu(AdamOptimizer const *op, - OpMeta const *meta, - float const *w_grad_ptr[], - size_t *size, - float *w_ptr[], - float *v_ptr[], - float *m_ptr[]) { +__host__ void AdamOptimizer::nccl_unified_update_task_gpu( + AdamOptimizer const *op, + OpMeta const *meta, + GenericTensorAccessorR *accWGrads, + size_t *size, + GenericTensorAccessorW *accWs, + GenericTensorAccessorW *accVs, + GenericTensorAccessorW *accMs) { hipStream_t stream; - checkCUDA(get_legion_stream(&stream)); + checkCUDA(get_legion_stream(&stream)); assert(op->reservedWorkSpaceSize < meta->handle.workSpaceSize); void *workSpace_ptr = meta->handle.workSpace; - for(int i = 0; i < op->parameters_num; i++){ + for (int i = 0; i < op->parameters_num; i++) { hipMemcpyAsync(workSpace_ptr, - w_grad_ptr[i], - size[i] * sizeof(float), - hipMemcpyDeviceToDevice, - stream); - workSpace_ptr = static_cast(workSpace_ptr) + size[i] * sizeof(float); + accWGrads[i].get_float_ptr(), + size[i] * sizeof(float), + hipMemcpyDeviceToDevice, + stream); + workSpace_ptr = + static_cast(workSpace_ptr) + size[i] * sizeof(float); } - //do allreduce once + // do allreduce once checkNCCL(ncclAllReduce(meta->handle.workSpace, - (float *)meta->handle.workSpace, - op->reservedWorkSpaceSize / sizeof(float), - ncclFloat, - ncclSum, - meta->handle.ncclComm, - stream)); + (float *)meta->handle.workSpace, + op->reservedWorkSpaceSize / sizeof(float), + ncclFloat, + ncclSum, + meta->handle.ncclComm, + stream)); workSpace_ptr = static_cast(meta->handle.workSpace); float alpha_t = op->alpha_t; float beta1_t = op->beta1_t; float beta2_t = op->beta2_t; - for(int i = 0; i < op->parameters_num; i++){ + for (int i = 0; i < op->parameters_num; i++) { // update // printf("update %d\n", i); - hipLaunchKernelGGL(HIP_KERNEL_NAME(adam_update), - GET_BLOCKS(size[i]), - CUDA_NUM_THREADS, - 0, - stream, - size[i], - alpha_t, - op->beta1, - op->beta2, - op->weight_decay, - op->epsilon, - static_cast(workSpace_ptr), - m_ptr[i], - v_ptr[i], - w_ptr[i]); - workSpace_ptr = static_cast(workSpace_ptr) + size[i] * sizeof(float); + hipLaunchKernelGGL(HIP_KERNEL_NAME(adam_update), + GET_BLOCKS(size[i]), + CUDA_NUM_THREADS, + 0, + stream, + size[i], + alpha_t, + op->beta1, + op->beta2, + op->weight_decay, + op->epsilon, + static_cast(workSpace_ptr), + accMs[i].get_float_ptr(), + accVs[i].get_float_ptr(), + accWs[i].get_float_ptr()); + workSpace_ptr = + static_cast(workSpace_ptr) + size[i] * sizeof(float); + + // update + beta1_t *= op->beta1; + beta2_t *= op->beta2; + alpha_t = op->alpha * sqrt(1 - beta2_t) / (1 - beta1_t); + } - //update - beta1_t *= op->beta1; - beta2_t *= op->beta2; - alpha_t = op->alpha * sqrt(1 - beta2_t) / (1 - beta1_t); - } - // checkCUDA(hipDeviceSynchronize()); } #endif diff --git a/src/runtime/optimizer_kernel.cu b/src/runtime/optimizer_kernel.cu index 5f654fbb5b..d613ae76a4 100644 --- a/src/runtime/optimizer_kernel.cu +++ b/src/runtime/optimizer_kernel.cu @@ -216,6 +216,68 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, w_ptr); // checkCUDA(cudaDeviceSynchronize()); } + +__host__ void AdamOptimizer::nccl_unified_update_task_gpu( + AdamOptimizer const *op, + OpMeta const *meta, + GenericTensorAccessorR *accWGrads, + size_t *size, + GenericTensorAccessorW *accWs, + GenericTensorAccessorW *accVs, + GenericTensorAccessorW *accMs) { + + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(op->reservedWorkSpaceSize < meta->handle.workSpaceSize); + + void *workSpace_ptr = meta->handle.workSpace; + + for (int i = 0; i < op->parameters_num; i++) { + cudaMemcpyAsync(workSpace_ptr, + accWGrads[i].get_float_ptr(), + size[i] * sizeof(float), + cudaMemcpyDeviceToDevice, + stream); + workSpace_ptr = + static_cast(workSpace_ptr) + size[i] * sizeof(float); + } + + // do allreduce once + checkNCCL(ncclAllReduce(meta->handle.workSpace, + (float *)meta->handle.workSpace, + op->reservedWorkSpaceSize / sizeof(float), + ncclFloat, + ncclSum, + meta->handle.ncclComm, + stream)); + + workSpace_ptr = static_cast(meta->handle.workSpace); + float alpha_t = op->alpha_t; + float beta1_t = op->beta1_t; + float beta2_t = op->beta2_t; + for (int i = 0; i < op->parameters_num; i++) { + // update + // printf("update %d\n", i); + adam_update<<>>( + size[i], + alpha_t, + op->beta1, + op->beta2, + op->weight_decay, + op->epsilon, + static_cast(workSpace_ptr), + accMs[i].get_float_ptr(), + accVs[i].get_float_ptr(), + accWs[i].get_float_ptr()); + workSpace_ptr = + static_cast(workSpace_ptr) + size[i] * sizeof(float); + + // update + beta1_t *= op->beta1; + beta2_t *= op->beta2; + alpha_t = op->alpha * sqrt(1 - beta2_t) / (1 - beta1_t); + } +} #endif }; // namespace FlexFlow From 3b9e1c6a6391acd178dda095ee59f586cd757287 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 12 Jan 2024 12:25:15 -0500 Subject: [PATCH 06/19] . --- include/flexflow/optimizer.h | 1 - src/runtime/optimizer.cc | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/flexflow/optimizer.h b/include/flexflow/optimizer.h index 2b710d1e7d..401fffb351 100644 --- a/include/flexflow/optimizer.h +++ b/include/flexflow/optimizer.h @@ -137,7 +137,6 @@ class AdamOptimizer : public Optimizer { std::map v_values, m_values; size_t reservedWorkSpaceSize = 0; int parameters_num = 0; - int processed_parameters_num = 0; }; }; // namespace FlexFlow diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc index 3bca298b01..b5e475c466 100644 --- a/src/runtime/optimizer.cc +++ b/src/runtime/optimizer.cc @@ -522,6 +522,8 @@ void AdamOptimizer::unified_update(std::vector const parameters) } int offset = 0; + int processed_parameters_num = 0; + parameters_num = 0; // printf("param size: %d, %d\n", parameters.size(), parameters_num); while(processed_parameters_num < parameters.size()){ From 9f8bb9ee1e613689cb3675ab3def2930348f638c Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 12 Jan 2024 13:17:47 -0500 Subject: [PATCH 07/19] fix --- src/ops/kernels/dropout_kernels.cu | 37 ++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/src/ops/kernels/dropout_kernels.cu b/src/ops/kernels/dropout_kernels.cu index 80ea2de941..c5b1a384df 100644 --- a/src/ops/kernels/dropout_kernels.cu +++ b/src/ops/kernels/dropout_kernels.cu @@ -29,6 +29,10 @@ DropoutMeta::DropoutMeta(FFHandler handler, Domain const &output_domain) : OpMeta(handler) { profiling = dropout->profiling; + rate = dropout->rate; + seed = dropout->seed; + input_type[0] = dropout->data_type; + output_type[0] = dropout->data_type; checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); checkCUDNN(cudnnCreateDropoutDescriptor(&dropoutDesc)); @@ -115,11 +119,28 @@ void forward_kernel_wrapper(DropoutMeta *m, GenericTensorAccessorW const &output) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + Internal::forward_kernel(m, input.get_float_ptr(), output.get_float_ptr(), input.domain.get_volume(), stream); + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf(" [dropout] forward time = %.2lfms\n", elapsed); + } } void backward_kernel_wrapper(DropoutMeta *m, @@ -127,11 +148,27 @@ void backward_kernel_wrapper(DropoutMeta *m, GenericTensorAccessorW const &input_grad) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } Internal::backward_kernel(m, output_grad.get_float_ptr(), input_grad.get_float_ptr(), output_grad.domain.get_volume(), stream); + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf(" [dropout] backward time = %.2lfms\n", elapsed); + } } namespace Internal { From fb91122170c2e81c3580fc582193f3e3afa19590 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 12 Jan 2024 13:19:18 -0500 Subject: [PATCH 08/19] remove print --- src/runtime/optimizer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc index b5e475c466..03d26b6a64 100644 --- a/src/runtime/optimizer.cc +++ b/src/runtime/optimizer.cc @@ -743,7 +743,7 @@ void AdamOptimizer::nccl_unified_update_task(Task const *task, size_t *size = new size_t[op->parameters_num]; int offset = 0; - printf("parameters_num: %d\n", op->parameters_num); + // printf("parameters_num: %d\n", op->parameters_num); for(int i = 0; i < op->parameters_num; i++){ accWGrads[i] = helperGetGenericTensorAccessorRO(DataType::DT_FLOAT, regions[offset], task->regions[offset], FID_DATA, ctx, runtime); From c162d4cb03d03978fec5090fe8075d2fa402c432 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Thu, 18 Jan 2024 21:21:58 -0500 Subject: [PATCH 09/19] fix hip --- src/ops/fused.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index cb2b81fbcf..d1de787fb4 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -205,9 +205,7 @@ __host__ void FusedOp::forward_task(Task const *task, assert(fused->op_num_outputs[op] == 1); DropoutMeta *m = (DropoutMeta *)metas->meta[op]; Kernels::Dropout::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); + m, my_input_accessor[0], my_output_accessor[0]); break; } case OP_LINEAR: { From ea793172725e6b7f7a5df10d037764bec95aa98c Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 19 Jan 2024 09:55:14 -0500 Subject: [PATCH 10/19] fix multinodes --- src/runtime/optimizer.cc | 145 +++++++++++++++++++++++---------------- 1 file changed, 84 insertions(+), 61 deletions(-) diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc index 03d26b6a64..74d665a592 100644 --- a/src/runtime/optimizer.cc +++ b/src/runtime/optimizer.cc @@ -523,27 +523,29 @@ void AdamOptimizer::unified_update(std::vector const parameters) int offset = 0; int processed_parameters_num = 0; - parameters_num = 0; // printf("param size: %d, %d\n", parameters.size(), parameters_num); while(processed_parameters_num < parameters.size()){ + parameters_num = 0; for(int i = 0; i < parameters.size(); i++){ const ParallelTensor p = parameters.at(i); assert(v_values.find(p->region) != v_values.end()); assert(m_values.find(p->region) != m_values.end()); assert(p->owner_op != NULL); - if(reservedWorkSpaceSize + p->get_volume() * sizeof(float) >= model->handlers->workSpaceSize){ - break; + if (reservedWorkSpaceSize + p->get_volume() * sizeof(float) >= + model->handlers->workSpaceSize) { + break; } reservedWorkSpaceSize += p->get_volume() * sizeof(float); parameters_num += 1; - assert (p->sync_type == ParameterSyncType::NCCL); + assert(p->sync_type == ParameterSyncType::NCCL); assert(p->parallel_is != IndexSpace::NO_SPACE); } - // printf("parameters_num: %d %d, %d\n", parameters_num, reservedWorkSpaceSize, model->handlers->workSpaceSize); - assert(parameters_num <= parameters.size()); + // printf("parameters_num: %d %d, %d\n", parameters_num, + // reservedWorkSpaceSize, model->handlers->workSpaceSize); + assert(processed_parameters_num <= parameters.size()); IndexLauncher launcher(ADAM_UNIFY_UPD_NCCL_TASK_ID, p0->parallel_is, @@ -553,42 +555,42 @@ void AdamOptimizer::unified_update(std::vector const parameters) false /*must_epoch*/, 0 /*mapper_id*/, p0->machine_view.hash()); - //launch a unified task - for(int j = 0; j < parameters_num; j++){ + // launch a unified task + for (int j = 0; j < parameters_num; j++) { const ParallelTensor p = parameters.at(processed_parameters_num + j); - // regions[0]: region_grad - launcher.add_region_requirement(RegionRequirement(p->part_grad, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - p->region_grad)); - launcher.add_field(offset, FID_DATA); - // regions[1]: region - launcher.add_region_requirement(RegionRequirement( - p->part, 0 /*projection id*/, READ_WRITE, EXCLUSIVE, p->region)); - launcher.add_field(offset + 1, FID_DATA); - // regions[2]: w_region - launcher.add_region_requirement( - RegionRequirement(v_values[p->region]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - v_values[p->region]->region)); - launcher.add_field(offset + 2, FID_DATA); - // regions[3]: m_region - launcher.add_region_requirement( - RegionRequirement(m_values[p->region]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - m_values[p->region]->region)); - launcher.add_field(offset + 3, FID_DATA); - offset += 4; + // regions[0]: region_grad + launcher.add_region_requirement(RegionRequirement(p->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + p->region_grad)); + launcher.add_field(offset, FID_DATA); + // regions[1]: region + launcher.add_region_requirement(RegionRequirement( + p->part, 0 /*projection id*/, READ_WRITE, EXCLUSIVE, p->region)); + launcher.add_field(offset + 1, FID_DATA); + // regions[2]: w_region + launcher.add_region_requirement( + RegionRequirement(v_values[p->region]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + v_values[p->region]->region)); + launcher.add_field(offset + 2, FID_DATA); + // regions[3]: m_region + launcher.add_region_requirement( + RegionRequirement(m_values[p->region]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + m_values[p->region]->region)); + launcher.add_field(offset + 3, FID_DATA); + offset += 4; } - - //update alpha, beta - for(int i = 0; i < parameters_num; i++){ + + // update alpha, beta + for (int i = 0; i < parameters_num; i++) { this->next(); } launcher.concurrent = true; @@ -597,9 +599,9 @@ void AdamOptimizer::unified_update(std::vector const parameters) runtime->issue_execution_fence(ctx); reservedWorkSpaceSize = 0; offset = 0; - processed_parameters_num += parameters_num; + processed_parameters_num += parameters_num; } - + parameters_num = 0; } void AdamOptimizer::ps_update_task(Task const *task, @@ -716,11 +718,11 @@ void AdamOptimizer::nccl_update_task(Task const *task, nccl_update_task_gpu(op, meta, w_grad_ptr, size, w_ptr, v_ptr, m_ptr); } - -void AdamOptimizer::nccl_unified_update_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { +void AdamOptimizer::nccl_unified_update_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { // assert(regions.size() == 4); // assert(task->regions.size() == 4); AdamOptimizer const *op = (AdamOptimizer *)task->args; @@ -730,7 +732,8 @@ void AdamOptimizer::nccl_unified_update_task(Task const *task, ctx, task->regions[1].region.get_index_space()); // float const *w_grad_ptr[op->parameters_num]; - // float *w_ptr[op->parameters_num], *v_ptr[op->parameters_num], *m_ptr[op->parameters_num]; + // float *w_ptr[op->parameters_num], *v_ptr[op->parameters_num], + // *m_ptr[op->parameters_num]; // hipMalloc(w_grad_ptr, sizeof(float*) * op->parameters_num); // hipMalloc(w_ptr, sizeof(float*) * op->parameters_num); @@ -743,20 +746,40 @@ void AdamOptimizer::nccl_unified_update_task(Task const *task, size_t *size = new size_t[op->parameters_num]; int offset = 0; - // printf("parameters_num: %d\n", op->parameters_num); - - for(int i = 0; i < op->parameters_num; i++){ - accWGrads[i] = helperGetGenericTensorAccessorRO(DataType::DT_FLOAT, regions[offset], task->regions[offset], FID_DATA, ctx, runtime); - accWs[i] = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, regions[offset+1], task->regions[offset+1], FID_DATA, ctx, runtime); - accVs[i] = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, regions[offset+2], task->regions[offset+2], FID_DATA, ctx, runtime); - accMs[i] = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, regions[offset+3], task->regions[offset+3], FID_DATA, ctx, runtime); - offset += 4; - - size[i] = accWGrads[i].domain.get_volume(); - // w_grad_ptr[i] = accWGrad.get_float_ptr(); - // w_ptr[i] = accW.get_float_ptr(); - // v_ptr[i] = accV.get_float_ptr(); - // m_ptr[i] = accM.get_float_ptr(); + // printf("parameters_num: %d\n", op->parameters_num); + + for (int i = 0; i < op->parameters_num; i++) { + accWGrads[i] = helperGetGenericTensorAccessorRO(DataType::DT_FLOAT, + regions[offset], + task->regions[offset], + FID_DATA, + ctx, + runtime); + accWs[i] = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, + regions[offset + 1], + task->regions[offset + 1], + FID_DATA, + ctx, + runtime); + accVs[i] = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, + regions[offset + 2], + task->regions[offset + 2], + FID_DATA, + ctx, + runtime); + accMs[i] = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, + regions[offset + 3], + task->regions[offset + 3], + FID_DATA, + ctx, + runtime); + offset += 4; + + size[i] = accWGrads[i].domain.get_volume(); + // w_grad_ptr[i] = accWGrad.get_float_ptr(); + // w_ptr[i] = accW.get_float_ptr(); + // v_ptr[i] = accV.get_float_ptr(); + // m_ptr[i] = accM.get_float_ptr(); } nccl_unified_update_task_gpu(op, meta, accWGrads, size, accWs, accVs, accMs); } From 58d84eda1395e6d8cb56bfcaadde0e59278f04aa Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 19 Jan 2024 12:50:37 -0500 Subject: [PATCH 11/19] fix --- src/runtime/optimizer.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc index 74d665a592..c2e433d2e5 100644 --- a/src/runtime/optimizer.cc +++ b/src/runtime/optimizer.cc @@ -523,12 +523,12 @@ void AdamOptimizer::unified_update(std::vector const parameters) int offset = 0; int processed_parameters_num = 0; - // printf("param size: %d, %d\n", parameters.size(), parameters_num); + // printf("param size: %d\n", parameters.size()); while(processed_parameters_num < parameters.size()){ parameters_num = 0; - for(int i = 0; i < parameters.size(); i++){ + for(int i = processed_parameters_num; i < parameters.size(); i++){ const ParallelTensor p = parameters.at(i); assert(v_values.find(p->region) != v_values.end()); assert(m_values.find(p->region) != m_values.end()); @@ -543,8 +543,8 @@ void AdamOptimizer::unified_update(std::vector const parameters) assert(p->parallel_is != IndexSpace::NO_SPACE); } - // printf("parameters_num: %d %d, %d\n", parameters_num, - // reservedWorkSpaceSize, model->handlers->workSpaceSize); + printf("parameters_num: %d %zu, %zu, %d\n", parameters_num, + reservedWorkSpaceSize, model->handlers->workSpaceSize, parameters.size()); assert(processed_parameters_num <= parameters.size()); IndexLauncher launcher(ADAM_UNIFY_UPD_NCCL_TASK_ID, From 01c9d4c560bf57188811156fefc1db0b2f013644 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Thu, 25 Jan 2024 18:27:36 -0500 Subject: [PATCH 12/19] fix --- src/runtime/optimizer.cc | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc index c2e433d2e5..a683d234e3 100644 --- a/src/runtime/optimizer.cc +++ b/src/runtime/optimizer.cc @@ -525,16 +525,19 @@ void AdamOptimizer::unified_update(std::vector const parameters) int processed_parameters_num = 0; // printf("param size: %d\n", parameters.size()); - while(processed_parameters_num < parameters.size()){ + size_t workSpaceSize = model->handlers->workSpaceSize * + model->config.workersPerNode * model->config.numNodes; + + while (processed_parameters_num < parameters.size()) { parameters_num = 0; - + for(int i = processed_parameters_num; i < parameters.size(); i++){ const ParallelTensor p = parameters.at(i); assert(v_values.find(p->region) != v_values.end()); assert(m_values.find(p->region) != m_values.end()); assert(p->owner_op != NULL); if (reservedWorkSpaceSize + p->get_volume() * sizeof(float) >= - model->handlers->workSpaceSize) { + workSpaceSize) { break; } reservedWorkSpaceSize += p->get_volume() * sizeof(float); @@ -543,8 +546,9 @@ void AdamOptimizer::unified_update(std::vector const parameters) assert(p->parallel_is != IndexSpace::NO_SPACE); } - printf("parameters_num: %d %zu, %zu, %d\n", parameters_num, - reservedWorkSpaceSize, model->handlers->workSpaceSize, parameters.size()); + // printf("parameters_num: %d %zu, %zu, %d\n", parameters_num, + // reservedWorkSpaceSize, model->handlers->workSpaceSize, + // parameters.size()); assert(processed_parameters_num <= parameters.size()); IndexLauncher launcher(ADAM_UNIFY_UPD_NCCL_TASK_ID, From a31f8e9bbd27e799abab712d73b202dac87ed2b3 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Thu, 25 Jan 2024 23:39:37 -0500 Subject: [PATCH 13/19] fix --- src/runtime/optimizer_kernel.cpp | 4 ++-- src/runtime/optimizer_kernel.cu | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/runtime/optimizer_kernel.cpp b/src/runtime/optimizer_kernel.cpp index 0b7b459d9b..373eb3fe7a 100644 --- a/src/runtime/optimizer_kernel.cpp +++ b/src/runtime/optimizer_kernel.cpp @@ -258,7 +258,7 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu( hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - assert(op->reservedWorkSpaceSize < meta->handle.workSpaceSize); + // assert(op->reservedWorkSpaceSize < meta->handle.workSpaceSize); void *workSpace_ptr = meta->handle.workSpace; @@ -275,7 +275,7 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu( // do allreduce once checkNCCL(ncclAllReduce(meta->handle.workSpace, (float *)meta->handle.workSpace, - op->reservedWorkSpaceSize / sizeof(float), + meta->handle.workSpaceSize, ncclFloat, ncclSum, meta->handle.ncclComm, diff --git a/src/runtime/optimizer_kernel.cu b/src/runtime/optimizer_kernel.cu index d613ae76a4..a57281a5ec 100644 --- a/src/runtime/optimizer_kernel.cu +++ b/src/runtime/optimizer_kernel.cu @@ -228,7 +228,7 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu( cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - assert(op->reservedWorkSpaceSize < meta->handle.workSpaceSize); + // assert(op->reservedWorkSpaceSize < meta->handle.workSpaceSize); void *workSpace_ptr = meta->handle.workSpace; @@ -245,7 +245,7 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu( // do allreduce once checkNCCL(ncclAllReduce(meta->handle.workSpace, (float *)meta->handle.workSpace, - op->reservedWorkSpaceSize / sizeof(float), + meta->handle.workSpaceSize, ncclFloat, ncclSum, meta->handle.ncclComm, From 9141c46a64617de893d986b694f1202841e86b27 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Thu, 1 Feb 2024 23:23:14 -0500 Subject: [PATCH 14/19] tp --- examples/python/pytorch/mt5/mt5_ff.py | 1 + include/flexflow/ffconst.h | 2 + include/flexflow/model.h | 4 + include/flexflow/operator_params.h | 2 + include/flexflow/parallel_ops/allreduce.h | 57 ++++ .../flexflow/parallel_ops/allreduce_params.h | 22 ++ .../parallel_ops/kernels/allreduce_kernels.h | 31 ++ python/flexflow/core/flexflow_cffi.py | 4 +- src/ops/fused.cc | 2 +- src/ops/fused.cpp | 17 ++ src/ops/fused.cu | 17 ++ src/parallel_ops/allreduce.cc | 279 ++++++++++++++++++ .../kernels/allreduce_kernels.cpp | 58 ++++ src/parallel_ops/kernels/allreduce_kernels.cu | 56 ++++ src/runtime/ffconst_utils.cc | 2 + src/runtime/graph.cc | 19 ++ src/runtime/model.cc | 60 +++- src/runtime/operator_params.cc | 3 + src/runtime/optimizer.cc | 4 +- src/runtime/substitution.cc | 7 + 20 files changed, 635 insertions(+), 12 deletions(-) create mode 100644 include/flexflow/parallel_ops/allreduce.h create mode 100644 include/flexflow/parallel_ops/allreduce_params.h create mode 100644 include/flexflow/parallel_ops/kernels/allreduce_kernels.h create mode 100644 src/parallel_ops/allreduce.cc create mode 100644 src/parallel_ops/kernels/allreduce_kernels.cpp create mode 100644 src/parallel_ops/kernels/allreduce_kernels.cu diff --git a/examples/python/pytorch/mt5/mt5_ff.py b/examples/python/pytorch/mt5/mt5_ff.py index c2868e9d1e..5dff7415d3 100644 --- a/examples/python/pytorch/mt5/mt5_ff.py +++ b/examples/python/pytorch/mt5/mt5_ff.py @@ -122,6 +122,7 @@ def top_level_task(): input_names = ["input_ids", "attention_mask"] print("Tracing the model...") + print(batch_size) hf_model = PyTorchModel( model, is_hf_model=True, input_names=input_names, batch_size=batch_size, seq_length=seq_length, diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 5658e2923d..060983b020 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -157,6 +157,7 @@ enum OperatorType { OP_REPLICATE, OP_REDUCTION, OP_PIPELINE, + OP_ALLREDUCE, OP_FUSED_PARALLEL, OP_INVALID, }; @@ -189,6 +190,7 @@ enum PMParameter { PM_COMBINE_DEGREE, // Combine PM_REDUCTION_DIM, // Reduction PM_REDUCTION_DEGREE, // Reduction + PM_ALLREDUCE_DIM, // AllReduce PM_SOFTMAX_DIM, // Softmax PM_NUM_HEADS, // MultiHeadAttention PM_INVALID, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 82a29c8182..3f45de3d79 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -191,6 +191,10 @@ enum TaskIDs { PIPELINE_INIT_TASK_ID, PIPELINE_FWD_TASK_ID, PIPELINE_BWD_TASK_ID, + ALLREDUCE_INIT_TASK_ID, + ALLREDUCE_INF_TASK_ID, + ALLREDUCE_FWD_TASK_ID, + ALLREDUCE_BWD_TASK_ID, FUSED_PARALLELOP_INIT_TASK_ID, FUSED_PARALLELOP_FWD_TASK_ID, FUSED_PARALLELOP_BWD_TASK_ID, diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h index 24c84a85ed..84653ac9ca 100644 --- a/include/flexflow/operator_params.h +++ b/include/flexflow/operator_params.h @@ -7,6 +7,7 @@ #include "flexflow/ops/batch_matmul_params.h" #include "flexflow/ops/cast_params.h" #include "flexflow/ops/concat_params.h" +#include "flexflow/parallel_ops/allreduce_params.h" #include "flexflow/ops/conv_2d_params.h" #include "flexflow/ops/dropout_params.h" #include "flexflow/ops/element_binary_params.h" @@ -62,6 +63,7 @@ using OperatorParameters = mp::variant; tl::optional get_op_parameters(Op const *op); diff --git a/include/flexflow/parallel_ops/allreduce.h b/include/flexflow/parallel_ops/allreduce.h new file mode 100644 index 0000000000..a28d4cef9e --- /dev/null +++ b/include/flexflow/parallel_ops/allreduce.h @@ -0,0 +1,57 @@ +#ifndef _FLEXFLOW_ALLREDUCE_H +#define _FLEXFLOW_ALLREDUCE_H + +#include "flexflow/layer.h" +#include "flexflow/node.h" +#include "flexflow/op_meta.h" +#include "flexflow/operator.h" +#include "flexflow/parallel_ops/allreduce_params.h" +#include "parallel_op.h" + +namespace FlexFlow { + +class AllReduce : public ParallelOp { +public: + using Params = AllReduceParams; + using Input = ParallelTensor; + + AllReduce(FFModel &model, + const ParallelTensor input, + int allreduce_legion_dim, + char const *name = NULL); + AllReduce(FFModel &model, + Params const ¶ms, + Input const input, + char const *name = nullptr); + void create_input_partition(FFModel &model) override; + void init(FFModel const &) override; + void forward(FFModel const &) override; + void backward(FFModel const &) override; + bool get_int_parameter(PMParameter, int *) const override; + bool append_parallel_op_info( + std::vector ¶llel_ops) const override; + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void forward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void backward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + bool measure_operator_cost(Simulator *sim, + MachineView const &pc, + CostMetrics &cost_metrics) const override; + + Params get_params() const; + +public: + int allreduce_dim; +}; + +}; // namespace FlexFlow + +#endif // _FLEXFLOW_ALLREDUCE_H diff --git a/include/flexflow/parallel_ops/allreduce_params.h b/include/flexflow/parallel_ops/allreduce_params.h new file mode 100644 index 0000000000..a0daac8f9a --- /dev/null +++ b/include/flexflow/parallel_ops/allreduce_params.h @@ -0,0 +1,22 @@ +#ifndef _FLEXFLOW_ALLREDUCE_PARAMS_H +#define _FLEXFLOW_ALLREDUCE_PARAMS_H + +namespace FlexFlow { + +struct AllReduceParams { + int allreduce_legion_dim; + char name[MAX_OPNAME]; + bool is_valid(ParallelTensorShape const &) const; +}; +bool operator==(AllReduceParams const &, AllReduceParams const &); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t operator()(FlexFlow::AllReduceParams const &) const; +}; +} // namespace std + +#endif // _FLEXFLOW_ALLREDUCE_PARAMS_H diff --git a/include/flexflow/parallel_ops/kernels/allreduce_kernels.h b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h new file mode 100644 index 0000000000..02a5026fcf --- /dev/null +++ b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h @@ -0,0 +1,31 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_ALLREDUCE_KERNELS_H +#define _FLEXFLOW_OPS_KERNELS_ALLREDUCE_KERNELS_H + +#include "flexflow/device.h" +#include "flexflow/fftype.h" +#include "flexflow/op_meta.h" +#include "flexflow/parallel_ops/allreduce.h" + +namespace FlexFlow { + +class AllReduceMeta : public OpMeta { +public: + AllReduceMeta(FFHandler handle, AllReduce const *reduct); +}; + +namespace Kernels { +namespace AllReduce { + +void forward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void backward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad); + +} // namespace AllReduce +} // namespace Kernels +} // namespace FlexFlow + +#endif // _FLEXFLOW_OPS_KERNELS_ALLREDUCE_KERNELS_H diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 49ad5b3dc8..c39b58e4b8 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -2124,9 +2124,9 @@ def fit(self, x=None, y=None, batch_size=None, epochs=1): for d in dataloaders: d.next_batch(self) self.forward() - self.zero_gradients() + # self.zero_gradients() self.backward() - self.update() + self.unified_update() self._ffconfig.end_trace(self._tracing_id) def eval(self, x=None, y=None, batch_size=None): diff --git a/src/ops/fused.cc b/src/ops/fused.cc index d0c895c53d..b241ff1587 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -129,7 +129,7 @@ bool FusedOp::add_operator(FFModel &model, Op *op) { // op->name, op_config)); // Cannot fuse parallel operators since they have different paralel_is // in forward and backward - assert(!op->is_parallel_op()); + assert(!op->is_parallel_op() || op->op_type == OP_ALLREDUCE); // Currently don't consider nested fusion if (op->op_type == OP_FUSED) { return false; diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index d1de787fb4..9da93f0c65 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -31,6 +31,7 @@ #include "flexflow/ops/kernels/reshape_kernels.h" #include "flexflow/ops/kernels/softmax_kernels.h" #include "flexflow/ops/kernels/transpose_kernels.h" +#include "flexflow/parallel_ops/kernels/allreduce_kernels.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" #include "flexflow/utils/hip_helper.h" @@ -420,6 +421,14 @@ __host__ void FusedOp::forward_task(Task const *task, } break; } + case OP_ALLREDUCE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0]); + break; + } case OP_RESHAPE: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); @@ -957,6 +966,14 @@ __host__ void FusedOp::backward_task(Task const *task, } break; } + case OP_ALLREDUCE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::backward_kernel_wrapper( + m, my_input_grad_accessor[0], my_output_grad_accessor[0]); + break; + } case OP_TRANSPOSE: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); diff --git a/src/ops/fused.cu b/src/ops/fused.cu index e43ce51f9c..b78447ba41 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -33,6 +33,7 @@ #include "flexflow/ops/kernels/reshape_kernels.h" #include "flexflow/ops/kernels/softmax_kernels.h" #include "flexflow/ops/kernels/transpose_kernels.h" +#include "flexflow/parallel_ops/kernels/allreduce_kernels.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/utils/cuda_helper.h" @@ -460,6 +461,14 @@ __host__ void FusedOp::forward_task(Task const *task, } break; } + case OP_ALLREDUCE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0]); + break; + } case OP_TRANSPOSE: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); @@ -1002,6 +1011,14 @@ __host__ void FusedOp::backward_task(Task const *task, } break; } + case OP_ALLREDUCE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::backward_kernel_wrapper( + m, my_input_grad_accessor[0], my_output_grad_accessor[0]); + break; + } case OP_TRANSPOSE: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc new file mode 100644 index 0000000000..1bbc43ca0d --- /dev/null +++ b/src/parallel_ops/allreduce.cc @@ -0,0 +1,279 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/parallel_ops/allreduce.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/model.h" +#include "flexflow/parallel_ops/kernels/allreduce_kernels.h" +#include "flexflow/utils/hash_utils.h" + +namespace FlexFlow { +// declare Legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::Future; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::LogicalPartition; +using Legion::LogicalRegion; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; + +using namespace FlexFlow::Kernels::AllReduce; + +/* Params */ +bool operator==(AllReduceParams const &lhs, AllReduceParams const &rhs) { + return lhs.allreduce_legion_dim == rhs.allreduce_legion_dim; +} + +bool AllReduceParams::is_valid(ParallelTensorShape const &input) const { + return input.is_valid(); +} + +AllReduceParams AllReduce::get_params() const { + AllReduceParams params; + params.allreduce_legion_dim = this->allreduce_dim; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } + return params; +} + +AllReduce::AllReduce(FFModel &model, + const ParallelTensor _input, + int _allreduce_legion_dim, + char const *name) + : ParallelOp(model, OP_ALLREDUCE, name, _input), + allreduce_dim(_allreduce_legion_dim) { + int numdim = _input->num_dims; + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + dims[i] = _input->dims[i]; + } + assert(dims[allreduce_dim].degree > 1); + // ParallelTensorBase::update_parallel_ids(numdim, dims); + outputs[0] = model.create_parallel_tensor_legion_ordering( + numdim, dims, _input->data_type, this); +} + +AllReduce::AllReduce(FFModel &model, + AllReduceParams const ¶ms, + ParallelTensor const input, + char const *name) + : AllReduce(model, input, params.allreduce_legion_dim, params.name) {} + +void AllReduce::create_input_partition(FFModel &ff) { + // Do nothing + return; +} + + +OpMeta *AllReduce::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + AllReduce *ar = (AllReduce *)task->args; + FFHandler handle = *((FFHandler const *)task->local_args); + AllReduceMeta *meta = new AllReduceMeta(handle, ar); + meta->input_type[0] = ar->inputs[0]->data_type; + meta->output_type[0] = ar->outputs[0]->data_type; + assert(meta->input_type[0] == meta->output_type[0]); + return meta; +} + +void AllReduce::init(FFModel const &ff) { + ArgumentMap argmap; + parallel_is = outputs[0]->parallel_is; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(ALLREDUCE_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(AllReduce)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(1, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +void AllReduce::forward(FFModel const &ff) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = outputs[0]->parallel_is; + assert(numOutputs == 1); + assert(numInputs == 1); + set_argumentmap_for_forward(ff, argmap); + IndexLauncher launcher(ALLREDUCE_FWD_TASK_ID, + outputs[0]->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + +void AllReduce::backward(FFModel const &ff) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + IndexLauncher launcher(ALLREDUCE_BWD_TASK_ID, + inputs[0]->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + inputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region_grad)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + +bool AllReduce::measure_operator_cost(Simulator *sim, + MachineView const &pc, + CostMetrics &cost_metrics) const { + cost_metrics = CostMetrics(); + cost_metrics.forward_time = 0.0f; + cost_metrics.backward_time = 0.0f; + + cost_metrics.sync_time = 0; + cost_metrics.inputs_memory = 0; + cost_metrics.outputs_memory = 0; + cost_metrics.weights_memory = 0; + return true; +} + +bool AllReduce::get_int_parameter(PMParameter para, int *value) const { + switch (para) { + case PM_ALLREDUCE_DIM: + *value = allreduce_dim; + return true; + default: + return Op::get_int_parameter(para, value); + } +} + +bool AllReduce::append_parallel_op_info( + std::vector ¶llel_ops) const { + ParallelOpInfo ret; + ret.op_type = op_type; + ret.parallel_dim = allreduce_dim; + ret.parallel_degree = -1; // AllReduce does not affect parallel degree + parallel_ops.push_back(ret); + return true; +} + +/*static*/ +void AllReduce::forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + + AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input.data_type == output.data_type); + forward_kernel_wrapper(m, input, output); +} + +void AllReduce::backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); + + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input_grad.data_type == output_grad.data_type); + backward_kernel_wrapper(m, input_grad, output_grad); +} + +}; // namespace FlexFlow + +namespace std { +size_t hash::operator()( + FlexFlow::AllReduceParams const ¶ms) const { + size_t key = 0; + hash_combine(key, params.allreduce_legion_dim); + return key; +} + +} // namespace std diff --git a/src/parallel_ops/kernels/allreduce_kernels.cpp b/src/parallel_ops/kernels/allreduce_kernels.cpp new file mode 100644 index 0000000000..0aea27107d --- /dev/null +++ b/src/parallel_ops/kernels/allreduce_kernels.cpp @@ -0,0 +1,58 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/parallel_ops/kernels/allreduce_kernels.h" +#include "flexflow/utils/hip_helper.h" +#include + +namespace FlexFlow { + +AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct) + : OpMeta(handle) {} + +namespace Kernels { +namespace AllReduce { + +void forward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input.data_type == output.data_type); + assert(input.domain == output.domain); + size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1; +#ifdef FF_USE_NCCL + // ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); + checkNCCL(ncclAllReduce(input.ptr, + output.ptr, + input.domain.get_volume(), + ncclFloat, + ncclSum, + m->handle.ncclComm, + stream)); +#else + assert(false && "Must enable FF_USE_NCCL to use AllReduce operators"); +#endif +} + +void backward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + assert(false && "To be implemented"); +} + +} // namespace AllReduce +} // namespace Kernels +} // namespace FlexFlow diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu new file mode 100644 index 0000000000..bce970cbdc --- /dev/null +++ b/src/parallel_ops/kernels/allreduce_kernels.cu @@ -0,0 +1,56 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/parallel_ops/kernels/allreduce_kernels.h" +#include "flexflow/utils/cuda_helper.h" + +namespace FlexFlow { + +AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct) + : OpMeta(handle) {} + +namespace Kernels { +namespace AllReduce { + +void forward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input.data_type == output.data_type); + assert(input.domain == output.domain); +#ifdef FF_USE_NCCL + // ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); + checkNCCL(ncclAllReduce(input.ptr, + output.ptr, + input.domain.get_volume(), + ncclFloat, + ncclSum, + m->handle.ncclComm, + stream)); +#else + assert(false && "Must enable FF_USE_NCCL to use AllReduce operators"); +#endif +} + +void backward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + assert(false && "To be implemented"); +} + +} // namespace AllReduce +} // namespace Kernels +} // namespace FlexFlow diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index 7ab9201113..e2debfa2d5 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -168,6 +168,8 @@ std::string get_operator_type_name(OperatorType type) { return "Replicate"; case OP_REDUCTION: return "Reduction"; + case OP_ALLREDUCE: + return "AllReduce"; case OP_PIPELINE: return "Pipeline"; case OP_FUSED_PARALLEL: diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 7447125197..8be3877b48 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -39,6 +39,7 @@ #include "flexflow/ops/topk.h" #include "flexflow/ops/transpose.h" #include "flexflow/parallel_ops/combine.h" +#include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/fused_parallel_op.h" #include "flexflow/parallel_ops/partition.h" #include "flexflow/parallel_ops/reduction.h" @@ -2295,6 +2296,13 @@ GraphOptimalViewSerialized sez.serialize(reduction->reduction_dim); sez.serialize(reduction->reduction_degree); break; + } + case OP_ALLREDUCE: { + AllReduce *allreduce = (AllReduce *)op; + sez.serialize(allreduce->allreduce_dim); + sez.serialize(strlen(allreduce->name)); + sez.serialize(allreduce->name, strlen(allreduce->name)); + break; } case OP_COMBINE: { Combine *combine = (Combine *)op; @@ -2704,6 +2712,17 @@ void FFModel::deserialize_graph_optimal_view( {reduction_dim, reduction_degree}); break; } + case OP_ALLREDUCE: { + assert(num_inputs == 1); + int allreduce_dim; + dez.deserialize(allreduce_dim); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); + node = get_or_create_node(inputs[0], {allreduce_dim}); + break; + } case OP_FUSED_PARALLEL: { assert(num_inputs == 1); std::vector parallel_ops; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index b16766291a..3807e85391 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -53,6 +53,7 @@ #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" #include "flexflow/parallel_ops/partition.h" +#include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/reduction.h" #include "flexflow/parallel_ops/replicate.h" #include "flexflow/substitution.h" @@ -2508,9 +2509,10 @@ bool FFModel::apply_fusion(std::vector const &operators, operators[l]->op_type == OP_WEIGHT) { continue; } - // don't fuse parallel op since they have different parallel_is in - // forward/backward - if (operators[l]->is_parallel_op()) { + // don't fuse parallel op except allReduce since they have different + // parallel_is in forward/backward + if (operators[l]->is_parallel_op() && + operators[l]->op_type != OP_ALLREDUCE) { continue; } size_t start = 0; @@ -2553,9 +2555,10 @@ bool FFModel::apply_fusion(std::vector const &operators, operators[i]->op_type == OP_WEIGHT) { continue; } - // don't fuse parallel op since they have different parallel_is in - // forward/backward - if (operators[i]->is_parallel_op()) { + // don't fuse parallel op except allReduce since they have different + // parallel_is in forward/backward + if (operators[i]->is_parallel_op() && + operators[i]->op_type != OP_ALLREDUCE) { continue; } fused_op = new FusedOp(*this, operators[i]); @@ -3491,7 +3494,7 @@ struct DefaultConfig { const static bool profiling = false; constexpr static float learningRate = 0.01f; constexpr static float weightDecay = 0.0001f; - const static size_t workSpaceSize = (size_t)1 * 1024 * 1024 * 1024; // 2GB + const static size_t workSpaceSize = (size_t)2 * 1024 * 1024 * 1024; // 2GB const static int numNodes = 1; const static int workersPerNode = 0; const static int cpusPerNode = 0; @@ -5214,6 +5217,49 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + // AllReduce + { + TaskVariantRegistrar registrar(ALLREDUCE_INIT_TASK_ID, "AllReduce Init"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "AllReduce init Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + { + TaskVariantRegistrar registrar(ALLREDUCE_FWD_TASK_ID, "AllReduce Forward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "AllReduce Forward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + { + TaskVariantRegistrar registrar(ALLREDUCE_BWD_TASK_ID, "AllReduce Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "AllReduce Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } // FusedParallelOp { TaskVariantRegistrar registrar(FUSED_PARALLELOP_FWD_TASK_ID, diff --git a/src/runtime/operator_params.cc b/src/runtime/operator_params.cc index 41dd37dec7..322d7840fb 100644 --- a/src/runtime/operator_params.cc +++ b/src/runtime/operator_params.cc @@ -28,6 +28,7 @@ #include "flexflow/ops/topk.h" #include "flexflow/ops/transpose.h" #include "flexflow/parallel_ops/combine.h" +#include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/fused_parallel_op.h" #include "flexflow/parallel_ops/partition.h" #include "flexflow/parallel_ops/reduction.h" @@ -94,6 +95,8 @@ tl::optional get_op_parameters(Op const *op) { return ((Reduction *)op)->get_params(); case OP_COMBINE: return ((Combine *)op)->get_params(); + case OP_ALLREDUCE: + return ((AllReduce *)op)->get_params(); case OP_FUSED_PARALLEL: return ((FusedParallelOp *)op)->get_params(); case OP_TRANSPOSE: diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc index a683d234e3..2b90bc3f44 100644 --- a/src/runtime/optimizer.cc +++ b/src/runtime/optimizer.cc @@ -536,10 +536,10 @@ void AdamOptimizer::unified_update(std::vector const parameters) assert(v_values.find(p->region) != v_values.end()); assert(m_values.find(p->region) != m_values.end()); assert(p->owner_op != NULL); - if (reservedWorkSpaceSize + p->get_volume() * sizeof(float) >= - workSpaceSize) { + if (reservedWorkSpaceSize + p->get_volume() * sizeof(float) >= workSpaceSize) { break; } + printf("continueee\n"); reservedWorkSpaceSize += p->get_volume() * sizeof(float); parameters_num += 1; assert(p->sync_type == ParameterSyncType::NCCL); diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index e3adfec5b7..4f44a3a574 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -34,6 +34,7 @@ #include "flexflow/ops/split.h" #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" +#include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/partition.h" #include "flexflow/parallel_ops/reduction.h" #include "flexflow/parallel_ops/replicate.h" @@ -3743,6 +3744,12 @@ bool FFModel::convert_graph_to_operators( reduction->reduction_degree); break; } + case OP_ALLREDUCE: { + assert(inList.size() == 1); + AllReduce *allreduce = (AllReduce *)node.ptr; + new_op = new AllReduce(*this, inputs[0], allreduce->allreduce_dim); + break; + } case OP_FUSED_PARALLEL: { assert(inList.size() == 1); FusedParallelOp *fused = (FusedParallelOp *)node.ptr; From 81852893af7b866e42b1a834670535f459309758 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Thu, 15 Feb 2024 18:09:44 -0500 Subject: [PATCH 15/19] timer --- src/runtime/optimizer_kernel.cu | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/runtime/optimizer_kernel.cu b/src/runtime/optimizer_kernel.cu index a57281a5ec..ee8fc8dcd4 100644 --- a/src/runtime/optimizer_kernel.cu +++ b/src/runtime/optimizer_kernel.cu @@ -230,6 +230,15 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu( checkCUDA(get_legion_stream(&stream)); // assert(op->reservedWorkSpaceSize < meta->handle.workSpaceSize); + cudaEvent_t t_start, t_start1, t_start2, t_end; + cudaEventCreate(&t_start); + cudaEventCreate(&t_start1); + cudaEventCreate(&t_start2); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + cudaEventRecord(t_start1, stream); + cudaEventRecord(t_start2, stream); + void *workSpace_ptr = meta->handle.workSpace; for (int i = 0; i < op->parameters_num; i++) { @@ -242,6 +251,13 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu( static_cast(workSpace_ptr) + size[i] * sizeof(float); } + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start1, t_end)); + cudaEventDestroy(t_start1); + printf("[optimizer] data copy time = %.2lfms\n", elapsed); + // do allreduce once checkNCCL(ncclAllReduce(meta->handle.workSpace, (float *)meta->handle.workSpace, @@ -250,6 +266,12 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu( ncclSum, meta->handle.ncclComm, stream)); + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start2, t_end)); + cudaEventDestroy(t_start2); + printf("[optimizer] allreduce time = %.2lfms\n", elapsed); workSpace_ptr = static_cast(meta->handle.workSpace); float alpha_t = op->alpha_t; @@ -277,6 +299,14 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu( beta2_t *= op->beta2; alpha_t = op->alpha * sqrt(1 - beta2_t) / (1 - beta1_t); } + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[optimizer] total time = %.2lfms\n", elapsed); + assert(false); } #endif From d958805a310d83084a2d7c5221dddf011f03cd87 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 16 Feb 2024 00:58:27 -0500 Subject: [PATCH 16/19] rmv --- src/runtime/optimizer_kernel.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/src/runtime/optimizer_kernel.cu b/src/runtime/optimizer_kernel.cu index ee8fc8dcd4..37c6e55ede 100644 --- a/src/runtime/optimizer_kernel.cu +++ b/src/runtime/optimizer_kernel.cu @@ -306,7 +306,6 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu( cudaEventDestroy(t_start); cudaEventDestroy(t_end); printf("[optimizer] total time = %.2lfms\n", elapsed); - assert(false); } #endif From 38dfd87af234227f2e00604acff908c96e755ef4 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Thu, 22 Feb 2024 13:42:09 -0500 Subject: [PATCH 17/19] fix tp --- include/flexflow/config.h | 3 + include/flexflow/model.h | 6 + include/flexflow/ops/element_binary.h | 5 + include/flexflow/ops/element_binary_params.h | 1 + include/flexflow/utils/cuda_helper.h | 3 + python/flexflow/core/flexflow_cffi.py | 6 +- src/dataloader/dataloader.cc | 2 +- src/dataloader/dataloader.cu | 15 +- src/ops/element_binary.cc | 22 ++ src/ops/embedding.cc | 12 +- src/ops/layer_norm.cc | 4 +- src/ops/linear.cc | 20 +- src/ops/reshape.cc | 82 ++++- src/parallel_ops/allreduce.cc | 1 + src/parallel_ops/kernels/allreduce_kernels.cu | 19 +- src/runtime/cuda_helper.cu | 17 + src/runtime/graph.cc | 32 +- src/runtime/model.cc | 322 +++++++++++++----- src/runtime/optimizer.cc | 1 - src/runtime/optimizer_kernel.cu | 1 - src/runtime/parallel_tensor.cc | 18 +- 21 files changed, 484 insertions(+), 108 deletions(-) diff --git a/include/flexflow/config.h b/include/flexflow/config.h index d1fe6231da..b6a27a4f2a 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -124,6 +124,7 @@ class FFConfig { size_t workSpaceSize; Legion::Context lg_ctx; Legion::Runtime *lg_hlr; + Legion::IndexSpaceT<1> all_gpu_task_is; Legion::FieldSpace field_space; bool syntheticInput, profiling, perform_fusion; size_t simulator_work_space_size; @@ -137,6 +138,8 @@ class FFConfig { bool enable_parameter_parallel; bool enable_attribute_parallel; bool enable_inplace_optimizations; + int data_parallelism_degree; + int tensor_parallelism_degree; // Control Tensor Op Math Conversion bool allow_tensor_op_math_conversion; std::string dataset_path; diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 3f45de3d79..fe73e6a0e3 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -278,6 +278,7 @@ class Split; class TopK; class Transpose; class Combine; +class AllReduce; class Repartition; class Reduction; class Replicate; @@ -834,6 +835,8 @@ class FFModel { Legion::IndexSpace get_task_is(Legion::Domain const &domain) const; Legion::IndexSpace get_task_is(ParallelConfig const &pc) const; Legion::IndexSpace get_task_is(MachineView const &view) const; + bool is_transformer_block(int layer_idx) const; + bool is_mlp_block(int layer_idx) const; void create_operators_from_layers(); Op *create_operator_from_layer(Layer *layer, std::vector const &inputs); @@ -860,6 +863,7 @@ class FFModel { int metrics_input; ParallelTensor parallel_label_tensor; Tensor label_tensor; + int num_inputs = 0; std::vector layers; std::vector operators; @@ -929,6 +933,8 @@ class FFModel { Replicate *>, std::unordered_map, Reduction *>, + std::unordered_map, + AllReduce *>, std::unordered_map, Combine *>, std::unordered_map, diff --git a/include/flexflow/ops/element_binary.h b/include/flexflow/ops/element_binary.h index f83ce20609..677ff23ce2 100644 --- a/include/flexflow/ops/element_binary.h +++ b/include/flexflow/ops/element_binary.h @@ -53,6 +53,11 @@ class ElementBinary : public Op { bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; + void serialize(Legion::Serializer &) const override; + static PCG::Node deserialize(FFModel &ff, + Legion::Deserializer &d, + ParallelTensor inputs[], + int num_inputs); Params get_params() const; public: diff --git a/include/flexflow/ops/element_binary_params.h b/include/flexflow/ops/element_binary_params.h index 5aa20e25a5..c70e1b597a 100644 --- a/include/flexflow/ops/element_binary_params.h +++ b/include/flexflow/ops/element_binary_params.h @@ -8,6 +8,7 @@ namespace FlexFlow { struct ElementBinaryParams { OperatorType type; + bool inplace_a; bool is_valid( std::pair const &) const; diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index a4b2be0a66..c56ab473c0 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -82,6 +82,9 @@ __global__ void assign_kernel(DT *ptr, Legion::coord_t size, DT value); template __global__ void copy_kernel(DT *dst, const DT *src, Legion::coord_t size); +template +__global__ void copy_kernel_with_replicate(DT *dst, const DT *src, Legion::coord_t origin_size, Legion::coord_t size); + template __global__ void add_kernel(T *data_ptr, T const *grad_ptr, size_t size); diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index c39b58e4b8..750838d829 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -2079,11 +2079,11 @@ def load_bert_pretrained(self, checkpoint=None): layer = self._layers[i] if (layer.name + "_weight") in weights_dict: print('weight: ' + layer.name) - weight = layer.get_parameter_by_id(0); + weight = layer.get_parameter_by_id(0) weight.set_tensor(self, weights_dict[layer.name + "_weight"]) if (layer.name + "_bias") in weights_dict: print('bias: ' + layer.name) - bias = layer.get_parameter_by_id(1); + bias = layer.get_parameter_by_id(1) bias.set_tensor(self, weights_dict[layer.name + "_bias"]) def fit(self, x=None, y=None, batch_size=None, epochs=1): """Trains the model for a fixed number of epochs (iterations on a dataset). @@ -2126,7 +2126,7 @@ def fit(self, x=None, y=None, batch_size=None, epochs=1): self.forward() # self.zero_gradients() self.backward() - self.unified_update() + self.update() self._ffconfig.end_trace(self._tracing_id) def eval(self, x=None, y=None, batch_size=None): diff --git a/src/dataloader/dataloader.cc b/src/dataloader/dataloader.cc index 441a088194..614482e8b1 100644 --- a/src/dataloader/dataloader.cc +++ b/src/dataloader/dataloader.cc @@ -97,7 +97,7 @@ SingleDataLoader::SingleDataLoader(FFModel &ff, datatype = datatype_; // Currently assume that the leading dim of input is a replica dim of degree 1 assert(input->dims[input->num_dims - 1].is_replica_dim); - assert(input->dims[input->num_dims - 1].size == 1); + // assert(input->dims[input->num_dims - 1].size == 1); batch_input = input; ParallelDim dims[MAX_TENSOR_DIM]; diff --git a/src/dataloader/dataloader.cu b/src/dataloader/dataloader.cu index c2994d00a2..5462532d76 100644 --- a/src/dataloader/dataloader.cu +++ b/src/dataloader/dataloader.cu @@ -40,10 +40,13 @@ void SingleDataLoader::load_input(Task const *task, int num_dims = full_input_domain.get_dim(); assert(num_dims + 1 == batch_input_domain.get_dim()); // assert the leading replica dim has a degree of one - assert(batch_input_domain.hi()[num_dims] == - batch_input_domain.lo()[num_dims]); + // assert(batch_input_domain.hi()[num_dims] == + // batch_input_domain.lo()[num_dims]); coord_t batch_size = batch_input_domain.hi()[num_dims - 1] - batch_input_domain.lo()[num_dims - 1] + 1; + + coord_t replicate_num = + batch_input_domain.hi()[num_dims] - batch_input_domain.lo()[num_dims] + 1; coord_t num_elements_per_batch = batch_input_domain.get_volume() / batch_size; // FIXME: currently assume continous indices assert(batch_size == meta->num_samples); @@ -60,11 +63,15 @@ void SingleDataLoader::load_input(Task const *task, // printf("ptr(%p, %p), idx0 %d nb_elements_per_batch %d, batch_size %d, // %d\n", acc_full_input.ptr, input_zc, start_idx, num_elements_per_batch, // batch_size, start_idx * num_elements_per_batch); - copy_kernel
+ assert(batch_input_domain.get_volume() % replicate_num == 0); + copy_kernel_with_replicate
<<>>(batch_input_ptr, input_zc, batch_input_domain.get_volume()); + stream>>>(batch_input_ptr, + input_zc, + batch_input_domain.get_volume() / replicate_num, + batch_input_domain.get_volume()); checkCUDA(cudaDeviceSynchronize()); } diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index 12895cfd98..84c3f8ba93 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -802,10 +802,32 @@ bool ElementBinary::measure_operator_cost(Simulator *sim, return true; } +void ElementBinary::serialize(Legion::Serializer &sez) const { + sez.serialize(this->op_type); + sez.serialize(this->inplace_a); +} + +using PCG::Node; +/*static*/ +Node ElementBinary::deserialize(FFModel &ff, + Legion::Deserializer &dez, + ParallelTensor inputs[], + int num_inputs) { + assert(num_inputs == 2); + OperatorType op_type; + bool inplace_a; + dez.deserialize(op_type); + dez.deserialize(inplace_a); + ElementBinaryParams params; + params.type = op_type; + params.inplace_a = inplace_a; + return ff.get_or_create_node({inputs[0], inputs[1]}, params); +} ElementBinaryParams ElementBinary::get_params() const { ElementBinaryParams params; params.type = this->op_type; + params.inplace_a = this->inplace_a; return params; } diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index 18bf7e324e..8df6324460 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -148,23 +148,27 @@ int Embedding::output_size(ParallelDim output_dims[MAX_TENSOR_DIM]) { int const OUT_CHANNELS = Output::OUT_CHANNELS; if (aggr == AGGR_MODE_NONE) { int num_dims = input->num_dims + 1; - for (int i = 1; i < num_dims; i++) { + for (int i = 1; i < num_dims - 1; i++) { output_dims[i] = input->dims[i - 1]; } assert(OUT_CHANNELS == 0); output_dims[OUT_CHANNELS].size = this->out_channels; output_dims[OUT_CHANNELS].degree = 1; output_dims[OUT_CHANNELS].parallel_idx = -1; + // Copy replica dim + output_dims[num_dims - 1] = input->dims[input->num_dims - 1]; return num_dims; } else { int num_dims = input->num_dims; - for (int i = 1; i < num_dims; i++) { + for (int i = 1; i < num_dims - 1; i++) { output_dims[i] = input->dims[i]; } assert(OUT_CHANNELS == 0); output_dims[OUT_CHANNELS].size = this->out_channels; output_dims[OUT_CHANNELS].degree = 1; output_dims[OUT_CHANNELS].parallel_idx = -1; + // Copy replica dim + output_dims[num_dims - 1] = input->dims[input->num_dims - 1]; return num_dims; } // const int REPLICA = this->output_vocab_size_replica_dim(); @@ -179,13 +183,13 @@ int Embedding::weight_size(ParallelDim weight_dims[MAX_TENSOR_DIM]) { weight_dims[Weight::VOCAB_SIZE].size = this->num_entries; weight_dims[Weight::VOCAB_SIZE].degree = 1; weight_dims[Weight::VOCAB_SIZE].parallel_idx = -1; - for (int i = 2; i < input->num_dims; i++) { + for (int i = 2; i < input->num_dims + 1; i++) { weight_dims[i].size = input->dims[i - 1].degree; weight_dims[i].degree = weight_dims[i].size; weight_dims[i].parallel_idx = input->dims[i - 1].parallel_idx; weight_dims[i].is_replica_dim = true; } - return input->num_dims; + return input->num_dims + 1; } void Embedding::register_output_mappings() { diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index a6130bb425..ccbd7c2dd6 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -226,10 +226,10 @@ LayerNorm::LayerNorm(FFModel &model, dims[i] = inputs[0]->dims[i]; } assert(numInputs == 1); - dims[num_dims].degree = inputs[0]->dims[inputs[0]->num_dims - 2].degree; + dims[num_dims].degree = inputs[0]->dims[inputs[0]->num_dims - 1].degree; dims[num_dims].size = dims[num_dims].degree; dims[num_dims].parallel_idx = - inputs[0]->dims[inputs[0]->num_dims - 2].parallel_idx; + inputs[0]->dims[inputs[0]->num_dims - 1].parallel_idx; dims[num_dims].is_replica_dim = true; num_dims += 1; diff --git a/src/ops/linear.cc b/src/ops/linear.cc index d5257b9c3e..20cc193107 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -190,6 +190,23 @@ Linear::Linear(FFModel &model, params.construct_mappings(*this->parallel_dims_mapping, input_shape); params.solve_dims(input_shape, output_shape, kernel_shape, bias_shape); + kernel_shape.dims[0].size = this->in_channels; + bias_shape.dims[0].degree = _input->dims[_input->num_dims - 1].degree; + bias_shape.dims[0].parallel_idx = + _input->dims[_input->num_dims - 1].parallel_idx; + bias_shape.dims[1].size = bias_shape.dims[1].degree = 1; + bias_shape.dims[1].parallel_idx = -1; + bias_shape.dims[bias_shape.num_dims - 1].size = + bias_shape.dims[bias_shape.num_dims - 1].degree = 1; + for (int i = 0; i < input_shape.num_dims - 1; i++) { + if (_input->dims[i].degree > 1) { + bias_shape.dims[bias_shape.num_dims - 1].size *= _input->dims[i].degree; + bias_shape.dims[bias_shape.num_dims - 1].degree *= _input->dims[i].degree; + bias_shape.dims[bias_shape.num_dims - 1].parallel_idx = + _input->dims[i].parallel_idx; + } + } + if (allocate_weights) { Initializer *kernel_initializer = new GlorotUniform(std::rand() /*seed*/); @@ -220,7 +237,6 @@ Linear::Linear(FFModel &model, outputs[0] = model.create_parallel_tensor_legion_ordering( output_shape.num_dims, output_shape.dims, _data_type, this); - assert(check_output_input_weight_parallel_dims(allocate_weights)); } void Linear::init(FFModel const &ff) { @@ -433,7 +449,7 @@ void Linear::forward_task_with_dim(Task const *task, int out_dim = acc_output.rect.hi[0] - acc_output.rect.lo[0] + 1; int batch_size = acc_output.rect.volume() / out_dim; assert(acc_output.rect.volume() == static_cast(out_dim * batch_size)); - assert(acc_input.rect.volume() == static_cast(in_dim * batch_size)); + // assert(acc_input.rect.volume() == static_cast(in_dim * batch_size)); assert(acc_kernel.rect.volume() == static_cast(in_dim * out_dim)); float const *acc_bias_ptr = NULL; if (m->use_bias) { diff --git a/src/ops/reshape.cc b/src/ops/reshape.cc index 2b8a60bf21..07797bd223 100644 --- a/src/ops/reshape.cc +++ b/src/ops/reshape.cc @@ -80,9 +80,14 @@ Op *Reshape::create_operator_from_layer( return new Reshape(model, layer->layer_guid, inputs[0], shape, layer->name); } +bool match_pattern(std::vector const &_shape) { + return (_shape.size() == 4 && _shape[1] == 1 && _shape[2] == 1 && + _shape[3] == 512); +} + Reshape::Reshape(FFModel &model, LayerID const &_layer_guid, - const ParallelTensor input, + ParallelTensor const input, std::vector const &_shape, char const *name) : Op(model, @@ -106,19 +111,64 @@ Reshape::Reshape(FFModel &model, if (input->dims[i].is_replica_dim) { num_replica_dims++; } + // std::cout << "reshape input size: " << input->dims[i].size + // << ", parallelidx: " << input->dims[i].parallel_idx << ". degree: " << input->dims[i].degree + // << "is replicate dim: " << input->dims[i].is_replica_dim << + // "\n"; } + + // assert(false); // assert that all replica dims are leading dims for (int i = 0; i < num_replica_dims; i++) { assert(input->dims[input->num_dims - 1 - i].is_replica_dim); } int numdim = (int)_shape.size(); ParallelDim dims[MAX_TENSOR_DIM]; - for (int i = 0; i < numdim; i++) { - dims[i].size = _shape[numdim - 1 - i]; - dims[i].degree = 1; - dims[i].parallel_idx = -1; - dims[i].is_replica_dim = false; - } + + + bool expanded = numdim >= input->num_dims; + bool aggregation = numdim < input->num_dims - 1; + + for (int i = 0; i < numdim; i++) { + if (expanded && i < numdim - 1 && + _shape[i] * _shape[i + 1] == input->dims[numdim - i - 2].size) { + dims[numdim - i - 1].size = _shape[i]; + dims[numdim - i - 1].degree = input->dims[numdim - i - 2].degree; + dims[numdim - i - 1].parallel_idx = + input->dims[numdim - i - 2].parallel_idx; + dims[numdim - i - 1].is_replica_dim = + input->dims[numdim - i - 2].is_replica_dim; + std::cout << "expand dim i:" << i << ", " << dims[numdim - i - 1].degree + << ", " << dims[numdim - i - 1].size << "\n"; + } else if (aggregation && + (_shape[i] == input->dims[input->num_dims - 2 - i].size * + input->dims[input->num_dims - 3 - i].size)) { + // inherit + dims[numdim - i - 1].size = _shape[i]; + dims[numdim - i - 1].degree = + input->dims[input->num_dims - 2 - i].degree; + dims[numdim - i - 1].parallel_idx = + input->dims[input->num_dims - 2 - i].parallel_idx; + dims[numdim - i - 1].is_replica_dim = + input->dims[input->num_dims - 2 - i].is_replica_dim; + // std::cout << "agree i: " << i <<", " << _shape[i] << "\n"; + } else { + dims[numdim - i - 1].size = _shape[i]; + dims[numdim - i - 1].degree = 1; + dims[numdim - i - 1].parallel_idx = -1; + dims[numdim - i - 1].is_replica_dim = false; + } + } + + + + + // for (int i = 0; i < numdim; i++) { + // dims[i].size = _shape[numdim - 1 - i]; + // dims[i].degree = 1; + // dims[i].parallel_idx = -1; + // dims[i].is_replica_dim = false; + // } // copy all replica dims for (int i = 0; i < num_replica_dims; i++) { dims[i + numdim] = input->dims[input->num_dims - 1 - i]; @@ -131,6 +181,24 @@ Reshape::Reshape(FFModel &model, } dims[numdim - 1 - i] = input->dims[input->num_dims - 1 - i]; } + + //TODO temporary fix for input to attention QK, fix it after fuse the attention block + if(match_pattern(_shape) && model.config.tensor_parallelism_degree > 1){ + //number of heads + + dims[2].size = 12; + dims[2].degree = model.config.tensor_parallelism_degree; + dims[2].parallel_idx = 0; + dims[2].is_replica_dim = true; + + dims[4].size = 1; + dims[4].degree = 1; + dims[4].parallel_idx = -1; + dims[4].is_replica_dim = false; + + } + + outputs[0] = model.create_parallel_tensor_legion_ordering( numdim, dims, input->data_type, this); assert(outputs[0]->get_volume() == inputs[0]->get_volume()); diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc index 1bbc43ca0d..7052bb3ed5 100644 --- a/src/parallel_ops/allreduce.cc +++ b/src/parallel_ops/allreduce.cc @@ -173,6 +173,7 @@ void AllReduce::backward(FFModel const &ff) { Runtime *runtime = ff.config.lg_hlr; assert(numOutputs == 1); assert(numInputs == 1); + set_argumentmap_for_backward(ff, argmap); IndexLauncher launcher(ALLREDUCE_BWD_TASK_ID, inputs[0]->parallel_is, TaskArgument(NULL, 0), diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu index bce970cbdc..1e932d2b12 100644 --- a/src/parallel_ops/kernels/allreduce_kernels.cu +++ b/src/parallel_ops/kernels/allreduce_kernels.cu @@ -48,7 +48,24 @@ void forward_kernel_wrapper(AllReduceMeta const *m, void backward_kernel_wrapper(AllReduceMeta const *m, GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output_grad) { - assert(false && "To be implemented"); + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input_grad.data_type == output_grad.data_type); + assert(input_grad.domain == output_grad.domain); +#ifdef FF_USE_NCCL + // ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); + // std::cout <<"input volume: " << input.domain.get_volume() << "\n"; + // print_tensor((float*)input.ptr, 32, "input ptr"); + checkNCCL(ncclAllReduce(output_grad.ptr, + input_grad.ptr, + output_grad.domain.get_volume(), + ncclFloat, + ncclSum, + m->handle.ncclComm, + stream)); +#else + assert(false && "Must enable FF_USE_NCCL to use AllReduce operators"); +#endif } } // namespace AllReduce diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index 3236545cb9..a4a58e60fc 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -61,6 +61,15 @@ __global__ void copy_kernel(DT *dst, const DT *src, coord_t size) { dst[i] = src[i]; } } +template +__global__ void copy_kernel_with_replicate(DT *dst, + const DT *src, + coord_t origin_size, + coord_t size) { + CUDA_KERNEL_LOOP(i, size) { + dst[i] = src[i % origin_size]; + } +} template __global__ void reluBackward(DT *grad_ptr, const DT *output, size_t n) { @@ -410,6 +419,14 @@ template __global__ void template __global__ void copy_kernel(float *dst, float const *src, coord_t size); +template __global__ void copy_kernel_with_replicate(float *dst, + float const *src, + coord_t origin_size, + coord_t size); +template __global__ void copy_kernel_with_replicate( + int32_t *dst, int32_t const *src, coord_t origin_size, coord_t size); +template __global__ void copy_kernel_with_replicate( + int64_t *dst, int64_t const *src, coord_t origin_size, coord_t size); template __global__ void copy_kernel(int32_t *dst, int32_t const *src, coord_t size); template __global__ void diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 8be3877b48..762c5911d6 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -1928,9 +1928,37 @@ std::pair, std::unordered_map> model->config.numNodes * model->config.workersPerNode; data_parallel_view.stride[0] = 1; data_parallel_view.start_device_id = 0; + // Currently assume a 1D machine view is needed + assert(model->config.data_parallelism_degree == 1 || + model->config.tensor_parallelism_degree == 1); + int degree = model->config.data_parallelism_degree * + model->config.tensor_parallelism_degree; for (auto const &node : curr_best_graph->inEdges) { - curr_optimal_views[node.first] = data_parallel_view; - } + Op const *op = node.first.ptr; + MachineView mv; + mv.device_type = MachineView::GPU; + mv.ndims = 1; + int total_parallel_degree = 1; + for (int i = 0; i < op->outputs[0]->num_dims; i++) { + total_parallel_degree *= op->outputs[0]->dims[i].degree; + } + mv.dim[0] = total_parallel_degree; + mv.stride[0] = 1; + mv.start_device_id = 0; + // std::cout << mv.start_device_id + degree - 1 << "\n"; + // std::cout << model->config.numNodes << "\n"; + // std::cout << model->config.workersPerNode << "\n"; + // assert(false); + assert(mv.start_device_id + degree - 1 < + model->config.numNodes * model->config.workersPerNode); + curr_optimal_views[node.first] = mv; + for (int i = 0; i < node.first.ptr->numOutputs; i++) { + assert(node.first.ptr->outputs[i]->is_valid_machine_view(mv)); + } + } + // for (auto const &node : curr_best_graph->inEdges) { + // curr_optimal_views[node.first] = data_parallel_view; + // } return std::make_pair(std::move(curr_best_graph), curr_optimal_views); } diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 3807e85391..6feddcd03c 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -50,10 +50,10 @@ #include "flexflow/ops/split.h" #include "flexflow/ops/topk.h" #include "flexflow/ops/transpose.h" +#include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" #include "flexflow/parallel_ops/partition.h" -#include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/reduction.h" #include "flexflow/parallel_ops/replicate.h" #include "flexflow/substitution.h" @@ -79,10 +79,10 @@ Op::Op(FFModel &model, int numWeights, bool allocate_weights, int numOutputs, - const ParallelTensor input1, - const ParallelTensor input2, - const ParallelTensor input3, - const ParallelTensor input4) + ParallelTensor const input1, + ParallelTensor const input2, + ParallelTensor const input3, + ParallelTensor const input4) : Op(model, otype, dtype, @@ -102,10 +102,10 @@ Op::Op(FFModel &model, int _numInputs, int _numWeights, int _numOutputs, - const ParallelTensor _input1, - const ParallelTensor _input2, - const ParallelTensor _input3, - const ParallelTensor _input4) + ParallelTensor const _input1, + ParallelTensor const _input2, + ParallelTensor const _input3, + ParallelTensor const _input4) : op_type(_otype), data_type(_dtype), op_guid(model.op_global_guid++), numInputs(_numInputs), numWeights(_numWeights), numOutputs(_numOutputs), profiling(model.config.profiling) { @@ -587,8 +587,13 @@ ncclComm_t Op::init_nccl_comms_task(Task const *task, ncclComm_t ncclComm; fprintf(stderr, "Before ncclCommInitRank\n"); checkNCCL(ncclCommInitRank(&ncclComm, allRanks, ncclId, myRank)); - fprintf(stderr, "After ncclCommInitRank ncclComm(%p) allRanks(%d) myRank(%d) ncclId(%p)\n", - ncclComm, allRanks, myRank, ncclId); + fprintf(stderr, + "After ncclCommInitRank ncclComm(%p) allRanks(%d) myRank(%d) " + "ncclId(%p)\n", + ncclComm, + allRanks, + myRank, + ncclId); return ncclComm; } #endif @@ -783,9 +788,9 @@ void Op::register_output_parallel_dims( operation); } -int Op::get_output_to_input_dim_mapping(const ParallelTensor output, +int Op::get_output_to_input_dim_mapping(ParallelTensor const output, int output_dim, - const ParallelTensor input) { + ParallelTensor const input) { int output_idx = -1, input_idx = -1; for (int i = 0; i < numOutputs; i++) { if (output == outputs[i]) { @@ -818,9 +823,9 @@ int Op::get_output_to_input_dim_mapping(const ParallelTensor output, return -1; } -int Op::get_output_to_weight_dim_mapping(const ParallelTensor output, +int Op::get_output_to_weight_dim_mapping(ParallelTensor const output, int output_dim, - const ParallelTensor weight) { + ParallelTensor const weight) { int output_idx = -1, weight_idx = -1; for (int i = 0; i < numOutputs; i++) { if (output == outputs[i]) { @@ -886,6 +891,9 @@ bool Op::check_output_input_weight_parallel_dims(bool allocate_weights) const { break; } + printf("other dim degree: %d, input dim degree %d\n", + other_dim.degree, + input_dim.degree); assert(other_dim.degree == input_dim.degree); assert(other_dim.parallel_idx == input_dim.parallel_idx); } @@ -895,18 +903,25 @@ bool Op::check_output_input_weight_parallel_dims(bool allocate_weights) const { bool Op::check_output_input_weight_same_parallel_is() const { assert(numOutputs > 0); IndexSpace parallel_is = outputs[0]->parallel_is; + std::cout << "output space: " + << ", " << parallel_is << "\n"; for (int i = 0; i < numOutputs; i++) { if (outputs[i]->parallel_is != parallel_is) { + std::cout << "output mismatch" + << "\n"; return false; } } for (int i = 0; i < numInputs; i++) { + std::cout << "input space: " << i << ", " << inputs[i]->parallel_is << "\n"; if (inputs[i]->parallel_is != parallel_is) { return false; } } for (int i = 0; i < numWeights; i++) { if (weights[i]->parallel_is != parallel_is) { + std::cout << "weight mismatch" + << "\n"; return false; } } @@ -948,7 +963,7 @@ void Op::set_argumentmap_for_init(FFModel const &ff, ArgumentMap &argmap) { for (PointInRectIterator it(rect); it(); it++) { \ FFHandler handle = ff.handlers[view.get_device_id(*it)]; \ if (ff.config.computationMode == COMP_MODE_TRAINING && \ - op_type == OP_WEIGHT) { \ + (op_type == OP_WEIGHT || op_type == OP_ALLREDUCE)) { \ ncclComm_t *nccl_comms = ff.find_nccl_comms(view); \ handle.ncclComm = nccl_comms[idx++]; \ } \ @@ -1192,9 +1207,11 @@ FFModel::FFModel(FFConfig &_config) //} ArgumentMap argmap; - Rect<1> task_rect(Point<1>(0), - Point<1>(config.workersPerNode * config.numNodes - 1)); - IndexSpaceT<1> task_is = runtime->create_index_space(ctx, task_rect); + // Rect<1> task_rect(Point<1>(0), + // Point<1>(config.workersPerNode * config.numNodes - 1)); + // IndexSpaceT<1> task_is = runtime->create_index_space(ctx, task_rect); + Domain domain = runtime->get_index_space_domain(ctx, config.all_gpu_task_is); + Rect<1> task_rect = domain; // int rank = 0; for (PointInRectIterator<1> it(task_rect); it(); it++) { @@ -1208,7 +1225,7 @@ FFModel::FFModel(FFConfig &_config) // Init CUDA library on each worker IndexLauncher initLauncher(FF_INIT_TASK_ID, - task_is, + config.all_gpu_task_is, TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED, @@ -1301,7 +1318,7 @@ Tensor FFModel::create_tensor(int numdim, } ParallelTensor FFModel::create_parallel_tensor(int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *op, int idx, @@ -1334,7 +1351,7 @@ Tensor FFModel::create_tensor_legion_ordering(int numdim, ParallelTensor FFModel::create_parallel_tensor_legion_ordering(int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *op, int idx, @@ -1384,7 +1401,7 @@ Tensor FFModel::create_tensor(int const dims[], } template -ParallelTensor FFModel::create_parallel_tensor(const ParallelDim dims[], +ParallelTensor FFModel::create_parallel_tensor(ParallelDim const dims[], DataType data_type, Op const *owner_op, int owner_idx, @@ -1465,7 +1482,7 @@ Parameter FFModel::create_weight(int numdim, } template -ParallelParameter FFModel::create_parallel_weight(const ParallelDim dims[], +ParallelParameter FFModel::create_parallel_weight(ParallelDim const dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -1495,7 +1512,7 @@ ParallelParameter FFModel::create_parallel_weight(const ParallelDim dims[], } ParallelParameter FFModel::create_parallel_weight(int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -1515,7 +1532,7 @@ ParallelParameter FFModel::create_parallel_weight(int numdim, ParallelParameter FFModel::create_parallel_weight_legion_ordering( int numdim, - const ParallelDim dims[], + ParallelDim const dims[], DataType data_type, Op const *owner_op, bool create_grad, @@ -1720,7 +1737,7 @@ void FFModel::map_weight_with_dim(ParallelTensor weight, } bool FFModel::get_parallel_tensor_from_tensor( - const Tensor tensor, ParallelTensor ¶llel_tensor) const { + Tensor const tensor, ParallelTensor ¶llel_tensor) const { // check if tensor->parallel_tensor is already set if (tensor->parallel_tensor != nullptr) { parallel_tensor = tensor->parallel_tensor; @@ -1757,7 +1774,7 @@ bool FFModel::get_parallel_tensor_from_tensor( } void FFModel::create_disjoint_partition(int num_dims, - const ParallelDim dims[], + ParallelDim const dims[], IndexSpace const &part_is, LogicalRegion const ®ion, LogicalPartition &part) { @@ -1780,7 +1797,7 @@ void FFModel::create_disjoint_partition(int num_dims, template void FFModel::create_disjoint_partition_with_dim2( - const ParallelDim dims[], + ParallelDim const dims[], IndexSpaceT const &part_is, LogicalRegion const ®ion, LogicalPartition &part) { @@ -1813,7 +1830,7 @@ void FFModel::create_disjoint_partition_with_dim2( } void FFModel::create_aliased_partition(int num_dims, - const ParallelDim dims[], + ParallelDim const dims[], int aliased_dim, IndexSpace const &part_is, LogicalRegion const ®ion, @@ -1837,7 +1854,7 @@ void FFModel::create_aliased_partition(int num_dims, template void FFModel::create_aliased_partition_with_dim2( - const ParallelDim dims[], + ParallelDim const dims[], int aliased_dim, IndexSpaceT const &part_is, LogicalRegion const ®ion, @@ -1874,7 +1891,7 @@ void FFModel::create_aliased_partition_with_dim2( } template -void FFModel::create_disjoint_partition(const ParallelTensor tensor, +void FFModel::create_disjoint_partition(ParallelTensor const tensor, IndexSpaceT const &part_is, LogicalPartition &part_fwd, LogicalPartition &part_bwd) { @@ -1922,7 +1939,7 @@ void FFModel::create_disjoint_partition(const ParallelTensor tensor, template void FFModel::create_data_parallel_partition_with_diff_dims( - const ParallelTensor tensor, + ParallelTensor const tensor, IndexSpaceT const &part_is, LogicalPartition &part_fwd, LogicalPartition &part_bwd) { @@ -2304,7 +2321,7 @@ IndexSpace FFModel::get_task_is(ParallelConfig const &pc) const { return get_task_is(view); } -IndexSpace FFModel::get_or_create_task_is(const ParallelTensor tensor) { +IndexSpace FFModel::get_or_create_task_is(ParallelTensor const tensor) { MachineView view; view.ndims = 0; for (int i = 0; i < tensor->num_dims; i++) { @@ -2630,6 +2647,18 @@ Op *FFModel::create_operator_from_layer( dims[num_dims].degree = 1; dims[num_dims].parallel_idx = -1; dims[num_dims].is_replica_dim = true; + if (config.tensor_parallelism_degree > 1 && num_inputs != 1) { + dims[num_dims].size *= config.tensor_parallelism_degree; + dims[num_dims].degree *= config.tensor_parallelism_degree; + dims[num_dims].parallel_idx = 0; + } + //TODO temporary fix for input to attention QK, fix it after fuse the attention block + else if(config.tensor_parallelism_degree > 1){ + //n heads + dims[num_dims].size *= 12; + dims[num_dims].degree *= config.tensor_parallelism_degree; + dims[num_dims].parallel_idx = 0; + } // create_parallel_tensor adds an NoOp into operators ParallelTensor pt = create_parallel_tensor_legion_ordering(num_dims + 1, @@ -2643,19 +2672,14 @@ Op *FFModel::create_operator_from_layer( assert(tensor->parallel_tensor == nullptr); tensor->parallel_tensor = pt; // start from data parllel tensor - if (config.only_data_parallel && - config.numNodes * config.workersPerNode > 1) { - if (pt->dims[num_dims - 1].size == 1) { - Replicate *repl = new Replicate( - *this, pt, num_dims, config.numNodes * config.workersPerNode); - repl->outputs[0]->dims[num_dims].is_replica_dim = true; - operators.push_back(repl); - } else { - Repartition *part = new Repartition( - *this, pt, num_dims - 1, config.numNodes * config.workersPerNode); - operators.push_back(part); - } - } + // if (config.only_data_parallel && + // config.computationMode == COMP_MODE_TRAINING) { + // Repartition *part = new Repartition( + // *this, pt, num_dims - 1, config.numNodes * + // config.workersPerNode); + // operators.push_back(part); + // } + num_inputs++; return operators[operators.size() - 1]; } case OP_MULTIHEAD_ATTENTION: { @@ -2798,9 +2822,42 @@ Op *FFModel::create_operator_from_layer( } } +bool FFModel::is_transformer_block(int layer_idx) const { + auto const &l = layers[layer_idx]; + if (l->op_type == OP_DROPOUT && layer_idx >= 4 && + layers[layer_idx - 1]->op_type == OP_LINEAR && + layers[layer_idx - 2]->op_type == OP_RESHAPE && + layers[layer_idx - 3]->op_type == OP_TRANSPOSE && + layers[layer_idx - 4]->op_type == OP_BATCHMATMUL) { + return true; + } + return false; +} +bool FFModel::is_mlp_block(int layer_idx) const { + auto const &l = layers[layer_idx]; + // standard opt relu + if (l->op_type == OP_LINEAR && layer_idx >= 2 && + layers[layer_idx - 1]->op_type == OP_RELU && + layers[layer_idx - 2]->op_type == OP_LINEAR) { + return true; + } + // mlp layer with relu embedded in first dense layer + if (l->op_type == OP_LINEAR && layer_idx >= 1 && + layers[layer_idx - 1]->op_type == OP_LINEAR) { + long long value; + layers[layer_idx - 1]->get_int_property("activation", value); + ActiMode activation = (ActiMode)value; + if (activation == AC_MODE_RELU) { + return true; + } + } + return false; +} + void FFModel::create_operators_from_layers() { - std::map tensors_to_parallel_tensors; - for (auto const &l : layers) { + std::map tensors_to_parallel_tensors; + for (int layer_idx = 0; layer_idx < layers.size(); layer_idx++) { + auto const &l = layers[layer_idx]; std::vector inputs; for (int i = 0; i < l->numInputs; i++) { // create new input tensors @@ -2808,7 +2865,50 @@ void FFModel::create_operators_from_layers() { tensors_to_parallel_tensors.end()); inputs.push_back(tensors_to_parallel_tensors[l->inputs[i]]); } - Op *op = create_operator_from_layer(l, inputs); + // Op *op = create_operator_from_layer(l, inputs); + Op *op = nullptr; + if (config.tensor_parallelism_degree > 1 && l->op_type == OP_LAYERNORM && + layer_idx == layers.size() - 6) { + std::vector partitioned_inputs; + Combine *comb = new Combine(*this, + inputs[0], + 3 /*inner most dim*/, + config.tensor_parallelism_degree); + partitioned_inputs.push_back(comb->outputs[0]); + operators.push_back(comb); + op = create_operator_from_layer(l, partitioned_inputs); + } else { + op = create_operator_from_layer(l, inputs); + } + + // add replicate operators after op if needed + if (config.tensor_parallelism_degree > 1 && l->op_type == OP_EMBEDDING) { + // assert(op->numOutputs == 1); + // Replicate *repl = new Replicate(*this, + // op->outputs[0], + // op->outputs[0]->num_dims - 1, + // config.tensor_parallelism_degree); + // operators.push_back(repl); + // op = repl; + } else if (config.tensor_parallelism_degree > 1 && + (is_transformer_block(layer_idx) || is_mlp_block(layer_idx) || + // llama mlp layer + (l->op_type == OP_LINEAR && layer_idx >= 2 && + layers[layer_idx - 1]->op_type == OP_GELU && + layers[layer_idx - 2]->op_type == OP_LINEAR) || + // LLAMA without element-wise operator fusion + (l->op_type == OP_LINEAR && layer_idx >= 5 && + layers[layer_idx - 1]->op_type == OP_EW_MUL && + layers[layer_idx - 2]->op_type == OP_EW_MUL && + layers[layer_idx - 3]->op_type == OP_SIGMOID && + layers[layer_idx - 4]->op_type == OP_LINEAR && + layers[layer_idx - 5]->op_type == OP_LINEAR))) { + assert(op->numOutputs == 1); + AllReduce *allreduce = + new AllReduce(*this, op->outputs[0], op->outputs[0]->num_dims - 1); + operators.push_back(allreduce); + op = allreduce; + } assert(op->numOutputs == l->numOutputs); for (int i = 0; i < op->numOutputs; i++) { tensors_to_parallel_tensors[l->outputs[i]] = op->outputs[i]; @@ -2844,8 +2944,7 @@ void FFModel::compile(LossType loss_type, TaskLauncher launcher(GRAPH_OPTIMIZE_TASK_ID, TaskArgument(&model, sizeof(FFModel *))); Future future = runtime->execute_task(ctx, launcher); - ret = - future.get_result(); + ret = future.get_result(); } else { ret = PCG::Graph::graph_optimize_wrapper(this); } @@ -2981,6 +3080,60 @@ void FFModel::compile(LossType loss_type, } } + int degree = + config.data_parallelism_degree * config.tensor_parallelism_degree; + + for (int op_idx = 0; op_idx < operators.size(); op_idx++) { + Op const *op = operators[op_idx]; + // Skip weight operators + if (op->op_type == OP_WEIGHT) { + continue; + } + // Get machine views + std::vector machine_views; + for (int j = 0; j < config.data_parallelism_degree; j++) { + MachineView mv; + mv.device_type = MachineView::GPU; + mv.ndims = 1; + // mv.start_device_id = 0; + mv.stride[0] = 1; + int parallel_degree = 1; + for (int k = 0; k < op->outputs[0]->num_dims; k++) { + parallel_degree *= op->outputs[0]->dims[k].degree; + } + mv.dim[0] = parallel_degree; + mv.start_device_id = 0; + assert(mv == op->outputs[0]->machine_view); + machine_views.push_back(mv); + } + for (int i = 0; i < op->numOutputs; i++) { + ParallelTensor pt_base = op->outputs[i]; + + if (op->op_type == OP_REPLICATE) { + assert(op->numInputs == 1 && op->numOutputs == 1); + } + std::vector list; + bool found_parallel_tensor = false; + if (!found_parallel_tensor) { + for (int j = 0; j < config.data_parallelism_degree; j++) { + // Copy the metadata from pt_base to pt + ParallelTensor pt = new ParallelTensorBase(*pt_base); + pt->region = + runtime->create_logical_region(ctx, + pt_base->region.get_index_space(), + pt_base->region.get_field_space()); + pt->part = runtime->get_logical_partition( + ctx, pt->region, pt_base->part.get_index_partition()); + pt->machine_view = machine_views[j]; + Domain part_domain = + runtime->get_index_space_domain(ctx, pt_base->parallel_is); + assert(pt->machine_view.get_domain() == part_domain); + list.push_back(pt); + } + } + } + } + // Perform fusion optimizations if (config.perform_fusion) { fprintf(stderr, "Applying fusion optimizations during compilation...\n"); @@ -3488,34 +3641,34 @@ void FFIterationConfig::reset() { // Default Config Parameters struct DefaultConfig { - const static int epochs = 1; + static int const epochs = 1; // const static int iterations = 1; - const static int batchSize = 64; - const static bool profiling = false; + static int const batchSize = 64; + static bool const profiling = false; constexpr static float learningRate = 0.01f; constexpr static float weightDecay = 0.0001f; - const static size_t workSpaceSize = (size_t)2 * 1024 * 1024 * 1024; // 2GB - const static int numNodes = 1; - const static int workersPerNode = 0; - const static int cpusPerNode = 0; - const static size_t searchBudget = -1; - const static size_t simulatorWorkSpaceSize = + static size_t const workSpaceSize = (size_t)2 * 1024 * 1024 * 1024; // 2GB + static int const numNodes = 1; + static int const workersPerNode = 0; + static int const cpusPerNode = 0; + static size_t const searchBudget = -1; + static size_t const simulatorWorkSpaceSize = (size_t)2 * 1024 * 1024 * 1024; // 2GB constexpr static float searchAlpha = 1.2f; - const static bool searchOverlapBackwardUpdate = false; - const static bool onlyDataParallel = false; - const static bool enableSampleParallel = true; - const static bool enableParameterParallel = false; - const static bool enableAttributeParallel = false; - const static bool enableInplaceOptimizations = false; - const static bool allowTensorOpMathConversion = false; - const static int machine_model_version = 0; - const static int simulator_segment_size = 16777216; // 16 MB - const static int simulator_max_num_segments = 1; - const static int base_optimize_threshold = 10; - const static bool enable_control_replication = true; + static bool const searchOverlapBackwardUpdate = false; + static bool const onlyDataParallel = false; + static bool const enableSampleParallel = true; + static bool const enableParameterParallel = false; + static bool const enableAttributeParallel = false; + static bool const enableInplaceOptimizations = false; + static bool const allowTensorOpMathConversion = false; + static int const machine_model_version = 0; + static int const simulator_segment_size = 16777216; // 16 MB + static int const simulator_max_num_segments = 1; + static int const base_optimize_threshold = 10; + static bool const enable_control_replication = true; // The default python data loader type is 2 to enable control replication - const static int python_data_loader_type = 2; + static int const python_data_loader_type = 2; }; FFConfig::FFConfig() { @@ -3535,6 +3688,8 @@ FFConfig::FFConfig() { search_overlap_backward_update = DefaultConfig::searchOverlapBackwardUpdate; computationMode = COMP_MODE_TRAINING; only_data_parallel = DefaultConfig::onlyDataParallel; + data_parallelism_degree = 1; + tensor_parallelism_degree = 1; enable_sample_parallel = DefaultConfig::enableSampleParallel; enable_parameter_parallel = DefaultConfig::enableParameterParallel; enable_attribute_parallel = DefaultConfig::enableAttributeParallel; @@ -3580,6 +3735,9 @@ FFConfig::FFConfig() { Runtime *runtime = Runtime::get_runtime(); lg_hlr = runtime; lg_ctx = Runtime::get_context(); + Rect<1> task_rect(Point<1>(0), Point<1>(workersPerNode * numNodes - 1)); + // Create an index space for tasks running on all GPUs + all_gpu_task_is = runtime->create_index_space(lg_ctx, task_rect); field_space = runtime->create_field_space(lg_ctx); } @@ -3640,6 +3798,16 @@ void FFConfig::parse_args(char **argv, int argc) { only_data_parallel = true; continue; } + // data parallelism degree + if (!strcmp(argv[i], "-data-parallelism-degree")) { + data_parallelism_degree = std::stoi(argv[++i]); + continue; + } + // tensor parallelism degree + if (!strcmp(argv[i], "-tensor-parallelism-degree")) { + tensor_parallelism_degree = std::stoi(argv[++i]); + continue; + } if ((!strcmp(argv[i], "--enable-parameter-parallel"))) { enable_parameter_parallel = true; continue; @@ -5353,11 +5521,13 @@ void register_flexflow_internal_tasks(Runtime *runtime, } } { - TaskVariantRegistrar registrar(ADAM_UNIFY_UPD_NCCL_TASK_ID, "Adam unified NCCL Update"); + TaskVariantRegistrar registrar(ADAM_UNIFY_UPD_NCCL_TASK_ID, + "Adam unified NCCL Update"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { - Runtime::preregister_task_variant( + Runtime::preregister_task_variant< + AdamOptimizer::nccl_unified_update_task>( registrar, "Adam unified NCCL Update Task"); } else { if (enable_control_replication) { diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc index 2b90bc3f44..91a16e8db7 100644 --- a/src/runtime/optimizer.cc +++ b/src/runtime/optimizer.cc @@ -539,7 +539,6 @@ void AdamOptimizer::unified_update(std::vector const parameters) if (reservedWorkSpaceSize + p->get_volume() * sizeof(float) >= workSpaceSize) { break; } - printf("continueee\n"); reservedWorkSpaceSize += p->get_volume() * sizeof(float); parameters_num += 1; assert(p->sync_type == ParameterSyncType::NCCL); diff --git a/src/runtime/optimizer_kernel.cu b/src/runtime/optimizer_kernel.cu index 37c6e55ede..bfa57e9a8e 100644 --- a/src/runtime/optimizer_kernel.cu +++ b/src/runtime/optimizer_kernel.cu @@ -225,7 +225,6 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu( GenericTensorAccessorW *accWs, GenericTensorAccessorW *accVs, GenericTensorAccessorW *accMs) { - cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); // assert(op->reservedWorkSpaceSize < meta->handle.workSpaceSize); diff --git a/src/runtime/parallel_tensor.cc b/src/runtime/parallel_tensor.cc index 18318db3ce..b9f3dc89f7 100644 --- a/src/runtime/parallel_tensor.cc +++ b/src/runtime/parallel_tensor.cc @@ -660,14 +660,24 @@ bool ParallelTensorBase::set_tensor(FFModel const *ff, // TODO: check data type matches // TODO: Currently we use a task launch, change to index launch for NCCL // parameter - size_t volume = 1, num_replicas = 0; + size_t volume = 1, num_replicas = 1; if (sync_type == ParameterSyncType::NCCL) { - Domain domain = runtime->get_index_space_domain(ctx, parallel_is); - num_replicas = domain.get_volume(); + // Domain domain = runtime->get_index_space_domain(ctx, parallel_is); + // num_replicas = domain.get_volume(); + for (int i = 0; i < this->num_dims; i++) { + if (this->dims[i].is_replica_dim) { + num_replicas *= this->dims[i].size; + } + } } else if (sync_type == ParameterSyncType::PS) { num_replicas = 1; } else { - num_replicas = 1; + for (int i = 0; i < this->num_dims; i++) { + if (this->dims[i].is_replica_dim) { + num_replicas *= this->dims[i].size; + } + } + // num_replicas = 1; } for (size_t i = 0; i < dim_sizes.size(); i++) { volume = volume * dim_sizes[i]; From 355d4b493e926ab026f1c8f17b7a67dcaf224ab4 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Wed, 28 Feb 2024 23:46:35 -0500 Subject: [PATCH 18/19] try a fix --- src/runtime/optimizer_kernel.cu | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/runtime/optimizer_kernel.cu b/src/runtime/optimizer_kernel.cu index bfa57e9a8e..17adce94b8 100644 --- a/src/runtime/optimizer_kernel.cu +++ b/src/runtime/optimizer_kernel.cu @@ -238,7 +238,12 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu( cudaEventRecord(t_start1, stream); cudaEventRecord(t_start2, stream); - void *workSpace_ptr = meta->handle.workSpace; + void *allocate_ptr; + // = meta->handle.workSpace; + checkCUDA( + cudaMalloc(&allocate_ptr,meta->handle.workSpaceSize)); + + void *workSpace_ptr = allocate_ptr; for (int i = 0; i < op->parameters_num; i++) { cudaMemcpyAsync(workSpace_ptr, @@ -272,7 +277,8 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu( cudaEventDestroy(t_start2); printf("[optimizer] allreduce time = %.2lfms\n", elapsed); - workSpace_ptr = static_cast(meta->handle.workSpace); + // workSpace_ptr = static_cast(meta->handle.workSpace); + workSpace_ptr = static_cast(allocate_ptr); float alpha_t = op->alpha_t; float beta1_t = op->beta1_t; float beta2_t = op->beta2_t; @@ -304,6 +310,7 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu( checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); cudaEventDestroy(t_start); cudaEventDestroy(t_end); + checkCUDA(cudaFree(allocate_ptr)); printf("[optimizer] total time = %.2lfms\n", elapsed); } #endif From 8488ba023b21210d7e32e1760d6fbbc18afd3f8f Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Fri, 13 Sep 2024 10:14:15 -0400 Subject: [PATCH 19/19] fix hip --- include/flexflow/utils/cuda_helper.h | 5 ++++- include/flexflow/utils/hip_helper.h | 6 ++++++ src/dataloader/dataloader.cpp | 10 +++++++--- src/runtime/hip_helper.cpp | 20 ++++++++++++++++++++ 4 files changed, 37 insertions(+), 4 deletions(-) diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index c56ab473c0..d077995884 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -83,7 +83,10 @@ template __global__ void copy_kernel(DT *dst, const DT *src, Legion::coord_t size); template -__global__ void copy_kernel_with_replicate(DT *dst, const DT *src, Legion::coord_t origin_size, Legion::coord_t size); +__global__ void copy_kernel_with_replicate(DT *dst, + const DT *src, + Legion::coord_t origin_size, + Legion::coord_t size); template __global__ void add_kernel(T *data_ptr, T const *grad_ptr, size_t size); diff --git a/include/flexflow/utils/hip_helper.h b/include/flexflow/utils/hip_helper.h index 709e78f517..8c589305c2 100644 --- a/include/flexflow/utils/hip_helper.h +++ b/include/flexflow/utils/hip_helper.h @@ -82,6 +82,12 @@ __global__ void assign_kernel(DT *ptr, Legion::coord_t size, DT value); template __global__ void copy_kernel(DT *dst, const DT *src, Legion::coord_t size); +template +__global__ void copy_kernel_with_replicate(DT *dst, + const DT *src, + Legion::coord_t origin_size, + Legion::coord_t size); + template __global__ void add_kernel(T *data_ptr, T const *grad_ptr, size_t size); diff --git a/src/dataloader/dataloader.cpp b/src/dataloader/dataloader.cpp index 7d9ffc02b1..97668d705d 100644 --- a/src/dataloader/dataloader.cpp +++ b/src/dataloader/dataloader.cpp @@ -41,10 +41,12 @@ void SingleDataLoader::load_input(Task const *task, int num_dims = full_input_domain.get_dim(); assert(num_dims + 1 == batch_input_domain.get_dim()); // assert the leading replica dim has a degree of one - assert(batch_input_domain.hi()[num_dims] == - batch_input_domain.lo()[num_dims]); + // assert(batch_input_domain.hi()[num_dims] == + // batch_input_domain.lo()[num_dims]); coord_t batch_size = batch_input_domain.hi()[num_dims - 1] - batch_input_domain.lo()[num_dims - 1] + 1; + coord_t replicate_num = + batch_input_domain.hi()[num_dims] - batch_input_domain.lo()[num_dims] + 1; coord_t num_elements_per_batch = batch_input_domain.get_volume() / batch_size; // FIXME: currently assume continous indices assert(batch_size == meta->num_samples); @@ -61,13 +63,15 @@ void SingleDataLoader::load_input(Task const *task, // printf("ptr(%p, %p), idx0 %d nb_elements_per_batch %d, batch_size %d, // %d\n", acc_full_input.ptr, input_zc, start_idx, num_elements_per_batch, // batch_size, start_idx * num_elements_per_batch); - hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel
), + assert(batch_input_domain.get_volume() % replicate_num == 0); + hipLaunchKernelGGL(HIP_KERNEL_NAME(copy_kernel_with_replicate
), GET_BLOCKS(batch_input_domain.get_volume()), CUDA_NUM_THREADS, 0, stream, batch_input_ptr, input_zc, + batch_input_domain.get_volume() / replicate_num, batch_input_domain.get_volume()); checkCUDA(hipDeviceSynchronize()); } diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp index ffdcf0dac1..8617cb2ef3 100644 --- a/src/runtime/hip_helper.cpp +++ b/src/runtime/hip_helper.cpp @@ -55,6 +55,16 @@ __global__ void copy_kernel(DT *dst, const DT *src, coord_t size) { } } +template +__global__ void copy_kernel_with_replicate(DT *dst, + const DT *src, + coord_t origin_size, + coord_t size) { + CUDA_KERNEL_LOOP(i, size) { + dst[i] = src[i % origin_size]; + } +} + template __global__ void reluBackward(DT *grad_ptr, const DT *output, size_t n) { CUDA_KERNEL_LOOP(i, n) { @@ -404,6 +414,16 @@ template __global__ void template __global__ void copy_kernel(float *dst, float const *src, coord_t size); + +template __global__ void copy_kernel_with_replicate(float *dst, + float const *src, + coord_t origin_size, + coord_t size); +template __global__ void copy_kernel_with_replicate( + int32_t *dst, int32_t const *src, coord_t origin_size, coord_t size); +template __global__ void copy_kernel_with_replicate( + int64_t *dst, int64_t const *src, coord_t origin_size, coord_t size); + template __global__ void copy_kernel(int32_t *dst, int32_t const *src, coord_t size); template __global__ void