初步的Tensor并行

ztxz16 · Aug 8, 2024 · aa577f7 · aa577f7
1 parent 3e9a49d
commit aa577f7
Show file tree

Hide file tree

Showing 8 changed files with 578 additions and 2 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -83,6 +83,10 @@ if (USE_CUDA)
     include_directories(include/devices/cuda)
     #message(${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
     set(FASTLLM_CUDA_SOURCES src/devices/cuda/cudadevice.cpp src/devices/cuda/cudadevicebatch.cpp src/devices/cuda/fastllm-cuda.cu)
+
+    include_directories(include/devices/multicuda)
+    set(FASTLLM_CUDA_SOURCES ${FASTLLM_CUDA_SOURCES} src/devices/multicuda/multicudadevice.cpp src/devices/multicuda/fastllm-multicuda.cu)
+
     set(FASTLLM_LINKED_LIBS ${FASTLLM_LINKED_LIBS} cublas)
     set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
 endif()

diff --git a/include/devices/multicuda/fastllm-multicuda.cuh b/include/devices/multicuda/fastllm-multicuda.cuh
@@ -0,0 +1,20 @@
+//
+// Created by huangyuyang on 8/2/24.
+//
+
+#include "fastllm.h"
+
+std::vector <long long> FastllmCudaGetFreeSizes();
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+void FastllmMultiCudaSetDevice(std::vector <int> ids);
+
+bool FastllmMultiCudaHalfMatMul(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k);
+bool FastllmMultiCudaMatMul(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/include/devices/multicuda/multicudadevice.h b/include/devices/multicuda/multicudadevice.h
@@ -0,0 +1,27 @@
+//
+// Created by huangyuyang on 8/2/24.
+//
+
+#ifndef FASTLLM_MULTICUDADEVICE_H
+#define FASTLLM_MULTICUDADEVICE_H
+
+#include "device.h"
+
+namespace fastllm {
+    class MultiCudaDevice : BaseDevice {
+    public:
+        MultiCudaDevice ();
+
+        bool Malloc (void **ret, size_t size); // 分配尺寸为size的空间
+        bool Free(void *ret); // 释放ret
+
+        bool CopyDataToCPU(void *dst, void *src, size_t size);
+        bool CopyDataFromCPU(void *dst, void *src, size_t size);
+    };
+
+    class MultiCudaLinearOp : CudaLinearOp {
+        void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
+    };
+}
+
+#endif //FASTLLM_MULTICUDADEVICE_H
diff --git a/src/devices/cuda/fastllm-cuda.cu b/src/devices/cuda/fastllm-cuda.cu
@@ -406,7 +406,6 @@ __global__ void FastllmCudaHalf2FloatKernel(half* a, float *b, int len) {
 }
 
 __global__ void FastllmCudaBF162FloatKernel(uint16_t* a, float *b, int len) {
-    return;
     int idx = threadIdx.x + blockIdx.x * blockDim.x;
     if (idx < len) {
         ((uint32_t*)b)[idx] = a[idx] << 16;

diff --git a/src/devices/multicuda/fastllm-multicuda.cu b/src/devices/multicuda/fastllm-multicuda.cu
diff --git a/src/devices/multicuda/multicudadevice.cpp b/src/devices/multicuda/multicudadevice.cpp
@@ -0,0 +1,82 @@
+//
+// Created by huangyuyang on 8/2/24.
+//
+
+#include "devices/cpu/cpudevice.h"
+#include "devices/cuda/cudadevice.h"
+#include "devices/cuda/fastllm-cuda.cuh"
+#include "devices/multicuda/multicudadevice.h"
+
+#include "fastllm-multicuda.cuh"
+
+#include "utils.h"
+
+namespace fastllm {
+    MultiCudaDevice::MultiCudaDevice() {
+        this->deviceType = "multicuda";
+
+        this->ops["Linear"] = (BaseOperator*)(new MultiCudaLinearOp());
+    }
+
+    bool MultiCudaDevice::Malloc(void **ret, size_t size) {
+        *ret = FastllmCudaMalloc(size);
+        return true;
+    }
+
+    bool MultiCudaDevice::Free(void *ret) {
+        FastllmCudaFree(ret);
+        return true;
+    }
+
+    bool MultiCudaDevice::CopyDataFromCPU(void *dst, void *src, size_t size) {
+        FastllmCudaCopyFromHostToDevice(dst, src, size);
+        return true;
+    }
+
+    bool MultiCudaDevice::CopyDataToCPU(void *dst, void *src, size_t size) {
+        FastllmCudaCopyFromDeviceToHost(dst, src, size);
+        return true;
+    }
+
+    void MultiCudaLinearOp::Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams) {
+// auto st = std::chrono::system_clock::now();
+        Data &input = *(datas.find("input")->second);
+        Data &output = *(datas.find("output")->second);
+        Data &weight = *(datas.find("weight")->second);
+        Data &bias = *(datas.find("bias")->second);
+
+        output.Allocate();
+        int n = input.Count(0) / input.dims.back();
+        int m = input.dims.back();
+        int k = output.dims.back();
+
+        if (input.dataType == DataType::FLOAT16) {
+            if (weight.dataType == DataType::FLOAT16 ||
+                weight.dataType == DataType::INT8 ||
+                weight.dataType == DataType::INT4_NOZERO ||
+                weight.dataType == DataType::INT4_GROUP) {
+                FastllmMultiCudaHalfMatMul(input, weight, bias, output, n, m, k);
+            } else {
+                ErrorInFastLLM("Linear error: unsupport weight's dataType.\n");
+            }
+        } else if (input.dataType == DataType::FLOAT32) {
+            if (weight.dataType == DataType::FLOAT32) {
+                FastllmCudaMatMulFloat32(input, weight, bias, output, n, m, k);
+            } else if (weight.dataType == DataType::FLOAT16 ||
+                        weight.dataType == DataType::INT8 ||
+                        weight.dataType == DataType::INT4_NOZERO ||
+                        weight.dataType == DataType::INT4_GROUP) {
+                FastllmMultiCudaMatMul(input, weight, bias, output, n, m, k);
+            } else if (weight.dataType == DataType::INT4) {
+                FastllmCudaMatMulFloatInt4(input, weight, bias, output, n, m, k);
+            } else {
+                ErrorInFastLLM("Linear error: unsupport weight's dataType.\n");
+            }
+        } else {
+            ErrorInFastLLM("Linear error: unsupport input's dataType.\n");
+        }
+// float spend = GetSpan(st, std::chrono::system_clock::now());
+// float gops = (float)n * m * k / spend / 1e9;
+// printf("n = %d, m = %d, k = %d, spend %f s, gops = %f\n", n, m, k, spend, gops);
+    }
+}
diff --git a/src/executor.cpp b/src/executor.cpp
@@ -11,6 +11,8 @@
 #ifdef USE_CUDA
 #include "devices/cuda/cudadevice.h"
 #include "devices/cuda/fastllm-cuda.cuh"
+#include "devices/multicuda/multicudadevice.h"
+#include "devices/multicuda/fastllm-multicuda.cuh"
 #endif
 
 #ifdef USE_TFACC
@@ -22,6 +24,7 @@ namespace fastllm {
         this->devices.clear();
 #ifdef USE_CUDA
         this->devices.push_back((BaseDevice*) new CudaDevice());
+        this->devices.push_back((BaseDevice*) new MultiCudaDevice());
 #endif
 #ifdef USE_TFACC
         this->devices.push_back((BaseDevice*) new TfaccDevice());
@@ -96,6 +99,9 @@ namespace fastllm {
                 if (device->deviceType == "cuda" && device->deviceIds.size() > 0) {
                     FastllmCudaSetDevice(device->deviceIds[0]);
                 }
+                if (device->deviceType == "multicuda" && device->deviceIds.size() > 0) {
+                    FastllmMultiCudaSetDevice(device->deviceIds);
+                }
 #endif
                 for (auto &it: datas) {
                     if (intParams.find(it.first + "___batch") != intParams.end()) {

diff --git a/src/fastllm.cpp b/src/fastllm.cpp
@@ -1102,7 +1102,7 @@ namespace fastllm {
 
     void Data::ToDevice(void *device) {
         BaseDevice *dev = (BaseDevice*)device;
-        if (dev->deviceType == "cuda") {
+        if (dev->deviceType == "cuda" || dev->deviceType == "multicuda") {
             this->ToDevice(DataDevice::CUDA, dev->deviceIds);
         } else {
             this->ToDevice(DataDevice::CPU, dev->deviceIds);