Skip to content

Commit

Permalink
初步的Tensor并行
Browse files Browse the repository at this point in the history
  • Loading branch information
ztxz16 committed Aug 8, 2024
1 parent 3e9a49d commit aa577f7
Show file tree
Hide file tree
Showing 8 changed files with 578 additions and 2 deletions.
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ if (USE_CUDA)
include_directories(include/devices/cuda)
#message(${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
set(FASTLLM_CUDA_SOURCES src/devices/cuda/cudadevice.cpp src/devices/cuda/cudadevicebatch.cpp src/devices/cuda/fastllm-cuda.cu)

include_directories(include/devices/multicuda)
set(FASTLLM_CUDA_SOURCES ${FASTLLM_CUDA_SOURCES} src/devices/multicuda/multicudadevice.cpp src/devices/multicuda/fastllm-multicuda.cu)

set(FASTLLM_LINKED_LIBS ${FASTLLM_LINKED_LIBS} cublas)
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
endif()
Expand Down
20 changes: 20 additions & 0 deletions include/devices/multicuda/fastllm-multicuda.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
//
// Created by huangyuyang on 8/2/24.
//

#include "fastllm.h"

std::vector <long long> FastllmCudaGetFreeSizes();

#ifdef __cplusplus
extern "C" {
#endif

void FastllmMultiCudaSetDevice(std::vector <int> ids);

bool FastllmMultiCudaHalfMatMul(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k);
bool FastllmMultiCudaMatMul(const fastllm::Data &input, fastllm::Data &weight, const fastllm::Data &bias, fastllm::Data &output, int n, int m, int k);

#ifdef __cplusplus
}
#endif
27 changes: 27 additions & 0 deletions include/devices/multicuda/multicudadevice.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
//
// Created by huangyuyang on 8/2/24.
//

#ifndef FASTLLM_MULTICUDADEVICE_H
#define FASTLLM_MULTICUDADEVICE_H

#include "device.h"

namespace fastllm {
class MultiCudaDevice : BaseDevice {
public:
MultiCudaDevice ();

bool Malloc (void **ret, size_t size); // 分配尺寸为size的空间
bool Free(void *ret); // 释放ret

bool CopyDataToCPU(void *dst, void *src, size_t size);
bool CopyDataFromCPU(void *dst, void *src, size_t size);
};

class MultiCudaLinearOp : CudaLinearOp {
void Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams);
};
}

#endif //FASTLLM_MULTICUDADEVICE_H
1 change: 0 additions & 1 deletion src/devices/cuda/fastllm-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,6 @@ __global__ void FastllmCudaHalf2FloatKernel(half* a, float *b, int len) {
}

__global__ void FastllmCudaBF162FloatKernel(uint16_t* a, float *b, int len) {
return;
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < len) {
((uint32_t*)b)[idx] = a[idx] << 16;
Expand Down
438 changes: 438 additions & 0 deletions src/devices/multicuda/fastllm-multicuda.cu

Large diffs are not rendered by default.

82 changes: 82 additions & 0 deletions src/devices/multicuda/multicudadevice.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
//
// Created by huangyuyang on 8/2/24.
//

#include "devices/cpu/cpudevice.h"
#include "devices/cuda/cudadevice.h"
#include "devices/cuda/fastllm-cuda.cuh"
#include "devices/multicuda/multicudadevice.h"

#include "fastllm-multicuda.cuh"

#include "utils.h"

namespace fastllm {
MultiCudaDevice::MultiCudaDevice() {
this->deviceType = "multicuda";

this->ops["Linear"] = (BaseOperator*)(new MultiCudaLinearOp());
}

bool MultiCudaDevice::Malloc(void **ret, size_t size) {
*ret = FastllmCudaMalloc(size);
return true;
}

bool MultiCudaDevice::Free(void *ret) {
FastllmCudaFree(ret);
return true;
}

bool MultiCudaDevice::CopyDataFromCPU(void *dst, void *src, size_t size) {
FastllmCudaCopyFromHostToDevice(dst, src, size);
return true;
}

bool MultiCudaDevice::CopyDataToCPU(void *dst, void *src, size_t size) {
FastllmCudaCopyFromDeviceToHost(dst, src, size);
return true;
}

void MultiCudaLinearOp::Run(const std::string &opType, const DataDict &datas, const FloatDict &floatParams, const IntDict &intParams) {
// auto st = std::chrono::system_clock::now();
Data &input = *(datas.find("input")->second);
Data &output = *(datas.find("output")->second);
Data &weight = *(datas.find("weight")->second);
Data &bias = *(datas.find("bias")->second);

output.Allocate();
int n = input.Count(0) / input.dims.back();
int m = input.dims.back();
int k = output.dims.back();

if (input.dataType == DataType::FLOAT16) {
if (weight.dataType == DataType::FLOAT16 ||
weight.dataType == DataType::INT8 ||
weight.dataType == DataType::INT4_NOZERO ||
weight.dataType == DataType::INT4_GROUP) {
FastllmMultiCudaHalfMatMul(input, weight, bias, output, n, m, k);
} else {
ErrorInFastLLM("Linear error: unsupport weight's dataType.\n");
}
} else if (input.dataType == DataType::FLOAT32) {
if (weight.dataType == DataType::FLOAT32) {
FastllmCudaMatMulFloat32(input, weight, bias, output, n, m, k);
} else if (weight.dataType == DataType::FLOAT16 ||
weight.dataType == DataType::INT8 ||
weight.dataType == DataType::INT4_NOZERO ||
weight.dataType == DataType::INT4_GROUP) {
FastllmMultiCudaMatMul(input, weight, bias, output, n, m, k);
} else if (weight.dataType == DataType::INT4) {
FastllmCudaMatMulFloatInt4(input, weight, bias, output, n, m, k);
} else {
ErrorInFastLLM("Linear error: unsupport weight's dataType.\n");
}
} else {
ErrorInFastLLM("Linear error: unsupport input's dataType.\n");
}
// float spend = GetSpan(st, std::chrono::system_clock::now());
// float gops = (float)n * m * k / spend / 1e9;
// printf("n = %d, m = %d, k = %d, spend %f s, gops = %f\n", n, m, k, spend, gops);
}
}
6 changes: 6 additions & 0 deletions src/executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#ifdef USE_CUDA
#include "devices/cuda/cudadevice.h"
#include "devices/cuda/fastllm-cuda.cuh"
#include "devices/multicuda/multicudadevice.h"
#include "devices/multicuda/fastllm-multicuda.cuh"
#endif

#ifdef USE_TFACC
Expand All @@ -22,6 +24,7 @@ namespace fastllm {
this->devices.clear();
#ifdef USE_CUDA
this->devices.push_back((BaseDevice*) new CudaDevice());
this->devices.push_back((BaseDevice*) new MultiCudaDevice());
#endif
#ifdef USE_TFACC
this->devices.push_back((BaseDevice*) new TfaccDevice());
Expand Down Expand Up @@ -96,6 +99,9 @@ namespace fastllm {
if (device->deviceType == "cuda" && device->deviceIds.size() > 0) {
FastllmCudaSetDevice(device->deviceIds[0]);
}
if (device->deviceType == "multicuda" && device->deviceIds.size() > 0) {
FastllmMultiCudaSetDevice(device->deviceIds);
}
#endif
for (auto &it: datas) {
if (intParams.find(it.first + "___batch") != intParams.end()) {
Expand Down
2 changes: 1 addition & 1 deletion src/fastllm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1102,7 +1102,7 @@ namespace fastllm {

void Data::ToDevice(void *device) {
BaseDevice *dev = (BaseDevice*)device;
if (dev->deviceType == "cuda") {
if (dev->deviceType == "cuda" || dev->deviceType == "multicuda") {
this->ToDevice(DataDevice::CUDA, dev->deviceIds);
} else {
this->ToDevice(DataDevice::CPU, dev->deviceIds);
Expand Down

0 comments on commit aa577f7

Please sign in to comment.