Skip to content

Commit

Permalink
[CUDA] Add functions to dump bfloat16 tensors (#19266)
Browse files Browse the repository at this point in the history
### Description
GroupQueryAttention add BFloat16 in
#19095, and there is build
error when enable dumping. This supports print bfloat16 tensor to
console.
  • Loading branch information
tianleiwu authored Jan 25, 2024
1 parent 5b06505 commit 2b285cd
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 40 deletions.
88 changes: 56 additions & 32 deletions onnxruntime/contrib_ops/cuda/transformers/dump_cuda_tensor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -203,33 +203,34 @@ void DumpGpuTensor(const char* name, const Tensor& tensor) {
DumpGpuTensor(nullptr, tensor, static_cast<int>(num_rows), static_cast<int>(row_size));
}

void CudaTensorConsoleDumper::Print(const char* name, const float* tensor, int dim0, int dim1) const {
void CudaTensorConsoleDumper::Print(const char* name, const size_t* tensor, int dim0, int dim1) const {
if (is_enabled_)
DumpGpuTensor<float>(name, tensor, dim0, dim1, true);
DumpGpuTensor<size_t>(name, tensor, dim0, dim1, true);
}

void CudaTensorConsoleDumper::Print(const char* name, const MLFloat16* tensor, int dim0, int dim1) const {
void CudaTensorConsoleDumper::Print(const char* name, const int32_t* tensor, int dim0, int dim1) const {
if (is_enabled_)
DumpGpuTensor<MLFloat16>(name, tensor, dim0, dim1, true);
DumpGpuTensor<int32_t>(name, tensor, dim0, dim1, true);
}

void CudaTensorConsoleDumper::Print(const char* name, const size_t* tensor, int dim0, int dim1) const {
void CudaTensorConsoleDumper::Print(const char* name, const int32_t* tensor, int dim0, int dim1, int dim2) const {
if (is_enabled_)
DumpGpuTensor<size_t>(name, tensor, dim0, dim1, true);
}

void CudaTensorConsoleDumper::Print(const char* name, const half* tensor, int dim0, int dim1) const {
Print(name, reinterpret_cast<const MLFloat16*>(tensor), dim0, dim1);
DumpGpuTensor<int32_t>(name, tensor, dim0, dim1, dim2, true);
}

void CudaTensorConsoleDumper::Print(const char* name, const int64_t* tensor, int dim0, int dim1) const {
if (is_enabled_)
DumpGpuTensor<int64_t>(name, tensor, dim0, dim1, true);
}

void CudaTensorConsoleDumper::Print(const char* name, const int32_t* tensor, int dim0, int dim1) const {
void CudaTensorConsoleDumper::Print(const char* name, const int64_t* tensor, int dim0, int dim1, int dim2) const {
if (is_enabled_)
DumpGpuTensor<int32_t>(name, tensor, dim0, dim1, true);
DumpGpuTensor<int64_t>(name, tensor, dim0, dim1, dim2, true);
}

void CudaTensorConsoleDumper::Print(const char* name, const float* tensor, int dim0, int dim1) const {
if (is_enabled_)
DumpGpuTensor<float>(name, tensor, dim0, dim1, true);
}

void CudaTensorConsoleDumper::Print(const char* name, const float* tensor, int dim0, int dim1, int dim2) const {
Expand All @@ -242,6 +243,11 @@ void CudaTensorConsoleDumper::Print(const char* name, const float* tensor, int d
DumpGpuTensor<float>(name, tensor, dim0, dim1, dim2, dim3, true);
}

void CudaTensorConsoleDumper::Print(const char* name, const MLFloat16* tensor, int dim0, int dim1) const {
if (is_enabled_)
DumpGpuTensor<MLFloat16>(name, tensor, dim0, dim1, true);
}

void CudaTensorConsoleDumper::Print(const char* name, const MLFloat16* tensor, int dim0, int dim1, int dim2) const {
if (is_enabled_)
DumpGpuTensor<MLFloat16>(name, tensor, dim0, dim1, dim2, true);
Expand All @@ -252,22 +258,31 @@ void CudaTensorConsoleDumper::Print(const char* name, const MLFloat16* tensor, i
DumpGpuTensor<MLFloat16>(name, tensor, dim0, dim1, dim2, dim3, true);
}

void CudaTensorConsoleDumper::Print(const char* name, const half* tensor, int dim0, int dim1, int dim2) const {
Print(name, reinterpret_cast<const MLFloat16*>(tensor), dim0, dim1, dim2);
void CudaTensorConsoleDumper::Print(const char* name, const BFloat16* tensor, int dim0, int dim1) const {
if (is_enabled_)
DumpGpuTensor<BFloat16>(name, tensor, dim0, dim1, true);
}

void CudaTensorConsoleDumper::Print(const char* name, const half* tensor, int dim0, int dim1, int dim2, int dim3) const {
Print(name, reinterpret_cast<const MLFloat16*>(tensor), dim0, dim1, dim2, dim3);
void CudaTensorConsoleDumper::Print(const char* name, const BFloat16* tensor, int dim0, int dim1, int dim2) const {
if (is_enabled_)
DumpGpuTensor<BFloat16>(name, tensor, dim0, dim1, dim2, true);
}

void CudaTensorConsoleDumper::Print(const char* name, const int64_t* tensor, int dim0, int dim1, int dim2) const {
void CudaTensorConsoleDumper::Print(const char* name, const BFloat16* tensor, int dim0, int dim1, int dim2, int dim3) const {
if (is_enabled_)
DumpGpuTensor<int64_t>(name, tensor, dim0, dim1, dim2, true);
DumpGpuTensor<BFloat16>(name, tensor, dim0, dim1, dim2, dim3, true);
}

void CudaTensorConsoleDumper::Print(const char* name, const int32_t* tensor, int dim0, int dim1, int dim2) const {
if (is_enabled_)
DumpGpuTensor<int32_t>(name, tensor, dim0, dim1, dim2, true);
void CudaTensorConsoleDumper::Print(const char* name, const half* tensor, int dim0, int dim1) const {
Print(name, reinterpret_cast<const MLFloat16*>(tensor), dim0, dim1);
}

void CudaTensorConsoleDumper::Print(const char* name, const half* tensor, int dim0, int dim1, int dim2) const {
Print(name, reinterpret_cast<const MLFloat16*>(tensor), dim0, dim1, dim2);
}

void CudaTensorConsoleDumper::Print(const char* name, const half* tensor, int dim0, int dim1, int dim2, int dim3) const {
Print(name, reinterpret_cast<const MLFloat16*>(tensor), dim0, dim1, dim2, dim3);
}

void CudaTensorConsoleDumper::Print(const char* name, const Tensor& tensor) const {
Expand Down Expand Up @@ -301,43 +316,52 @@ void CudaTensorConsoleDumper::Print(const char* name, const std::string& value,
}

#else
void CudaTensorConsoleDumper::Print(const char*, const float*, int, int) const {
void CudaTensorConsoleDumper::Print(const char*, const size_t*, int, int) const {
}

void CudaTensorConsoleDumper::Print(const char*, const MLFloat16*, int, int) const {
void CudaTensorConsoleDumper::Print(const char*, const int32_t*, int, int) const {
}

void CudaTensorConsoleDumper::Print(const char*, const size_t*, int, int) const {
void CudaTensorConsoleDumper::Print(const char*, const int32_t*, int, int, int) const {
}

void CudaTensorConsoleDumper::Print(const char*, const half*, int, int) const {
void CudaTensorConsoleDumper::Print(const char*, const int64_t*, int, int) const {
}

void CudaTensorConsoleDumper::Print(const char*, const int64_t*, int, int) const {
void CudaTensorConsoleDumper::Print(const char*, const int64_t*, int, int, int) const {
}

void CudaTensorConsoleDumper::Print(const char*, const int32_t*, int, int) const {
void CudaTensorConsoleDumper::Print(const char*, const float*, int, int) const {
}

void CudaTensorConsoleDumper::Print(const char*, const float*, int, int, int) const {
}

void CudaTensorConsoleDumper::Print(const char*, const float*, int, int, int, int) const {
}

void CudaTensorConsoleDumper::Print(const char*, const MLFloat16*, int, int) const {
}

void CudaTensorConsoleDumper::Print(const char*, const MLFloat16*, int, int, int) const {
}

void CudaTensorConsoleDumper::Print(const char*, const half*, int, int, int) const {
void CudaTensorConsoleDumper::Print(const char*, const MLFloat16*, int, int, int, int) const {
}

void CudaTensorConsoleDumper::Print(const char*, const int64_t*, int, int, int) const {
void CudaTensorConsoleDumper::Print(const char*, const BFloat16*, int, int) const {
}

void CudaTensorConsoleDumper::Print(const char*, const int32_t*, int, int, int) const {
void CudaTensorConsoleDumper::Print(const char*, const BFloat16*, int, int, int) const {
}

void CudaTensorConsoleDumper::Print(const char*, const float*, int, int, int, int) const {
void CudaTensorConsoleDumper::Print(const char*, const BFloat16*, int, int, int, int) const {
}

void CudaTensorConsoleDumper::Print(const char*, const MLFloat16*, int, int, int, int) const {
void CudaTensorConsoleDumper::Print(const char*, const half*, int, int) const {
}

void CudaTensorConsoleDumper::Print(const char*, const half*, int, int, int) const {
}

void CudaTensorConsoleDumper::Print(const char*, const half*, int, int, int, int) const {
Expand Down
27 changes: 19 additions & 8 deletions onnxruntime/contrib_ops/cuda/transformers/dump_cuda_tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,31 @@ class CudaTensorConsoleDumper : public onnxruntime::contrib::transformers::ICons
public:
CudaTensorConsoleDumper() = default;
virtual ~CudaTensorConsoleDumper() {}
void Print(const char* name, const float* tensor, int dim0, int dim1) const override;
void Print(const char* name, const MLFloat16* tensor, int dim0, int dim1) const override;

void Print(const char* name, const size_t* tensor, int dim0, int dim1) const override;
void Print(const char* name, const half* tensor, int dim0, int dim1) const;
void Print(const char* name, const int64_t* tensor, int dim0, int dim1) const override;

void Print(const char* name, const int32_t* tensor, int dim0, int dim1) const override;
void Print(const char* name, const int32_t* tensor, int dim0, int dim1, int dim2) const override;

void Print(const char* name, const int64_t* tensor, int dim0, int dim1) const override;
void Print(const char* name, const int64_t* tensor, int dim0, int dim1, int dim2) const override;

void Print(const char* name, const float* tensor, int dim0, int dim1) const override;
void Print(const char* name, const float* tensor, int dim0, int dim1, int dim2) const override;
void Print(const char* name, const float* tensor, int dim0, int dim1, int dim2, int dim3) const;
void Print(const char* name, const MLFloat16* tensor, int dim0, int dim1, int dim2) const override;
void Print(const char* name, const MLFloat16* tensor, int dim0, int dim1, int dim2, int dim3) const;

void Print(const char* name, const half* tensor, int dim0, int dim1) const;
void Print(const char* name, const half* tensor, int dim0, int dim1, int dim2) const;
void Print(const char* name, const half* tensor, int dim0, int dim1, int dim2, int dim3) const;
void Print(const char* name, const int64_t* tensor, int dim0, int dim1, int dim2) const override;
void Print(const char* name, const int32_t* tensor, int dim0, int dim1, int dim2) const override;

void Print(const char* name, const MLFloat16* tensor, int dim0, int dim1) const override;
void Print(const char* name, const MLFloat16* tensor, int dim0, int dim1, int dim2) const override;
void Print(const char* name, const MLFloat16* tensor, int dim0, int dim1, int dim2, int dim3) const;

void Print(const char* name, const BFloat16* tensor, int dim0, int dim1) const;
void Print(const char* name, const BFloat16* tensor, int dim0, int dim1, int dim2) const;
void Print(const char* name, const BFloat16* tensor, int dim0, int dim1, int dim2, int dim3) const;

void Print(const char* name, const Tensor& value) const override;
void Print(const char* name, const OrtValue& value) const override;
void Print(const char* name, int index, bool end_line) const override;
Expand Down

0 comments on commit 2b285cd

Please sign in to comment.