Skip to content

Commit

Permalink
Add cpu logits conversion from fp16 to fp32 (#260)
Browse files Browse the repository at this point in the history
Convenience addition for models that run on devices besides CPU but
don't output fp32.

Prior to this change, there would be an error if the model outputs fp16
and our scoring device is the CPU.
  • Loading branch information
RyanUnderhill authored Apr 12, 2024
1 parent 91eab7b commit e295090
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 12 deletions.
9 changes: 2 additions & 7 deletions src/models/logits.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,6 @@ Logits::Logits(const Model& model, State& state)
state_{state},
shape_{static_cast<int64_t>(state_.params_->batch_size) * state_.params_->search.num_beams, state_.params_->sequence_length, state_.params_->vocab_size},
type_{model_.session_info_->GetOutputDataType(model_.config_->model.decoder.outputs.logits)} {
if (model_.device_type_ == DeviceType::CPU && type_ != Ort::TypeToTensorType<float>::type)
throw std::runtime_error("Model logits_type can only be float32 on CPU");

auto logits_tensor = OrtValue::CreateTensor(*model.allocator_device_, shape_, type_);
if (type_ == Ort::TypeToTensorType<float>::type)
value32_ = std::move(logits_tensor);
Expand All @@ -22,11 +19,9 @@ Logits::Logits(const Model& model, State& state)
RoamingArray<float> Logits::Get() {
size_t element_count = shape_[0] * shape_[1] * shape_[2];

#if USE_CUDA
// Convert from float16 to float32 if necessary
if (model_.device_type_ == DeviceType::CUDA && type_ == Ort::TypeToTensorType<Ort::Float16_t>::type)
ConvertFp16ToFp32(*model_.allocator_device_, model_.cuda_stream_, *value16_, value32_);
#endif
if (type_ == Ort::TypeToTensorType<Ort::Float16_t>::type)
ConvertFp16ToFp32(*model_.allocator_device_, *value16_, value32_, model_.device_type_, model_.cuda_stream_);

// First iteration? Then copy the logits over to a {batch_beams, 1, vocab_size} tensor
// We'll reuse this tensor for all future iterations
Expand Down
20 changes: 16 additions & 4 deletions src/models/model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -363,8 +363,7 @@ std::shared_ptr<GeneratorParams> CreateGeneratorParams() {
return std::make_shared<GeneratorParams>();
}

#if USE_CUDA
void ConvertFp16ToFp32(OrtAllocator& allocator, cudaStream_t stream, OrtValue& in, std::unique_ptr<OrtValue>& p_out) {
void ConvertFp16ToFp32(OrtAllocator& allocator, OrtValue& in, std::unique_ptr<OrtValue>& p_out, DeviceType device_type, cudaStream_t stream) {
auto shape_info = in.GetTensorTypeAndShapeInfo();
auto shape = shape_info->GetShape();
assert(shape_info->GetElementType() == Ort::TypeToTensorType<Ort::Float16_t>::type);
Expand All @@ -383,10 +382,23 @@ void ConvertFp16ToFp32(OrtAllocator& allocator, cudaStream_t stream, OrtValue& i
auto* fp16 = in.GetTensorData<uint16_t>();
auto* fp32 = p_out->GetTensorMutableData<float>();

cuda::LaunchFp16ToFp32(fp16, fp32, count, stream);
}
switch (device_type) {
case DeviceType::CPU:
for (int i = 0; i < count; i++)
fp32[i] = Float16ToFloat32(fp16[i]);
break;

#ifdef USE_CUDA
case DeviceType::CUDA:
cuda::LaunchFp16ToFp32(fp16, fp32, count, stream);
break;
#endif

default:
throw std::runtime_error("ConvertFp16ToFp32 - Unsupported device type");
}
}

size_t GetOrtTypeSize(ONNXTensorElementDataType type) {
switch (type) {
case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
Expand Down
2 changes: 1 addition & 1 deletion src/models/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ namespace Generators {

struct Tokenizer;

void ConvertFp16ToFp32(OrtAllocator& allocator, cudaStream_t stream, OrtValue& in, std::unique_ptr<OrtValue>& p_out);
void ConvertFp16ToFp32(OrtAllocator& allocator, OrtValue& in, std::unique_ptr<OrtValue>& p_out, DeviceType device_type, cudaStream_t stream);

struct State {
State(const GeneratorParams& params);
Expand Down

0 comments on commit e295090

Please sign in to comment.