Add cpu logits conversion from fp16 to fp32 (#260)

Convenience addition for models that run on devices besides CPU but don't output fp32. Prior to this change, there would be an error if the model outputs fp16 and our scoring device is the CPU.
microsoft · Apr 12, 2024 · e295090 · e295090
1 parent 91eab7b
commit e295090
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 12 deletions.
diff --git a/src/models/logits.cpp b/src/models/logits.cpp
@@ -9,9 +9,6 @@ Logits::Logits(const Model& model, State& state)
       state_{state},
       shape_{static_cast<int64_t>(state_.params_->batch_size) * state_.params_->search.num_beams, state_.params_->sequence_length, state_.params_->vocab_size},
       type_{model_.session_info_->GetOutputDataType(model_.config_->model.decoder.outputs.logits)} {
-  if (model_.device_type_ == DeviceType::CPU && type_ != Ort::TypeToTensorType<float>::type)
-    throw std::runtime_error("Model logits_type can only be float32 on CPU");
-
   auto logits_tensor = OrtValue::CreateTensor(*model.allocator_device_, shape_, type_);
   if (type_ == Ort::TypeToTensorType<float>::type)
     value32_ = std::move(logits_tensor);
@@ -22,11 +19,9 @@ Logits::Logits(const Model& model, State& state)
 RoamingArray<float> Logits::Get() {
   size_t element_count = shape_[0] * shape_[1] * shape_[2];
 
-#if USE_CUDA
   // Convert from float16 to float32 if necessary
-  if (model_.device_type_ == DeviceType::CUDA && type_ == Ort::TypeToTensorType<Ort::Float16_t>::type)
-    ConvertFp16ToFp32(*model_.allocator_device_, model_.cuda_stream_, *value16_, value32_);
-#endif
+  if (type_ == Ort::TypeToTensorType<Ort::Float16_t>::type)
+    ConvertFp16ToFp32(*model_.allocator_device_, *value16_, value32_, model_.device_type_, model_.cuda_stream_);
 
   // First iteration? Then copy the logits over to a {batch_beams, 1, vocab_size} tensor
   // We'll reuse this tensor for all future iterations

diff --git a/src/models/model.cpp b/src/models/model.cpp
@@ -363,8 +363,7 @@ std::shared_ptr<GeneratorParams> CreateGeneratorParams() {
   return std::make_shared<GeneratorParams>();
 }
 
-#if USE_CUDA
-void ConvertFp16ToFp32(OrtAllocator& allocator, cudaStream_t stream, OrtValue& in, std::unique_ptr<OrtValue>& p_out) {
+void ConvertFp16ToFp32(OrtAllocator& allocator, OrtValue& in, std::unique_ptr<OrtValue>& p_out, DeviceType device_type, cudaStream_t stream) {
   auto shape_info = in.GetTensorTypeAndShapeInfo();
   auto shape = shape_info->GetShape();
   assert(shape_info->GetElementType() == Ort::TypeToTensorType<Ort::Float16_t>::type);
@@ -383,10 +382,23 @@ void ConvertFp16ToFp32(OrtAllocator& allocator, cudaStream_t stream, OrtValue& i
   auto* fp16 = in.GetTensorData<uint16_t>();
   auto* fp32 = p_out->GetTensorMutableData<float>();
 
-  cuda::LaunchFp16ToFp32(fp16, fp32, count, stream);
-}
+  switch (device_type) {
+    case DeviceType::CPU:
+      for (int i = 0; i < count; i++)
+        fp32[i] = Float16ToFloat32(fp16[i]);
+      break;
+
+#ifdef USE_CUDA
+    case DeviceType::CUDA:
+      cuda::LaunchFp16ToFp32(fp16, fp32, count, stream);
+      break;
 #endif
 
+    default:
+      throw std::runtime_error("ConvertFp16ToFp32 - Unsupported device type");
+  }
+}
+
 size_t GetOrtTypeSize(ONNXTensorElementDataType type) {
   switch (type) {
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:

diff --git a/src/models/model.h b/src/models/model.h
@@ -7,7 +7,7 @@ namespace Generators {
 
 struct Tokenizer;
 
-void ConvertFp16ToFp32(OrtAllocator& allocator, cudaStream_t stream, OrtValue& in, std::unique_ptr<OrtValue>& p_out);
+void ConvertFp16ToFp32(OrtAllocator& allocator, OrtValue& in, std::unique_ptr<OrtValue>& p_out, DeviceType device_type, cudaStream_t stream);
 
 struct State {
   State(const GeneratorParams& params);