microsoft · tianleiwu · Oct 31, 2023 · Oct 13, 2023 · Oct 15, 2023 · Oct 15, 2023
diff --git a/onnxruntime/core/optimizer/insert_cast_transformer.cc b/onnxruntime/core/optimizer/insert_cast_transformer.cc
@@ -32,7 +32,7 @@
                                   int64_t to_type,
                                   onnxruntime::ProviderType providerType) {
   // insert cast op to cast input
-  std::string node_name = graph.GenerateNodeName("InsertedCast_" + old_arg->Name());
+  std::string node_name = graph.GenerateNodeName("InsertedPrecisionFreeCast_" + old_arg->Name());
 
   auto* new_arg = &graph.GetOrCreateNodeArg(node_name, new_type);
 
@@ -231,37 +231,29 @@
   return Status::OK();
 }
 
-enum TypeGroup {
-  Unknown = -1,
-  Bool = 0,
-  Integer = 1,
-  Float = 2,
-};
-
-TypeGroup GetTypeGroup(DataType type) {
-  if (*type == "tensor(bool)") {
-    return Bool;
-  }
-
-  if (*type == "tensor(int16)" || *type == "tensor(int32)" || *type == "tensor(int64)" || *type == "tensor(int8)" ||
-      *type == "tensor(uint16)" || *type == "tensor(uint32)" || *type == "tensor(uint64)" || *type == "tensor(uint8)") {
-    return Integer;
-  }
-
-  if (*type == "tensor(bfloat16)" || *type == "tensor(double)" || *type == "tensor(float)" || *type == "tensor(float16)") {
-    return Float;
-  }
-
-  return Unknown;
-}
-
 /** Transformer to remove duplicate Cast nodes. */
 class RemoveDuplicateCastTransformer : public GraphTransformer {
  public:
   RemoveDuplicateCastTransformer() : GraphTransformer("RemoveDuplicateCastTransformer") {
   }
 
  private:
+  InlinedVector<std::string> cast_ordering{
+      "tensor(bool)", "tensor(uint8)", "tensor(uint16)", "tensor(uint32)", "tensor(uint64)", "tensor(int8)", "tensor(int16)",
+      "tensor(int32)", "tensor(int64)", "tensor(bfloat16)", "tensor(float16)", "tensor(float)", "tensor(double)"};
+
+  inline bool LossOfPrecision(DataType src_type, DataType dst_type, const Node& node) const {
+    // The comparison with "InsertedPrecisionFreeCast_" reflects cast nodes that are inserted by InsertCastTransformer.
+    // Such casts should not be considered as loss of precision - the inserted upcasts (f16 -> f32) and downcasts (f32 -> f16) are inserted to support kernels when on a CPU EP without F16 support.
+    auto src_pos = std::find(cast_ordering.begin(), cast_ordering.end(), *src_type);
+    auto dst_pos = std::find(cast_ordering.begin(), cast_ordering.end(), *dst_type);
+    if (src_pos == cast_ordering.end() || dst_pos == cast_ordering.end()) {
+      return true;
+    }
+
+    return std::distance(src_pos, dst_pos) < 0 && (node.Name().compare(0, 26, "InsertedPrecisionFreeCast_"));
+  }
+
   Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override {
     auto output_args = graph.GetOutputs();
     InlinedHashSet<const onnxruntime::NodeArg*> graph_outputs;
@@ -293,16 +285,8 @@
         //     - for each consumer cast node, it meets above condition for this optimization.
         auto src_type = node.InputDefs()[0]->Type();
         auto dst_type = node.OutputDefs()[0]->Type();
-        TypeGroup src_type_group = GetTypeGroup(src_type);
-        TypeGroup dst_type_group = GetTypeGroup(dst_type);
-        if (src_type_group == Unknown || dst_type_group == Unknown) {
-          continue;
-        }
 
-        bool loss_precision_cast = false;
-        if (src_type_group > dst_type_group) {
-          loss_precision_cast = true;
-        }
+        bool loss_precision_cast = LossOfPrecision(src_type, dst_type, node);
 
         size_t num_children = node.GetOutputEdgesCount();
 
@@ -312,10 +296,7 @@
           if (output_node.OpType() == "Cast") {
             auto src_type1 = output_node.InputDefs()[0]->Type();
             auto dst_type1 = output_node.OutputDefs()[0]->Type();
-            TypeGroup src_type_group1 = GetTypeGroup(src_type1);
-            TypeGroup dst_type_group1 = GetTypeGroup(dst_type1);
-            if (src_type_group1 == Unknown || dst_type_group1 == Unknown ||
-                (loss_precision_cast && dst_type_group1 > src_type_group1)) {
+            if (loss_precision_cast && LossOfPrecision(dst_type1, src_type1, output_node)) {
               inconsistent_casts = true;
               break;
             }

diff --git a/onnxruntime/test/framework/insert_cast_transformer_test.cc b/onnxruntime/test/framework/insert_cast_transformer_test.cc
@@ -4,6 +4,7 @@
 #include "core/framework/allocator.h"
 #include "core/optimizer/insert_cast_transformer.h"
 #include "core/graph/model.h"
+#include "core/graph/node_attr_utils.h"
 #include "gtest/gtest.h"
 #include "test_utils.h"
 #include "test/test_environment.h"
@@ -110,6 +111,70 @@
   }
 }
 
+TEST(TransformerTest, CastRemovalDoesNotLowerPrecisionTest) {
+  auto model = std::make_shared<onnxruntime::Model>("test", false, DefaultLoggingManager().DefaultLogger());
+  onnxruntime::Graph& graph = model->MainGraph();
+  TypeProto tensor_float_32;
+  tensor_float_32.mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT);
+  TypeProto tensor_float_64;
+  tensor_float_64.mutable_tensor_type()->set_elem_type(TensorProto_DataType_DOUBLE);
+  onnxruntime::NodeArg n1_def("N1", &tensor_float_64),
+      n2_def("N2", &tensor_float_32),
+      n3_def("N3", &tensor_float_64);
+
+  NodeAttributes n1_attrs = {{"to", utils::MakeAttribute("to", static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_FLOAT))}};
+  NodeAttributes n2_attrs = {{"to", utils::MakeAttribute("to", static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_DOUBLE))}};
+
+  graph.AddNode("node1", "Cast", "F64 to F32 cast", ArgMap{&n1_def}, ArgMap{&n2_def}, &n1_attrs);
+  graph.AddNode("node2", "Cast", "F32 to F64 cast", ArgMap{&n2_def}, ArgMap{&n3_def}, &n2_attrs);
+
+  auto status = graph.Resolve();
+  ASSERT_TRUE(status.IsOK()) << status.ErrorMessage();
+
+  InsertCastTransformer cast_inserter("Test", DefaultCpuExecutionProvider()->GetKernelRegistry().get());
+
+  bool modified = true;
+  status = cast_inserter.Apply(graph, modified, DefaultLoggingManager().DefaultLogger());
+  EXPECT_TRUE(status.IsOK()) << status.ErrorMessage();
+  status = graph.Resolve();
+  EXPECT_TRUE(status.IsOK()) << status.ErrorMessage();
+
+  // When casting f64 -> f32 -> f64 we should not be optimising away the cast since there is a loss of precision.
+  EXPECT_EQ(graph.NumberOfNodes(), 2);
+}
+
+TEST(TransformerTest, CastRemovalDoesNotRemoveSignednessTest) {
+  auto model = std::make_shared<onnxruntime::Model>("test", false, DefaultLoggingManager().DefaultLogger());
+  onnxruntime::Graph& graph = model->MainGraph();
+  TypeProto tensor_uint32;
+  tensor_uint32.mutable_tensor_type()->set_elem_type(TensorProto_DataType_UINT32);
+  TypeProto tensor_int32;
+  tensor_int32.mutable_tensor_type()->set_elem_type(TensorProto_DataType_INT32);
+  onnxruntime::NodeArg n1_def("N1", &tensor_int32),
+      n2_def("N2", &tensor_uint32),
+      n3_def("N3", &tensor_int32);
+
+  NodeAttributes n1_attrs = {{"to", utils::MakeAttribute("to", static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_UINT32))}};
+  NodeAttributes n2_attrs = {{"to", utils::MakeAttribute("to", static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_INT32))}};
+
+  graph.AddNode("node1", "Cast", "I32 to UI32 cast", ArgMap{&n1_def}, ArgMap{&n2_def}, &n1_attrs);
+  graph.AddNode("node2", "Cast", "UI32 to I32 cast", ArgMap{&n2_def}, ArgMap{&n3_def}, &n2_attrs);
+
+  auto status = graph.Resolve();
+  ASSERT_TRUE(status.IsOK()) << status.ErrorMessage();
+
+  InsertCastTransformer cast_inserter("Test", DefaultCpuExecutionProvider()->GetKernelRegistry().get());
+
+  bool modified = true;
+  status = cast_inserter.Apply(graph, modified, DefaultLoggingManager().DefaultLogger());
+  EXPECT_TRUE(status.IsOK()) << status.ErrorMessage();
+  status = graph.Resolve();
+  EXPECT_TRUE(status.IsOK()) << status.ErrorMessage();
+
+  // When casting i32 -> ui32 -> i32 we should not be optimising away the cast since applying the casts produces a very different result.
+  EXPECT_EQ(graph.NumberOfNodes(), 2);
+}
+
 // test that when there are 3 Cast ops in a row we remove the correct ones
 TEST(TransformerTest, ThreeInARowRemoval) {
   auto model_uri = MODEL_FOLDER ORT_TSTR("triple-cast.onnx");