microsoft · tianleiwu · Oct 31, 2023 · Oct 13, 2023 · Oct 15, 2023 · Oct 15, 2023
diff --git a/onnxruntime/core/optimizer/insert_cast_transformer.cc b/onnxruntime/core/optimizer/insert_cast_transformer.cc
@@ -32,7 +32,7 @@
                                   int64_t to_type,
                                   onnxruntime::ProviderType providerType) {
   // insert cast op to cast input
-  std::string node_name = graph.GenerateNodeName("InsertedCast_" + old_arg->Name());
+  std::string node_name = graph.GenerateNodeName("InsertedPrecisionFreeCast_" + old_arg->Name());
 
   auto* new_arg = &graph.GetOrCreateNodeArg(node_name, new_type);
 
@@ -235,33 +235,95 @@
   Unknown = -1,
   Bool = 0,
   Integer = 1,
-  Float = 2,
+  Unsigned = 2,
+  Float = 3,
 };
 
 TypeGroup GetTypeGroup(DataType type) {
   if (*type == "tensor(bool)") {
     return Bool;
   }
 
-  if (*type == "tensor(int16)" || *type == "tensor(int32)" || *type == "tensor(int64)" || *type == "tensor(int8)" ||
-      *type == "tensor(uint16)" || *type == "tensor(uint32)" || *type == "tensor(uint64)" || *type == "tensor(uint8)") {
+  if (*type == "tensor(int16)" || *type == "tensor(int32)" || *type == "tensor(int64)" || *type == "tensor(int8)") {
     return Integer;
   }
 
+  if (*type == "tensor(uint16)" || *type == "tensor(uint32)" || *type == "tensor(uint64)" || *type == "tensor(uint8)") {
+    return Unsigned;
+  }
+
   if (*type == "tensor(bfloat16)" || *type == "tensor(double)" || *type == "tensor(float)" || *type == "tensor(float16)") {
     return Float;
   }
 
   return Unknown;
 }
 
+int BitLength(DataType type) {
+  if (*type == "tensor(bool)") {
+    return 1;
+  } else if (*type == "tensor(uint8)" || *type == "tensor(int8)") {
+    return 8;
+  } else if (*type == "tensor(int16)" || *type == "tensor(uint16)" || *type == "tensor(bfloat16)" || *type == "tensor(float16)") {
+    return 16;
+  } else if (*type == "tensor(int32)" || *type == "tensor(uint32)" || *type == "tensor(float)") {
+    return 32;
+  } else if (*type == "tensor(int64)" || *type == "tensor(uint64)" || *type == "tensor(double)") {
+    return 64;
+  } else {
+    return -1;
+  }
+}
+
 /** Transformer to remove duplicate Cast nodes. */
 class RemoveDuplicateCastTransformer : public GraphTransformer {
  public:
   RemoveDuplicateCastTransformer() : GraphTransformer("RemoveDuplicateCastTransformer") {
   }
 
  private:
+  static bool UnsafeCast(DataType src_type, DataType dst_type, const Node& node) {
+    // This is not a complete cast optimisation pass, and is more conservative than it could be.
+    // For instance, certain integral -> floating point casts could be optimised but this is left to an explicit cast optimisation pass.
+
+    // The comparison with "InsertedPrecisionFreeCast_" reflects cast nodes that are inserted by InsertCastTransformer.
+    // Such casts should not be considered as loss of precision - the inserted upcasts (f16 -> f32) and downcasts (f32 -> f16) are inserted to support kernels when on a CPU EP without F16 support.
+    auto src_type_group = GetTypeGroup(src_type);
+    auto dst_type_group = GetTypeGroup(dst_type);
+    if (Unknown == src_type_group || Unknown == dst_type_group) {
+      return true;
+    }
+
+    // Do not remove any signed -> unsigned cast.
+    if ((src_type_group != Bool && src_type_group != Unsigned) && Unsigned == dst_type_group) {
+      return true;
+    }
+
+    // Do not remove any floating point -> non floating point cast.
+    if (Float == src_type_group && Float != dst_type_group) {
+      return true;
+    }
+
+    auto src_bit_length = BitLength(src_type);
+    auto dst_bit_length = BitLength(dst_type);
+
+    // unsigned integer -> integer cast may overflow if the destination integer is smaller or equal to the source integer.
+    if (Unsigned == src_type_group && Integer == dst_type_group) {
+      return dst_bit_length <= src_bit_length;
+    }
+
+    // integral -> floating cast may overflow if integer cannot be encoded in the mantissa. This check could be more precise.
+    if ((Integer == src_type_group || Unsigned == src_type_group) && Float == dst_type_group) {
+      return dst_bit_length <= src_bit_length;
+    }
+
+    if ((*src_type == "tensor(float16)" && *dst_type == "tensor(bfloat16)") || (*src_type == "tensor(bfloat16)" && *dst_type == "tensor(float16)")) {
+      return true;
+    }
+
+    return src_bit_length > dst_bit_length && (node.Name().compare(0, 26, "InsertedPrecisionFreeCast_"));
+  }
+
   Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override {
     auto output_args = graph.GetOutputs();
     InlinedHashSet<const onnxruntime::NodeArg*> graph_outputs;
@@ -293,17 +355,8 @@
         //     - for each consumer cast node, it meets above condition for this optimization.
         auto src_type = node.InputDefs()[0]->Type();
         auto dst_type = node.OutputDefs()[0]->Type();
-        TypeGroup src_type_group = GetTypeGroup(src_type);
-        TypeGroup dst_type_group = GetTypeGroup(dst_type);
-        if (src_type_group == Unknown || dst_type_group == Unknown) {
-          continue;
-        }
-
-        bool loss_precision_cast = false;
-        if (src_type_group > dst_type_group) {
-          loss_precision_cast = true;
-        }
 
+        bool loss_precision_cast = UnsafeCast(src_type, dst_type, node);
         size_t num_children = node.GetOutputEdgesCount();
 
         bool inconsistent_casts = false;
@@ -312,10 +365,7 @@
           if (output_node.OpType() == "Cast") {
             auto src_type1 = output_node.InputDefs()[0]->Type();
             auto dst_type1 = output_node.OutputDefs()[0]->Type();
-            TypeGroup src_type_group1 = GetTypeGroup(src_type1);
-            TypeGroup dst_type_group1 = GetTypeGroup(dst_type1);
-            if (src_type_group1 == Unknown || dst_type_group1 == Unknown ||
-                (loss_precision_cast && dst_type_group1 > src_type_group1)) {
+            if (loss_precision_cast && UnsafeCast(dst_type1, src_type1, output_node)) {
               inconsistent_casts = true;
               break;
             }

diff --git a/onnxruntime/test/framework/insert_cast_transformer_test.cc b/onnxruntime/test/framework/insert_cast_transformer_test.cc
@@ -4,6 +4,7 @@
 #include "core/framework/allocator.h"
 #include "core/optimizer/insert_cast_transformer.h"
 #include "core/graph/model.h"
+#include "core/graph/node_attr_utils.h"
 #include "gtest/gtest.h"
 #include "test_utils.h"
 #include "test/test_environment.h"
@@ -110,6 +111,70 @@
   }
 }
 
+TEST(TransformerTest, CastRemovalDoesNotLowerPrecisionTest) {
+  auto model = std::make_shared<onnxruntime::Model>("test", false, DefaultLoggingManager().DefaultLogger());
+  onnxruntime::Graph& graph = model->MainGraph();
+  TypeProto tensor_float_32;
+  tensor_float_32.mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT);
+  TypeProto tensor_float_64;
+  tensor_float_64.mutable_tensor_type()->set_elem_type(TensorProto_DataType_DOUBLE);
+  onnxruntime::NodeArg n1_def("N1", &tensor_float_64),
+      n2_def("N2", &tensor_float_32),
+      n3_def("N3", &tensor_float_64);
+
+  NodeAttributes n1_attrs = {{"to", utils::MakeAttribute("to", static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_FLOAT))}};
+  NodeAttributes n2_attrs = {{"to", utils::MakeAttribute("to", static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_DOUBLE))}};
+
+  graph.AddNode("node1", "Cast", "F64 to F32 cast", ArgMap{&n1_def}, ArgMap{&n2_def}, &n1_attrs);
+  graph.AddNode("node2", "Cast", "F32 to F64 cast", ArgMap{&n2_def}, ArgMap{&n3_def}, &n2_attrs);
+
+  auto status = graph.Resolve();
+  ASSERT_TRUE(status.IsOK()) << status.ErrorMessage();
+
+  InsertCastTransformer cast_inserter("Test", DefaultCpuExecutionProvider()->GetKernelRegistry().get());
+
+  bool modified = true;
+  status = cast_inserter.Apply(graph, modified, DefaultLoggingManager().DefaultLogger());
+  EXPECT_TRUE(status.IsOK()) << status.ErrorMessage();
+  status = graph.Resolve();
+  EXPECT_TRUE(status.IsOK()) << status.ErrorMessage();
+
+  // When casting f64 -> f32 -> f64 we should not be optimising away the cast since there is a loss of precision.
+  EXPECT_EQ(graph.NumberOfNodes(), 2);
+}
+
+TEST(TransformerTest, CastRemovalDoesNotRemoveSignednessTest) {
+  auto model = std::make_shared<onnxruntime::Model>("test", false, DefaultLoggingManager().DefaultLogger());
+  onnxruntime::Graph& graph = model->MainGraph();
+  TypeProto tensor_uint32;
+  tensor_uint32.mutable_tensor_type()->set_elem_type(TensorProto_DataType_UINT32);
+  TypeProto tensor_int32;
+  tensor_int32.mutable_tensor_type()->set_elem_type(TensorProto_DataType_INT32);
+  onnxruntime::NodeArg n1_def("N1", &tensor_int32),
+      n2_def("N2", &tensor_uint32),
+      n3_def("N3", &tensor_int32);
+
+  NodeAttributes n1_attrs = {{"to", utils::MakeAttribute("to", static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_UINT32))}};
+  NodeAttributes n2_attrs = {{"to", utils::MakeAttribute("to", static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_INT32))}};
+
+  graph.AddNode("node1", "Cast", "I32 to UI32 cast", ArgMap{&n1_def}, ArgMap{&n2_def}, &n1_attrs);
+  graph.AddNode("node2", "Cast", "UI32 to I32 cast", ArgMap{&n2_def}, ArgMap{&n3_def}, &n2_attrs);
+
+  auto status = graph.Resolve();
+  ASSERT_TRUE(status.IsOK()) << status.ErrorMessage();
+
+  InsertCastTransformer cast_inserter("Test", DefaultCpuExecutionProvider()->GetKernelRegistry().get());
+
+  bool modified = true;
+  status = cast_inserter.Apply(graph, modified, DefaultLoggingManager().DefaultLogger());
+  EXPECT_TRUE(status.IsOK()) << status.ErrorMessage();
+  status = graph.Resolve();
+  EXPECT_TRUE(status.IsOK()) << status.ErrorMessage();
+
+  // When casting i32 -> ui32 -> i32 we should not be optimising away the cast since applying the casts produces a very different result.
+  EXPECT_EQ(graph.NumberOfNodes(), 2);
+}
+
 // test that when there are 3 Cast ops in a row we remove the correct ones
 TEST(TransformerTest, ThreeInARowRemoval) {
   auto model_uri = MODEL_FOLDER ORT_TSTR("triple-cast.onnx");