From bebe1030e48e6fd6d9fb2558cbdf292951856616 Mon Sep 17 00:00:00 2001
From: lmoneta <lorenzo.moneta@cern.ch>
Date: Fri, 1 Jul 2022 12:27:34 +0200
Subject: [PATCH] - add optimization vectorization options for SOFIE tests -
 set in ONNXRuntime also the interopnumthreads to 1

---
 root/tmva/sofie/CMakeLists.txt                      | 10 +++++++---
 .../tmva/sofie/ONNXRuntimeInference_Template.cxx.in | 13 +++++++------
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/root/tmva/sofie/CMakeLists.txt b/root/tmva/sofie/CMakeLists.txt
index 118d0148..4fa4cdbb 100644
--- a/root/tmva/sofie/CMakeLists.txt
+++ b/root/tmva/sofie/CMakeLists.txt
@@ -190,9 +190,13 @@ add_dependencies(RDF_SOFIE_Inference SofieCompileModels)
 #if (ROOT_PLATFORM MATCHES "linux|macosx" AND CMAKE_SYSTEM_PROCESSOR MATCHES x86_64 AND CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
 ## assume we run only on linux/macos with gnu or gcc
 set(gnu-flags $<$<CXX_COMPILER_ID:GNU>:-fno-signaling-nans>)
-target_compile_options(SOFIEInference  PRIVATE   ${gnu-flags} -fno-trapping-math -O3)
-target_compile_options(RDF_SOFIE_Inference  PRIVATE  ${gnu-flags} -fno-trapping-math -O3)
-#endif()
+if ($APPLE)
+target_compile_options(SOFIEInference  PRIVATE   ${gnu-flags} -ffast-math -fno-trapping-math -O3)
+target_compile_options(RDF_SOFIE_Inference  PRIVATE  ${gnu-flags} -ffast-math -fno-trapping-math -O3)
+else()
+target_compile_options(SOFIEInference  PRIVATE   ${gnu-flags} -march=native -ffast-math -fno-trapping-math -O3)
+target_compile_options(RDF_SOFIE_Inference  PRIVATE  ${gnu-flags} -march=native -ffast-math -fno-trapping-math -O3)
+endif()
 
 endif()  # endif blas 
 endif()  # endif TMVA/SOFIE
diff --git a/root/tmva/sofie/ONNXRuntimeInference_Template.cxx.in b/root/tmva/sofie/ONNXRuntimeInference_Template.cxx.in
index 99ed729e..01e2f38c 100644
--- a/root/tmva/sofie/ONNXRuntimeInference_Template.cxx.in
+++ b/root/tmva/sofie/ONNXRuntimeInference_Template.cxx.in
@@ -19,6 +19,7 @@ static void @FUNC_NAME@(benchmark::State& state, string model_path)
 
    Ort::SessionOptions session_options;
    session_options.SetIntraOpNumThreads(1);
+   session_options.SetInterOpNumThreads(1);
    session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
 
    //std::cout << "benchmarking model " << model_path << std::endl;
@@ -45,12 +46,12 @@ static void @FUNC_NAME@(benchmark::State& state, string model_path)
    for (int i = 0; i < nout; i++)
       output_node_dims[i] = session.GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape();
 
-   for (int i = 0; i < nin; i++) {
-      std::cout << "input " << input_node_names[i] << " shape : ";
-      for (int j = 0; j < input_node_dims[i].size(); j++)
-         std::cout << "  " << input_node_dims[i][j];
-      std::cout << std::endl;
-   }
+   // for (int i = 0; i < nin; i++) {
+   //    std::cout << "input " << input_node_names[i] << " shape : ";
+   //    for (int j = 0; j < input_node_dims[i].size(); j++)
+   //       std::cout << "  " << input_node_dims[i][j];
+   //    std::cout << std::endl;
+   // }
    // fix negative shapes
    for (int i = 0; i < nin; i++) {
       for (int j = 0; j < input_node_dims[i].size(); j++) {