microsoft · snnn · Dec 13, 2023 · Dec 11, 2023 · Dec 12, 2023 · Dec 12, 2023
diff --git a/onnxruntime/contrib_ops/cpu/math/sparse_dense_matmul.cc b/onnxruntime/contrib_ops/cpu/math/sparse_dense_matmul.cc
@@ -47,7 +47,6 @@ struct ComputeCtx {
   float alpha;
 };
 
-#if !defined(__i386__) && !defined(_M_IX86) && !defined(__wasm__) && !defined(__ANDROID__)
 template <typename T>
 inline void SparseDenseMatMulImpl(const ComputeCtx& ctx, const ConstSparseMatrixMap<T>& map_A,
                                   const ConstEigenMatrixMapRowMajor<T>& map_B, EigenMatrixMapRowMajor<T>& output_map) {
@@ -64,7 +63,8 @@ inline void SparseDenseMatMulImpl(const ComputeCtx& ctx, const ConstSparseMatrix
 
 template <>
 inline void SparseDenseMatMulImpl<float>(const ComputeCtx& ctx, const ConstSparseMatrixMap<float>& map_A,
-                                         const ConstEigenMatrixMapRowMajor<float>& map_B, EigenMatrixMapRowMajor<float>& output_map) {
+                                         const ConstEigenMatrixMapRowMajor<float>& map_B,
+                                         EigenMatrixMapRowMajor<float>& output_map) {
   if (ctx.trans_A && ctx.trans_B) {
     output_map = map_A.transpose() * ctx.alpha * map_B.transpose();
   } else if (ctx.trans_A && !ctx.trans_B) {
@@ -84,21 +84,47 @@ struct SparseToDenseCsr {
     const auto& b_dims = B.Shape().GetDims();
     const auto& out_dims = output.Shape().GetDims();
     auto csr_view = A.AsCsr();
-
-    ConstSparseMatrixMap<T> map_A(a_dims[0], a_dims[1], A.NumValues(),
-                                  csr_view.Outer().Data<int64_t>(),
-                                  csr_view.Inner().Data<int64_t>(),
+    const Eigen::Index* inner_index_pointer = nullptr;
+    const Eigen::Index* outer_index_pointer = nullptr;
+    // For auto-release the above two pointers when they are not NULL.
+    std::unique_ptr<Eigen::Index[]> buffer_holder_inner, buffer_holder_outer;
+    if constexpr (std::is_integral<Eigen::Index>::value &&
+                  std::is_signed<Eigen::Index>::value &&
+                  (sizeof(Eigen::Index) == sizeof(int64_t))) {
+      // On macOS the following reinterpret_cast is necessary because Eigen::Index is an alias of `long` but int64_t is
+      // `long long`. Though they have the same size, compilers still do not allow an implicit casting between them.
+      inner_index_pointer = reinterpret_cast<const Eigen::Index*>(csr_view.Inner().Data<int64_t>());
+      outer_index_pointer = reinterpret_cast<const Eigen::Index*>(csr_view.Outer().Data<int64_t>());
+    } else {
+      // In a 32-bit build we need to cast the following two tensors to 32 bits
+      gsl::span<const int64_t> inner_data = csr_view.Inner().DataAsSpan<int64_t>();
+      gsl::span<const int64_t> outer_data = csr_view.Outer().DataAsSpan<int64_t>();
+      buffer_holder_inner.reset(new Eigen::Index[inner_data.size()]);
+      buffer_holder_outer.reset(new Eigen::Index[outer_data.size()]);
+      inner_index_pointer = buffer_holder_inner.get();
+      outer_index_pointer = buffer_holder_outer.get();
+
+      std::transform(inner_data.begin(), inner_data.end(),
+                     buffer_holder_inner.get(), [](int64_t v) -> Eigen::Index {
+                       return narrow<Eigen::Index>(v);
+                     });
+      std::transform(outer_data.begin(), outer_data.end(),
+                     buffer_holder_outer.get(), [](int64_t v) -> Eigen::Index {
+                       return narrow<Eigen::Index>(v);
+                     });
+    }
+    ConstSparseMatrixMap<T> map_A(narrow<Eigen::Index>(a_dims[0]), narrow<Eigen::Index>(a_dims[1]),
+                                  narrow<Eigen::Index>(A.NumValues()), outer_index_pointer, inner_index_pointer,
                                   A.Values().Data<T>());
-    ConstEigenMatrixMapRowMajor<T> map_B(B.Data<T>(), b_dims[0], b_dims[1]);
-    EigenMatrixMapRowMajor<T> output_map(output.MutableData<T>(), out_dims[0], out_dims[1]);
+    ConstEigenMatrixMapRowMajor<T> map_B(B.Data<T>(), narrow<Eigen::Index>(b_dims[0]), narrow<Eigen::Index>(b_dims[1]));
+    EigenMatrixMapRowMajor<T> output_map(output.MutableData<T>(), narrow<Eigen::Index>(out_dims[0]),
+                                         narrow<Eigen::Index>(out_dims[1]));
     // XXX: Consider re-writing it as a parallel loop as Eigen requires it to use OpenMP
     // XXX: Consider vectorization
     SparseDenseMatMulImpl(ctx, map_A, map_B, output_map);
   }
 };
 
-#endif  //! defined(__i386__) && !defined(_M_IX86) && !defined(__wasm__) && !defined(__ANDROID__)
-
 template <typename T>
 inline T Mul(T a_value, float, T b_value) {
   return a_value * b_value;
@@ -121,9 +147,11 @@ struct SparseToDenseCoo {
     auto coo_view = A.AsCoo();
     const auto& ind_dims = coo_view.Indices().Shape().GetDims();
     ORT_RETURN_IF_NOT(ind_dims.size() == 2, "COO indices must be 2-D, got: ", ind_dims.size());
-    ConstEigenMatrixMapRowMajor<int64_t> a_indicies_map(coo_view.Indices().Data<int64_t>(), narrow<size_t>(ind_dims[0]), narrow<size_t>(ind_dims[1]));
+    ConstEigenMatrixMapRowMajor<int64_t> a_indicies_map(coo_view.Indices().Data<int64_t>(), narrow<size_t>(ind_dims[0]),
+                                                        narrow<size_t>(ind_dims[1]));
     ConstEigenMatrixMapRowMajor<T> map_b(B.Data<T>(), narrow<size_t>(b_dims[0]), narrow<size_t>(b_dims[1]));
-    EigenMatrixMapRowMajor<T> output_map(output.MutableData<T>(), narrow<size_t>(out_dims[0]), narrow<size_t>(out_dims[1]));
+    EigenMatrixMapRowMajor<T> output_map(output.MutableData<T>(), narrow<size_t>(out_dims[0]),
+                                         narrow<size_t>(out_dims[1]));
     output_map.setZero();
 
     const auto rhs_right = (ctx.trans_B) ? b_dims[0] : b_dims[1];
@@ -140,7 +168,8 @@ struct SparseToDenseCoo {
       ORT_RETURN_IF_NOT(m < out_left, "COO m index: ", m, " is out of bounds of out_left: ", out_left);
       const T a_value = a_values[i];
       for (int64_t n = 0; n < rhs_right; ++n) {
-        const T b_value = (ctx.trans_B) ? map_b(narrow<size_t>(n), narrow<size_t>(k)) : map_b(narrow<size_t>(k), narrow<size_t>(n));
+        const T b_value =
+            (ctx.trans_B) ? map_b(narrow<size_t>(n), narrow<size_t>(k)) : map_b(narrow<size_t>(k), narrow<size_t>(n));
         output_map(narrow<size_t>(m), narrow<size_t>(n)) += Mul(a_value, ctx.alpha, b_value);
       }
     }
@@ -170,8 +199,9 @@ Status SparseToDenseMatMul::Compute(OpKernelContext* ctx) const {
   const auto inner_B = (trans_b_attr_) ? b_dims[1] : b_dims[0];
   const auto outer_B = (trans_b_attr_) ? b_dims[0] : b_dims[1];
 
-  ORT_RETURN_IF_NOT(inner_A == inner_B, "Can not multiply A and B as inner dimension does not match. inner_A: ",
-                    inner_A, " vs inner_B: ", inner_B);
+  ORT_RETURN_IF_NOT(inner_A == inner_B,
+                    "Can not multiply A and B as inner dimension does not match. inner_A: ", inner_A,
+                    " vs inner_B: ", inner_B);
 
   TensorShape output_shape{outer_A, outer_B};
   auto* output = ctx->Output(0, output_shape);
@@ -184,12 +214,10 @@ Status SparseToDenseMatMul::Compute(OpKernelContext* ctx) const {
     auto coo_view = A->AsCoo();
     const auto num_dims = coo_view.Indices().Shape().NumDimensions();
     ORT_RETURN_IF_NOT(num_dims == 2, "Expecting COO 2-D indices shape");
-    ORT_RETURN_IF_NOT(A->Values().Shape().Size() * 2 == coo_view.Indices().Shape().Size(), "Expecting 2xValues == indices");
+    ORT_RETURN_IF_NOT(A->Values().Shape().Size() * 2 == coo_view.Indices().Shape().Size(),
+                      "Expecting 2xValues == indices");
     auto status = t_disp.InvokeRet<Status, SparseToDenseCoo>(compute_ctx, *A, *B, *output);
     ORT_RETURN_IF_ERROR(status);
-// Eigen has a bug in x86 where it calculates reallocation size as -1
-// and throws bad_alloc
-#if !defined(__i386__) && !defined(_M_IX86) && !defined(__wasm__) && !defined(__ANDROID__)
   } else if (A->Format() == SparseFormat::kCsrc) {
     auto csr_view = A->AsCsr();
     ORT_RETURN_IF_NOT(A->Values().Shape().Size() == csr_view.Inner().Shape().Size(),
@@ -199,16 +227,11 @@ Status SparseToDenseMatMul::Compute(OpKernelContext* ctx) const {
   } else {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Currently support only COO and CSR(x64) formats");
   }
-#else
-  } else {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "WASM and 32-bit builds support only COO format");
-  }
-#endif  //! defined(__i386__) && !defined(_M_IX86) && !defined(__wasm__) && !defined(__ANDROID__)
 
   return Status::OK();
 }
 
 }  // namespace contrib
 }  // namespace onnxruntime
 
-#endif  //! defined(DISABLE_SPARSE_TENSORS)
+#endif  //! defined(DISABLE_SPARSE_TENSORS)
diff --git a/onnxruntime/core/util/math_cpuonly.h b/onnxruntime/core/util/math_cpuonly.h
@@ -93,7 +93,7 @@ template <typename T>
 using ConstEigenMatrixMap = Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
 
 template <class T>
-using ConstSparseMatrixMap = Eigen::Map<const Eigen::SparseMatrix<T, Eigen::RowMajor, int64_t>>;
+using ConstSparseMatrixMap = Eigen::Map<const Eigen::SparseMatrix<T, Eigen::RowMajor, Eigen::Index>>;
 
 template <typename T>
 using ConstEigenArrayMap = Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;

diff --git a/onnxruntime/test/contrib_ops/math/matmul_sparse_test.cc b/onnxruntime/test/contrib_ops/math/matmul_sparse_test.cc
@@ -140,7 +140,6 @@ void resize(Index size, double reserveSizeFactor = 0) {
 }
 */
 #if !defined(DISABLE_SPARSE_TENSORS)
-#if !defined(__i386__) && !defined(_M_IX86) && !defined(__wasm__) && !defined(__ANDROID__)
 TEST(SparseToDenseMatMul, TestCsr) {
   constexpr int64_t rows = 9;
   constexpr int64_t cols = 9;
@@ -261,7 +260,6 @@ TEST(SparseToDenseMatMul, TestCsr) {
     tester.Run(OpTester::ExpectResult::kExpectSuccess);
   }
 }
-#endif  // //!defined(__i386__) && !defined(_M_IX86) && !defined(__wasm__) && !defined(__ANDROID__)
 
 TEST(SparseToDenseMatMul, TestCoo) {
   constexpr int64_t rows = 9;

diff --git a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
@@ -106,8 +106,7 @@ stages:
           ls $(Build.BinariesDirectory)/gccbin/bin
           mkdir $(Build.BinariesDirectory)/arm32build
           cd $(Build.BinariesDirectory)/arm32build
-          # TODO: fix the warnings and remove the --compile-no-warning-as-error arg
-          cmake --compile-no-warning-as-error $(Build.SourcesDirectory)/cmake -Donnxruntime_ENABLE_CPUINFO=OFF -DPython_EXECUTABLE=/usr/bin/python3 -DPYTHON_EXECUTABLE=/usr/bin/python3 -DCMAKE_BUILD_TYPE=Debug -DCMAKE_TOOLCHAIN_FILE=$(Build.SourcesDirectory)/cmake/linux_arm32_crosscompile_toolchain.cmake -G Ninja
+          cmake $(Build.SourcesDirectory)/cmake -Donnxruntime_ENABLE_CPUINFO=OFF -DPython_EXECUTABLE=/usr/bin/python3 -DPYTHON_EXECUTABLE=/usr/bin/python3 -DCMAKE_BUILD_TYPE=Debug -DCMAKE_TOOLCHAIN_FILE=$(Build.SourcesDirectory)/cmake/linux_arm32_crosscompile_toolchain.cmake -G Ninja
           ninja
           rm -rf $(Build.BinariesDirectory)/arm32build $(Build.BinariesDirectory)/gccbin
         displayName: Cross-compile for Linux ARM32 and ARM64