From 4abec9749e0cd3bcd22ed3025d8505f91e80f562 Mon Sep 17 00:00:00 2001
From: junchao-loongson <68935141+junchao-loongson@users.noreply.github.com>
Date: Fri, 8 Dec 2023 03:15:59 +0800
Subject: [PATCH] [mlas] add loongarch lsx and lasx optimize code (#17937)

### Description
Hello we(@lixing-star) are the developers of loongson team.

We add 128 (lsx), 256 (lasx) vector optimization code for the loongarch
architecture


[100% tests passed, 0 tests failed out of
7](https://cloud.a-boat.cn:2021/api/public/dl/6831z1Bi?inline=true)

### Development Environments1
```
CPU:
    Loongson-3C5000L
uname -a:
    Linux localhost.localdomain 4.19.190-6.4.lns8.loongarch64 #1 SMP Thu Jul 14 12:08:04 CST 2022 loongarch64 loongarch64 loongarch64 GNU/Linux

```
### LonngArch Documents
- [LoongArch Reference Manual - Volume 1: Basic Architecture: This
manual describes the basic part of the LoongArch
architecture.](https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html)
- [LoongArch ELF psABI: This manual describes the LoongArch ELF
psABI.](https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html)
-
[more](https://loongson.github.io/LoongArch-Documentation/README-EN.html)
---
 cmake/onnxruntime_mlas.cmake                  |  22 +
 onnxruntime/core/mlas/inc/mlas.h              |  11 +-
 onnxruntime/core/mlas/lib/activate.cpp        |   2 +
 onnxruntime/core/mlas/lib/compute.cpp         |  13 +-
 onnxruntime/core/mlas/lib/dgemm.cpp           |   2 +-
 .../mlas/lib/loongarch64/DgemmKernelCommon.h  |  27 +
 .../mlas/lib/loongarch64/DgemmKernelLasx.S    |  32 +
 .../mlas/lib/loongarch64/DgemmKernelLsx.S     | 217 +++++
 .../mlas/lib/loongarch64/FgemmKernelCommon.h  | 100 ++
 .../lib/loongarch64/FgemmKernelLasxCommon.h   | 546 +++++++++++
 .../lib/loongarch64/FgemmKernelLsxCommon.h    | 170 ++++
 .../mlas/lib/loongarch64/SconvKernelLasx.S    | 412 +++++++++
 .../lib/loongarch64/SconvKernelLasxCommon.h   | 868 ++++++++++++++++++
 .../mlas/lib/loongarch64/SconvKernelLsx.S     | 339 +++++++
 .../lib/loongarch64/SconvKernelLsxCommon.h    | 669 ++++++++++++++
 .../mlas/lib/loongarch64/SgemmKernelCommon.h  |  35 +
 .../mlas/lib/loongarch64/SgemmKernelLasx.S    |  33 +
 .../mlas/lib/loongarch64/SgemmKernelLsx.S     | 267 ++++++
 .../loongarch64/SgemmTransposePackB16x4LSX.S  |  89 ++
 .../loongarch64/SgemmTransposePackB16x4Lasx.S | 126 +++
 .../mlas/lib/loongarch64/SoftmaxKernelLasx.S  | 357 +++++++
 .../mlas/lib/loongarch64/SpoolKernelLSX.S     | 460 ++++++++++
 .../mlas/lib/loongarch64/SpoolKernelLasx.S    | 238 +++++
 .../lib/loongarch64/SpoolKernelLasxCommon.h   | 311 +++++++
 .../core/mlas/lib/loongarch64/asmmacro.h      | 144 +++
 onnxruntime/core/mlas/lib/mlasi.h             | 182 +++-
 onnxruntime/core/mlas/lib/platform.cpp        |  79 ++
 onnxruntime/core/mlas/lib/pooling.cpp         |  90 ++
 onnxruntime/core/mlas/lib/q4gemm.h            |   2 +-
 onnxruntime/core/mlas/lib/qdwconv.cpp         |  54 +-
 onnxruntime/core/mlas/lib/qgemm.h             |   2 +-
 .../core/mlas/lib/qgemm_kernel_lsx.cpp        | 531 +++++++++++
 onnxruntime/core/mlas/lib/qladd.cpp           | 113 +++
 onnxruntime/core/mlas/lib/qladd.h             | 127 +++
 onnxruntime/core/mlas/lib/qlgavgpool.cpp      | 312 ++++++-
 onnxruntime/core/mlas/lib/qlmul.cpp           | 164 ++++
 onnxruntime/core/mlas/lib/quantize.cpp        | 407 +++++++-
 onnxruntime/core/mlas/lib/reorder.cpp         |  33 +-
 onnxruntime/core/mlas/lib/sgemm.cpp           |   4 +-
 onnxruntime/core/mlas/lib/snchwc.cpp          |  18 +-
 onnxruntime/core/mlas/lib/transpose.cpp       | 122 ++-
 41 files changed, 7696 insertions(+), 34 deletions(-)
 create mode 100644 onnxruntime/core/mlas/lib/loongarch64/DgemmKernelCommon.h
 create mode 100644 onnxruntime/core/mlas/lib/loongarch64/DgemmKernelLasx.S
 create mode 100644 onnxruntime/core/mlas/lib/loongarch64/DgemmKernelLsx.S
 create mode 100644 onnxruntime/core/mlas/lib/loongarch64/FgemmKernelCommon.h
 create mode 100644 onnxruntime/core/mlas/lib/loongarch64/FgemmKernelLasxCommon.h
 create mode 100644 onnxruntime/core/mlas/lib/loongarch64/FgemmKernelLsxCommon.h
 create mode 100644 onnxruntime/core/mlas/lib/loongarch64/SconvKernelLasx.S
 create mode 100644 onnxruntime/core/mlas/lib/loongarch64/SconvKernelLasxCommon.h
 create mode 100644 onnxruntime/core/mlas/lib/loongarch64/SconvKernelLsx.S
 create mode 100644 onnxruntime/core/mlas/lib/loongarch64/SconvKernelLsxCommon.h
 create mode 100644 onnxruntime/core/mlas/lib/loongarch64/SgemmKernelCommon.h
 create mode 100644 onnxruntime/core/mlas/lib/loongarch64/SgemmKernelLasx.S
 create mode 100644 onnxruntime/core/mlas/lib/loongarch64/SgemmKernelLsx.S
 create mode 100644 onnxruntime/core/mlas/lib/loongarch64/SgemmTransposePackB16x4LSX.S
 create mode 100644 onnxruntime/core/mlas/lib/loongarch64/SgemmTransposePackB16x4Lasx.S
 create mode 100644 onnxruntime/core/mlas/lib/loongarch64/SoftmaxKernelLasx.S
 create mode 100644 onnxruntime/core/mlas/lib/loongarch64/SpoolKernelLSX.S
 create mode 100644 onnxruntime/core/mlas/lib/loongarch64/SpoolKernelLasx.S
 create mode 100644 onnxruntime/core/mlas/lib/loongarch64/SpoolKernelLasxCommon.h
 create mode 100644 onnxruntime/core/mlas/lib/loongarch64/asmmacro.h
 create mode 100644 onnxruntime/core/mlas/lib/qgemm_kernel_lsx.cpp

diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index 04efa5c2b4f6d..26e4380af4c23 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -284,6 +284,8 @@ else()
           set(X86 TRUE)
         elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
           set(X86_64 TRUE)
+        elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^loongarch64.*")
+          set(LOONGARCH64 TRUE)
         endif()
     endif()
 
@@ -575,6 +577,26 @@ else()
           set(MLAS_SOURCE_IS_NOT_SET 0)
         endif()
     endif()
+    if(LOONGARCH64 AND MLAS_SOURCE_IS_NOT_SET)
+        set(mlas_platform_srcs
+          ${MLAS_SRC_DIR}/qgemm_kernel_lsx.cpp
+          ${MLAS_SRC_DIR}/loongarch64/SgemmKernelLasx.S
+          ${MLAS_SRC_DIR}/loongarch64/DgemmKernelLsx.S
+          ${MLAS_SRC_DIR}/loongarch64/DgemmKernelLasx.S
+          ${MLAS_SRC_DIR}/loongarch64/SgemmKernelLsx.S
+          ${MLAS_SRC_DIR}/loongarch64/SconvKernelLsx.S
+          ${MLAS_SRC_DIR}/loongarch64/SconvKernelLasx.S
+          ${MLAS_SRC_DIR}/loongarch64/SpoolKernelLSX.S
+          ${MLAS_SRC_DIR}/loongarch64/SpoolKernelLasx.S
+          ${MLAS_SRC_DIR}/loongarch64/SgemmTransposePackB16x4LSX.S
+          ${MLAS_SRC_DIR}/loongarch64/SgemmTransposePackB16x4Lasx.S
+          ${MLAS_SRC_DIR}/loongarch64/SoftmaxKernelLasx.S
+            )
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mlsx -mlasx")
+        if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH)
+          set(MLAS_SOURCE_IS_NOT_SET 0)
+        endif()
+    endif()
     if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH AND MLAS_SOURCE_IS_NOT_SET)
         file(GLOB_RECURSE mlas_platform_srcs
           "${MLAS_SRC_DIR}/scalar/*.cpp")
diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
index fd6b3df93444b..bdd4dba521eba 100644
--- a/onnxruntime/core/mlas/inc/mlas.h
+++ b/onnxruntime/core/mlas/inc/mlas.h
@@ -69,6 +69,9 @@ Module Name:
 #endif
 #endif
 
+#if defined(__loongarch64)
+#define MLAS_TARGET_LARCH64
+#endif
 //
 // Define the support levels for the target architecture.
 //
@@ -87,7 +90,7 @@ Module Name:
 
 #define MLAS_F16VEC_INTRINSICS_SUPPORTED
 
-#endif // 
+#endif //
 #endif // ARM64
 #endif // Visual Studio 16 or earlier does not support fp16 intrinsic
 
@@ -1619,7 +1622,7 @@ MlasHalfGemmConvertPackB(
  * @param Channels      # of input channels
  * @param OutputCount   # of output pixels
  * @param KernelSize    # kernel size
- * @return 
+ * @return
 */
 void
 MLASCALL
@@ -1657,7 +1660,7 @@ MlasTranspose(
  * @param Channels      C in NHWC
  * @param OutputCount   Number of output pixels
  * @param KernelSize    Size of the kernel
- * @return 
+ * @return
 */
 void
 MLASCALL
@@ -1676,7 +1679,7 @@ MlasNhwcMaxPool(
  * @param Channels      C in NHWC
  * @param OutputCount   Number of output pixels
  * @param KernelSize    size of the kernel
- * @return 
+ * @return
 */
 void
 MLASCALL
diff --git a/onnxruntime/core/mlas/lib/activate.cpp b/onnxruntime/core/mlas/lib/activate.cpp
index 6c4ab8ae118dc..df3b884a7e7c9 100644
--- a/onnxruntime/core/mlas/lib/activate.cpp
+++ b/onnxruntime/core/mlas/lib/activate.cpp
@@ -143,6 +143,8 @@ struct MLAS_ACTIVATION_FUNCTION<MlasLeakyReluActivation>
         return MlasBlendFloat32x4(ValueTimesAlpha, Value, _mm_cmple_ps(ZeroFloat32x4, Value));
 #elif defined(MLAS_VSX_INTRINSICS)
         return vec_sel(ValueTimesAlpha, Value, vec_cmple(ZeroFloat32x4, Value));
+#elif defined(MLAS_LSX_INTRINSICS)
+        return MlasBlendFloat32x4(ValueTimesAlpha, Value, (__m128)__lsx_vfcmp_cle_s(ZeroFloat32x4, Value));
 #else
         return MlasBlendFloat32x4(ValueTimesAlpha, Value, ZeroFloat32x4 < Value);
 #endif
diff --git a/onnxruntime/core/mlas/lib/compute.cpp b/onnxruntime/core/mlas/lib/compute.cpp
index 118351055157d..78cac2e617ff7 100644
--- a/onnxruntime/core/mlas/lib/compute.cpp
+++ b/onnxruntime/core/mlas/lib/compute.cpp
@@ -148,6 +148,9 @@ Return Value:
     // instead.
     normal = _mm_min_epi16(normal, MaximumExponent);
     normal = _mm_max_epi16(normal, MinimumExponent);
+#elif defined(MLAS_LSX_INTRINSICS)
+    normal = __lsx_vmin_h(normal, MaximumExponent);
+    normal = __lsx_vmax_h(normal, MinimumExponent);
 #else
     normal = MlasMinimumInt32x4(normal, MaximumExponent);
     normal = MlasMaximumInt32x4(normal, MinimumExponent);
@@ -215,6 +218,8 @@ Return Value:
             // N.B. SSE2 lacks a broadcast load instruction, so avoid a shuffle
             // and use zeroes for the upper elements.
             Vector = _mm_load_ss(Input);
+#elif defined(MLAS_LSX_INTRINSICS)
+            Vector = (MLAS_FLOAT32X4)__lsx_vldrepl_w(Input, 0);
 #else
             Vector = MlasBroadcastFloat32x4(Input);
 #endif
@@ -467,6 +472,8 @@ Return Value:
         // N.B. SSE2 lacks a broadcast load instruction, so avoid a shuffle and
         // use zeroes for the upper elements.
         MLAS_FLOAT32X4 Vector = _mm_load_ss(Input);
+#elif defined(MLAS_LSX_INTRINSICS)
+        MLAS_FLOAT32X4 Vector = (MLAS_FLOAT32X4)__lsx_vldrepl_w(Input, 0);
 #else
         MLAS_FLOAT32X4 Vector = MlasBroadcastFloat32x4(Input);
 #endif
@@ -849,7 +856,7 @@ Return Value:
         // Find the maximum value for the row.
         //
 
-#if defined(MLAS_TARGET_AMD64)
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64)
         float Maximum = GetMlasPlatform().ReduceMaximumF32Kernel(Input, D);
 #else
         float Maximum = MlasReduceMaximumF32Kernel(Input, D);
@@ -874,7 +881,7 @@ Return Value:
 
             float Parameters[] = { NegativeMaximum, std::log(Accumulation)};
 
-#if defined(MLAS_TARGET_AMD64)
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64)
             GetMlasPlatform().ComputeLogSoftmaxOutputF32Kernel(Input, Output, D, Parameters);
 #else
             MlasComputeLogSoftmaxOutputF32Kernel(Input, Output, D, Parameters);
@@ -899,7 +906,7 @@ Return Value:
 
             float Parameters[] = { 1.0f / Accumulation };
 
-#if defined(MLAS_TARGET_AMD64)
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64)
             GetMlasPlatform().ComputeSoftmaxOutputF32Kernel(Output, D, Parameters);
 #else
             MlasComputeSoftmaxOutputF32Kernel(Output, D, Parameters);
diff --git a/onnxruntime/core/mlas/lib/dgemm.cpp b/onnxruntime/core/mlas/lib/dgemm.cpp
index 1ef63d03c8014..50c62744f1d8e 100644
--- a/onnxruntime/core/mlas/lib/dgemm.cpp
+++ b/onnxruntime/core/mlas/lib/dgemm.cpp
@@ -530,7 +530,7 @@ Return Value:
 
         size_t RowsHandled;
 
-#if defined(MLAS_TARGET_AMD64_IX86) || defined (MLAS_TARGET_POWER)
+#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_LARCH64)
         RowsHandled = GetMlasPlatform().GemmDoubleKernel(A, B, C, CountK, CountM, CountN, lda, ldc, alpha, ZeroMode);
 #else
         if (ZeroMode) {
diff --git a/onnxruntime/core/mlas/lib/loongarch64/DgemmKernelCommon.h b/onnxruntime/core/mlas/lib/loongarch64/DgemmKernelCommon.h
new file mode 100644
index 0000000000000..8d812baabdf9d
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/loongarch64/DgemmKernelCommon.h
@@ -0,0 +1,27 @@
+/*++
+
+Copyright (C) 2023 Loongson Technology Corporation Limited. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    DgemmKernelCommon.h
+
+Abstract:
+
+    This module contains common kernel macros and structures for the double
+    precision matrix/matrix multiply operation (DGEMM).
+
+--*/
+
+#define     LFgemmElementShift      3
+#define     LFgemmElementSize       (1 << LFgemmElementShift)
+#define     LFgemmYmmElementCount   (32/LFgemmElementSize)
+
+#include "FgemmKernelCommon.h"
+
+FGEMM_TYPED_INSTRUCTION(xvfadd, xvfadd.d)
+FGEMM_TYPED_INSTRUCTION(xvfmadd, xvfmadd.d)
+FGEMM_TYPED_INSTRUCTION(xvldrepl, xvldrepl.d)
+FGEMM_TYPED_INSTRUCTION(xvfmul, xvfmul.d)
diff --git a/onnxruntime/core/mlas/lib/loongarch64/DgemmKernelLasx.S b/onnxruntime/core/mlas/lib/loongarch64/DgemmKernelLasx.S
new file mode 100644
index 0000000000000..2f197d6891579
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/loongarch64/DgemmKernelLasx.S
@@ -0,0 +1,32 @@
+/*++
+
+Copyright (C) 2023 Loongson Technology Corporation Limited. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    DgemmKernelLasx.s
+
+Abstract:
+
+    This module implements the kernels for the double precision matrix/matrix
+    multiply operation (DGEMM).
+
+    This implementation uses Lasx instructions.
+
+--*/
+
+#include "asmmacro.h"
+#include "DgemmKernelCommon.h"
+#include "FgemmKernelLasxCommon.h"
+
+        .text
+
+//
+// Generate the GEMM kernel.
+//
+
+FgemmKernelLasxFunction MlasGemmDoubleKernelLasx
+
+        .end
diff --git a/onnxruntime/core/mlas/lib/loongarch64/DgemmKernelLsx.S b/onnxruntime/core/mlas/lib/loongarch64/DgemmKernelLsx.S
new file mode 100644
index 0000000000000..63395631a9bc5
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/loongarch64/DgemmKernelLsx.S
@@ -0,0 +1,217 @@
+/*++
+
+Copyright (C) 2023 Loongson Technology Corporation Limited. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    DgemmKernelLsx.s
+
+Abstract:
+
+    This module implements the kernels for the double precision matrix/matrix
+    multiply operation (DGEMM).
+
+    This implementation uses Lsx instructions.
+
+--*/
+
+#include "asmmacro.h"
+#include "FgemmKernelLsxCommon.h"
+
+FGEMM_TYPED_INSTRUCTION(vfadd, vfadd.d)
+/*++
+
+Macro Description:
+
+    This macro multiplies and accumulates for a 8xN block of the output matrix.
+
+Arguments:
+
+    RowCount - Supplies the number of rows to process.
+
+Implicit Arguments:
+
+    a1 (rsi) - Supplies the address into the matrix B data.
+
+    vr0-vr1 - Supplies up to two elements loaded from matrix A and matrix A
+        plus one row.
+
+    vr8-vr15 - Supplies the block accumulators.
+
+--*/
+
+        .macro ComputeBlockSseBy8 RowCount
+
+        vld     $vr4, $a1, 0
+        vld     $vr5, $a1, 16
+.if \RowCount\() == 2
+        vmove   $vr6, $vr4
+        vmove   $vr7, $vr5
+.endif
+        vfmadd.d    $vr8, $vr4, $vr0, $vr8
+        vfmadd.d    $vr9, $vr5, $vr0, $vr9
+.if \RowCount\() == 2
+        vfmadd.d    $vr12, $vr6, $vr1, $vr12
+        vfmadd.d    $vr13, $vr7, $vr1, $vr13
+.endif
+        vld     $vr4, $a1, 32
+        vld     $vr5, $a1, 48
+.if \RowCount\() == 2
+        vmove   $vr6, $vr4
+        vmove   $vr7, $vr5
+.endif
+        vfmadd.d    $vr10, $vr4, $vr0, $vr10
+        vfmadd.d    $vr11, $vr5, $vr0, $vr11
+.if \RowCount\() == 2
+        vfmadd.d    $vr14, $vr6, $vr1, $vr14
+        vfmadd.d    $vr15, $vr7, $vr1, $vr15
+.endif
+
+        .endm
+
+/*++
+
+Macro Description:
+
+    This macro generates code to compute matrix multiplication for a fixed set
+    of rows.
+
+Arguments:
+
+    RowCount - Supplies the number of rows to process.
+
+    Fallthrough - Supplies a non-blank value if the macro may fall through to
+        the ExitKernel label.
+
+Implicit Arguments:
+
+    a0 - Supplies the address of matrix A.
+
+    a1 - Supplies the address of matrix B.
+
+    t8 - Supplies the address of matrix A.
+
+    a5 - Supplies the number of columns from matrix B and matrix C to iterate
+        over.
+
+    a2 - Supplies the address of matrix C.
+
+    a3 - Supplies the number of columns from matrix A and the number of rows
+        from matrix B to iterate over.
+
+    t7 - Supplies the length in bytes of a row from matrix A.
+
+    t5 - Supplies the length in bytes of a row from matrix C.
+
+    s3 - Stores the ZeroMode argument from the stack frame.
+
+--*/
+
+        .macro ProcessCountM RowCount, Fallthrough
+.LProcessNextColumnLoop8xN\@:
+        EmitIfCountGE \RowCount\(), 1, "vxor.v $vr8,$vr8,$vr8"
+        EmitIfCountGE \RowCount\(), 1, "vxor.v $vr9,$vr9,$vr9"
+        EmitIfCountGE \RowCount\(), 1, "vxor.v $vr10,$vr10,$vr10"
+        EmitIfCountGE \RowCount\(), 1, "vxor.v $vr11,$vr11,$vr11"
+        EmitIfCountGE \RowCount\(), 2, "vxor.v $vr12,$vr12,$vr12"
+        EmitIfCountGE \RowCount\(), 2, "vxor.v $vr13,$vr13,$vr13"
+        EmitIfCountGE \RowCount\(), 2, "vxor.v $vr14,$vr14,$vr14"
+        EmitIfCountGE \RowCount\(), 2, "vxor.v $vr15,$vr15,$vr15"
+        move     $t7,$a3                     # reload CountK
+.LCompute8xNBlockBy1Loop\@:
+        EmitIfCountGE \RowCount\(), 1, "ld.d    $s0, $a0, 0"
+        EmitIfCountGE \RowCount\(), 1, "vreplgr2vr.d    $vr0, $s0"
+        EmitIfCountGE \RowCount\(), 2, "ldx.d    $s0, $a0, $t0"
+        EmitIfCountGE \RowCount\(), 2, "vreplgr2vr.d    $vr1, $s0"
+        ComputeBlockSseBy8 \RowCount\()
+        addi.d     $a1, $a1, 8*8                     # advance matrix B by 8 columns
+        addi.d     $a0, $a0, 8                       # advance matrix A by 1 column
+        addi.d     $t7, $t7, -1
+        bnez       $t7, .LCompute8xNBlockBy1Loop\@
+
+.LOutput8xNBlock\@:
+        movfr2gr.d      $s0,  $f24
+        vreplgr2vr.d    $vr2, $s0
+                                            # multiply by alpha
+        EmitIfCountGE \RowCount\(), 1, "vfmul.d $vr8, $vr8, $vr2"
+        EmitIfCountGE \RowCount\(), 1, "vfmul.d $vr9, $vr9, $vr2"
+        EmitIfCountGE \RowCount\(), 1, "vfmul.d $vr10,$vr10, $vr2"
+        EmitIfCountGE \RowCount\(), 1, "vfmul.d $vr11,$vr11, $vr2"
+        EmitIfCountGE \RowCount\(), 2, "vfmul.d $vr12,$vr12, $vr2"
+        EmitIfCountGE \RowCount\(), 2, "vfmul.d $vr13,$vr13, $vr2"
+        EmitIfCountGE \RowCount\(), 2, "vfmul.d $vr14,$vr14, $vr2"
+        EmitIfCountGE \RowCount\(), 2, "vfmul.d $vr15,$vr15, $vr2"
+        li.d    $s0, 8
+        blt     $a5, $s0, .LOutputPartial8xNBlock\@
+        sub.d   $a5, $a5, $s0
+        AccumulateAndStoreBlock \RowCount\(), 4
+        addi.d  $a2, $a2, 8*8       # advance matrix C by 8 columns
+        move    $a0, $t1            # reload matrix A
+        bnez    $a5, .LProcessNextColumnLoop8xN\@
+        b       .LExitKernel
+
+//
+// Output a partial 8xN block to the matrix.
+//
+
+.LOutputPartial8xNBlock\@:
+        li.d    $s0, 2
+        blt     $a5, $s0, .LOutputPartial1xNBlock\@
+        li.d    $s0, 4
+        blt     $a5, $s0, .LOutputPartialLessThan4xNBlock\@
+        li.d    $s0, 6
+        blt     $a5, $s0, .LOutputPartialLessThan6xNBlock\@
+        AccumulateAndStoreBlock \RowCount\(), 3
+        andi    $s0, $a5, 1                  # check if remaining count is small
+        beqz    $s0, .LExitKernel
+        EmitIfCountGE \RowCount\(), 1, "vmove $vr8,$vr11"
+                                            # shift remaining elements down
+        EmitIfCountGE \RowCount\(), 2, "vmove $vr12,$vr15"
+        addi.d     $a2, $a2, 6*8                     # advance matrix C by 6 columns
+        b     .LOutputPartial1xNBlock\@
+
+.LOutputPartialLessThan6xNBlock\@:
+        AccumulateAndStoreBlock \RowCount\(), 2
+        andi    $s0, $a5,1                       # check if remaining count is small
+        beqz    $s0, .LExitKernel
+        EmitIfCountGE \RowCount\(), 1, "vmove $vr8,$vr10"
+                                            # shift remaining elements down
+        EmitIfCountGE \RowCount\(), 2, "vmove $vr12,$vr14"
+        addi.d     $a2, $a2, 4*8                     # advance matrix C by 4 columns
+        b     .LOutputPartial1xNBlock\@
+
+.LOutputPartialLessThan4xNBlock\@:
+        AccumulateAndStoreBlock \RowCount\(), 1
+        andi    $s0, $a5,1                       # check if remaining count is small
+        beqz    $s0, .LExitKernel
+        EmitIfCountGE \RowCount\(), 1, "vmove $vr8,$vr9"
+                                            # shift remaining elements down
+        EmitIfCountGE \RowCount\(), 2, "vmove $vr12,$vr13"
+        addi.d     $a2, $a2, 2*8                     # advance matrix C by 2 columns
+
+.LOutputPartial1xNBlock\@:
+        bnez    $t5, .LSkipAccumulateOutput1xN\@     # ZeroMode?
+
+        EmitIfCountGE \RowCount\(), 1, "fld.d    $f15, $a2, 0"
+        EmitIfCountGE \RowCount\(), 1, "fadd.d  $f15, $f15, $f8"
+        EmitIfCountGE \RowCount\(), 2, "fldx.d   $f16, $a2, $t6"
+        EmitIfCountGE \RowCount\(), 2, "fadd.d  $f16, $f16, $f12"
+
+.LSkipAccumulateOutput1xN\@:
+        EmitIfCountGE \RowCount\(), 1, "fst.d    $f15, $a2, 0"
+        EmitIfCountGE \RowCount\(), 2, "fstx.d    $f16, $a2, $t6"
+.ifb \Fallthrough\()
+        b     .LExitKernel
+.endif
+
+        .endm
+
+//
+// Generate the GEMM kernel.
+//
+
+FgemmKernelLsxFunction MlasGemmDoubleKernelLSX
+
+        .end
diff --git a/onnxruntime/core/mlas/lib/loongarch64/FgemmKernelCommon.h b/onnxruntime/core/mlas/lib/loongarch64/FgemmKernelCommon.h
new file mode 100644
index 0000000000000..777a592590ec4
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/loongarch64/FgemmKernelCommon.h
@@ -0,0 +1,100 @@
+/*++
+
+Copyright (C) 2023 Loongson Technology Corporation Limited. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    FgemmKernelCommon.h
+
+Abstract:
+
+    This module contains common kernel macros and structures for the floating
+    point matrix/matrix multiply operation (SGEMM and DGEMM).
+
+--*/
+
+//
+// Define the typed instruction template.
+//
+
+#define FGEMM_TYPED_INSTRUCTION(Untyped, Typed) \
+        .macro Untyped Operand:vararg; Typed \Operand\(); .endm;
+
+/*++
+
+Macro Description:
+
+    This macro generates code to execute the block compute macro multiple
+    times and advancing the matrix A and matrix B data pointers.
+
+Arguments:
+
+    ComputeBlock - Supplies the macro to compute a single block.
+
+    RowCount - Supplies the number of rows to process.
+
+    AdvanceMatrixAPlusRows - Supplies a non-zero value if the data pointer
+        in rbx should also be advanced as part of the loop.
+
+Implicit Arguments:
+
+    a0 - Supplies the address into the matrix A data.
+
+    t7 - Supplies the address into the matrix A data plus 3 rows.
+
+    a1 - Supplies the address into the matrix B data.
+
+    a3 - Supplies the number of columns from matrix A and the number of rows
+        from matrix B to iterate over.
+
+    vr4-vr15 - Supplies the block accumulators.
+
+--*/
+
+        .macro ComputeBlockLoop ComputeBlock, RowCount, AdvanceMatrixAPlusRows
+
+        move     $t8, $a3                     # reload CountK
+        li.d    $s0, 4
+        blt     $t8, $s0, .LProcessRemainingBlocks\@
+
+.LComputeBlockBy4Loop\@:
+        \ComputeBlock\() \RowCount\(), 0, LFgemmElementSize*0, 64*4
+        \ComputeBlock\() \RowCount\(), 2*32, LFgemmElementSize*1, 64*4
+        addi.d $a1, $a1, 2*2*32                # advance matrix B by 128 bytes
+        \ComputeBlock\() \RowCount\(), 0, LFgemmElementSize*2, 64*4
+        \ComputeBlock\() \RowCount\(), 2*32, LFgemmElementSize*3, 64*4
+        addi.d  $a1, $a1, 2*2*32                # advance matrix B by 128 bytes
+        addi.d  $a0, $a0, 4*LFgemmElementSize    # advance matrix A by 4 elements
+.if \RowCount\() > 3
+        addi.d     $t7, $t7, 4*LFgemmElementSize    # advance matrix A plus rows by 4 elements
+.if \RowCount\() == 12
+        addi.d     $t3, $t3, 4*LFgemmElementSize
+        addi.d     $t4,, $t4, 4*LFgemmElementSize
+.endif
+.endif
+        addi.d     $t8, $t8, -4
+        li.d        $s0, 4
+        bge     $t8, $s0, .LComputeBlockBy4Loop\@
+
+.LProcessRemainingBlocks\@:
+        beqz    $t8,      .LOutputBlock\@
+
+.LComputeBlockBy1Loop\@:
+        \ComputeBlock\() \RowCount\(), 0, 0
+        addi.d     $a1, $a1, 2*32                    # advance matrix B by 64 bytes
+        addi.d     $a0, $a0, LFgemmElementSize      # advance matrix A by 1 element
+.if \RowCount\() > 3
+        addi.d     $t7, $t7, LFgemmElementSize      # advance matrix A plus rows by 1 element
+.if \RowCount\() == 12
+        addi.d     $t3, $t3, LFgemmElementSize
+        addi.d     $t4, $t4, LFgemmElementSize
+.endif
+.endif
+        addi.d     $t8, $t8, -1
+        bnez    $t8,     .LComputeBlockBy1Loop\@
+
+.LOutputBlock\@:
+
+        .endm
diff --git a/onnxruntime/core/mlas/lib/loongarch64/FgemmKernelLasxCommon.h b/onnxruntime/core/mlas/lib/loongarch64/FgemmKernelLasxCommon.h
new file mode 100644
index 0000000000000..b96db848617bf
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/loongarch64/FgemmKernelLasxCommon.h
@@ -0,0 +1,546 @@
+
+/*++
+
+Copyright (C) 2023 Loongson Technology Corporation Limited. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    FgemmKernelLasxCommon.h
+
+Abstract:
+
+    This module implements the kernels for the floating point matrix/matrix
+    multiply operation (SGEMM and DGEMM).
+
+    This implementation uses LASX instructions.
+
+--*/
+
+/*++
+
+Macro Description:
+
+    This macro multiplies and accumulates for 2 YMMWORDs by N rows of the output
+    matrix.
+
+Arguments:
+
+    RowCount - Supplies the number of rows to process.
+
+    VectorOffset - Supplies the byte offset from matrix B to fetch elements.
+
+    BroadcastOffset - Supplies the byte offset from matrix A to fetch elements.
+
+    PrefetchOffset - Optionally supplies the byte offset from matrix B to
+        prefetch elements.
+
+Implicit Arguments:
+
+    a0 - Supplies the address into the matrix A data.
+
+    t7 - Supplies the address into the matrix A data plus 2 rows.
+
+    a1 - Supplies the address into the matrix B data.
+
+    t0 - Supplies the length in bytes of a row from matrix A.
+
+    xr8-xr15 - Supplies the block accumulators.
+
+--*/
+
+        .macro ComputeBlockLasxBy16 RowCount, VectorOffset, BroadcastOffset, PrefetchOffset
+
+.if \RowCount\() == 1
+    xvldrepl.w	$xr3, $a0, \BroadcastOffset\()
+	xvld	$xr4, $a1, \VectorOffset\()
+	xvfmadd	$xr8, $xr4, $xr3, $xr8
+	xvld	$xr5, $a1, \VectorOffset\()+32
+	xvfmadd	$xr9, $xr5, $xr3, $xr9
+.else
+	xvld	$xr0, $a1, \VectorOffset\()
+	xvld	$xr1, $a1, \VectorOffset\()+32
+        EmitIfCountGE \RowCount\(), 1, "xvldrepl $xr3,$a0, \BroadcastOffset\()"
+        EmitIfCountGE \RowCount\(), 1, "xvfmadd $xr8, $xr3, $xr0, $xr8"
+        EmitIfCountGE \RowCount\(), 1, "xvfmadd $xr9, $xr3, $xr1, $xr9"
+        EmitIfCountGE \RowCount\(), 2, "add.d $s0,$a0, $t0"
+        EmitIfCountGE \RowCount\(), 2, "xvldrepl $xr3,$s0, \BroadcastOffset\()"
+        EmitIfCountGE \RowCount\(), 2, "xvfmadd $xr10, $xr3, $xr0, $xr10"
+        EmitIfCountGE \RowCount\(), 2, "xvfmadd $xr11, $xr3, $xr1, $xr11"
+
+        EmitIfCountGE \RowCount\(), 3, "xvldrepl $xr3,$t7, \BroadcastOffset\()"
+        EmitIfCountGE \RowCount\(), 3, "xvfmadd $xr12, $xr3, $xr0, $xr12"
+        EmitIfCountGE \RowCount\(), 3, "xvfmadd $xr13, $xr3, $xr1, $xr13"
+        EmitIfCountGE \RowCount\(), 4, "add.d $s0,$t7, $t0"
+        EmitIfCountGE \RowCount\(), 4, "xvldrepl $xr3,$s0, \BroadcastOffset\()"
+        EmitIfCountGE \RowCount\(), 4, "xvfmadd $xr14, $xr3, $xr0, $xr14"
+        EmitIfCountGE \RowCount\(), 4, "xvfmadd $xr15, $xr3, $xr1, $xr15"
+.endif
+
+        .endm
+
+/*++
+
+Macro Description:
+
+    This macro multiplies and accumulates for 1 YMMWORD by N rows of the output
+    matrix.
+
+Arguments:
+
+    RowCount - Supplies the number of rows to process.
+
+    VectorOffset - Supplies the byte offset from matrix B to fetch elements.
+
+    BroadcastOffset - Supplies the byte offset from matrix A to fetch elements.
+
+    PrefetchOffset - Optionally supplies the byte offset from matrix B to
+        prefetch elements.
+
+Implicit Arguments:
+
+    a0 - Supplies the address into the matrix A data.
+
+    t7 - Supplies the address into the matrix A data plus 2 rows.
+
+    a1 - Supplies the address into the matrix B data.
+
+    t0 - Supplies the length in bytes of a row from matrix A.
+
+    xr8-xr15 - Supplies the block accumulators.
+
+--*/
+
+        .macro ComputeBlockLasxBy8 RowCount, VectorOffset, BroadcastOffset, PrefetchOffset
+
+.if \RowCount\() == 1
+    xvldrepl.w	$xr3, $a0, \BroadcastOffset\()
+	xvld	$xr5, $a1, \VectorOffset\()
+	xvfmadd.s	$xr9, $xr5, $xr3, $xr9
+.else
+	xvld	$xr0, $a1, \VectorOffset\()
+        EmitIfCountGE \RowCount\(), 1, "xvldrepl $xr3, $a0, \BroadcastOffset\()"
+        EmitIfCountGE \RowCount\(), 1, "xvfmadd $xr9, $xr3, $xr0, $xr9"
+
+        EmitIfCountGE \RowCount\(), 2, "add.d $s0, $a0, $t0"
+        EmitIfCountGE \RowCount\(), 2, "xvldrepl $xr3, $s0, \BroadcastOffset\()"
+        EmitIfCountGE \RowCount\(), 2, "xvfmadd $xr11, $xr3, $xr0, $xr11"
+        EmitIfCountGE \RowCount\(), 3, "xvldrepl $xr3, $t7, \BroadcastOffset\()"
+        EmitIfCountGE \RowCount\(), 3, "xvfmadd $xr13, $xr3, $xr0, $xr13"
+        EmitIfCountGE \RowCount\(), 4, "add.d $s0, $t7, $t0"
+        EmitIfCountGE \RowCount\(), 4, "xvldrepl $xr3, $s0, \BroadcastOffset\()"
+        EmitIfCountGE \RowCount\(), 4, "xvfmadd $xr15, $xr3, $xr0, $xr15"
+.endif
+
+        .endm
+
+/*++
+
+Macro Description:
+
+    This macro generates code to execute the block compute macro multiple
+    times and advancing the matrix A and matrix B data pointers.
+
+Arguments:
+
+    ComputeBlock - Supplies the macro to compute a single block.
+
+    RowCount - Supplies the number of rows to process.
+
+Implicit Arguments:
+
+    a0 - Supplies the address into the matrix A data.
+
+    a1 - Supplies the address into the matrix B data.
+
+    a3 - Supplies the number of columns from matrix A and the number of rows
+        from matrix B to iterate over.
+
+    t0 - Supplies the length in bytes of a row from matrix A.
+
+    vr4-vr15 - Supplies the block accumulators.
+
+--*/
+
+        .macro ComputeBlockLasxLoop ComputeBlock, RowCount
+
+.if \RowCount\() > 2
+        # compute matrix A plus 2 rows
+	slli.d	$s0, $t0, 1
+	add.d	$t7, $a0, $s0
+.endif
+        ComputeBlockLoop \ComputeBlock\(), \RowCount\(), \RowCount\() > 2
+.if \RowCount\() > 2
+        # compute matrix C plus 2 rows
+	slli.d	$s0, $t6, 1
+	add.d	$t7, $a2, $s0
+.endif
+
+        .endm
+
+    .macro store_n  src, num, dst
+    move    $s2,    \num\()
+    beqz    $s2, .Lstore_exit\@
+    xvstelm.w   \src\(), \dst\(), 0, 0
+    addi.d  $s2, $s2, -1
+    beqz    $s2, .Lstore_exit\@
+
+    xvstelm.w   \src\(), \dst\(), 4, 1
+    addi.d  $s2, $s2, -1
+    beqz    $s2, .Lstore_exit\@
+
+    xvstelm.w   \src\(), \dst\(), 8, 2
+    addi.d  $s2, $s2, -1
+    beqz    $s2, .Lstore_exit\@
+
+    xvstelm.w   \src\(), \dst\(), 12, 3
+    addi.d  $s2, $s2, -1
+    beqz    $s2, .Lstore_exit\@
+
+    xvstelm.w   \src\(), \dst\(), 16, 4
+    addi.d  $s2, $s2, -1
+    beqz    $s2, .Lstore_exit\@
+
+    xvstelm.w   \src\(), \dst\(), 20, 5
+    addi.d  $s2, $s2, -1
+    beqz    $s2, .Lstore_exit\@
+
+    xvstelm.w   \src\(), \dst\(), 24, 6
+    addi.d  $s2, $s2, -1
+    beqz    $s2, .Lstore_exit\@
+
+.Lstore_exit\@:
+    .endm
+/*++
+
+Macro Description:
+
+    This macro generates code to compute matrix multiplication for a fixed set
+    of rows.
+
+Arguments:
+
+    RowCount - Supplies the number of rows to process.
+
+    Fallthrough - Supplies a non-blank value if the macro may fall through to
+        the ExitKernel label.
+
+Implicit Arguments:
+
+    a0 - Supplies the address of matrix A.
+
+    a1 - Supplies the address of matrix B.
+
+    t1 - Supplies the address of matrix A.
+
+    a5 - Supplies the number of columns from matrix B and matrix C to iterate
+        over.
+
+    a2 - Supplies the address of matrix C.
+
+    a3 - Supplies the number of columns from matrix A and the number of rows
+        from matrix B to iterate over.
+
+    t0 - Supplies the length in bytes of a row from matrix A.
+
+    t6 - Supplies the length in bytes of a row from matrix C.
+
+    t5 - Stores the ZeroMode argument from the stack frame.
+
+--*/
+
+        .macro ProcessCountM RowCount, Fallthrough
+
+	ori	$s1, $r0, LFgemmYmmElementCount
+	bgeu	$s1, $a5, .LProcessRemainingCountN\@
+
+.LProcessNextColumnLoop2xN\@:
+        EmitIfCountGE \RowCount\(), 1, "xvxor.v $xr8, $xr8, $xr8"
+        EmitIfCountGE \RowCount\(), 1, "xvxor.v $xr9, $xr9, $xr9"
+        EmitIfCountGE \RowCount\(), 2, "xvxor.v $xr10, $xr10, $xr10"
+        EmitIfCountGE \RowCount\(), 2, "xvxor.v $xr11, $xr11, $xr11"
+        EmitIfCountGE \RowCount\(), 3, "xvxor.v $xr12, $xr12, $xr12"
+        EmitIfCountGE \RowCount\(), 3, "xvxor.v $xr13, $xr13, $xr13"
+        EmitIfCountGE \RowCount\(), 4, "xvxor.v $xr14, $xr14, $xr14"
+        EmitIfCountGE \RowCount\(), 4, "xvxor.v $xr15, $xr15, $xr15"
+
+        ComputeBlockLasxLoop ComputeBlockLasxBy16, \RowCount\()
+        EmitIfCountGE \RowCount\(), 1, "xvfmul $xr8, $xr8, $xr2"
+        EmitIfCountGE \RowCount\(), 1, "xvfmul $xr9, $xr9, $xr2"
+        EmitIfCountGE \RowCount\(), 2, "xvfmul $xr10, $xr10, $xr2"
+        EmitIfCountGE \RowCount\(), 2, "xvfmul $xr11, $xr11, $xr2"
+        EmitIfCountGE \RowCount\(), 3, "xvfmul $xr12, $xr12, $xr2"
+        EmitIfCountGE \RowCount\(), 3, "xvfmul $xr13, $xr13, $xr2"
+        EmitIfCountGE \RowCount\(), 4, "xvfmul $xr14, $xr14, $xr2"
+        EmitIfCountGE \RowCount\(), 4, "xvfmul $xr15, $xr15, $xr2"
+
+	sub.d	$a5, $a5, $s1
+	sub.d	$a5, $a5, $s1
+	blt	$a5, $zero, .LOutputMasked2xNBlock\@
+	andi	$s0, $t5, 0xff # ZeroMode?
+	bnez	$s0, .LStore2xNBlock\@
+        EmitIfCountGE \RowCount\(), 1, "xvld $xr16, $a2, 0"
+        EmitIfCountGE \RowCount\(), 1, "xvfadd $xr8, $xr8, $xr16"
+        EmitIfCountGE \RowCount\(), 1, "xvld $xr16, $a2, 0x20"
+        EmitIfCountGE \RowCount\(), 1, "xvfadd $xr9, $xr9, $xr16"
+        EmitIfCountGE \RowCount\(), 2, "xvldx $xr16, $a2, $t6"
+        EmitIfCountGE \RowCount\(), 2, "xvfadd $xr10, $xr10, $xr16"
+        EmitIfCountGE \RowCount\(), 2, "add.d $s0, $a2, $t6"
+        EmitIfCountGE \RowCount\(), 2, "xvld $xr16, $s0, 0x20"
+        EmitIfCountGE \RowCount\(), 2, "xvfadd $xr11, $xr11, $xr16"
+        EmitIfCountGE \RowCount\(), 3, "xvld $xr16, $t7, 0"
+        EmitIfCountGE \RowCount\(), 3, "xvfadd $xr12, $xr12, $xr16"
+        EmitIfCountGE \RowCount\(), 3, "xvld $xr16, $t7, 0x20"
+        EmitIfCountGE \RowCount\(), 3, "xvfadd $xr13, $xr13, $xr16"
+        EmitIfCountGE \RowCount\(), 4, "xvldx $xr16, $t7, $t6"
+        EmitIfCountGE \RowCount\(), 4, "xvfadd $xr14, $xr14, $xr16"
+        EmitIfCountGE \RowCount\(), 4, "add.d $s0, $t7, $t6"
+        EmitIfCountGE \RowCount\(), 4, "xvld $xr16, $s0, 0x20"
+        EmitIfCountGE \RowCount\(), 4, "xvfadd $xr15, $xr15, $xr16"
+
+.LStore2xNBlock\@:
+        EmitIfCountGE \RowCount\(), 1, "xvst $xr8, $a2, 0"
+        EmitIfCountGE \RowCount\(), 1, "xvst $xr9, $a2, 0x20"
+        EmitIfCountGE \RowCount\(), 2, "xvstx $xr10, $a2, $t6"
+        EmitIfCountGE \RowCount\(), 2, "add.d $s0, $a2, $t6"
+        EmitIfCountGE \RowCount\(), 2, "xvst $xr11, $s0, 0x20"
+        EmitIfCountGE \RowCount\(), 3, "xvst $xr12, $t7, 0"
+        EmitIfCountGE \RowCount\(), 3, "xvst $xr13, $t7, 0x20"
+        EmitIfCountGE \RowCount\(), 4, "xvstx $xr14, $t7, $t6"
+        EmitIfCountGE \RowCount\(), 4, "add.d $s0, $t7, $t6"
+        EmitIfCountGE \RowCount\(), 4, "xvst $xr15, $s0, 0x20"
+
+	addi.d	$a2, $a2, 0x40     # advance matrix C by 2 XRWORDs
+	move	$a0, $t1	   # reload matrix A
+	bltu	$s1, $a5, .LProcessNextColumnLoop2xN\@
+	beqz	$a5, .LExitKernel
+
+.LProcessRemainingCountN\@:
+        EmitIfCountGE \RowCount\(), 1, "xvxor.v $xr9, $xr9, $xr9"
+        EmitIfCountGE \RowCount\(), 2, "xvxor.v $xr11, $xr11, $xr11"
+        EmitIfCountGE \RowCount\(), 3, "xvxor.v $xr13, $xr13, $xr13"
+        EmitIfCountGE \RowCount\(), 4, "xvxor.v $xr15, $xr15, $xr15"
+
+
+        ComputeBlockLasxLoop ComputeBlockLasxBy8, \RowCount\()
+        EmitIfCountGE \RowCount\(), 1, "xvfmul $xr9, $xr9, $xr2"
+        EmitIfCountGE \RowCount\(), 2, "xvfmul $xr11, $xr11, $xr2"
+        EmitIfCountGE \RowCount\(), 3, "xvfmul $xr13, $xr13, $xr2"
+        EmitIfCountGE \RowCount\(), 4, "xvfmul $xr15, $xr15, $xr2"
+	bltu	$a5, $s1, .LOutputMasked1xNBlock\@
+	andi	$s0, $t5, 0xff # ZeroMode?
+	bnez	$s0, .LStore1xNBlock\@
+        EmitIfCountGE \RowCount\(), 1, "xvld  $xr16, $a2, 0"
+        EmitIfCountGE \RowCount\(), 1, "xvfadd  $xr9, $xr9, $xr16"
+        EmitIfCountGE \RowCount\(), 2, "xvldx  $xr16, $a2, $t6"
+        EmitIfCountGE \RowCount\(), 2, "xvfadd  $xr11, $xr11, $xr16"
+        EmitIfCountGE \RowCount\(), 3, "xvld  $xr16, $t7, 0"
+        EmitIfCountGE \RowCount\(), 3, "xvfadd  $xr13, $xr13, $xr16"
+        EmitIfCountGE \RowCount\(), 4, "xvldx  $xr16, $t7, $t6"
+        EmitIfCountGE \RowCount\(), 4, "xvfadd  $xr15, $xr15, $xr16"
+
+.LStore1xNBlock\@:
+        EmitIfCountGE \RowCount\(), 1, "xvst $xr9, $a2, 0"
+        EmitIfCountGE \RowCount\(), 2, "xvstx $xr11, $a2, $t6"
+        EmitIfCountGE \RowCount\(), 3, "xvst $xr13, $t7, 0"
+        EmitIfCountGE \RowCount\(), 4, "xvstx $xr15, $t7, $t6"
+        b     .LExitKernel
+
+.LOutputMasked2xNBlock\@:
+	andi	$s0, $t5, 0xff # ZeroMode?
+	bnez	$s0, .LStoreMasked2xNBlock\@
+        EmitIfCountGE \RowCount\(), 1, "xvld $xr16, $a2, 0"
+        EmitIfCountGE \RowCount\(), 1, "xvfadd $xr8, $xr8, $xr16"
+        EmitIfCountGE \RowCount\(), 2, "xvldx $xr16, $a2, $t6"
+        EmitIfCountGE \RowCount\(), 2, "xvfadd $xr10, $xr10, $xr16"
+        EmitIfCountGE \RowCount\(), 3, "xvld $xr16, $t7, 0"
+        EmitIfCountGE \RowCount\(), 3, "xvfadd $xr12, $xr12, $xr16"
+        EmitIfCountGE \RowCount\(), 4, "xvldx $xr16, $t7, $t6"
+        EmitIfCountGE \RowCount\(), 4, "xvfadd $xr14, $xr14, $xr16"
+
+.LStoreMasked2xNBlock\@:
+        EmitIfCountGE \RowCount\(), 1, "xvst $xr8, $a2, 0"
+        EmitIfCountGE \RowCount\(), 2, "xvstx $xr10, $a2, $t6"
+        EmitIfCountGE \RowCount\(), 3, "xvst $xr12, $t7, 0"
+        EmitIfCountGE \RowCount\(), 4, "xvstx $xr14, $t7, $t6"
+	addi.d	$a2, $a2, 0x20              # advance matrix C by YMMWORD
+.if \RowCount\() > 2
+	addi.d	$t7, $t7, 0x20               # advance matrix C plus 2 rows by YMMWORD
+
+.endif
+	addi.d	$a5, $a5, LFgemmYmmElementCount   # correct for over-subtract above
+
+
+.LOutputMasked1xNBlock\@:
+
+.if \RowCount\() > 2
+    slli.d $s0, $t0, 1
+    add.d   $t7, $a0, $s0
+.endif
+
+.if \RowCount\() == 1
+.else
+.endif
+
+.if \RowCount\() > 2
+    slli.d  $s0, $t6, 1
+    add.d   $t7, $a2, $s0
+.endif
+
+	sub.d	$a5, $zero, $a5
+    la.global	$a0, MlasMaskMoveTableLasx
+	ori	$s0, $r0, LFgemmElementSize
+	mul.d	$s0, $a5, $s0
+    addi.d  $s0, $s0, 8*4
+	xvldx	$xr0, $a0, $s0
+	andi	$s0, $t5, 0xff
+
+	sub.d	$a5, $zero, $a5
+
+	bnez	$s0, .LStoreMasked1xNBlock\@
+        EmitIfCountGE \RowCount\(), 1, "xvld $xr16, $a2, 0"
+        EmitIfCountGE \RowCount\(), 1, "xvand.v $xr8, $xr16, $xr0"
+        EmitIfCountGE \RowCount\(), 2, "xvldx $xr16, $a2, $t6"
+        EmitIfCountGE \RowCount\(), 2, "xvand.v $xr10, $xr16, $xr0"
+        EmitIfCountGE \RowCount\(), 3, "xvld $xr16, $t7, 0"
+        EmitIfCountGE \RowCount\(), 3, "xvand.v $xr12, $xr16, $xr0"
+        EmitIfCountGE \RowCount\(), 4, "xvldx $xr16, $t7, $t6"
+        EmitIfCountGE \RowCount\(), 4, "xvand.v $xr14, $xr16, $xr0"
+
+        EmitIfCountGE \RowCount\(), 1, "xvfadd $xr9, $xr9, $xr8"
+        EmitIfCountGE \RowCount\(), 2, "xvfadd $xr11, $xr11, $xr10"
+        EmitIfCountGE \RowCount\(), 3, "xvfadd $xr13, $xr13, $xr12"
+        EmitIfCountGE \RowCount\(), 4, "xvfadd $xr15, $xr15, $xr14"
+.LStoreMasked1xNBlock\@:
+        EmitIfCountGE \RowCount\(), 1, "store_n $xr9, $a5, $a2"
+
+        add.d   $s3, $a2, $t6
+        EmitIfCountGE \RowCount\(), 2, "store_n $xr11, $a5, $s3"
+
+        EmitIfCountGE \RowCount\(), 3, "store_n $xr13, $a5, $t7"
+
+        add.d   $s3, $t7, $t6
+        EmitIfCountGE \RowCount\(), 4, "store_n $xr15, $a5, $s3"
+	    sub.d	$a5, $zero, $a5
+.ifb \Fallthrough\()
+        b     .LExitKernel
+.endif
+
+        .endm
+
+/*++
+
+Macro Description:
+
+    This macro generates the inner kernel to compute matrix multiplication.
+
+Arguments:
+
+    FunctionName - Supplies the name for the generated function.
+
+--*/
+
+        .macro FgemmKernelLasxFunction FunctionName
+
+/*++
+
+Routine Description:
+
+    This routine is an inner kernel to compute matrix multiplication for a
+    set of rows.
+
+Arguments:
+
+    A a0 - Supplies the address of matrix A.
+
+    B a1 - Supplies the address of matrix B. The matrix data has been packed
+        using MlasSgemmCopyPackB or MlasSgemmTransposePackB.
+
+    C a2 - Supplies the address of matrix C.
+
+    CountK a3 - Supplies the number of columns from matrix A and the number
+        of rows from matrix B to iterate over.
+
+    CountM a4 - Supplies the maximum number of rows that can be processed for
+        matrix A and matrix C. The actual number of rows handled for this
+        invocation depends on the kernel implementation.
+
+    CountN a5 - Supplies the number of columns from matrix B and matrix C to
+        iterate over.
+
+    lda a6 - Supplies the first dimension of matrix A.
+
+    ldc a7 - Supplies the first dimension of matrix C.
+
+    Alpha f0 - Supplies the scalar alpha multiplier (see GEMM definition).
+
+    ZeroMode (sp + 0)- Supplies true if the output matrix must be zero initialized,
+        else false if the output matrix is accumulated into.
+
+Return Value:
+
+    Returns the number of rows handled.
+
+--*/
+
+        FUNCTION_ENTRY \FunctionName\()
+
+	addi.d	$sp, $sp, -64
+	st.d	$ra, $sp, 56
+	st.d	$s0, $sp, 0*8
+	st.d	$s1, $sp, 1*8
+	fst.s	$f0, $sp, 2*8
+    fst.d   $f16, $sp,3*8
+    st.d    $s2, $sp, 4*8
+    st.d    $s3, $sp, 5*8
+
+	move	$t1, $a0
+	slli.d	$t0, $a6, 2  # convert lda to bytes
+	slli.d	$t6, $a7, 2  # convert ldc to bytes
+	ld.d	$t5, $sp, 64 # get zeromode
+	fst.s	$f0, $sp, 2*8
+	xvldrepl.w	$xr2, $sp, 0x10
+
+//
+// Process 4 rows of the matrices.
+//
+
+	ori	$s0, $zero, 4
+	bltu	$a4, $s0, .LProcessCountMLessThan4
+	li.d	$a4, 4	# return 4 rows handled
+        ProcessCountM 4, Fallthrough
+
+//
+// Restore non-volatile registers and return.
+//
+
+.LExitKernel:
+    bstrpick.d	$a0, $a4, 31, 0
+	ld.d	$s0, $sp, 0
+	ld.d	$s1, $sp, 8
+    fld.d   $f16, $sp,3*8
+    ld.d    $s2, $sp, 4*8
+    ld.d    $s3, $sp, 5*8
+	ld.d	$ra, $sp, 7*8
+	addi.d	$sp, $sp, 64
+	jr	$ra
+
+//
+// Process 2 rows of the matrices.
+//
+
+.LProcessCountMLessThan4:
+	ori	$s0, $r0, 2
+	bltu	$a4, $s0, .LProcessCountMLessThan2
+	li.d	$a4, 2	# return 2 rows handled
+        ProcessCountM 2
+
+//
+// Process 1 row of the matrices.
+//
+
+.LProcessCountMLessThan2:
+        ProcessCountM 1
+
+        .endm
diff --git a/onnxruntime/core/mlas/lib/loongarch64/FgemmKernelLsxCommon.h b/onnxruntime/core/mlas/lib/loongarch64/FgemmKernelLsxCommon.h
new file mode 100644
index 0000000000000..0333af792ba70
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/loongarch64/FgemmKernelLsxCommon.h
@@ -0,0 +1,170 @@
+/*++
+
+Copyright (C) 2023 Loongson Technology Corporation Limited. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    FgemmKernelLsxCommon.h
+
+Abstract:
+
+    This module implements the kernels for the floating point matrix/matrix
+    multiply operation (SGEMM and DGEMM).
+
+    This implementation uses Lsx instructions.
+
+--*/
+
+#include "FgemmKernelCommon.h"
+/*++
+
+Macro Description:
+
+    This stores the block accumulators to the output matrix with an optional
+    accumulation of the existing contents of the output matrix.
+
+Arguments:
+
+    RowCount - Supplies the number of rows to process.
+
+    VectorCount - Supplies the number of vector columns to process.
+
+Implicit Arguments:
+
+    t5 - Supplies the length in bytes of a row from matrix C.
+
+    a2 - Supplies the address of matrix C.
+
+    s3 - Stores the ZeroMode argument from the stack frame.
+
+    vr8-vr15 - Supplies the block accumulators.
+
+--*/
+
+        .macro AccumulateAndStoreBlock RowCount, VectorCount
+
+        and    $s0, $t5,$t5                   # ZeroMode?
+        bnez    $s0 , .LSkipAccumulateOutput\@
+        EmitIfCount2GE \RowCount\(), 1, \VectorCount\(), 1, "vld $vr0, $a2, 0"
+        EmitIfCount2GE \RowCount\(), 1, \VectorCount\(), 2, "vld $vr1, $a2, 16"
+        EmitIfCount2GE \RowCount\(), 1, \VectorCount\(), 3, "vld $vr2, $a2, 32"
+        EmitIfCount2GE \RowCount\(), 1, \VectorCount\(), 4, "vld $vr3, $a2, 48"
+        EmitIfCount2GE \RowCount\(), 2, \VectorCount\(), 1, "vldx $vr4, $a2, $t6"
+        EmitIfCount2GE \RowCount\(), 2, \VectorCount\(), 2, "addi.d $s0, $t6, 16"
+        EmitIfCount2GE \RowCount\(), 2, \VectorCount\(), 2, "vldx $vr5, $a2, $s0"
+        EmitIfCount2GE \RowCount\(), 2, \VectorCount\(), 3, "addi.d $s0, $t6, 32"
+        EmitIfCount2GE \RowCount\(), 2, \VectorCount\(), 3, "vldx $vr6, $a2, $s0"
+        EmitIfCount2GE \RowCount\(), 2, \VectorCount\(), 4, "addi.d $s0, $t6, 48"
+        EmitIfCount2GE \RowCount\(), 2, \VectorCount\(), 4, "vldx $vr7, $a2, $s0"
+        EmitIfCount2GE \RowCount\(), 1, \VectorCount\(), 1, "vfadd $vr8, $vr8, $vr0"
+        EmitIfCount2GE \RowCount\(), 1, \VectorCount\(), 2, "vfadd $vr9, $vr9, $vr1"
+        EmitIfCount2GE \RowCount\(), 1, \VectorCount\(), 3, "vfadd $vr10,$vr10,$vr2"
+        EmitIfCount2GE \RowCount\(), 1, \VectorCount\(), 4, "vfadd $vr11,$vr11,$vr3"
+        EmitIfCount2GE \RowCount\(), 2, \VectorCount\(), 1, "vfadd $vr12,$vr12,$vr4"
+        EmitIfCount2GE \RowCount\(), 2, \VectorCount\(), 2, "vfadd $vr13,$vr13,$vr5"
+        EmitIfCount2GE \RowCount\(), 2, \VectorCount\(), 3, "vfadd $vr14,$vr14,$vr6"
+        EmitIfCount2GE \RowCount\(), 2, \VectorCount\(), 4, "vfadd $vr15,$vr15,$vr7"
+
+.LSkipAccumulateOutput\@:
+        EmitIfCount2GE \RowCount\(), 1, \VectorCount\(), 1, "vst $vr8, $a2, 0"
+        EmitIfCount2GE \RowCount\(), 1, \VectorCount\(), 2, "vst $vr9,  $a2, 16"
+        EmitIfCount2GE \RowCount\(), 1, \VectorCount\(), 3, "vst $vr10, $a2, 32"
+        EmitIfCount2GE \RowCount\(), 1, \VectorCount\(), 4, "vst $vr11, $a2, 48"
+        EmitIfCount2GE \RowCount\(), 2, \VectorCount\(), 1, "vstx $vr12, $a2, $t6"
+        EmitIfCount2GE \RowCount\(), 2, \VectorCount\(), 2, "addi.d $s0, $t6, 16"
+        EmitIfCount2GE \RowCount\(), 2, \VectorCount\(), 2, "vstx $vr13, $a2, $s0"
+        EmitIfCount2GE \RowCount\(), 2, \VectorCount\(), 3, "addi.d $s0, $t6, 32"
+        EmitIfCount2GE \RowCount\(), 2, \VectorCount\(), 3, "vstx $vr14, $a2, $s0"
+        EmitIfCount2GE \RowCount\(), 2, \VectorCount\(), 4, "addi.d $s0, $t6, 48"
+        EmitIfCount2GE \RowCount\(), 2, \VectorCount\(), 4, "vstx $vr15, $a2, $s0"
+
+        .endm
+/*++
+
+Macro Description:
+
+    This macro generates the inner kernel to compute matrix multiplication.
+
+Arguments:
+
+    FunctionName - Supplies the name for the generated function.
+
+--*/
+
+        .macro FgemmKernelLsxFunction FunctionName
+
+/*++
+
+Routine Description:
+
+    This routine is an inner kernel to compute matrix multiplication for a
+    set of rows.
+
+Arguments:
+
+    A (a0) - Supplies the address of matrix A.
+
+    B (a1) - Supplies the address of matrix B. The matrix data has been packed
+        using MlasSgemmCopyPackB or MlasSgemmTransposePackB.
+
+    C (a2) - Supplies the address of matrix C.
+
+    CountK (a3) - Supplies the number of columns from matrix A and the number
+        of rows from matrix B to iterate over.
+
+    CountM (a4) - Supplies the maximum number of rows that can be processed for
+        matrix A and matrix C. The actual number of rows handled for this
+        invocation depends on the kernel implementation.
+
+    CountN (a5) - Supplies the number of columns from matrix B and matrix C to
+        iterate over.
+
+    lda (a6) Supplies the first dimension of matrix A.
+
+    ldc (a7) Supplies the first dimension of matrix C.
+
+    Alpha (f0) - Supplies the scalar alpha multiplier (see GEMM definition).
+
+    ZeroMode (sp 0) - Supplies true if the output matrix must be zero initialized,
+        else false if the output matrix is accumulated into.
+
+Return Value:
+
+    Returns the number of rows handled.
+
+--*/
+
+FUNCTION_ENTRY \FunctionName\()
+    addi.d  $sp, $sp, -64
+    st.d    $t5, $sp, 0
+    st.d    $s0, $sp, 1*8
+    st.d    $s1, $sp, 2*8
+    st.d    $s2, $sp, 3*8
+    st.d    $s3, $sp, 4*8
+    move    $t1, $a0
+    slli.d  $t0, $a6, 2   //convert lda to bytes
+    slli.d  $t6, $a7, 2   //convert ldc to bytes
+    ld.d    $t5, $sp, 64
+    fmov.s    $f24, $f0     //f0 destroyed by lsx
+
+    li.d    $s0, 2
+    blt     $a4, $s0, .LProcessCountM1
+
+    li.d    $a4, 2
+    ProcessCountM 2, Fallthrough
+
+.LExitKernel:
+    ld.d    $t5, $sp, 0
+    ld.d    $s0, $sp, 1*8
+    ld.d    $s1, $sp, 2*8
+    ld.d    $s2, $sp, 3*8
+    ld.d    $s3, $sp, 4*8
+    addi.d  $sp, $sp, 64
+    move    $a0, $a4
+    jr      $ra
+
+.LProcessCountM1:
+    ProcessCountM 1
+    .endm
diff --git a/onnxruntime/core/mlas/lib/loongarch64/SconvKernelLasx.S b/onnxruntime/core/mlas/lib/loongarch64/SconvKernelLasx.S
new file mode 100644
index 0000000000000..e03503521912a
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/loongarch64/SconvKernelLasx.S
@@ -0,0 +1,412 @@
+/*++
+
+Copyright (C) 2023 Loongson Technology Corporation Limited. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    SconvKernelLasx.S
+
+Abstract:
+
+    This module implements the kernels for the single precision convolution
+    operation.
+
+    This implementation uses Lasx instructions.
+
+--*/
+
+#include "asmmacro.h"
+#include "SconvKernelLasxCommon.h"
+
+        .text
+
+/*++
+
+Macro Description:
+
+    This macro multiplies and accumulates for FilterCount by OutputCount block
+    of the output buffer.
+
+Arguments:
+
+    KernelType - Supplies the type of kernel to be generated.
+
+    FilterCount - Supplies the number of rows from the filter to process.
+
+    OutputCount - Supplies the number of output blocks to produce.
+
+    VectorOffset - Supplies the byte offset from the filter buffer to fetch
+        elements.
+
+    BroadcastOffset - Supplies the byte offset from the input buffer to fetch
+        elements.
+
+Implicit Arguments:
+
+    a3 - Supplies the address of the input buffer.
+
+    a2 - Supplies the address of the filter buffer.
+
+    a1 - Supplies the FilterStride parameter (see function description).
+
+    t7 - Supplies the address of the filter buffer plus 2 * FilterStride.
+
+    a5 - Supplies the StrideWidth parameter (see function description).
+
+    xr0-xr7 - Supplies the block accumulators.
+
+--*/
+
+        .macro ComputeBlock KernelType, FilterCount, OutputCount, VectorOffset, BroadcastOffset
+
+.ifeqs "\KernelType\()","Depthwise"
+	xvld	$xr12, $a2, 0
+        EmitIfCountGE \OutputCount\(), 1, "xvld $xr8, $a3, 0"
+        EmitIfCountGE \OutputCount\(), 1, "xvfmadd.s $xr0, $xr8, $xr12, $xr0"
+        EmitIfCountGE \OutputCount\(), 2, "xvldx $xr9, $a3, $a5"
+        EmitIfCountGE \OutputCount\(), 2, "xvfmadd.s $xr4, $xr9, $xr12, $xr4"
+
+.else
+        EmitIfCountGE \OutputCount\(), 1, "xvldrepl.w $xr13, $a3, \BroadcastOffset\()"
+        EmitIfCountGE \OutputCount\(), 2, "add.d $s0, $a3, $a5"
+        EmitIfCountGE \OutputCount\(), 2, "xvldrepl.w $xr14, $s0, \BroadcastOffset\()"
+.if \OutputCount\() == 1
+        EmitIfCountGE \FilterCount\(), 1, "xvld $xr8, $a2, \VectorOffset\()"
+        EmitIfCountGE \FilterCount\(), 1, "xvfmadd.s $xr0, $xr8, $xr13, $xr0"
+        EmitIfCountGE \FilterCount\(), 2, "add.d $s0, $a2, $a1"
+        EmitIfCountGE \FilterCount\(), 2, "xvld $xr9, $s0, \VectorOffset\()"
+        EmitIfCountGE \FilterCount\(), 2, "xvfmadd.s $xr1, $xr9, $xr13, $xr1"
+        EmitIfCountGE \FilterCount\(), 3, "xvld $xr10, $t7, \VectorOffset\()"
+        EmitIfCountGE \FilterCount\(), 3, "xvfmadd.s $xr2, $xr10, $xr13, $xr2"
+        EmitIfCountGE \FilterCount\(), 4, "add.d $s0, $t7, $a1"
+        EmitIfCountGE \FilterCount\(), 4, "xvld $xr11, $s0, \VectorOffset\()"
+        EmitIfCountGE \FilterCount\(), 4, "xvfmadd.s $xr3, $xr11, $xr13, $xr3"
+.else
+        EmitIfCountGE \FilterCount\(), 1, "xvld $xr12, $a2, \VectorOffset\()"
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 1, "xvfmadd.s $xr0, $xr12, $xr13, $xr0"
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 2, "xvfmadd.s $xr4, $xr12, $xr14, $xr4"
+        EmitIfCountGE \FilterCount\(), 2, "add.d $s0, $a2, $a1"
+        EmitIfCountGE \FilterCount\(), 2, "xvld $xr12, $s0, \VectorOffset\()"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 1, "xvfmadd.s $xr1, $xr13, $xr12, $xr1"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 2, "xvfmadd.s $xr5, $xr14, $xr12, $xr5"
+        EmitIfCountGE \FilterCount\(), 3, "xvld $xr12, $t7, \VectorOffset\()"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 1, "xvfmadd.s $xr2, $xr13, $xr12, $xr2"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 2, "xvfmadd.s $xr6, $xr14, $xr12, $xr6"
+        EmitIfCountGE \FilterCount\(), 4, "add.d $s0, $t7, $a1"
+        EmitIfCountGE \FilterCount\(), 4, "xvld $xr12, $s0, \VectorOffset\()"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 1, "xvfmadd.s $xr3, $xr13, $xr12, $xr3"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 2, "xvfmadd.s $xr7, $xr14, $xr12, $xr7"
+.endif
+.endif
+
+        .endm
+
+/*++
+
+Macro Description:
+
+    This macro generates code to compute the convolution for a specified number
+    of filter rows.
+
+Arguments:
+
+    KernelFrame - Supplies the symbol name to access the convolution kernel
+        stack.
+
+    KernelType - Supplies the type of kernel to be generated.
+
+    FilterCount - Supplies the number of rows from the filter to process.
+
+Implicit Arguments:
+
+    a0 - Supplies the address of the input buffer.
+
+    a1 - Supplies the FilterStride parameter (see function description) when
+        KernelType!=Depthwise. Supplies the address of the filter buffer when
+        KernelType=Depthwise.
+
+    t7 - Supplies the DilationWidth parameter (see function description).
+
+    a4 - Supplies the address of the output buffer.
+
+    a5 - Supplies the StrideWidth parameter (see function description).
+
+    t5 - Supplies the InputStride parameter (see function description).
+
+--*/
+
+        .macro ProcessFilterCountN KernelFrame, KernelType, FilterCount
+
+//
+// Process the output blocks that include left padding.
+//
+
+	ld.d	$t0, $sp, OutputCountLeftPad_arg
+	beqz	$t0, .L\KernelType\().\FilterCount\().ProcessOutputCount
+    bl    MlasConv\KernelType\()FloatSingleLasxFilter\FilterCount\()
+
+//
+// Process the output blocks that do not include any padding.
+//
+
+.L\KernelType\().\FilterCount\().ProcessOutputCount:
+	ld.d	$t0, $sp, OutputCount_arg
+    li.d    $s0, 2
+    bltu	$t0, $s0, .L\KernelType\().\FilterCount\().ProcessRemainingOutputCount
+
+.L\KernelType\().\FilterCount\().ProcessNextOutputCountBy2:
+        ProcessOutputCountN Lasx, \KernelFrame\(), \KernelType\(), 8, \FilterCount\(), 2
+	slli.d	$s0, $a5, 1              # advance input by 2 elements
+	add.d	$a0, $a0, $s0
+	addi.d	$t0, $t0, -2
+    li.d    $s0, 2
+	bgeu	$t0, $s0, .L\KernelType\().\FilterCount\().ProcessNextOutputCountBy2
+
+.L\KernelType\().\FilterCount\().ProcessRemainingOutputCount:
+
+//
+// Process the output blocks that include right padding plus any remaining output
+// blocks from above.
+//
+
+.L\KernelType\().\FilterCount\().ProcessOutputCountRightPadAndRemaining:
+	ld.d	$s0, $sp, OutputCountRightPad_arg
+	add.d	$t0, $t0, $s0
+	beqz	$t0, .L\KernelType\().ExitKernel
+        bl	MlasConv\KernelType\()FloatSingleLasxFilter\FilterCount\()
+
+        .endm
+
+/*++
+
+Macro Description:
+
+    This macro generates code to compute the convolution for a specified number
+    of filter rows for a pointwise convolution.
+
+Arguments:
+
+    FilterCount - Supplies the number of rows from the filter to process.
+
+Implicit Arguments:
+
+    a0 - Supplies the address of the input buffer.
+
+    a1 - Supplies the FilterStride parameter (see function description).
+
+    t8 - Supplies the InputStride parameter (see function description).
+
+    a4 - Supplies the address of the output buffer.
+
+    a5 - Supplies the StrideWidth parameter (see function description).
+
+    t0 - Supplies the OutputCount parameter (see function description).
+
+    t2 - Supplies the address of the filter buffer.
+
+--*/
+
+        .macro ProcessPointwiseFilterCountN FilterCount
+        li.d    $s0, 2
+        bltu	$t0, $s0, .LPointwise.\FilterCount\().ProcessRemainingOutputCount
+
+.LPointwise.\FilterCount\().ProcessNextOutputCountBy2:
+        ProcessPointwiseOutputCountN Lasx, 8, \FilterCount\(), 2
+	slli.d	$s0, $a5, 1              # advance input by 2 elements
+	add.d	$a0, $a0, $s0
+	addi.d	$t0, $t0, -2
+    li.d    $s0, 2
+    bgeu	$t0, $s0, .LPointwise.\FilterCount\().ProcessNextOutputCountBy2
+
+.LPointwise.\FilterCount\().ProcessRemainingOutputCount:
+        beqz	$t0, .LPointwise.ExitKernel
+        ProcessPointwiseOutputCountN Lasx, 8, \FilterCount\(), 1
+
+        .endm
+
+//
+// Generate the convolution kernels.
+//
+
+        SconvKernelFunction Nchw, 8, Lasx
+        SconvKernelFunction Nchwc, 8, Lasx, BiasFilter
+        SconvKernelDepthwiseFunction 8, Lasx
+        SconvKernelPointwiseFunction Lasx, BiasFilter
+
+/*++
+
+Macro Description:
+
+    This macro generates code to process an output block after the inner
+    convolution kernel has executed and then stores the output block to the
+    output buffer.
+
+Arguments:
+
+    FilterCount - Supplies the number of rows from the filter to process.
+
+    OutputCount - Supplies the number of output blocks to produce.
+
+--*/
+
+        .macro PostProcessBlock FilterCount, OutputCount
+
+        .globl  MlasConvPostProcessFloatLasxFilter\FilterCount\()Output\OutputCount\()
+        .hidden MlasConvPostProcessFloatLasxFilter\FilterCount\()Output\OutputCount\()
+MlasConvPostProcessFloatLasxFilter\FilterCount\()Output\OutputCount\():
+
+        .globl  MlasConvPostProcessFloatFma3Filter\FilterCount\()Output\OutputCount\()
+        .hidden MlasConvPostProcessFloatFma3Filter\FilterCount\()Output\OutputCount\()
+MlasConvPostProcessFloatFma3Filter\FilterCount\()Output\OutputCount\():
+
+.if \FilterCount\() > 2
+	slli.d	$s0, $t6, 1              # compute output plus 2 rows
+	add.d	$t7, $a4, $s0
+.endif
+
+//
+// Test if the existing contents of the output buffer should be accumulated
+// with the output block.
+//
+
+	andi	$s0, $a2, MLAS_CONV_KERNEL_FLAG_ACCUMULATE_OUTPUT
+        beqz	$s0, .LPostProcessBlock.\FilterCount\().\OutputCount\().SkipAccumulateOutput
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 1, "xvld $xr16, $a4, 0"
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 1, "xvfadd.s $xr0, $xr0, $xr16"
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 2, "xvld $xr16, $a4, 32"
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 2, "xvfadd.s $xr4, $xr4, $xr16"
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 3, "xvld $xr16, $a4, 0x40"
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 3, "xvfadd.s $xr8, $xr8, $xr16"
+
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 1, "xvldx $xr16, $a4, $t6"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 1, "xvfadd.s $xr1, $xr1, $xr16"
+
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 2, "add.d $s0, $a4, $t6"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 2, "xvld $xr16, $s0, 0x20"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 2, "xvfadd.s $xr5, $xr5, $xr16"
+
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 3, "add.d $s0, $a4, $t6"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 3, "xvld $xr16, $s0, 0x40"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 3, "xvfadd.s $xr9, $xr9, $xr16"
+
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 1, "xvld $xr16,$t7, 0"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 1, "xvfadd.s $xr2, $xr2, $xr16"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 2, "xvld $xr16,$t7, 0x20"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 2, "xvfadd.s $xr6, $xr6, $xr16"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 3, "xvld $xr16,$t7, 0x40"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 3, "xvfadd.s $xr10, $xr10, $xr16"
+
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 1, "xvldx $xr16,$t7, $t6"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 1, "xvfadd.s $xr3, $xr3, $xr16"
+
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 2, "add.d $s0, $t7, $t6"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 2, "xvld $xr16,$s0, 0x20"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 2, "xvfadd.s $xr7, $xr7, $xr16"
+
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 3, "add.d $s0, $t7, $t6"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 3, "xvld $xr16,$s0, 0x40"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 3, "xvfadd.s $xr11, $xr11, $xr16"
+
+
+.LPostProcessBlock.\FilterCount\().\OutputCount\().SkipAccumulateOutput:
+
+//
+// Test if the bias buffer should be accumulated with the output block.
+//
+
+	andi	$s0, $a2, MLAS_CONV_KERNEL_FLAG_BIAS_ADDITION
+        beqz	$s0, .LPostProcessBlock.\FilterCount\().\OutputCount\().SkipBiasAddition
+.if \OutputCount\() == 1
+        EmitIfCountGE \FilterCount\(), 1, "xvld $xr16, $a3, 0"
+        EmitIfCountGE \FilterCount\(), 1, "xvfadd.s $xr0, $xr0, $xr16"
+        EmitIfCountGE \FilterCount\(), 2, "xvld $xr16, $a3, 0x20"
+        EmitIfCountGE \FilterCount\(), 2, "xvfadd.s $xr1, $xr1, $xr16"
+        EmitIfCountGE \FilterCount\(), 3, "xvld $xr16, $a3, 0x40"
+        EmitIfCountGE \FilterCount\(), 3, "xvfadd.s $xr2, $xr2, $xr16"
+        EmitIfCountGE \FilterCount\(), 4, "xvld $xr16, $a3, 0x60"
+        EmitIfCountGE \FilterCount\(), 4, "xvfadd.s $xr3, $xr3, $xr16"
+.else
+        EmitIfCountGE \FilterCount\(), 1, "xvld $xr12, $a3, 0"
+        EmitIfCountGE \FilterCount\(), 2, "xvld $xr13, $a3, 0x20"
+        EmitIfCountGE \FilterCount\(), 3, "xvld $xr14, $a3, 0x40"
+        EmitIfCountGE \FilterCount\(), 4, "xvld $xr15, $a3, 0x60"
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 1, "xvfadd.s $xr0, $xr0, $xr12"
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 2, "xvfadd.s $xr4, $xr4, $xr12"
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 3, "xvfadd.s $xr8, $xr8, $xr12"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 1, "xvfadd.s $xr1, $xr1, $xr13"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 2, "xvfadd.s $xr5, $xr5, $xr13"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 3, "xvfadd.s $xr9, $xr9, $xr13"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 1, "xvfadd.s $xr2, $xr2, $xr14"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 2, "xvfadd.s $xr6, $xr6, $xr14"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 3, "xvfadd.s $xr10, $xr10, $xr14"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 1, "xvfadd.s $xr3, $xr3, $xr15"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 2, "xvfadd.s $xr7, $xr7, $xr15"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 3, "xvfadd.s $xr11, $xr11, $xr15"
+
+.endif
+
+.LPostProcessBlock.\FilterCount\().\OutputCount\().SkipBiasAddition:
+
+//
+// Test for fused ReLU activation.
+//
+
+	andi	$s0, $a2, MLAS_CONV_KERNEL_FLAG_RELU_ACTIVATION
+        beqz	$s0, .LPostProcessBlock.\FilterCount\().\OutputCount\().SkipReluActivation
+	xvxor.v	$xr15, $xr15, $xr15
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 1, "xvfmax.s $xr0, $xr15, $xr0"
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 2, "xvfmax.s $xr4, $xr15, $xr4"
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 3, "xvfmax.s $xr8, $xr15, $xr8"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 1, "xvfmax.s $xr1, $xr15, $xr1"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 2, "xvfmax.s $xr5, $xr15, $xr5"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 3, "xvfmax.s $xr9, $xr15, $xr9"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 1, "xvfmax.s $xr2, $xr15, $xr2"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 2, "xvfmax.s $xr6, $xr15, $xr6"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 3, "xvfmax.s $xr10, $xr15, $xr10"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 1, "xvfmax.s $xr3, $xr15, $xr3"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 2, "xvfmax.s $xr7, $xr15, $xr7"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 3, "xvfmax.s $xr11, $xr15, $xr11"
+
+.LPostProcessBlock.\FilterCount\().\OutputCount\().SkipReluActivation:
+
+//
+// Store the output block in the output buffer.
+//
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 1, "xvst $xr0, $a4, 0"
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 2, "xvst $xr4, $a4, 0x20"
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 3, "xvst $xr8, $a4, 0x40"
+
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 1, "xvstx $xr1, $a4, $t6"
+
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 2, "add.d $s0, $a4, $t6"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 2, "xvst $xr5, $s0, 0x20"
+
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 3, "add.d $s0, $a4, $t6"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 3, "xvst $xr9, $s0, 0x40"
+
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 1, "xvst $xr2, $t7, 0"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 2, "xvst $xr6, $t7, 0x20"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 3, "xvst $xr10, $t7, 0x40"
+
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 1, "xvstx $xr3, $t7, $t6"
+
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 2, "add.d $s0, $t7, $t6"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 2, "xvst $xr7, $s0, 0x20"
+
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 3, "add.d $s0, $t7, $t6"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 3, "xvst $xr11, $s0, 0x40"
+
+        add_immed $a4,\OutputCount\()*8*4    # advance output by N nchw8c blocks
+	jr	$ra
+
+        .endm
+
+        .irp    FilterCount, 1, 2, 3, 4
+        .irp    OutputCount, 1, 2, 3
+            PostProcessBlock \FilterCount\(), \OutputCount\()
+        .endr
+        .endr
+
+        .end
diff --git a/onnxruntime/core/mlas/lib/loongarch64/SconvKernelLasxCommon.h b/onnxruntime/core/mlas/lib/loongarch64/SconvKernelLasxCommon.h
new file mode 100644
index 0000000000000..bd2db816ed9ab
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/loongarch64/SconvKernelLasxCommon.h
@@ -0,0 +1,868 @@
+/*++
+
+Copyright (C) 2023 Loongson Technology Corporation Limited. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    SconvKernelLasxCommon.h
+
+Abstract:
+
+    This module contains common kernel macros and structures for the single
+    precision convolution operation for the Lasx kernels.
+
+--*/
+
+
+#define SP_SIZE 32*8
+
+#define MLAS_CONV_KERNEL_FLAG_ACCUMULATE_OUTPUT     0x00000001
+#define MLAS_CONV_KERNEL_FLAG_BIAS_ADDITION         0x00000002
+#define MLAS_CONV_KERNEL_FLAG_RELU_ACTIVATION       0x00000004
+#define MLAS_CONV_KERNEL_FLAG_OTHER_ACTIVATION      0x00000008
+
+#define OutputStride_arg                6*8
+#define KernelHeight_arg                7*8
+#define KernelWidth_arg                 8*8
+#define InputBase_arg                   9*8
+#define InputWidth_arg                  10*8
+#define DilatedInputWidth_arg           11*8
+#define OutputCountLeftPad_arg          12*8
+#define OutputCount_arg                 13*8
+#define OutputCountRightPad_arg         14*8
+#define Bias_arg                        15*8
+#define Flags_arg                       16*8
+#define InputChannels_arg               17*8
+#define Filter_save_offset 18*8
+
+/*++
+
+Macro Description:
+
+    This macro generates code to compute the convolution for a vector of input
+    blocks and a vector of filter blocks to produce a matrix of output blocks.
+
+    OutputCount=1 generates special case code to handle padding blocks. All
+    other output counts assume no padding.
+
+Arguments:
+
+    Isa - Supplies the instruction set architecture string for function tags.
+
+    KernelFrame - Supplies the symbol name to access the convolution kernel
+        stack.
+
+    KernelType - Supplies the type of kernel to be generated.
+
+    BlockSize - Supplies the number of elements per block.
+
+    FilterCount - Supplies the number of rows from the filter to process.
+
+    OutputCount - Supplies the number of output blocks to produce.
+
+Implicit Arguments:
+
+    a0 - Supplies the address of the input buffer.
+
+    a1 - Supplies the FilterStride parameter (see function description) when
+        KernelType!=Depthwise. Supplies the address of the filter buffer when
+        KernelType=Depthwise.
+
+    s8 - Supplies the DilationWidth parameter (see function description).
+
+    a4 - Supplies the address of the output buffer.
+
+    a5 - Supplies the StrideWidth parameter (see function description).
+
+    t5 - Supplies the InputStride parameter (see function description).
+--*/
+        .macro ProcessOutputCountN Isa, KernelFrame, KernelType, BlockSize, FilterCount, OutputCount
+
+	move	$a3, $a0
+.ifeqs "\KernelType\()","Depthwise"
+	move	$a2, $a1
+.else
+	ld.d	$a2, $sp, Filter_save_offset
+.endif
+	ld.d	$t1, $sp, KernelHeight_arg
+	ld.d	$t2, $sp, KernelWidth_arg
+.if \OutputCount\() == 1
+	ld.d	$t3, $sp, InputBase_arg
+	ld.d	$t4, $sp, InputWidth_arg
+	sub.d	$t3, $zero, $t3
+.endif
+        ClearBlock \FilterCount\(), \OutputCount\()
+        beqz	$t1, .L\KernelType\().\FilterCount\().\OutputCount\().HandlePostProcessing
+
+.L\KernelType\().\FilterCount\().\OutputCount\().ProcessNextRow:
+	move	$t6, $t2                    # reload kernel width remaining
+
+.L\KernelType\().\FilterCount\().\OutputCount\().ProcessNextColumn:
+.if \OutputCount\() == 1
+	add.d	$t7, $a3, $t3               # compute (Input - InputBase)
+        # (Input - InputBase) >= InputWidth?
+        bgeu	$t7, $t4, .L\KernelType\().\FilterCount\().\OutputCount\().SkipOverPadding
+.endif
+.if \OutputCount\() > 3
+	slli.d	$s0, $a5, 1
+	add.d	$s0, $s0, $a5
+	add.d	$t4, $a3, $s0                # compute input plus 3 blocks
+.endif
+.if \FilterCount\() > 2
+	slli.d	$s0, $a1, 1             # compute filter plus 2 rows
+	add.d	$t7, $a2, $s0
+.endif
+.ifeqs "\KernelType\()","Nchwc"
+.if \BlockSize\() == 16
+        .irp Index, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+            ComputeBlock \KernelType\(), \FilterCount\(), \OutputCount\(), \Index\()*16*4, \Index\()*4
+        .endr
+.else
+        .irp Index, 0, 1, 2, 3, 4, 5, 6, 7
+            ComputeBlock \KernelType\(), \FilterCount\(), \OutputCount\(), (\Index\()-4)*8*4, \Index\()*4
+        .endr
+.endif
+.else
+        ComputeBlock \KernelType\(), \FilterCount\(), \OutputCount\(), 0, 0
+.endif
+
+.L\KernelType\().\FilterCount\().\OutputCount\().SkipOverPadding:
+        # advance input by dilation width
+	add.d	$a3, $a3, $t8
+.ifeqs "\KernelType\()","Nchwc"
+       # advance filter by 8i8o/16i16o block
+	addi.d	$a2, $a2, \BlockSize\()*\BlockSize\()*4
+.else
+	addi.d	$a2, $a2, \BlockSize\()*4    # advance filter by 8o/16o block
+.endif
+	addi.d	$t6, $t6, -1
+        bnez	$t6, .L\KernelType\().\FilterCount\().\OutputCount\().ProcessNextColumn
+	add.d	$a3, $a3, $t5                # advance input to next row
+.if \OutputCount\() == 1
+	ld.d	$s0, $sp, DilatedInputWidth_arg
+        # advance input base to next row
+	sub.d	$t3, $t3, $s0
+.endif
+	addi.d	$t1, $t1, -1                 # decrement rows remaining
+        bnez	$t1, .L\KernelType\().\FilterCount\().\OutputCount\().ProcessNextRow
+
+//
+// Handle post processing of the output block.
+//
+
+.L\KernelType\().\FilterCount\().\OutputCount\().HandlePostProcessing:
+	ld.w	$a2, $sp, Flags_arg
+.if \FilterCount\() > 1
+	ld.d	$t6, $sp, OutputStride_arg
+.endif
+	ld.d	$a3, $sp, Bias_arg
+        bl    MlasConvPostProcessFloat\Isa\()Filter\FilterCount\()Output\OutputCount\()
+
+        .endm
+
+/*++
+
+Macro Description:
+
+    This macro generates code for the inner convolution kernel.
+
+Arguments:
+
+    KernelType - Supplies the type of kernel to be generated.
+
+    BlockSize - Supplies the number of elements per block.
+
+    Isa - Supplies the instruction set architecture string for function tags.
+
+    BiasFilter - Supplies a non-blank value if the address of the filter buffer
+        should be biased to point to the middle of a OIhw8i8o block in order to
+        reduce the code size from relative byte offsets.
+
+--*/
+
+        .macro SconvKernelFunction KernelType, BlockSize, Isa, BiasFilter
+
+/*++
+
+Routine Description:
+
+    This routine is the inner kernel to compute a convolution for the elements
+    of an output row for a set of filter rows.
+
+Arguments:
+
+    Input (a0) - Supplies the address of the input buffer.
+
+        The address is biased to include padding blocks for the left width
+        dimension. The address is not biased to include padding rows for the
+        left height dimension  these are accounted for in the outer kernel.
+
+    Filter (a1) - Supplies the address of the filter buffer.
+
+    Output (a2) - Supplies the address of the output buffer.
+
+    StrideWidth (a3) - Supplies the length in bytes of the blocked stride width.
+
+    DilationWidth (a4) - Supplies the length in bytes of the blocked dilation
+        width.
+
+    FilterCount (a5) - Supplies the number of filters to process in this
+        iteration.
+
+    InputStride (a6)- Supplies the length in bytes to advance the input buffer to
+        the next input row.
+
+    FilterStride (a7) - Supplies the length in bytes to advance the filter buffer
+        to the next set of filters.
+
+    OutputStride (sp + 0)- Supplies the length in bytes to advance the output buffer
+        to the next output address associated with the next set of filters.
+
+    KernelHeight (sp + 8)- Supplies the height of the kernel to apply. This height may
+        be less than the original kernel height after removing any padding
+        rows.
+
+    KernelWidth (sp + 0x10)- Supplies the width of the kernel to apply.
+
+    InputBase (sp + 0x18)- Supplies the address of the valid input buffer.
+
+        This parameter is similar to the Input parameter, but does not include
+        the padding blocks for the left width dimension. This parameter is used
+        with the following InputWidth parameter in order to validate that the
+        current input buffer address in bounds and not in the left or right
+        width padding region.
+
+    InputWidth (sp + 0x20)- Supplies the length in bytes of the blocked input width.
+
+    DilatedInputWidth (sp + 0x28)- Supplies the length in bytes to advance the input base
+        buffer to the next input row including dilation.
+
+    OutputCountLeftPad (sp + 0x30)- Supplies the number of output elements that include
+        one or more padding elements from the left edge.
+
+    OutputCount (sp + 0x38)- Supplies the number of output elements that do not include
+        any padding elements.
+
+    OutputCountRightPad (sp + 0x40)- Supplies the number of output elements that include
+        one or more padding elements from the right edge.
+
+    Bias (sp + 0x48)- Supplies the address of the bias buffer.
+
+    Flags (sp + 0x50)- Supplies additional flags controlling the convolution operation,
+        especially post calculation options.
+
+Return Value:
+
+    None.
+
+--*/
+
+        FUNCTION_ENTRY MlasConv\KernelType\()FloatKernel\Isa\()
+
+	addi.d	$sp, $sp, -SP_SIZE
+	st.d	$s0, $sp, 0
+	st.d	$s1, $sp, 8
+	st.d	$s2, $sp, 2*8
+	st.d	$ra, $sp, 5*8
+
+    ld.d    $t0, $sp, SP_SIZE+0*8
+    ld.d    $t1, $sp, SP_SIZE+1*8
+    ld.d    $t2, $sp, SP_SIZE+2*8
+    ld.d    $t3, $sp, SP_SIZE+3*8
+    st.d    $t0, $sp, OutputStride_arg
+    st.d    $t1, $sp, KernelHeight_arg
+    st.d    $t2, $sp, KernelWidth_arg
+    st.d    $t3, $sp, InputBase_arg
+    ld.d    $t0, $sp, SP_SIZE+4*8
+    ld.d    $t1, $sp, SP_SIZE+5*8
+    ld.d    $t2, $sp, SP_SIZE+6*8
+    ld.d    $t3, $sp, SP_SIZE+7*8
+    st.d    $t0, $sp, InputWidth_arg
+    st.d    $t1, $sp, DilatedInputWidth_arg
+    st.d    $t2, $sp, OutputCountLeftPad_arg
+    st.d    $t3, $sp, OutputCount_arg
+    ld.d    $t0, $sp, SP_SIZE+8*8
+    ld.d    $t1, $sp, SP_SIZE+9*8
+    ld.d    $t2, $sp, SP_SIZE+10*8
+    st.d    $t0, $sp, OutputCountRightPad_arg
+    st.d    $t1, $sp, Bias_arg
+    st.d    $t2, $sp, Flags_arg
+
+.ifeqs "\BiasFilter\()","BiasFilter"
+	addi.d	$a1, $a1, 4*8*4
+.endif
+	st.d	$a1, $sp, Filter_save_offset
+	move	$a1, $a7
+	move	$t5, $a6
+	move	$t8, $a4
+	move	$t1, $a5
+	move	$a4, $a2
+	move	$a5, $a3
+
+//
+// Process the specified number of filter rows.
+//
+
+	ori	$s0, $zero, 3
+	beq	$t1, $s0, .L\KernelType\().ProcessFilterCount3
+	bltu	$t1, $s0, .L\KernelType\().ProcessFilterCountLessThan3
+        ProcessFilterCountN LSconvKernelFrame, \KernelType\(), 4
+        b     .L\KernelType\().ExitKernel
+
+.L\KernelType\().ProcessFilterCount3:
+        ProcessFilterCountN LSconvKernelFrame, \KernelType\(), 3
+        b     .L\KernelType\().ExitKernel
+
+.L\KernelType\().ProcessFilterCountLessThan3:
+	ori	$s0, $zero, 2
+	bltu	$t1, $s0, .L\KernelType\().ProcessFilterCount1
+        ProcessFilterCountN LSconvKernelFrame, \KernelType\(), 2
+        b     .L\KernelType\().ExitKernel
+
+.L\KernelType\().ProcessFilterCount1:
+        ProcessFilterCountN LSconvKernelFrame, \KernelType\(), 1
+
+//
+// Restore non-volatile registers and return.
+//
+
+.L\KernelType\().ExitKernel:
+.ifnes "\Isa\()","LSX"
+	xvinsgr2vr.d	$xr0, $zero, 2
+	xvinsgr2vr.d	$xr0, $zero, 3
+	xvinsgr2vr.d	$xr1, $zero, 2
+	xvinsgr2vr.d	$xr1, $zero, 3
+	xvinsgr2vr.d	$xr2, $zero, 2
+	xvinsgr2vr.d	$xr2, $zero, 3
+	xvinsgr2vr.d	$xr3, $zero, 2
+	xvinsgr2vr.d	$xr3, $zero, 3
+	xvinsgr2vr.d	$xr4, $zero, 2
+	xvinsgr2vr.d	$xr4, $zero, 3
+	xvinsgr2vr.d	$xr5, $zero, 2
+	xvinsgr2vr.d	$xr5, $zero, 3
+	xvinsgr2vr.d	$xr6, $zero, 2
+	xvinsgr2vr.d	$xr6, $zero, 3
+	xvinsgr2vr.d	$xr7, $zero, 2
+	xvinsgr2vr.d	$xr7, $zero, 3
+	xvinsgr2vr.d	$xr8, $zero, 2
+	xvinsgr2vr.d	$xr8, $zero, 3
+	xvinsgr2vr.d	$xr9, $zero, 2
+	xvinsgr2vr.d	$xr9, $zero, 3
+	xvinsgr2vr.d	$xr10, $zero, 2
+	xvinsgr2vr.d	$xr10, $zero, 3
+	xvinsgr2vr.d	$xr11, $zero, 2
+	xvinsgr2vr.d	$xr11, $zero, 3
+	xvinsgr2vr.d	$xr12, $zero, 2
+	xvinsgr2vr.d	$xr12, $zero, 3
+	xvinsgr2vr.d	$xr13, $zero, 2
+	xvinsgr2vr.d	$xr13, $zero, 3
+	xvinsgr2vr.d	$xr14, $zero, 2
+	xvinsgr2vr.d	$xr14, $zero, 3
+	xvinsgr2vr.d	$xr15, $zero, 2
+	xvinsgr2vr.d	$xr15, $zero, 3
+.endif
+	ld.d	$s0, $sp, 0
+	ld.d	$s1, $sp, 8
+	ld.d	$s2, $sp, 2*8
+	ld.d	$ra, $sp, 5*8
+	addi.d	$sp, $sp, SP_SIZE
+	jirl	$zero, $ra, 0
+
+.ifnes "\Isa\()","LSX"
+
+//
+// Generate out-of-band helpers for handling output blocks involving padding.
+//
+
+        .irp FilterCount, 1, 2, 3, 4
+
+MlasConv\KernelType\()FloatSingle\Isa\()Filter\FilterCount\():
+    st.d	$ra, $sp, 19*8
+loopMlasConv\KernelType\()FloatSingle\Isa\()Filter\FilterCount\():
+        ProcessOutputCountN \Isa\(), LSconvKernelSingleFrame, \KernelType\(), \BlockSize\(), \FilterCount\(), 1
+	add.d	$a0, $a0, $a5                # advance input by 1 element
+	addi.d	$t0, $t0, -1                 # decrement output count remaining
+    bnez	$t0, loopMlasConv\KernelType\()FloatSingle\Isa\()Filter\FilterCount\()
+    ld.d	$ra, $sp, 19*8
+	jr	$ra
+
+        .endr
+
+.endif
+
+        .endm
+
+/*++
+
+Macro Description:
+
+    This macro generates code for the inner convolution kernel for the special
+    case of a depthwise separable convolution.
+
+Arguments:
+
+    BlockSize - Supplies the number of elements per block.
+
+    Isa - Supplies the instruction set architecture string for function tags.
+
+--*/
+
+        .macro SconvKernelDepthwiseFunction BlockSize, Isa
+
+/*++
+
+Routine Description:
+
+    This routine is the inner kernel to compute a convolution for the elements
+    of an output row for a set of filter rows.
+
+    Depthwise separable convolutions are a form of grouped convolution where
+    the number of input and output channels per group are one.
+
+Arguments:
+
+    Input (a0) - Supplies the address of the input buffer.
+
+        The address is biased to include padding blocks for the left width
+        dimension. The address is not biased to include padding rows for the
+        left height dimension  these are accounted for in the outer kernel.
+
+    Filter (a1) - Supplies the address of the filter buffer.
+
+    Output (a2) - Supplies the address of the output buffer.
+
+    StrideWidth (a3) - Supplies the length in bytes of the blocked stride width.
+
+    DilationWidth (a4) - Supplies the length in bytes of the blocked dilation
+        width.
+
+    InputStride (a5) - Supplies the length in bytes to advance the input buffer
+        to the next input row.
+
+    KernelHeight (a6)- Supplies the height of the kernel to apply. This height may
+        be less than the original kernel height after removing any padding
+        rows.
+
+    KernelWidth (a7)- Supplies the width of the kernel to apply.
+
+    InputBase (sp + 0 )- Supplies the address of the valid input buffer.
+
+        This parameter is similar to the Input parameter, but does not include
+        the padding blocks for the left width dimension. This parameter is used
+        with the following InputWidth parameter in order to validate that the
+        current input buffer address in bounds and not in the left or right
+        width padding region.
+
+    InputWidth (sp + 8 )- Supplies the length in bytes of the blocked input width.
+
+    DilatedInputWidth (sp + 0x10)- Supplies the length in bytes to advance the input base
+        buffer to the next input row including dilation.
+
+    OutputCountLeftPad (sp + 0x18)- Supplies the number of output elements that include
+        one or more padding elements from the left edge.
+
+    OutputCount (sp + 0x20)- Supplies the number of output elements that do not include
+        any padding elements.
+
+    OutputCountRightPad (sp + 0x28)- Supplies the number of output elements that include
+        one or more padding elements from the right edge.
+
+    Bias (sp + 0x30)- Supplies the address of the bias buffer.
+
+    Flags (sp + 0x38)- Supplies additional flags controlling the convolution operation,
+        especially post calculation options.
+
+Return Value:
+
+    None.
+
+--*/
+
+        FUNCTION_ENTRY MlasConvDepthwiseFloatKernel\Isa\()
+
+	addi.d	$sp, $sp, -SP_SIZE
+	st.d	$s0, $sp, 0
+	st.d	$s1, $sp, 8
+	st.d	$s2, $sp, 2*8
+	st.d	$ra, $sp, 5*8
+
+    st.d    $a6, $sp, KernelHeight_arg
+    st.d    $a7, $sp, KernelWidth_arg
+
+    ld.d    $t0, $sp, SP_SIZE+0*8
+    ld.d    $t1, $sp, SP_SIZE+1*8
+    ld.d    $t2, $sp, SP_SIZE+2*8
+    ld.d    $t3, $sp, SP_SIZE+3*8
+    st.d    $t0, $sp, InputBase_arg
+    st.d    $t1, $sp, InputWidth_arg
+    st.d    $t2, $sp, DilatedInputWidth_arg
+    st.d    $t3, $sp, OutputCountLeftPad_arg
+    ld.d    $t0, $sp, SP_SIZE+4*8
+    ld.d    $t1, $sp, SP_SIZE+5*8
+    ld.d    $t2, $sp, SP_SIZE+6*8
+    ld.d    $t3, $sp, SP_SIZE+7*8
+    st.d    $t0, $sp, OutputCount_arg
+    st.d    $t1, $sp, OutputCountRightPad_arg
+    st.d    $t2, $sp, Bias_arg
+    st.d    $t3, $sp, Flags_arg
+
+	move	$t8, $a4
+	move	$t5, $a5
+	move	$a4, $a2
+	move	$a5, $a3
+
+//
+// Process the specified number of filter rows.
+//
+
+        ProcessFilterCountN LSconvKernelDepthwiseFrame, Depthwise, 1
+
+//
+// Restore non-volatile registers and return.
+//
+
+.LDepthwise.ExitKernel:
+.ifnes "\Isa\()","LSX"
+	xvinsgr2vr.d	$xr0, $zero, 2
+	xvinsgr2vr.d	$xr0, $zero, 3
+	xvinsgr2vr.d	$xr1, $zero, 2
+	xvinsgr2vr.d	$xr1, $zero, 3
+	xvinsgr2vr.d	$xr2, $zero, 2
+	xvinsgr2vr.d	$xr2, $zero, 3
+	xvinsgr2vr.d	$xr3, $zero, 2
+	xvinsgr2vr.d	$xr3, $zero, 3
+	xvinsgr2vr.d	$xr4, $zero, 2
+	xvinsgr2vr.d	$xr4, $zero, 3
+	xvinsgr2vr.d	$xr5, $zero, 2
+	xvinsgr2vr.d	$xr5, $zero, 3
+	xvinsgr2vr.d	$xr6, $zero, 2
+	xvinsgr2vr.d	$xr6, $zero, 3
+	xvinsgr2vr.d	$xr7, $zero, 2
+	xvinsgr2vr.d	$xr7, $zero, 3
+	xvinsgr2vr.d	$xr8, $zero, 2
+	xvinsgr2vr.d	$xr8, $zero, 3
+	xvinsgr2vr.d	$xr9, $zero, 2
+	xvinsgr2vr.d	$xr9, $zero, 3
+	xvinsgr2vr.d	$xr10, $zero, 2
+	xvinsgr2vr.d	$xr10, $zero, 3
+	xvinsgr2vr.d	$xr11, $zero, 2
+	xvinsgr2vr.d	$xr11, $zero, 3
+	xvinsgr2vr.d	$xr12, $zero, 2
+	xvinsgr2vr.d	$xr12, $zero, 3
+	xvinsgr2vr.d	$xr13, $zero, 2
+	xvinsgr2vr.d	$xr13, $zero, 3
+	xvinsgr2vr.d	$xr14, $zero, 2
+	xvinsgr2vr.d	$xr14, $zero, 3
+	xvinsgr2vr.d	$xr15, $zero, 2
+	xvinsgr2vr.d	$xr15, $zero, 3
+.endif
+	ld.d	$s0, $sp, 0
+	ld.d	$s1, $sp, 8
+	ld.d	$s2, $sp, 2*8
+	ld.d	$ra, $sp, 5*8
+	addi.d	$sp, $sp, SP_SIZE
+	jr	$ra
+
+.ifnes "\Isa\()","LSX"
+
+//
+// Generate out-of-band helpers for handling output blocks involving padding.
+//
+
+MlasConvDepthwiseFloatSingle\Isa\()Filter1:
+    st.d	$ra, $sp, 20*8
+MlasConvDepthwiseFloatSingle\Isa\()Filter1_loop:
+        ProcessOutputCountN \Isa\(), LSconvKernelDepthwiseSingleFrame, Depthwise, \BlockSize\(), 1, 1
+	add.d	$a0, $a0, $a5                # advance input by 1 element
+	addi.d	$t0, $t0, -1                # decrement output count remaining
+
+        bnez	$t0, MlasConvDepthwiseFloatSingle\Isa\()Filter1_loop
+	ld.d	$ra, $sp, 20*8
+	jr	$ra
+
+.endif
+
+        .endm
+
+/*++
+
+Macro Description:
+
+    This macro generates code to compute the convolution for a vector of input
+    blocks and a vector of filter blocks to produce a matrix of output blocks
+    for a pointwise convolution.
+
+Arguments:
+
+    Isa - Supplies the instruction set architecture string for function tags.
+
+    BlockSize - Supplies the number of elements per block.
+
+    FilterCount - Supplies the number of rows from the filter to process.
+
+    OutputCount - Supplies the number of output blocks to produce.
+
+Implicit Arguments:
+
+    a0 - Supplies the address of the input buffer.
+
+    a1 - Supplies the FilterStride parameter (see function description).
+
+    t8 - Supplies the InputStride parameter (see function description).
+
+    a4 - Supplies the address of the output buffer.
+
+    a5 - Supplies the StrideWidth parameter (see function description).
+
+    t2 - Supplies the address of the filter buffer.
+
+--*/
+
+        .macro ProcessPointwiseOutputCountN Isa, BlockSize, FilterCount, OutputCount
+
+	move	$a3, $a0
+	move	$a2, $t2
+	ld.d	$t1, $sp, InputChannels_arg
+        ClearBlock \FilterCount\(), \OutputCount\()
+
+.LPointwise.\FilterCount\().\OutputCount\().ProcessNextInputBlock:
+.if \OutputCount\() > 3
+	slli.d	$s0, $a5, 1
+	add.d	$s0, $s0, $a5
+	add.d	$t4, $s0, $a3
+.endif
+.if \FilterCount\() > 2
+	slli.d	$s0, $a1, 1
+	add.d	$t7, $a2, $s0
+.endif
+.if \BlockSize\() == 16
+        .irp Index, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+            ComputeBlock Pointwise, \FilterCount\(), \OutputCount\(), \Index\()*16*4, \Index\()*4
+        .endr
+.else
+        .irp Index, 0, 1, 2, 3, 4, 5, 6, 7
+            ComputeBlock Pointwise, \FilterCount\(), \OutputCount\(), (\Index\()-4)*8*4, \Index\()*4
+        .endr
+.endif
+	add.d	$a3, $a3, $t8                # advance input to next channel block
+
+	addi.d	$a2, $a2, \BlockSize\()*\BlockSize\()*4    # advance filter by 8i8o/16i16o block
+	addi.d	$t1, $t1, -1                 # decrement input blocks remaining
+
+        bnez	$t1, .LPointwise.\FilterCount\().\OutputCount\().ProcessNextInputBlock
+
+//
+// Handle post processing of the output block.
+//
+
+	ld.w	$a2, $sp, Flags_arg
+.if \FilterCount\() > 1
+	ld.d	$t6, $sp, OutputStride_arg
+.endif
+	ld.d	$a3, $sp, Bias_arg
+        bl    MlasConvPostProcessFloat\Isa\()Filter\FilterCount\()Output\OutputCount\()
+
+        .endm
+
+/*++
+
+Macro Description:
+
+    This macro generates code for the inner convolution kernel for the special
+    case where the kernel dimensions are 1.
+
+Arguments:
+
+    Isa - Supplies the instruction set architecture string for function tags.
+
+    BiasFilter - Supplies a non-blank value if the address of the filter buffer
+        should be biased to point to the middle of a OIhw8i8o block in order to
+        reduce the code size from relative byte offsets.
+
+--*/
+
+        .macro SconvKernelPointwiseFunction Isa, BiasFilter
+
+/*++
+
+Routine Description:
+
+    This routine is the inner kernel to compute a convolution for the elements
+    of an output row for a set of filter rows.
+
+    Pointwise convolutions have a kernel size of one. To simplify this
+    implementation, no input padding is allowed, which matches typical usage in
+    models.
+
+Arguments:
+
+    Input (a0) - Supplies the address of the input buffer.
+
+    Filter (a1) - Supplies the address of the filter buffer.
+
+    Output (a2) - Supplies the address of the output buffer.
+
+    StrideWidth (a3) - Supplies the length in bytes of the blocked stride width.
+
+    InputChannels (a4) - Supplies the number of input channels to process.
+
+    FilterCount (a5) - Supplies the number of rows from the filter to process.
+
+    InputStride (a6) - Supplies the length in bytes to advance the input buffer to
+        the next input channel of the same input row.
+
+    FilterStride (a7) - Supplies the length in bytes to advance the filter buffer
+        to the next set of filters.
+
+    OutputStride (sp + 0)- Supplies the length in bytes to advance the output buffer
+        to the next output address associated with the next set of filters.
+
+    OutputCount (sp + 8)- Supplies the number of output elements.
+
+    Bias (sp + 0x10)- Supplies the address of the bias buffer.
+
+    Flags (sp + 0x18)- Supplies additional flags controlling the convolution operation,
+        especially post calculation options.
+
+Return Value:
+
+    None.
+
+--*/
+
+        FUNCTION_ENTRY MlasConvPointwiseFloatKernel\Isa\()
+
+	addi.d	$sp, $sp, -SP_SIZE
+	st.d	$s0, $sp, 0*8
+	st.d	$s1, $sp, 1*8
+	st.d	$s2, $sp, 2*8
+	st.d	$ra, $sp, 5*8
+
+    ld.d    $t0, $sp, SP_SIZE+0*8
+    ld.d    $t1, $sp, SP_SIZE+1*8
+    ld.d    $t2, $sp, SP_SIZE+2*8
+    ld.d    $t3, $sp, SP_SIZE+3*8
+    st.d    $t0, $sp, OutputStride_arg
+    st.d    $t1, $sp, OutputCount_arg
+    st.d    $t2, $sp, Bias_arg
+    st.d    $t3, $sp, Flags_arg
+    st.d    $a4, $sp, InputChannels_arg
+
+.ifeqs "\BiasFilter\()","BiasFilter"
+	addi.d	$t2, $a1, 4*8*4
+.else
+	move	$t2, $a1
+.endif
+	ld.d	$t0, $sp, OutputCount_arg
+	move	$a1, $a7
+	move	$t8, $a6
+	move	$t1, $a5
+	move	$a4, $a2
+	move	$a5, $a3
+
+//
+// Process the specified number of filter rows.
+//
+
+	ori	$s0, $zero, 3
+	beq	$t1, $s0, .LPointwise.ProcessFilterCount3
+	bltu	$t1, $s0, .LPointwise.ProcessFilterCountLessThan3
+        ProcessPointwiseFilterCountN 4
+        b     .LPointwise.ExitKernel
+
+.LPointwise.ProcessFilterCount3:
+        ProcessPointwiseFilterCountN 3
+        b     .LPointwise.ExitKernel
+
+.LPointwise.ProcessFilterCountLessThan3:
+	ori	$s0, $zero, 2
+	bltu	$t1, $s0, .LPointwise.ProcessFilterCount1
+        ProcessPointwiseFilterCountN 2
+        b     .LPointwise.ExitKernel
+
+.LPointwise.ProcessFilterCount1:
+        ProcessPointwiseFilterCountN 1
+
+//
+// Restore non-volatile registers and return.
+//
+
+.LPointwise.ExitKernel:
+.ifnes "\Isa\()","LSX"
+	xvinsgr2vr.d	$xr0, $zero, 2
+	xvinsgr2vr.d	$xr0, $zero, 3
+	xvinsgr2vr.d	$xr1, $zero, 2
+	xvinsgr2vr.d	$xr1, $zero, 3
+	xvinsgr2vr.d	$xr2, $zero, 2
+	xvinsgr2vr.d	$xr2, $zero, 3
+	xvinsgr2vr.d	$xr3, $zero, 2
+	xvinsgr2vr.d	$xr3, $zero, 3
+	xvinsgr2vr.d	$xr4, $zero, 2
+	xvinsgr2vr.d	$xr4, $zero, 3
+	xvinsgr2vr.d	$xr5, $zero, 2
+	xvinsgr2vr.d	$xr5, $zero, 3
+	xvinsgr2vr.d	$xr6, $zero, 2
+	xvinsgr2vr.d	$xr6, $zero, 3
+	xvinsgr2vr.d	$xr7, $zero, 2
+	xvinsgr2vr.d	$xr7, $zero, 3
+	xvinsgr2vr.d	$xr8, $zero, 2
+	xvinsgr2vr.d	$xr8, $zero, 3
+	xvinsgr2vr.d	$xr9, $zero, 2
+	xvinsgr2vr.d	$xr9, $zero, 3
+	xvinsgr2vr.d	$xr10, $zero, 2
+	xvinsgr2vr.d	$xr10, $zero, 3
+	xvinsgr2vr.d	$xr11, $zero, 2
+	xvinsgr2vr.d	$xr11, $zero, 3
+	xvinsgr2vr.d	$xr12, $zero, 2
+	xvinsgr2vr.d	$xr12, $zero, 3
+	xvinsgr2vr.d	$xr13, $zero, 2
+	xvinsgr2vr.d	$xr13, $zero, 3
+	xvinsgr2vr.d	$xr14, $zero, 2
+	xvinsgr2vr.d	$xr14, $zero, 3
+	xvinsgr2vr.d	$xr15, $zero, 2
+	xvinsgr2vr.d	$xr15, $zero, 3
+.endif
+	ld.d	$s0, $sp, 0*8
+	ld.d	$s1, $sp, 1*8
+	ld.d	$s2, $sp, 2*8
+	ld.d	$ra, $sp, 5*8
+	addi.d	$sp, $sp, SP_SIZE
+	jr	$ra
+
+        .endm
+
+/*++
+
+Macro Description:
+
+    This macro generates code to clear the block accumulators.
+
+Arguments:
+
+    FilterCount - Supplies the number of rows from the filter to process.
+
+    OutputCount - Supplies the number of output blocks to produce.
+
+Implicit Arguments:
+
+    xr0-xr11 - Supplies the block accumulators.
+
+--*/
+
+        .macro ClearBlock FilterCount, OutputCount
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 1, "xvxor.v $xr0, $xr0, $xr0"
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 2, "xvxor.v $xr4, $xr4, $xr4"
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 3, "xvxor.v $xr8, $xr8, $xr8"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 1, "xvxor.v $xr1, $xr1, $xr1"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 2, "xvxor.v $xr5, $xr5, $xr5"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 3, "xvxor.v $xr9, $xr9, $xr9"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 1, "xvxor.v $xr2, $xr2, $xr2"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 2, "xvxor.v $xr6, $xr6, $xr6"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 3, "xvxor.v $xr10, $xr10, $xr10"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 1, "xvxor.v $xr3, $xr3, $xr3"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 2, "xvxor.v $xr7, $xr7, $xr7"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 3, "xvxor.v $xr11, $xr11, $xr11"
+
+        .endm
diff --git a/onnxruntime/core/mlas/lib/loongarch64/SconvKernelLsx.S b/onnxruntime/core/mlas/lib/loongarch64/SconvKernelLsx.S
new file mode 100644
index 0000000000000..04b8dc14d067d
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/loongarch64/SconvKernelLsx.S
@@ -0,0 +1,339 @@
+/*++
+
+Copyright (C) 2023 Loongson Technology Corporation Limited. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    SconvKernelLsx.S
+
+Abstract:
+
+    This module implements the kernels for the single precision convolution
+    operation.
+
+    This implementation uses Lsx instructions.
+
+--*/
+
+#include "asmmacro.h"
+#include "SconvKernelLsxCommon.h"
+
+/*++
+
+Macro Description:
+
+    This macro generates code to clear the block accumulators.
+
+Arguments:
+
+    FilterCount - Supplies the number of rows from the filter to process.
+
+    OutputCount - Supplies the number of output blocks to produce.
+
+Implicit Arguments:
+
+    vr0-vr7 - Supplies the block accumulators.
+
+--*/
+
+        .macro ClearBlock FilterCount, OutputCount
+
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 1, "vxor.v $vr0,$vr0,$vr0"
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 1, "vxor.v $vr1,$vr1,$vr1"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 1, "vxor.v $vr2,$vr2,$vr2"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 1, "vxor.v $vr3,$vr3,$vr3"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 1, "vxor.v $vr4,$vr4,$vr4"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 1, "vxor.v $vr5,$vr5,$vr5"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 1, "vxor.v $vr6,$vr6,$vr6"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 1, "vxor.v $vr7,$vr7,$vr7"
+
+        .endm
+
+/*++
+
+Macro Description:
+
+    This macro multiplies and accumulates for FilterCount by OutputCount block
+    of the output buffer.
+
+Arguments:
+
+    KernelType - Supplies the type of kernel to be generated.
+
+    FilterCount - Supplies the number of rows from the filter to process.
+
+    OutputCount - Supplies the number of output blocks to produce.
+
+    VectorOffset - Supplies the byte offset from the filter buffer to fetch
+        elements.
+
+    BroadcastOffset - Supplies the byte offset from the input buffer to fetch
+        elements.
+
+Implicit Arguments:
+
+    a3 - Supplies the address of the input buffer.
+
+    a2 - Supplies the address of the filter buffer.
+
+    a1 - Supplies the FilterStride parameter (see function description).
+
+    t6 - Supplies the address of the filter buffer plus 2 * FilterStride.
+
+    a5 - Supplies the StrideWidth parameter (see function description).
+
+    vr0-vr7 - Supplies the block accumulators.
+
+--*/
+        .macro ComputeBlock KernelType, FilterCount, OutputCount, VectorOffset, BroadcastOffset
+
+.ifeqs "\KernelType\()","Depthwise"
+        vld     $vr8, $a2, 0
+        vld     $vr9, $a2, 16
+        vld     $vr10, $a3, 0
+        vld     $vr11, $a3, 16
+        vfmadd.s $vr0, $vr8, $vr10, $vr0
+        vfmadd.s $vr1, $vr9, $vr11, $vr1
+.else
+        EmitIfCountGE \OutputCount\(), 1, "ld.w $s0, $a3, \BroadcastOffset\()"
+        EmitIfCountGE \OutputCount\(), 1, "vreplgr2vr.w $vr12, $s0"
+        EmitIfCountGE \FilterCount\(), 1, "vld  $vr8, $a2, \VectorOffset\()"
+        EmitIfCountGE \FilterCount\(), 1, "vld  $vr9, $a2, \VectorOffset\()+16"
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 1, "vfmadd.s $vr0, $vr8, $vr12, $vr0"
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 1, "vfmadd.s $vr1, $vr9, $vr12, $vr1"
+        EmitIfCountGE \FilterCount\(), 2, "addi.d   $s0, $a1, +\VectorOffset\()"
+        EmitIfCountGE \FilterCount\(), 2, "vldx  $vr8, $a2, $s0"
+        EmitIfCountGE \FilterCount\(), 2, "addi.d   $s0, $a1, +\VectorOffset\()+16"
+        EmitIfCountGE \FilterCount\(), 2, "vldx  $vr9, $a2, $s0"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 1, "vfmadd.s $vr2, $vr8, $vr12, $vr2"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 1, "vfmadd.s $vr3, $vr9, $vr12, $vr3"
+        EmitIfCountGE \FilterCount\(), 3, "vld  $vr8, $t7, \VectorOffset\()"
+        EmitIfCountGE \FilterCount\(), 3, "vld  $vr9, $t7, \VectorOffset\()+16"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 1, "vfmadd.s $vr4, $vr8, $vr12, $vr4"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 1, "vfmadd.s $vr5, $vr9, $vr12, $vr5"
+        EmitIfCountGE \FilterCount\(), 4, "addi.d   $s0, $a1, \VectorOffset\()"
+        EmitIfCountGE \FilterCount\(), 4, "vldx  $vr8, $t7, $s0"
+        EmitIfCountGE \FilterCount\(), 4, "addi.d   $s0, $a1, \VectorOffset\()+16"
+        EmitIfCountGE \FilterCount\(), 4, "vldx  $vr9, $t7, $s0"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 1, "vfmadd.s $vr6, $vr8, $vr12, $vr6"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 1, "vfmadd.s $vr7, $vr9, $vr12, $vr7"
+.endif
+        .endm
+/*++
+
+Macro Description:
+
+    This macro generates code to compute the convolution for a specified number
+    of filter rows.
+
+Arguments:
+
+    KernelFrame - Supplies the symbol name to access the convolution kernel
+        stack.
+
+    KernelType - Supplies the type of kernel to be generated.
+
+    FilterCount - Supplies the number of rows from the filter to process.
+
+Implicit Arguments:
+
+    a0 - Supplies the address of the input buffer.
+
+    a1 - Supplies the FilterStride parameter (see function description) when
+        KernelType!=Depthwise. Supplies the address of the filter buffer when
+        KernelType=Depthwise.
+
+    s8 - Supplies the DilationWidth parameter (see function description).
+
+    a4 - Supplies the address of the output buffer.
+
+    a5 - Supplies the StrideWidth parameter (see function description).
+
+    s3 - Supplies the InputStride parameter (see function description).
+
+--*/
+
+        .macro ProcessFilterCountN KernelFrame, KernelType, FilterCount
+        ld.d    $s0, $sp, OutputCountLeftPad_arg   //OutputCountLeftPad
+        ld.d    $s1, $sp, OutputCount_arg   //OutputCount
+        add.d   $s0, $s0, $s1
+        ld.d    $s1, $sp, OutputCountRightPad_arg   //OutputCountRightPad
+        add.d   $t0, $s0, $s1
+.L\KernelType\().\FilterCount\().ProcessNextOutputCount:
+        ProcessOutputCountN Sse, \KernelFrame\(), \KernelType\(), 8, \FilterCount\(), 1
+        add.d   $a0, $a0, $a5
+        addi.d  $t0, $t0, -1
+        bnez    $t0, .L\KernelType\().\FilterCount\().ProcessNextOutputCount
+        .endm
+
+/*++
+
+Macro Description:
+
+    This macro generates code to compute the convolution for a specified number
+    of filter rows for a pointwise convolution.
+
+Arguments:
+
+    FilterCount - Supplies the number of rows from the filter to process.
+
+Implicit Arguments:
+
+    a0 - Supplies the address of the input buffer.
+
+    a1 - Supplies the FilterStride parameter (see function description).
+
+    s8 - Supplies the InputStride parameter (see function description).
+
+    a4 - Supplies the address of the output buffer.
+
+    a5 - Supplies the StrideWidth parameter (see function description).
+
+    t7 - Supplies the OutputCount parameter (see function description).
+
+    s5 - Supplies the address of the filter buffer.
+
+--*/
+
+        .macro ProcessPointwiseFilterCountN FilterCount
+.LPointwise.\FilterCount\().ProcessNextOutputCount:
+        ProcessPointwiseOutputCountN Sse, 8, \FilterCount\(), 1
+        add.d   $a0, $a0, $a5
+        addi.d  $t0, $t0, -1
+        bnez    $t0, .LPointwise.\FilterCount\().ProcessNextOutputCount
+        .endm
+
+//
+// Generate the convolution kernels.
+//
+
+        SconvKernelFunction Nchw, 8, LSX
+        SconvKernelFunction Nchwc, 8, LSX, BiasFilter
+        SconvKernelDepthwiseFunction 8, LSX
+        SconvKernelPointwiseFunction LSX, BiasFilter
+
+/*++
+
+Macro Description:
+
+    This macro generates code to process an output block after the inner
+    convolution kernel has executed and then stores the output block to the
+    output buffer.
+
+Arguments:
+
+    FilterCount - Supplies the number of rows from the filter to process.
+
+    OutputCount - Supplies the number of output blocks to produce.
+--*/
+
+        .macro PostProcessBlock FilterCount, OutputCount
+
+        .globl  MlasConvPostProcessFloatSseFilter\FilterCount\()Output\OutputCount\()
+#if !defined(__APPLE__)
+        .hidden MlasConvPostProcessFloatSseFilter\FilterCount\()Output\OutputCount\()
+#endif
+MlasConvPostProcessFloatSseFilter\FilterCount\()Output\OutputCount\():
+
+.if \FilterCount\() > 2
+        li.d    $s0, 2
+        mul.d   $s0, $s0, $t6
+        add.d   $t7, $a4, $s0
+.endif
+        andi    $s0, $a2, MLAS_CONV_KERNEL_FLAG_ACCUMULATE_OUTPUT
+        andi    $s0, $s0, 0xff
+        beqz    $s0, .LPostProcessBlock.\FilterCount\().\OutputCount\().SkipAccumulateOutput
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 1, "vld $vr8, $a4, 0"
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 1, "vld $vr9, $a4, 16"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 1, "vldx $vr10, $a4, $t6"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 1, "addi.d  $s0, $t6, 16"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 1, "vldx $vr11, $a4, $s0"
+
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 1, "vld $vr12, $t7, 0"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 1, "vld $vr13, $t7, 16"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 1, "vldx $vr14, $t7, $t6"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 1, "addi.d  $s0, $t6, 16"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 1, "vldx    $vr15, $t7, $s0"
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 1, "vfadd.s $vr0, $vr0, $vr8"
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 1, "vfadd.s $vr1, $vr1, $vr9"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 1, "vfadd.s $vr2, $vr2, $vr10"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 1, "vfadd.s $vr3, $vr3, $vr11"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 1, "vfadd.s $vr4, $vr4, $vr12"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 1, "vfadd.s $vr5, $vr5, $vr13"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 1, "vfadd.s $vr6, $vr6, $vr14"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 1, "vfadd.s $vr7, $vr7, $vr15"
+
+.LPostProcessBlock.\FilterCount\().\OutputCount\().SkipAccumulateOutput:
+//
+// Test if the bias buffer should be accumulated with the output block.
+//
+
+        andi    $s0, $a2, MLAS_CONV_KERNEL_FLAG_BIAS_ADDITION
+        andi    $s0, $s0, 0xff
+        beqz    $s0, .LPostProcessBlock.\FilterCount\().\OutputCount\().SkipBiasAddition
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 1, "vld $vr8, $a3, 0"
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 1, "vld $vr9, $a3, 16"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 1, "vld $vr10, $a3, 32"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 1, "vld $vr11, $a3, 48"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 1, "vld $vr12, $a3, 64"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 1, "vld $vr13, $a3, 80"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 1, "vld $vr14, $a3, 96"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 1, "vld $vr15, $a3, 112"
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 1, "vfadd.s $vr0, $vr0, $vr8"
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 1, "vfadd.s $vr1, $vr1, $vr9"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 1, "vfadd.s $vr2, $vr2, $vr10"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 1, "vfadd.s $vr3, $vr3, $vr11"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 1, "vfadd.s $vr4, $vr4, $vr12"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 1, "vfadd.s $vr5, $vr5, $vr13"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 1, "vfadd.s $vr6, $vr6, $vr14"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 1, "vfadd.s $vr7, $vr7, $vr15"
+
+.LPostProcessBlock.\FilterCount\().\OutputCount\().SkipBiasAddition:
+
+//
+// Test for fused ReLU activation.
+//
+
+        andi        $s0, $a2, MLAS_CONV_KERNEL_FLAG_RELU_ACTIVATION
+        andi        $s0, $s0, 0xff
+        beqz        $s0, .LPostProcessBlock.\FilterCount\().\OutputCount\().SkipReluActivation
+        vxor.v   $vr15,$vr15, $vr15
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 1, "vfmax.s $vr0, $vr0, $vr15"
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 1, "vfmax.s $vr1, $vr1, $vr15"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 1, "vfmax.s $vr2, $vr2, $vr15"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 1, "vfmax.s $vr3, $vr3, $vr15"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 1, "vfmax.s $vr4, $vr4, $vr15"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 1, "vfmax.s $vr5, $vr5, $vr15"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 1, "vfmax.s $vr6, $vr6, $vr15"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 1, "vfmax.s $vr7, $vr7, $vr15"
+
+.LPostProcessBlock.\FilterCount\().\OutputCount\().SkipReluActivation:
+
+//
+// Store the output block in the output buffer.
+//
+
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 1, "vst $vr0, $a4,0"
+        EmitIfCount2GE \FilterCount\(), 1, \OutputCount\(), 1, "vst $vr1, $a4, 16"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 1, "vstx $vr2, $a4, $t6"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 1, "addi.d $s0, $t6, 16"
+        EmitIfCount2GE \FilterCount\(), 2, \OutputCount\(), 1, "vstx $vr3, $a4, $s0"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 1, "vst $vr4, $t7, 0"
+        EmitIfCount2GE \FilterCount\(), 3, \OutputCount\(), 1, "vst $vr5, $t7, 16"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 1, "vstx $vr6, $t7, $t6"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 1, "addi.d $s0, $t6, 16"
+        EmitIfCount2GE \FilterCount\(), 4, \OutputCount\(), 1, "vstx $vr7, $t7, $s0"
+        add_immed  $a4, \OutputCount\()*8*4    # advance output by N nchw8c blocks
+        jr $ra
+
+        .endm
+
+        .irp    FilterCount, 1, 2, 3, 4
+        .irp    OutputCount, 1
+            PostProcessBlock \FilterCount\(), \OutputCount\()
+        .endr
+        .endr
+
+        .end
diff --git a/onnxruntime/core/mlas/lib/loongarch64/SconvKernelLsxCommon.h b/onnxruntime/core/mlas/lib/loongarch64/SconvKernelLsxCommon.h
new file mode 100644
index 0000000000000..d03714f654500
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/loongarch64/SconvKernelLsxCommon.h
@@ -0,0 +1,669 @@
+/*++
+
+Copyright (C) 2023 Loongson Technology Corporation Limited. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    SconvKernelLsxCommon.h
+
+Abstract:
+
+    This module contains common kernel macros and structures for the single
+    precision convolution operation for the Lsx kernels.
+
+--*/
+
+#define SP_SIZE 32*8
+
+#define MLAS_CONV_KERNEL_FLAG_ACCUMULATE_OUTPUT     0x00000001
+#define MLAS_CONV_KERNEL_FLAG_BIAS_ADDITION         0x00000002
+#define MLAS_CONV_KERNEL_FLAG_RELU_ACTIVATION       0x00000004
+#define MLAS_CONV_KERNEL_FLAG_OTHER_ACTIVATION      0x00000008
+
+#define Filter_save_offset 18*8
+
+#define OutputStride_arg                6*8
+#define KernelHeight_arg                7*8
+#define KernelWidth_arg                 8*8
+#define InputBase_arg                   9*8
+#define InputWidth_arg                  10*8
+#define DilatedInputWidth_arg           11*8
+#define OutputCountLeftPad_arg          12*8
+#define OutputCount_arg                 13*8
+#define OutputCountRightPad_arg         14*8
+#define Bias_arg                        15*8
+#define Flags_arg                       16*8
+#define InputChannels_arg               17*8
+
+/*++
+
+Macro Description:
+
+    This macro generates code to compute the convolution for a vector of input
+    blocks and a vector of filter blocks to produce a matrix of output blocks.
+
+    OutputCount=1 generates special case code to handle padding blocks. All
+    other output counts assume no padding.
+
+Arguments:
+
+    Isa - Supplies the instruction set architecture string for function tags.
+
+    KernelFrame - Supplies the symbol name to access the convolution kernel
+        stack.
+
+    KernelType - Supplies the type of kernel to be generated.
+
+    BlockSize - Supplies the number of elements per block.
+
+    FilterCount - Supplies the number of rows from the filter to process.
+
+    OutputCount - Supplies the number of output blocks to produce.
+
+Implicit Arguments:
+
+    a0 - Supplies the address of the input buffer.
+
+    a1 - Supplies the FilterStride parameter (see function description) when
+        KernelType!=Depthwise. Supplies the address of the filter buffer when
+        KernelType=Depthwise.
+
+    s8 - Supplies the DilationWidth parameter (see function description).
+
+    a4 - Supplies the address of the output buffer.
+
+    a5 - Supplies the StrideWidth parameter (see function description).
+
+    s3 - Supplies the InputStride parameter (see function description).
+--*/
+
+        .macro ProcessOutputCountN Isa, KernelFrame, KernelType, BlockSize, FilterCount, OutputCount
+        move    $a3, $a0
+.ifeqs "\KernelType\()","Depthwise"
+        move     $a2, $a1
+.else
+        ld.d    $a2, $sp, Filter_save_offset
+.endif
+        ld.d    $t1, $sp, KernelHeight_arg   //KernelHeight
+        ld.d    $t2, $sp, KernelWidth_arg   //KernelWidth
+.if \OutputCount\() == 1
+        ld.d    $t3, $sp, InputBase_arg   //InputBase
+        ld.d    $t4, $sp, InputWidth_arg   //InputWidth
+        sub.d   $t3, $zero, $t3                         # keep negative for lea usage below
+.endif
+        ClearBlock \FilterCount\(), \OutputCount\()
+        beqz    $t1, .L\KernelType\().\FilterCount\().\OutputCount\().HandlePostProcessing
+
+.L\KernelType\().\FilterCount\().\OutputCount\().ProcessNextRow:
+        move     $t6, $t2                     # reload kernel width remaining
+.L\KernelType\().\FilterCount\().\OutputCount\().ProcessNextColumn:
+.if \OutputCount\() == 1
+        add.d   $t7, $a3, $t3
+        bgeu     $t7, $t4, .L\KernelType\().\FilterCount\().\OutputCount\().SkipOverPadding
+.endif
+.if \OutputCount\() > 3
+        li.d    $s2, 2
+        mul.d   $s2, $a5, $s2
+        add.d   $t4, $a5, $s2
+
+        add.d   $t4, $t4, $a3                # compute input plus 3 blocks
+.endif
+.if \FilterCount\() > 2
+        li.d    $s2, 2
+        mul.d   $s2, $s2, $a1
+        add.d   $t7, $a2, $s2       //t6 is rbx used by ComputeBlock
+.endif
+.ifeqs "\KernelType\()","Nchwc"
+.if \BlockSize\() == 16
+        .irp Index, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+            ComputeBlock \KernelType\(), \FilterCount\(), \OutputCount\(), \Index\()*16*4, \Index\()*4
+        .endr
+.else
+        .irp Index, 0, 1, 2, 3, 4, 5, 6, 7
+            ComputeBlock \KernelType\(), \FilterCount\(), \OutputCount\(), (\Index\()-4)*8*4, \Index\()*4
+        .endr
+.endif
+.else
+        ComputeBlock \KernelType\(), \FilterCount\(), \OutputCount\(), 0, 0
+.endif
+.L\KernelType\().\FilterCount\().\OutputCount\().SkipOverPadding:
+        add.d   $a3, $a3, $t8               # advance input by dilation width
+.ifeqs "\KernelType\()","Nchwc"
+        addi.d  $a2, $a2, \BlockSize\()*\BlockSize\()*4
+                                            # advance filter by 8i8o/16i16o block
+.else
+        addi.d  $a2, $a2, \BlockSize\()*4   # advance filter by 8o/16o block
+.endif
+        addi.d  $t6, $t6, -1                # decrement columns remaining
+        bnez    $t6,    .L\KernelType\().\FilterCount\().\OutputCount\().ProcessNextColumn
+        add.d   $a3, $a3, $t5
+.if \OutputCount\() == 1
+        ld.d    $s0, $sp, DilatedInputWidth_arg            #DilatedInputWidth
+        sub.d   $t3, $t3, $s0
+                                            # advance input base to next row
+.endif
+        addi.d  $t1, $t1, -1                         # decrement rows remaining
+        bnez    $t1, .L\KernelType\().\FilterCount\().\OutputCount\().ProcessNextRow
+
+//
+// Handle post processing of the output block.
+//
+.L\KernelType\().\FilterCount\().\OutputCount\().HandlePostProcessing:
+        ld.w    $a2, $sp, Flags_arg
+
+.if \FilterCount\() > 1
+        ld.d    $t6, $sp, OutputStride_arg
+.endif
+        ld.d    $a3, $sp, Bias_arg
+        bl    MlasConvPostProcessFloat\Isa\()Filter\FilterCount\()Output\OutputCount\()
+.endm
+/*++
+
+Macro Description:
+
+    This macro generates code for the inner convolution kernel.
+
+Arguments:
+
+    KernelType - Supplies the type of kernel to be generated.
+
+    BlockSize - Supplies the number of elements per block.
+
+    Isa - Supplies the instruction set architecture string for function tags.
+
+    BiasFilter - Supplies a non-blank value if the address of the filter buffer
+        should be biased to point to the middle of a OIhw8i8o block in order to
+        reduce the code size from relative byte offsets.
+
+--*/
+
+        .macro SconvKernelFunction KernelType, BlockSize, Isa, BiasFilter
+
+/*++
+
+Routine Description:
+
+    This routine is the inner kernel to compute a convolution for the elements
+    of an output row for a set of filter rows.
+
+Arguments:
+
+    Input (a0) - Supplies the address of the input buffer.
+
+        The address is biased to include padding blocks for the left width
+        dimension. The address is not biased to include padding rows for the
+        left height dimension  these are accounted for in the outer kernel.
+
+    Filter (a1) - Supplies the address of the filter buffer.
+
+    Output (a2) - Supplies the address of the output buffer.
+
+    StrideWidth (a3) - Supplies the length in bytes of the blocked stride width.
+
+    DilationWidth (a4) - Supplies the length in bytes of the blocked dilation
+        width.
+
+    FilterCount (a5) - Supplies the number of filters to process in this
+        iteration.
+
+    InputStride (a6) - Supplies the length in bytes to advance the input buffer to
+        the next input row.
+
+    FilterStride (a7)- Supplies the length in bytes to advance the filter buffer
+        to the next set of filters.
+
+    OutputStride (sp,8*0) - Supplies the length in bytes to advance the output buffer
+        to the next output address associated with the next set of filters.
+
+    KernelHeight (sp,8*1)- Supplies the height of the kernel to apply. This height may
+        be less than the original kernel height after removing any padding
+        rows.
+
+    KernelWidth (sp, 8*2)- Supplies the width of the kernel to apply.
+
+    InputBase (sp, 8*3)- Supplies the address of the valid input buffer.
+
+        This parameter is similar to the Input parameter, but does not include
+        the padding blocks for the left width dimension. This parameter is used
+        with the following InputWidth parameter in order to validate that the
+        current input buffer address in bounds and not in the left or right
+        width padding region.
+
+    InputWidth (sp, 8*4)- Supplies the length in bytes of the blocked input width.
+
+    DilatedInputWidth (sp, 8*5)- Supplies the length in bytes to advance the input base
+        buffer to the next input row including dilation.
+
+    OutputCountLeftPad (sp, 8*6)- Supplies the number of output elements that include
+        one or more padding elements from the left edge.
+
+    OutputCount (sp, 8*7)- Supplies the number of output elements that do not include
+        any padding elements.
+
+    OutputCountRightPad (sp, 8*8)- Supplies the number of output elements that include
+        one or more padding elements from the right edge.
+
+    Bias (sp, 8*9)- Supplies the address of the bias buffer.
+
+    Flags (sp, 8*10)- Supplies additional flags controlling the convolution operation,
+        especially post calculation options.
+
+Return Value:
+
+    None.
+
+--*/
+
+    FUNCTION_ENTRY MlasConv\KernelType\()FloatKernel\Isa\()
+        addi.d  $sp, $sp, -SP_SIZE
+        st.d    $s0, $sp, 0*8
+        st.d    $s1, $sp, 1*8
+        st.d    $s2, $sp, 2*8
+        st.d    $s3, $sp, 3*8
+        st.d    $s4, $sp, 4*8
+        st.d    $ra, $sp, 5*8
+        ld.d    $s0, $sp, SP_SIZE+0*8
+        ld.d    $s1, $sp, SP_SIZE+1*8
+        ld.d    $s2, $sp, SP_SIZE+2*8
+        ld.d    $s3, $sp, SP_SIZE+3*8
+        st.d    $s0, $sp, OutputStride_arg
+        st.d    $s1, $sp, KernelHeight_arg
+        st.d    $s2, $sp, KernelWidth_arg
+        st.d    $s3, $sp, InputBase_arg
+        ld.d    $s0, $sp, SP_SIZE+4*8
+        ld.d    $s1, $sp, SP_SIZE+5*8
+        ld.d    $s2, $sp, SP_SIZE+6*8
+        ld.d    $s3, $sp, SP_SIZE+7*8
+        st.d    $s0, $sp, InputWidth_arg
+        st.d    $s1, $sp, DilatedInputWidth_arg
+        st.d    $s2, $sp, OutputCountLeftPad_arg
+        st.d    $s3, $sp, OutputCount_arg
+        ld.d    $s0, $sp, SP_SIZE+8*8
+        ld.d    $s1, $sp, SP_SIZE+9*8
+        ld.d    $s2, $sp, SP_SIZE+10*8
+        st.d    $s0, $sp, OutputCountRightPad_arg
+        st.d    $s1, $sp, Bias_arg
+        st.d    $s2, $sp, Flags_arg
+
+.ifeqs "\BiasFilter\()","BiasFilter"
+        addi.d $a1, $a1,4*8*4
+.endif
+        st.d    $a1, $sp, Filter_save_offset       //store  Filter
+        move    $a1, $a7
+        move    $t5, $a6
+        move    $t8, $a4    # shuffle to Win64 register usage
+        move    $t1, $a5
+        move    $a4, $a2
+        move    $a5, $a3
+
+        li.d    $s0, 3
+        beq     $t1, $s0, .L\KernelType\().ProcessFilterCount3
+        blt     $t1, $s0, .L\KernelType\().ProcessFilterCountLessThan3
+        ProcessFilterCountN SconvKernelFrame, \KernelType\(), 4
+        b     .L\KernelType\().ExitKernel
+
+.L\KernelType\().ProcessFilterCount3:
+        ProcessFilterCountN SconvKernelFrame, \KernelType\(), 3
+        b     .L\KernelType\().ExitKernel
+
+.L\KernelType\().ProcessFilterCountLessThan3:
+        li.d     $s0,2
+        blt      $t1, $s0, .L\KernelType\().ProcessFilterCount1
+        ProcessFilterCountN SconvKernelFrame, \KernelType\(), 2
+        b     .L\KernelType\().ExitKernel
+
+.L\KernelType\().ProcessFilterCount1:
+        ProcessFilterCountN SconvKernelFrame, \KernelType\(), 1
+
+//
+// Restore non-volatile registers and return.
+//
+
+.L\KernelType\().ExitKernel:
+        ld.d    $a1, $sp, Filter_save_offset       //restore  Filter
+        ld.d    $s0, $sp, 0*8
+        ld.d    $s1, $sp, 1*8
+        ld.d    $s2, $sp, 2*8
+        ld.d    $s3, $sp, 3*8
+        ld.d    $s4, $sp, 4*8
+        ld.d    $ra, $sp, 5*8
+
+        addi.d  $sp, $sp, SP_SIZE
+        jr $ra
+.endm
+
+/*++
+
+Macro Description:
+
+    This macro generates code for the inner convolution kernel for the special
+    case of a depthwise separable convolution.
+
+Arguments:
+
+    BlockSize - Supplies the number of elements per block.
+
+    Isa - Supplies the instruction set architecture string for function tags.
+
+--*/
+
+        .macro SconvKernelDepthwiseFunction BlockSize, Isa
+
+/*++
+
+Routine Description:
+
+    This routine is the inner kernel to compute a convolution for the elements
+    of an output row for a set of filter rows.
+
+    Depthwise separable convolutions are a form of grouped convolution where
+    the number of input and output channels per group are one.
+
+Arguments:
+
+    Input a0 - Supplies the address of the input buffer.
+
+        The address is biased to include padding blocks for the left width
+        dimension. The address is not biased to include padding rows for the
+        left height dimension  these are accounted for in the outer kernel.
+
+    Filter a1 - Supplies the address of the filter buffer.
+
+    Output a2 - Supplies the address of the output buffer.
+
+    StrideWidth a3 - Supplies the length in bytes of the blocked stride width.
+
+    DilationWidth a4 - Supplies the length in bytes of the blocked dilation
+        width.
+
+    InputStride a5 - Supplies the length in bytes to advance the input buffer
+        to the next input row.
+
+    KernelHeight a6 - Supplies the height of the kernel to apply. This height may
+        be less than the original kernel height after removing any padding
+        rows.
+
+    KernelWidth a7- Supplies the width of the kernel to apply.
+
+    InputBase (sp, 0*8)- Supplies the address of the valid input buffer.
+
+        This parameter is similar to the Input parameter, but does not include
+        the padding blocks for the left width dimension. This parameter is used
+        with the following InputWidth parameter in order to validate that the
+        current input buffer address in bounds and not in the left or right
+        width padding region.
+
+    InputWidth (sp, 1*8)- Supplies the length in bytes of the blocked input width.
+
+    DilatedInputWidth (sp, 2*8)- Supplies the length in bytes to advance the input base
+        buffer to the next input row including dilation.
+
+    OutputCountLeftPad (sp, 3*8)- Supplies the number of output elements that include
+        one or more padding elements from the left edge.
+
+    OutputCount (sp, 4*8)- Supplies the number of output elements that do not include
+        any padding elements.
+
+    OutputCountRightPad (sp, 5*8)- Supplies the number of output elements that include
+        one or more padding elements from the right edge.
+
+    Bias (sp, 6*8)- Supplies the address of the bias buffer.
+
+    Flags (sp, 7*8)- Supplies additional flags controlling the convolution operation,
+        especially post calculation options.
+
+Return Value:
+
+    None.
+
+--*/
+
+        FUNCTION_ENTRY MlasConvDepthwiseFloatKernel\Isa\()
+        addi.d  $sp, $sp, -SP_SIZE
+        st.d    $s0, $sp, 0*8
+        st.d    $s1, $sp, 1*8
+        st.d    $s2, $sp, 2*8
+        st.d    $s3, $sp, 3*8
+        st.d    $s4, $sp, 4*8
+        st.d    $ra, $sp, 5*8
+
+        st.d    $a6, $sp, KernelHeight_arg
+        st.d    $a7, $sp, KernelWidth_arg
+
+        ld.d    $s0, $sp, SP_SIZE+0*8
+        ld.d    $s1, $sp, SP_SIZE+1*8
+        ld.d    $s2, $sp, SP_SIZE+2*8
+        ld.d    $s3, $sp, SP_SIZE+3*8
+        st.d    $s0, $sp, InputBase_arg
+        st.d    $s1, $sp, InputWidth_arg
+        st.d    $s2, $sp, DilatedInputWidth_arg
+        st.d    $s3, $sp, OutputCountLeftPad_arg
+        ld.d    $s0, $sp, SP_SIZE+4*8
+        ld.d    $s1, $sp, SP_SIZE+5*8
+        ld.d    $s2, $sp, SP_SIZE+6*8
+        ld.d    $s3, $sp, SP_SIZE+7*8
+        st.d    $s0, $sp, OutputCount_arg
+        st.d    $s1, $sp, OutputCountRightPad_arg
+        st.d    $s2, $sp, Bias_arg
+        st.d    $s3, $sp, Flags_arg
+//
+// Process the specified number of filter rows.
+//
+        move    $t8, $a4        // shuffle to Win64 register usage
+        move    $t5, $a5
+        move    $a4, $a2
+        move    $a5, $a3
+        ProcessFilterCountN SconvKernelDepthwiseFrame, Depthwise, 1
+
+//
+// Restore non-volatile registers and return.
+        ld.d    $s0, $sp, 0*8
+        ld.d    $s1, $sp, 1*8
+        ld.d    $s2, $sp, 2*8
+        ld.d    $s3, $sp, 3*8
+        ld.d    $s4, $sp, 4*8
+        ld.d    $ra, $sp, 5*8
+        addi.d  $sp, $sp, SP_SIZE
+//
+        jr $ra
+.endm
+
+/*++
+
+Macro Description:
+
+    This macro generates code to compute the convolution for a vector of input
+    blocks and a vector of filter blocks to produce a matrix of output blocks
+    for a pointwise convolution.
+
+Arguments:
+
+    Isa - Supplies the instruction set architecture string for function tags.
+
+    BlockSize - Supplies the number of elements per block.
+
+    FilterCount - Supplies the number of rows from the filter to process.
+
+    OutputCount - Supplies the number of output blocks to produce.
+
+Implicit Arguments:
+
+    (a0) - Supplies the address of the input buffer.
+
+    (a1) - Supplies the FilterStride parameter (see function description).
+
+    (s8) - Supplies the InputStride parameter (see function description).
+
+    (a4) - Supplies the address of the output buffer.
+
+    (a5) - Supplies the StrideWidth parameter (see function description).
+
+    (s5) - Supplies the address of the filter buffer.
+
+--*/
+
+        .macro ProcessPointwiseOutputCountN Isa, BlockSize, FilterCount, OutputCount
+
+        move    $a3, $a0
+        move    $a2, $t2
+        ld.d    $t1, $sp, InputChannels_arg
+        ClearBlock \FilterCount\(), \OutputCount\()
+
+.LPointwise.\FilterCount\().\OutputCount\().ProcessNextInputBlock:
+.if \OutputCount\() > 3
+        li.d    $s0, 2
+        mul     $s0, $s0, $a5
+        add.d   $t4, $a5, $s0
+        add.d   $t4, $t4, $a3               # compute input plus 3 blocks
+.endif
+.if \FilterCount\() > 2
+        li.d    $s0, 2             # compute filter plus 2 rows
+        mul.d   $s0, $s0, $a1
+        add.d   $t7, $a2, $s0
+.endif
+
+.if \BlockSize\() == 16
+        .irp Index, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+            ComputeBlock Pointwise, \FilterCount\(), \OutputCount\(), \Index\()*16*4, \Index\()*4
+        .endr
+.else
+        .irp Index, 0, 1, 2, 3, 4, 5, 6, 7
+            ComputeBlock Pointwise, \FilterCount\(), \OutputCount\(), (\Index\()-4)*8*4, \Index\()*4
+        .endr
+.endif
+        add.d   $a3, $a3, $t8                     # advance input to next channel block
+        addi.d  $a2, $a2, \BlockSize\()*\BlockSize\()*4
+                                            # advance filter by 8i8o/16i16o block
+        addi.d  $t1, $t1, -1               //InputChannels  decrement input blocks remaining
+        bnez    $t1, .LPointwise.\FilterCount\().\OutputCount\().ProcessNextInputBlock
+
+//
+// Handle post processing of the output block.
+//
+        ld.w    $a2, $sp, Flags_arg     #load flag
+.if \FilterCount\() > 1
+        ld.d    $t6 ,$sp, OutputStride_arg        #load .LSconvKernelPointwiseFrame_OutputStride
+.endif
+        ld.d    $a3, $sp, Bias_arg        # load .LSconvKernelPointwiseFrame_Bias
+        bl  MlasConvPostProcessFloat\Isa\()Filter\FilterCount\()Output\OutputCount\()
+.endm
+
+        .macro SconvKernelPointwiseFunction Isa, BiasFilter
+
+/*++
+
+Routine Description:
+
+    This routine is the inner kernel to compute a convolution for the elements
+    of an output row for a set of filter rows.
+
+    Pointwise convolutions have a kernel size of one. To simplify this
+    implementation, no input padding is allowed, which matches typical usage in
+    models.
+
+Arguments:
+
+    Input (a0) - Supplies the address of the input buffer.
+
+    Filter (a1) - Supplies the address of the filter buffer.
+
+    Output (a2) - Supplies the address of the output buffer.
+
+    StrideWidth (a3) - Supplies the length in bytes of the blocked stride width.
+
+    InputChannels (a4) - Supplies the number of input channels to process.
+
+    FilterCount (a5) - Supplies the number of rows from the filter to process.
+
+    InputStride (a6) - Supplies the length in bytes to advance the input buffer to
+        the next input channel of the same input row.
+
+    FilterStride (a7) - Supplies the length in bytes to advance the filter buffer
+        to the next set of filters.
+
+    OutputStride (sp+0) - Supplies the length in bytes to advance the output buffer
+        to the next output address associated with the next set of filters.
+
+    OutputCount (sp+8) - Supplies the number of output elements.
+
+    Bias (sp+16) - Supplies the address of the bias buffer.
+
+    Flags (sp+24) - Supplies additional flags controlling the convolution operation,
+        especially post calculation options.
+
+Return Value:
+
+    None.
+
+--*/
+
+        FUNCTION_ENTRY MlasConvPointwiseFloatKernel\Isa\()
+        addi.d  $sp, $sp, -SP_SIZE
+        st.d    $s0, $sp, 0*8
+        st.d    $s1, $sp, 1*8
+        st.d    $s2, $sp, 2*8
+        st.d    $s3, $sp, 3*8
+        st.d    $s4, $sp, 4*8
+        st.d    $ra, $sp, 5*8
+
+        ld.d    $s0, $sp, SP_SIZE+0*8
+        ld.d    $s1, $sp, SP_SIZE+1*8
+        ld.d    $s2, $sp, SP_SIZE+2*8
+        ld.d    $s3, $sp, SP_SIZE+3*8
+        st.d    $s0, $sp, OutputStride_arg
+        st.d    $s1, $sp, OutputCount_arg
+        st.d    $s2, $sp, Bias_arg
+        st.d    $s3, $sp, Flags_arg
+        st.d    $a4, $sp, InputChannels_arg
+
+.ifeqs "\BiasFilter\()","BiasFilter"
+        addi.d    $t2, $a1, 4*8*4
+.else
+        move     $t2, $a1
+.endif
+
+        ld.d    $t0, $sp, OutputCount_arg      //OutputCount
+        move    $a1, $a7        // FilterStride
+        move    $t8, $a6        // InputStride
+        move    $t1, $a5        // shuffle to Win64 register usage
+        move    $a4, $a2
+        move    $a5, $a3
+
+//
+// Process the specified number of filter rows.
+//
+        li.d    $s0, 3
+        beq     $t1, $s0, .LPointwise.ProcessFilterCount3
+        blt     $t1, $s0, .LPointwise.ProcessFilterCountLessThan3
+        ProcessPointwiseFilterCountN 4
+        b       .LPointwise.ExitKernel
+
+.LPointwise.ProcessFilterCount3:
+        ProcessPointwiseFilterCountN 3
+        b     .LPointwise.ExitKernel
+
+.LPointwise.ProcessFilterCountLessThan3:
+        li.d    $s0, 2
+        blt     $t1, $s0, .LPointwise.ProcessFilterCount1
+        ProcessPointwiseFilterCountN 2
+        b       .LPointwise.ExitKernel
+
+.LPointwise.ProcessFilterCount1:
+        ProcessPointwiseFilterCountN 1
+
+//
+// Restore non-volatile registers and return.
+//
+.LPointwise.ExitKernel:
+
+        ld.d    $s0, $sp, 0*8
+        ld.d    $s1, $sp, 1*8
+        ld.d    $s2, $sp, 2*8
+        ld.d    $s3, $sp, 3*8
+        ld.d    $s4, $sp, 4*8
+        ld.d    $ra, $sp, 5*8
+        addi.d  $sp, $sp, SP_SIZE
+        jr $ra
+.endm
diff --git a/onnxruntime/core/mlas/lib/loongarch64/SgemmKernelCommon.h b/onnxruntime/core/mlas/lib/loongarch64/SgemmKernelCommon.h
new file mode 100644
index 0000000000000..93b109c90ae4f
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/loongarch64/SgemmKernelCommon.h
@@ -0,0 +1,35 @@
+/*++
+
+Copyright (C) 2023 Loongson Technology Corporation Limited. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    SgemmKernelCommon.h
+
+Abstract:
+
+    This module contains common kernel macros and structures for the single
+    precision matrix/matrix multiply operation (SGEMM).
+
+--*/
+
+//
+// Define the single precision parameters.
+//
+
+#define    LFgemmElementShift 2
+#define    LFgemmElementSize (1 << LFgemmElementShift)
+#define    LFgemmYmmElementCount   (32/LFgemmElementSize)
+
+#include "FgemmKernelCommon.h"
+
+//
+// Define the typed instructions for single precision.
+//
+
+FGEMM_TYPED_INSTRUCTION(xvfadd, xvfadd.s)
+FGEMM_TYPED_INSTRUCTION(xvfmadd, xvfmadd.s)
+FGEMM_TYPED_INSTRUCTION(xvldrepl, xvldrepl.w)
+FGEMM_TYPED_INSTRUCTION(xvfmul, xvfmul.s)
diff --git a/onnxruntime/core/mlas/lib/loongarch64/SgemmKernelLasx.S b/onnxruntime/core/mlas/lib/loongarch64/SgemmKernelLasx.S
new file mode 100644
index 0000000000000..d537742016d01
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/loongarch64/SgemmKernelLasx.S
@@ -0,0 +1,33 @@
+/*++
+
+Copyright (C) 2023 Loongson Technology Corporation Limited. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    SgemmKernelLasx.s
+
+Abstract:
+
+    This module implements the kernels for the single precision matrix/matrix
+    multiply operation (SGEMM).
+
+    This implementation uses LASX instructions.
+
+--*/
+
+#include "asmmacro.h"
+#include "SgemmKernelCommon.h"
+#include "FgemmKernelLasxCommon.h"
+
+
+        .text
+
+//
+// Generate the GEMM kernel.
+//
+
+FgemmKernelLasxFunction MlasGemmFloatKernelLasx
+
+        .end
diff --git a/onnxruntime/core/mlas/lib/loongarch64/SgemmKernelLsx.S b/onnxruntime/core/mlas/lib/loongarch64/SgemmKernelLsx.S
new file mode 100644
index 0000000000000..86b5ef8b51b00
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/loongarch64/SgemmKernelLsx.S
@@ -0,0 +1,267 @@
+/*++
+
+Copyright (C) 2023 Loongson Technology Corporation Limited. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    SgemmKernelLsx.s
+
+Abstract:
+
+    This module implements the kernels for the single precision matrix/matrix
+    multiply operation (SGEMM).
+
+    This implementation uses Lsx instructions.
+
+--*/
+
+#include "asmmacro.h"
+#include "FgemmKernelLsxCommon.h"
+
+FGEMM_TYPED_INSTRUCTION(vfadd, vfadd.s)
+
+/*++
+
+Macro Description:
+
+    This macro multiplies and accumulates for a 16xN block of the output matrix.
+
+Arguments:
+
+    RowCount - Supplies the number of rows to process.
+
+    VectorOffset - Supplies the byte offset from matrix B to fetch elements.
+
+    Shuffle - Supplies the shuffle mask to extract the element from matrix A.
+
+Implicit Arguments:
+
+    a1 - Supplies the address into the matrix B data.
+
+    vr0-vr1 - Supplies up to four elements loaded from matrix A and matrix A
+        plus one row.
+
+    vr8-vr15 - Supplies the block accumulators.
+
+--*/
+
+        .macro ComputeBlockSseBy16 RowCount, VectorOffset, Shuffle
+        vld     $vr4, $a1, \VectorOffset
+        vld     $vr5, $a1, \VectorOffset + 16
+        vreplvei.w   $vr2, $vr0, \Shuffle
+.if \RowCount\() == 2
+        vreplvei.w   $vr3, $vr1, \Shuffle
+        vmove   $vr6, $vr4
+        vmove   $vr7, $vr5
+.endif
+        vfmadd.s $vr8, $vr4, $vr2, $vr8
+        vfmadd.s $vr9, $vr5, $vr2, $vr9
+.if \RowCount\() == 2
+        vfmadd.s $vr12, $vr6, $vr3, $vr12
+        vfmadd.s $vr13, $vr7, $vr3, $vr13
+.endif
+        vld     $vr4, $a1,  \VectorOffset + 32
+        vld     $vr5, $a1,  \VectorOffset + 48
+.if \RowCount\() == 2
+        vmove   $vr6, $vr4
+        vmove   $vr7, $vr5
+.endif
+        vfmadd.s $vr10, $vr4, $vr2, $vr10
+        vfmadd.s $vr11, $vr5, $vr2, $vr11
+.if \RowCount\() == 2
+        vfmadd.s $vr14, $vr6, $vr3, $vr14
+        vfmadd.s $vr15, $vr7, $vr3, $vr15
+.endif
+        .endm
+
+
+/*++
+
+Macro Description:
+
+    This macro generates code to compute matrix multiplication for a fixed set
+    of rows.
+
+Arguments:
+
+    RowCount - Supplies the number of rows to process.
+
+    Fallthrough - Supplies a non-blank value if the macro may fall through to
+        the ExitKernel label.
+
+Implicit Arguments:
+
+    a0 - Supplies the address of matrix A.
+
+    a1 - Supplies the address of matrix B.
+
+    t8 - Supplies the address of matrix A.
+
+    a5 - Supplies the number of columns from matrix B and matrix C to iterate
+        over.
+
+    a2 - Supplies the address of matrix C.
+
+    a3 - Supplies the number of columns from matrix A and the number of rows
+        from matrix B to iterate over.
+
+    t7 - Supplies the length in bytes of a row from matrix A.
+
+    t5 - Supplies the length in bytes of a row from matrix C.
+
+    s3 - Stores the ZeroMode argument from the stack frame.
+
+--*/
+
+        .macro ProcessCountM RowCount, Fallthrough
+.LProcessNextColumnLoop16xN\@:
+        EmitIfCountGE \RowCount\(), 1, "vxor.v $vr8, $vr8,$vr8"
+        EmitIfCountGE \RowCount\(), 1, "vxor.v $vr9, $vr9,$vr9"
+        EmitIfCountGE \RowCount\(), 1, "vxor.v $vr10, $vr10,$vr10"
+        EmitIfCountGE \RowCount\(), 1, "vxor.v $vr11, $vr11,$vr11"
+        EmitIfCountGE \RowCount\(), 2, "vxor.v $vr12, $vr12,$vr12"
+        EmitIfCountGE \RowCount\(), 2, "vxor.v $vr13, $vr13,$vr13"
+        EmitIfCountGE \RowCount\(), 2, "vxor.v $vr14, $vr14,$vr14"
+        EmitIfCountGE \RowCount\(), 2, "vxor.v $vr15, $vr15,$vr15"
+        move    $t8, $a3
+        li.d    $s0, 4
+        blt     $t8, $s0, .LProcessRemaining16xNBlocks\@
+.LCompute16xNBlockBy4Loop\@:
+        EmitIfCountGE \RowCount\(), 1, "vld $vr0, $a0, 0"
+        EmitIfCountGE \RowCount\(), 2, "vldx $vr1, $a0, $t0"    #second line of A
+        ComputeBlockSseBy16 2, 0, 0x0
+        ComputeBlockSseBy16 2, 16*4, 0x1
+        addi.d  $a1, $a1, 32*4                 # advance matrix B by 32 columns
+        ComputeBlockSseBy16 2, 0, 0x2
+        ComputeBlockSseBy16 2, 16*4, 0x3
+        addi.d  $a1, $a1, 32*4                 # advance matrix B by 32 columns
+        addi.d  $a0, $a0, 4*4                   # advance matrix A by 4 columns
+        addi.d  $t8, $t8, -4
+        li.d    $s0, 4                          #check matrix A remaining less than 4
+        bge     $t8, $s0, .LCompute16xNBlockBy4Loop\@
+
+.LProcessRemaining16xNBlocks\@:
+        beqz    $t8, .LOutput16xNBlock\@
+
+.LCompute16xNBlockBy1Loop\@:
+        EmitIfCountGE \RowCount\(), 1, "ld.w $s0, $a0, 0"
+        EmitIfCountGE \RowCount\(), 1, "vinsgr2vr.w $vr0, $s0, 0"
+        EmitIfCountGE \RowCount\(), 2, "ldx.w $s0,$a0, $t0"
+        EmitIfCountGE \RowCount\(), 2, "vinsgr2vr.w $vr1,$s0, 0"
+        ComputeBlockSseBy16 2, 0, 0x00
+        addi.d  $a1, $a1, 16*4      #advance matrix B by 16 columns
+        addi.d  $a0, $a0, 1*4       #advance matrix A by 1 column
+        addi.d  $t8, $t8, -1
+        bnez    $t8, .LCompute16xNBlockBy1Loop\@
+
+.LOutput16xNBlock\@:
+        movfr2gr.s      $s0,  $f24
+        vreplgr2vr.w    $vr2, $s0
+        EmitIfCountGE \RowCount\(), 1, "vfmul.s $vr8,$vr8,$vr2"
+                                            # multiply by alpha
+        EmitIfCountGE \RowCount\(), 1, "vfmul.s $vr9,$vr9,$vr2"
+        EmitIfCountGE \RowCount\(), 1, "vfmul.s $vr10,$vr10,$vr2"
+        EmitIfCountGE \RowCount\(), 1, "vfmul.s $vr11,$vr11,$vr2"
+        EmitIfCountGE \RowCount\(), 2, "vfmul.s $vr12,$vr12,$vr2"
+        EmitIfCountGE \RowCount\(), 2, "vfmul.s $vr13,$vr13,$vr2"
+        EmitIfCountGE \RowCount\(), 2, "vfmul.s $vr14,$vr14,$vr2"
+        EmitIfCountGE \RowCount\(), 2, "vfmul.s $vr15,$vr15,$vr2"
+        li.d    $s0, 16
+        blt     $a5, $s0, .LOutputPartial16xNBlock\@
+        sub.d   $a5, $a5, $s0
+        AccumulateAndStoreBlock \RowCount\(), 4
+        addi.d  $a2, $a2, 16*4          # advance matrix C by 16 columns
+        move    $a0, $t1                # reload matrix A
+        bnez    $a5, .LProcessNextColumnLoop16xN\@
+        b       .LExitKernel
+
+//
+// Output a partial 16xN block to the matrix.
+//
+
+.LOutputPartial16xNBlock\@:
+        li.d    $s0, 4
+        blt     $a5, $s0, .LOutputPartialLessThan4xNBlock\@
+        li.d    $s0, 8
+        blt     $a5, $s0, .LOutputPartialLessThan8xNBlock\@
+        li.d    $s0, 12
+        blt     $a5, $s0, .LOutputPartialLessThan12xNBlock\@
+        AccumulateAndStoreBlock \RowCount\(), 3
+        andi  $a5, $a5, 3
+        beqz    $a5, .LExitKernel
+        EmitIfCountGE \RowCount\(), 1, "vmove $vr8, $vr11"
+                                            # shift remaining elements down
+        EmitIfCountGE \RowCount\(), 2, "vmove $vr12, $vr15"
+        addi.d  $a2, $a2,12*4                    # advance matrix C by 12 columns
+        b     .LOutputPartialLessThan4xNBlock\@
+
+.LOutputPartialLessThan12xNBlock\@:
+        AccumulateAndStoreBlock \RowCount\(), 2
+        andi  $a5, $a5, 3
+        beqz    $a5, .LExitKernel
+        EmitIfCountGE \RowCount\(), 1, "vmove $vr8, $vr10"
+                                            # shift remaining elements down
+        EmitIfCountGE \RowCount\(), 2, "vmove $vr12, $vr14"
+        addi.d  $a2, $a2,8*4                    # advance matrix C by 8 columns
+        b     .LOutputPartialLessThan4xNBlock\@
+
+.LOutputPartialLessThan8xNBlock\@:
+        AccumulateAndStoreBlock \RowCount\(), 1
+        andi  $a5, $a5, 3
+        beqz    $a5, .LExitKernel
+        EmitIfCountGE \RowCount\(), 1, "vmove $vr8, $vr9"
+                                            # shift remaining elements down
+        EmitIfCountGE \RowCount\(), 2, "vmove $vr12, $vr13"
+        addi.d  $a2, $a2, 4*4                     # advance matrix C by 4 columns
+
+.LOutputPartialLessThan4xNBlock\@:
+        andi  $s0, $a5, 2
+        beqz    $s0, .LOutputPartial1xNBlock\@
+        and     $s0,  $t5, $t5       # ZeroMode?
+        bnez    $s0, .LSkipAccumulateOutput2xN\@
+        EmitIfCountGE \RowCount\(), 1, "vxor.v  $vr0, $vr0, $vr0"
+        EmitIfCountGE \RowCount\(), 1, "ld.d    $s0, $a2, 0"
+        EmitIfCountGE \RowCount\(), 1, "vinsgr2vr.d     $vr0, $s0, 0"
+        EmitIfCountGE \RowCount\(), 2, "vxor.v  $vr1, $vr1, $vr1"
+        EmitIfCountGE \RowCount\(), 2, "ldx.d   $s0, $a2, $t6"
+        EmitIfCountGE \RowCount\(), 2, "vinsgr2vr.d     $vr1, $s0, 0"
+        EmitIfCountGE \RowCount\(), 1, "vfadd.s $vr8, $vr8, $vr0"
+        EmitIfCountGE \RowCount\(), 2, "vfadd.s $vr12, $vr12, $vr1"
+
+.LSkipAccumulateOutput2xN\@:
+        EmitIfCountGE \RowCount\(), 1, "vstelm.d    $vr8, $a2, 0, 0"
+        EmitIfCountGE \RowCount\(), 2, "vpickve2gr.d    $s0, $vr12, 0"
+        EmitIfCountGE \RowCount\(), 2, "stx.d    $s0, $a2, $t6"
+        andi     $s0, $a5, 1
+        beqz    $s0, .LExitKernel
+        EmitIfCountGE \RowCount\(), 1, "vpermi.w $vr8, $vr8, 0xee"
+                                            # shift third element down
+        EmitIfCountGE \RowCount\(), 2, "vpermi.w $vr12, $vr12, 0xee"
+        addi.d     $a2, $a2, 2*4                     # advance matrix C by 2 columns
+
+.LOutputPartial1xNBlock\@:
+        and    $s0, $t5, $t5                   # ZeroMode?
+        bnez    $s0, .LSkipAccumulateOutput1xN\@
+
+        EmitIfCountGE \RowCount\(), 1, "fld.s $f16, $a2, 0"
+        EmitIfCountGE \RowCount\(), 1, "fadd.s $f8, $f16, $f8"
+        EmitIfCountGE \RowCount\(), 2, "fldx.s $f17, $a2, $t6"
+        EmitIfCountGE \RowCount\(), 2, "fadd.s $f12, $f12, $f17"
+
+.LSkipAccumulateOutput1xN\@:
+        EmitIfCountGE \RowCount\(), 1, "fst.s $f8, $a2, 0"
+        EmitIfCountGE \RowCount\(), 2, "fstx.s $f12, $a2, $t6"
+.ifb \Fallthrough\()
+        b     .LExitKernel
+.endif
+        .endm
+
+//
+// Generate the GEMM kernel.
+//
+
+FgemmKernelLsxFunction MlasGemmFloatKernelLSX
+
+        .end
diff --git a/onnxruntime/core/mlas/lib/loongarch64/SgemmTransposePackB16x4LSX.S b/onnxruntime/core/mlas/lib/loongarch64/SgemmTransposePackB16x4LSX.S
new file mode 100644
index 0000000000000..cd1747745d2a4
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/loongarch64/SgemmTransposePackB16x4LSX.S
@@ -0,0 +1,89 @@
+/*++
+
+Copyright (C) 2023 Loongson Technology Corporation Limited. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    SgemmTransposePackB16x4LSX.s
+
+Abstract:
+
+    This module implements routines for packing buffers for the single precision
+    matrix/matrix multiply operation (SGEMM).
+
+    This implementation uses Lsx instructions.
+
+--*/
+
+#include "asmmacro.h"
+
+        .text
+
+/*++
+
+Routine Description:
+
+    This routine transposes elements from the source matrix to the destination
+    packed buffer.
+
+    4 columns of 16 rows from the source matrix are transposed to 16 columns of 4
+    rows in the destination packed buffer.
+
+Arguments:
+
+    D (a0) - Supplies the address of the destination packed buffer.
+
+    B (a1) - Supplies the address of the source matrix.
+
+    ldb (a2) - Supplies the number of elements per row of the source matrix.
+
+Return Value:
+
+    None.
+
+--*/
+
+        FUNCTION_ENTRY MlasSgemmTransposePackB16x4LSX
+    addi.d  $sp, $sp, -64
+    st.d    $s0, $sp, 0*8
+    st.d    $s1, $sp, 1*8
+	slli.d	$a2, $a2, 2		# convert ldb to bytes
+	ori	$a3, $zero, 4		# transpose four 4x4 blocks
+	vxor.v	$vr7, $vr7, $vr7
+.LTransposeBlockLoop:
+	slli.d	$s0, $a2, 1
+	add.d	$s1, $a1, $s0
+	vld	$vr0, $a1, 0
+	vldx	$vr1, $a1, $a2
+	vld	$vr2, $s1, 0
+	vldx	$vr3, $s1, $a2
+
+	vor.v	$vr4, $vr0, $vr7
+	vilvl.w	$vr4, $vr1, $vr4
+	vilvh.w	$vr0, $vr1, $vr0
+	vor.v	$vr5, $vr2, $vr7
+	vilvl.w	$vr5, $vr3, $vr5
+	vilvh.w	$vr2, $vr3, $vr2
+	vor.v	$vr1, $vr4, $vr7
+	vilvl.d	$vr1, $vr5, $vr1
+	vilvh.d	$vr4, $vr5, $vr4
+	vor.v	$vr3, $vr0, $vr7
+	vilvl.d	$vr3, $vr2, $vr3
+	vilvh.d	$vr0, $vr2, $vr0
+	vst	$vr1, $a0, 0
+	vst	$vr4, $a0, 0x40
+	vst	$vr3, $a0, 0x80
+	vst	$vr0, $a0, 0xc0
+	addi.d	$a0, $a0, 0x10
+	slli.d	$s0, $a2, 1
+	add.d	$a1, $s0, $s1
+	addi.d	$a3, $a3, -1
+	bnez	$a3, .LTransposeBlockLoop
+    ld.d    $s0, $sp, 0*8
+    ld.d    $s1, $sp, 1*8
+    addi.d  $sp, $sp, 64
+	jr	$ra
+
+        .end
diff --git a/onnxruntime/core/mlas/lib/loongarch64/SgemmTransposePackB16x4Lasx.S b/onnxruntime/core/mlas/lib/loongarch64/SgemmTransposePackB16x4Lasx.S
new file mode 100644
index 0000000000000..e617419989c4d
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/loongarch64/SgemmTransposePackB16x4Lasx.S
@@ -0,0 +1,126 @@
+/*++
+
+Copyright (C) 2023 Loongson Technology Corporation Limited. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    SgemmTransposePackB16x4Lasx.s
+
+Abstract:
+
+    This module implements routines for packing buffers for the single precision
+    matrix/matrix multiply operation (SGEMM).
+
+    This implementation uses Lasx instructions.
+
+--*/
+
+#include "asmmacro.h"
+
+        .text
+
+/*++
+
+Macro Description:
+
+    4 columns of 8 rows from the source matrix are transposed to 8 columns of 4
+    rows in the destination packed buffer.
+
+Arguments:
+
+    StoreOffset - Supplies the relative byte offset into the destination packed
+        buffer.
+
+Implicit Arguments:
+
+    a0 - Supplies the address of the destination packed buffer.
+
+    a1 - Supplies the address of the source matrix.
+
+    a2 - Supplies the number of elements per row of the source matrix.
+
+--*/
+
+        .macro TransposePackB8x4BlockLasx StoreOffset
+
+//
+// Load 4 columns from 8 rows of the source matrix into the lower and upper
+// halves of 4 XR registers.
+//
+
+	add.d	$t0, $a2, $a2
+	add.d	$t6, $a1, $t0
+	vld	$vr0, $a1, 0
+	vldx	$vr1, $a1, $a2
+	add.d	$t0, $a2, $a2
+	add.d	$a1, $t6, $t0
+	vld	$vr2, $t6, 0
+	vldx	$vr3, $t6, $a2
+	add.d	$t0, $a2, $a2
+	add.d	$t6, $a1, $t0
+
+	vld	$vr4, $a1, 0
+	xvpermi.q	$xr0, $xr4, 0x2
+	vldx	$vr5, $a1, $a2
+	xvpermi.q	$xr1, $xr5, 0x2
+	vld	$vr4, $t6, 0
+	xvpermi.q	$xr2, $xr4, 0x2
+	vldx	$vr5, $t6, $a2
+	xvpermi.q	$xr3, $xr5, 0x2
+
+//
+// Transpose the lower and upper halves of the 4 XR registers as two 4x4
+// matrices and store the output to the destination packed buffer.
+//
+
+	xvilvl.w	$xr4, $xr1, $xr0
+	xvilvh.w	$xr5, $xr1, $xr0
+	xvilvl.w	$xr0, $xr3, $xr2
+	xvilvh.w	$xr1, $xr3, $xr2
+	xvilvl.d	$xr2, $xr0, $xr4
+	xvilvh.d	$xr3, $xr0, $xr4
+	xvst	$xr2, $a0, \StoreOffset\()
+	xvst	$xr3, $a0, 0x40+\StoreOffset\()
+	xvilvl.d	$xr0, $xr1, $xr5
+	xvilvh.d	$xr4, $xr1, $xr5
+	xvst	$xr0, $a0, 0x80+\StoreOffset\()
+	xvst	$xr4, $a0, 0xc0+\StoreOffset\()
+
+        .endm
+
+/*++
+
+Routine Description:
+
+    This routine transposes elements from the source matrix to the destination
+    packed buffer.
+
+    4 columns of 16 rows from the source matrix are transposed to 16 columns of 4
+    rows in the destination packed buffer.
+
+Arguments:
+
+    D (a0) - Supplies the address of the destination packed buffer.
+
+    B (a1) - Supplies the address of the source matrix.
+
+    ldb (a2) - Supplies the number of elements per row of the source matrix.
+
+Return Value:
+
+    None.
+
+--*/
+
+        FUNCTION_ENTRY MlasSgemmTransposePackB16x4Lasx
+
+	slli.d	$a2, $a2, 2                 # convert ldb to bytes
+        TransposePackB8x4BlockLasx 0*4
+	add.d	$t0, $a2, $a2
+	add.d	$a1, $t0, $t6
+        TransposePackB8x4BlockLasx 8*4
+	jr	$ra
+
+        .end
diff --git a/onnxruntime/core/mlas/lib/loongarch64/SoftmaxKernelLasx.S b/onnxruntime/core/mlas/lib/loongarch64/SoftmaxKernelLasx.S
new file mode 100644
index 0000000000000..aaaa3cbf9138d
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/loongarch64/SoftmaxKernelLasx.S
@@ -0,0 +1,357 @@
+/*++
+
+Copyright (C) 2023 Loongson Technology Corporation Limited. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    SoftmaxKernelLasx.s
+
+Abstract:
+
+    This module implements the kernels for the single precision softmax
+    operation.
+
+    This implementation uses Lasx instructions.
+
+--*/
+
+#include "asmmacro.h"
+
+        .text
+
+/*++
+
+Routine Description:
+
+    This routine implements a vectorized kernel to find the maximum value of
+    the supplied buffer.
+
+Arguments:
+
+    Input (a0) - Supplies the input buffer.
+
+    N (a1) - Supplies the number of elements to process.
+
+Return Value:
+
+    Returns the maximum value of the supplied buffer.
+
+--*/
+
+        FUNCTION_ENTRY MlasReduceMaximumF32KernelLasx
+	addi.d	$sp, $sp, -32
+
+	la.global	$t0, MlasMinimumF32Value
+	ld.w	$t0, $t0, 0
+	xvreplgr2vr.w	$xr0, $t0
+	beqz	$a1, .LReduceMaximum.ExitKernel
+	ori	$t0, $zero, 8
+	bltu	$a1, $t0, .LReduceMaximum.ProcessRemainingCountBy1
+	ori	$t1, $zero, 32
+	bltu	$a1, $t1, .LReduceMaximum.ProcessRemainingCountBy8
+	xvreplgr2vr.w	$xr16, $zero
+	xvor.v	$xr1, $xr0, $xr16
+	xvor.v	$xr2, $xr0, $xr16
+	xvor.v	$xr3, $xr0, $xr16
+
+.LReduceMaximum.ProcessRemainingCountBy32:
+	xvld	$xr16, $a0, 0
+	xvfmax.s	$xr0, $xr0, $xr16
+	xvld	$xr16, $a0, 8*4
+	xvfmax.s	$xr1, $xr1, $xr16
+	addi.d	$a1, $a1, -0x20
+	xvld	$xr16, $a0, 16*4
+	xvfmax.s	$xr2, $xr2, $xr16
+	xvld	$xr16, $a0, 24*4
+	xvfmax.s	$xr3, $xr3, $xr16
+	addi.d	$a0, $a0, 32*4                # advance input by 32 elements
+	ori	$t1, $zero, 32
+	bgeu	$a1, $t1, .LReduceMaximum.ProcessRemainingCountBy32
+	xvfmax.s	$xr0, $xr0, $xr1
+	xvfmax.s	$xr2, $xr2, $xr3
+	xvfmax.s	$xr0, $xr0, $xr2
+
+.LReduceMaximum.ProcessRemainingCountBy8:
+	ori	$t1, $zero, 8
+	bltu	$a1, $t1, .LReduceMaximum.ProcessRemainingCountLessThan8
+	xvld	$xr16, $a0, 0
+	xvfmax.s	$xr0, $xr0, $xr16
+	addi.d	$a1, $a1, -8
+	addi.d	$a0, $a0, 8*4
+    b	.LReduceMaximum.ProcessRemainingCountBy8
+
+.LReduceMaximum.ProcessRemainingCountLessThan8:
+	xvst	$xr0, $sp, 0
+	vld	$vr1, $sp, 0x10
+	vld	$vr0, $sp, 0
+	vfmax.s	$vr0, $vr0, $vr1
+	vshuf4i.w	$vr1, $vr0, 0xee
+	vfmax.s	$vr0, $vr0, $vr1
+	vshuf4i.w	$vr1, $vr0, 0x55
+	vfmax.s	$vr0, $vr0, $vr1
+	beqz	$a1, .LReduceMaximum.ExitKernel
+
+.LReduceMaximum.ProcessRemainingCountBy1:
+	vld	$vr16, $a0, 0
+	vfmax.s	$vr0, $vr0, $vr16
+	addi.d	$a0, $a0, 4                     # advance input by 1 element
+	addi.d	$a1, $a1, -1
+        bnez	$a1, .LReduceMaximum.ProcessRemainingCountBy1
+
+.LReduceMaximum.ExitKernel:
+	xvinsgr2vr.d	$xr0, $zero, 2
+	xvinsgr2vr.d	$xr0, $zero, 3
+	xvinsgr2vr.d	$xr1, $zero, 2
+	xvinsgr2vr.d	$xr1, $zero, 3
+	xvinsgr2vr.d	$xr2, $zero, 2
+	xvinsgr2vr.d	$xr2, $zero, 3
+	xvinsgr2vr.d	$xr3, $zero, 2
+	xvinsgr2vr.d	$xr3, $zero, 3
+	xvinsgr2vr.d	$xr4, $zero, 2
+	xvinsgr2vr.d	$xr4, $zero, 3
+	xvinsgr2vr.d	$xr5, $zero, 2
+	xvinsgr2vr.d	$xr5, $zero, 3
+	xvinsgr2vr.d	$xr6, $zero, 2
+	xvinsgr2vr.d	$xr6, $zero, 3
+	xvinsgr2vr.d	$xr7, $zero, 2
+	xvinsgr2vr.d	$xr7, $zero, 3
+	xvinsgr2vr.d	$xr8, $zero, 2
+	xvinsgr2vr.d	$xr8, $zero, 3
+	xvinsgr2vr.d	$xr9, $zero, 2
+	xvinsgr2vr.d	$xr9, $zero, 3
+	xvinsgr2vr.d	$xr10, $zero, 2
+	xvinsgr2vr.d	$xr10, $zero, 3
+	xvinsgr2vr.d	$xr11, $zero, 2
+	xvinsgr2vr.d	$xr11, $zero, 3
+	xvinsgr2vr.d	$xr12, $zero, 2
+	xvinsgr2vr.d	$xr12, $zero, 3
+	xvinsgr2vr.d	$xr13, $zero, 2
+	xvinsgr2vr.d	$xr13, $zero, 3
+	xvinsgr2vr.d	$xr14, $zero, 2
+	xvinsgr2vr.d	$xr14, $zero, 3
+	xvinsgr2vr.d	$xr15, $zero, 2
+	xvinsgr2vr.d	$xr15, $zero, 3
+	addi.d	$sp, $sp, 32
+	jr	$ra
+
+/*++
+
+Routine Description:
+
+    This routine implements a vectorized kernel to produce the final output for
+    the softmax operation.
+
+Arguments:
+
+    Output (a0) - Supplies the output buffer.
+
+    N (a1) - Supplies the number of elements to process.
+
+    Parameters (a2) - Supplies an array containing the scale value.
+
+Return Value:
+
+    None.
+
+--*/
+
+        FUNCTION_ENTRY MlasComputeSoftmaxOutputF32KernelLasx
+
+	ld.w	$t0, $a2, 0
+	xvreplgr2vr.w	$xr4, $t0
+	ori	$t1, $zero, 0x20
+	bltu	$a1, $t1, .LComputeSoftmaxOutput.ProcessRemainingCountBy8
+
+.LComputeSoftmaxOutput.ProcessRemainingCountBy32:
+	xvld	$xr16, $a0, 0
+	xvfmul.s	$xr0, $xr4, $xr16
+	xvld	$xr16, $a0, 8*4
+	xvfmul.s	$xr1, $xr4, $xr16
+	addi.d	$a1, $a1, -0x20
+	xvld	$xr16, $a0, 16*4
+	xvfmul.s	$xr2, $xr4, $xr16
+	xvld	$xr16, $a0, 24*4
+	xvfmul.s	$xr3, $xr4, $xr16
+	xvst	$xr0, $a0, 0
+	xvst	$xr1, $a0, 8*4
+	xvst	$xr2, $a0, 16*4
+	xvst	$xr3, $a0, 24*4
+	addi.d	$a0, $a0, 0x80                   # advance output by 32 elements
+	bgeu	$a1, $t1, .LComputeSoftmaxOutput.ProcessRemainingCountBy32
+
+.LComputeSoftmaxOutput.ProcessRemainingCountBy8:
+	ori	$t2, $zero, 8
+	bltu	$a1, $t2, .LComputeSoftmaxOutput.ProcessRemainingCountLessThan8
+	xvld	$xr16, $a0, 0
+	xvfmul.s	$xr0, $xr4, $xr16
+	addi.d	$a1, $a1, -8
+	xvst	$xr0, $a0, 0
+	addi.d	$a0, $a0, 8*4                   # advance output by 8 elements
+        b	.LComputeSoftmaxOutput.ProcessRemainingCountBy8
+
+.LComputeSoftmaxOutput.ProcessRemainingCountLessThan8:
+	beqz	$a1, .LComputeSoftmaxOutput.ExitKernel
+
+.LComputeSoftmaxOutput.ProcessRemainingCountBy1:
+    fld.s   $f16, $a0, 0
+    fmul.s  $f0, $f4, $f16
+    fst.s   $f0, $a0, 0
+	addi.d	$a0, $a0, 4                      # advance output by 1 element
+	addi.d	$a1, $a1, -1
+        bnez	$a1, .LComputeSoftmaxOutput.ProcessRemainingCountBy1
+
+.LComputeSoftmaxOutput.ExitKernel:
+	xvinsgr2vr.d	$xr0, $zero, 2
+	xvinsgr2vr.d	$xr0, $zero, 3
+	xvinsgr2vr.d	$xr1, $zero, 2
+	xvinsgr2vr.d	$xr1, $zero, 3
+	xvinsgr2vr.d	$xr2, $zero, 2
+	xvinsgr2vr.d	$xr2, $zero, 3
+	xvinsgr2vr.d	$xr3, $zero, 2
+	xvinsgr2vr.d	$xr3, $zero, 3
+	xvinsgr2vr.d	$xr4, $zero, 2
+	xvinsgr2vr.d	$xr4, $zero, 3
+	xvinsgr2vr.d	$xr5, $zero, 2
+	xvinsgr2vr.d	$xr5, $zero, 3
+	xvinsgr2vr.d	$xr6, $zero, 2
+	xvinsgr2vr.d	$xr6, $zero, 3
+	xvinsgr2vr.d	$xr7, $zero, 2
+	xvinsgr2vr.d	$xr7, $zero, 3
+	xvinsgr2vr.d	$xr8, $zero, 2
+	xvinsgr2vr.d	$xr8, $zero, 3
+	xvinsgr2vr.d	$xr9, $zero, 2
+	xvinsgr2vr.d	$xr9, $zero, 3
+	xvinsgr2vr.d	$xr10, $zero, 2
+	xvinsgr2vr.d	$xr10, $zero, 3
+	xvinsgr2vr.d	$xr11, $zero, 2
+	xvinsgr2vr.d	$xr11, $zero, 3
+	xvinsgr2vr.d	$xr12, $zero, 2
+	xvinsgr2vr.d	$xr12, $zero, 3
+	xvinsgr2vr.d	$xr13, $zero, 2
+	xvinsgr2vr.d	$xr13, $zero, 3
+	xvinsgr2vr.d	$xr14, $zero, 2
+	xvinsgr2vr.d	$xr14, $zero, 3
+	xvinsgr2vr.d	$xr15, $zero, 2
+	xvinsgr2vr.d	$xr15, $zero, 3
+	jr	$ra
+
+/*++
+
+Routine Description:
+
+    This routine implements a vectorized kernel to produce the final output for
+    the log softmax operation.
+
+Arguments:
+
+    Input (a0) - Supplies the output buffer.
+
+    Output (a1) - Supplies the output buffer.
+
+    N (a2) - Supplies the number of elements to process.
+
+    Parameters (a3) - Supplies an array containing the negative maximum and
+        logarithm values.
+
+Return Value:
+
+    None.
+
+--*/
+
+        FUNCTION_ENTRY MlasComputeLogSoftmaxOutputF32KernelLasx
+
+	ld.w	$t0, $a3, 0
+	ld.w	$t1, $a3, 4
+	ori	$t2, $zero, 0x20
+	xvreplgr2vr.w	$xr4, $t0       # broadcast negative minimum value
+	xvreplgr2vr.w	$xr5, $t1     # broadcast log(SumExp)
+        bltu	$a2, $t2, .LComputeLogSoftmaxOutput.ProcessRemainingCountBy8
+
+.LComputeLogSoftmaxOutput.ProcessRemainingCountBy32:
+	xvld	$xr16, $a0, 0
+	xvfadd.s	$xr0, $xr4, $xr16
+	xvld	$xr16, $a0, 0x20
+	xvfadd.s	$xr1, $xr4, $xr16
+	addi.d	$a2, $a2, -0x20
+	xvld	$xr16, $a0, 0x40
+	xvfadd.s	$xr2, $xr4, $xr16
+	xvld	$xr16, $a0, 0x60
+	xvfadd.s	$xr3, $xr4, $xr16
+	addi.d	$a0, $a0, 0x80                   # advance input by 32 elements
+	xvfsub.s	$xr0, $xr0, $xr5         # do as two steps for numeric stability
+	xvfsub.s	$xr1, $xr1, $xr5         # do as two steps for numeric stability
+	xvfsub.s	$xr2, $xr2, $xr5         # do as two steps for numeric stability
+	xvfsub.s	$xr3, $xr3, $xr5         # do as two steps for numeric stability
+	xvst	$xr0, $a1, 0
+	xvst	$xr1, $a1, 0x20
+	xvst	$xr2, $a1, 0x40
+	xvst	$xr3, $a1, 0x60
+	addi.d	$a1, $a1, 0x80                   # advance output by 32 elements
+	bgeu	$a2, $t2, .LComputeLogSoftmaxOutput.ProcessRemainingCountBy32
+
+.LComputeLogSoftmaxOutput.ProcessRemainingCountBy8:
+	ori	$t3, $zero, 8
+	bltu	$a2, $t3, .LComputeLogSoftmaxOutput.ProcessRemainingCountLessThan8
+	xvld	$xr16, $a0, 0
+	xvfadd.s	$xr0, $xr4, $xr16
+	addi.d	$a0, $a0, 0x20
+	xvfsub.s	$xr0, $xr0, $xr5
+	addi.d	$a2, $a2, -8
+	xvst	$xr0, $a1, 0
+	addi.d	$a1, $a1, 0x20                   # advance output by 8 elements
+        b	.LComputeLogSoftmaxOutput.ProcessRemainingCountBy8
+
+.LComputeLogSoftmaxOutput.ProcessRemainingCountLessThan8:
+        beqz	$a2, .LComputeLogSoftmaxOutput.ExitKernel
+
+.LComputeLogSoftmaxOutput.ProcessRemainingCountBy1:
+    fld.s   $f16, $a0, 0
+    fadd.s  $f0, $f4, $f16
+
+	addi.d	$a0, $a0, 4
+    fsub.s  $f0, $f0, $f5
+    fst.s   $f0, $a1, 0
+
+	addi.d	$a1, $a1, 4
+	addi.d	$a2, $a2, -1
+        bnez	$a2, .LComputeLogSoftmaxOutput.ProcessRemainingCountBy1
+
+.LComputeLogSoftmaxOutput.ExitKernel:
+	xvinsgr2vr.d	$xr0, $zero, 2
+	xvinsgr2vr.d	$xr0, $zero, 3
+	xvinsgr2vr.d	$xr1, $zero, 2
+	xvinsgr2vr.d	$xr1, $zero, 3
+	xvinsgr2vr.d	$xr2, $zero, 2
+	xvinsgr2vr.d	$xr2, $zero, 3
+	xvinsgr2vr.d	$xr3, $zero, 2
+	xvinsgr2vr.d	$xr3, $zero, 3
+	xvinsgr2vr.d	$xr4, $zero, 2
+	xvinsgr2vr.d	$xr4, $zero, 3
+	xvinsgr2vr.d	$xr5, $zero, 2
+	xvinsgr2vr.d	$xr5, $zero, 3
+	xvinsgr2vr.d	$xr6, $zero, 2
+	xvinsgr2vr.d	$xr6, $zero, 3
+	xvinsgr2vr.d	$xr7, $zero, 2
+	xvinsgr2vr.d	$xr7, $zero, 3
+	xvinsgr2vr.d	$xr8, $zero, 2
+	xvinsgr2vr.d	$xr8, $zero, 3
+	xvinsgr2vr.d	$xr9, $zero, 2
+	xvinsgr2vr.d	$xr9, $zero, 3
+	xvinsgr2vr.d	$xr10, $zero, 2
+	xvinsgr2vr.d	$xr10, $zero, 3
+	xvinsgr2vr.d	$xr11, $zero, 2
+	xvinsgr2vr.d	$xr11, $zero, 3
+	xvinsgr2vr.d	$xr12, $zero, 2
+	xvinsgr2vr.d	$xr12, $zero, 3
+	xvinsgr2vr.d	$xr13, $zero, 2
+	xvinsgr2vr.d	$xr13, $zero, 3
+	xvinsgr2vr.d	$xr14, $zero, 2
+	xvinsgr2vr.d	$xr14, $zero, 3
+	xvinsgr2vr.d	$xr15, $zero, 2
+	xvinsgr2vr.d	$xr15, $zero, 3
+	jr	$ra
+
+        .end
diff --git a/onnxruntime/core/mlas/lib/loongarch64/SpoolKernelLSX.S b/onnxruntime/core/mlas/lib/loongarch64/SpoolKernelLSX.S
new file mode 100644
index 0000000000000..96bda3bb12c6f
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/loongarch64/SpoolKernelLSX.S
@@ -0,0 +1,460 @@
+/*++
+
+Copyright (C) 2023 Loongson Technology Corporation Limited. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    SpoolKernelLSX.s
+
+Abstract:
+
+    This module implements the kernels for the single precision pooling
+    operation.
+
+    This implementation uses LSX instructions.
+
+--*/
+
+#define SP_SIZE 32*8
+#define InputBase_arg                   SP_SIZE+0*8
+#define InputWidth_arg                  SP_SIZE+1*8
+#define DilatedInputWidth_arg           SP_SIZE+2*8
+#define OutputCountLeftPad_arg          SP_SIZE+3*8
+#define OutputCount_arg                 SP_SIZE+4*8
+#define OutputCountRightPad_arg         SP_SIZE+5*8
+
+        .macro FUNCTION_ENTRY FunctionName
+
+        .p2align 4
+        .globl  \FunctionName\()
+        .type   \FunctionName\(),@function
+\FunctionName\():
+
+        .endm
+
+
+        .text
+
+/*++
+
+Macro Description:
+
+    This macro generates code to initialize registers used across the kernel.
+
+Arguments:
+
+    PoolingType - Supplies the pooling type string.
+
+--*/
+
+        .macro InitializeKernel PoolingType
+
+.ifeqs "\PoolingType\()","Maximum"
+	li.w	$s0, 0xFF7FFFFF
+	vreplgr2vr.w	$vr5, $s0
+.endif
+
+.ifeqs "\PoolingType\()","AverageIncludePad"
+	vreplgr2vr.w	$vr5, $a5
+    vffint.s.w      $vr5, $vr5
+.endif
+
+        .endm
+/*++
+
+Macro Description:
+
+    This macro generates the common prologue code for the pooling kernels.
+
+Arguments:
+
+    PoolingType - Supplies the pooling type string.
+
+--*/
+
+        .macro SpoolKernelEntry PoolingType
+
+        addi.d  $sp, $sp, -SP_SIZE
+        st.d    $s0, $sp, 0*8
+        st.d    $s1, $sp, 1*8
+        st.d    $s2, $sp, 2*8
+        st.d    $s3, $sp, 3*8
+        st.d    $s4, $sp, 4*8
+        st.d    $ra, $sp, 5*8
+        fst.d   $f24,$sp, 6*8
+
+        InitializeKernel \PoolingType\()
+	# move InputStride to s8
+	or	$t8, $a4, $r0
+	# move StrideWidth to a4
+	or	$a4, $a2, $r0
+	# move DilationWidth to a5
+	or	$a5, $a3, $r0
+	# move Output to a2
+	or	$a2, $a1, $r0
+
+        .endm
+
+/*++
+
+Macro Description:
+
+    This macro generates the common epilogue code for the pooling kernels.
+
+Arguments:
+
+    None.
+
+--*/
+
+        .macro SpoolKernelExit
+
+        ld.d    $s0, $sp, 0*8
+        ld.d    $s1, $sp, 1*8
+        ld.d    $s2, $sp, 2*8
+        ld.d    $s3, $sp, 3*8
+        ld.d    $s4, $sp, 4*8
+        ld.d    $ra, $sp, 5*8
+        fld.d   $f24,$sp, 6*8
+
+        addi.d  $sp, $sp, SP_SIZE
+        jr $ra
+
+        .endm
+
+
+/*++
+
+Macro Description:
+
+    This macro generates code to clear the pooling intermediates.
+
+    For PoolingType==Maximum, the pooling intermediates are set to the minimum
+    float value. Otherwise, the pooling intermediates are cleared to zero.
+
+Arguments:
+
+    PoolingType - Supplies the pooling type string.
+
+    OutputCount - Supplies the number of output blocks to produce.
+
+Implicit Arguments:
+
+    a1 - Supplies the number of blocks accessed by ComputeBlock, if
+        PoolingType=AverageExcludePad and OutputCount=1.
+
+    vr0-vr1 - Supplies the pooling intermediates.
+
+    vr2 - Supplies a vector containing the minimum float value broadcasted,
+        if PoolingType==Maximum.
+
+--*/
+
+        .macro ClearBlock PoolingType, OutputCount
+
+.ifeqs "\PoolingType\()","Maximum"
+	vor.v	$vr0, $vr5, $vr5
+	vor.v	$vr1, $vr5, $vr5
+.else
+	vxor.v	$vr0, $vr0, $vr0
+	vxor.v	$vr1, $vr1, $vr1
+.endif
+
+.ifeqs "\PoolingType\()","AverageExcludePad"
+	xor	$a1, $a1, $a1		# reset valid block counter
+.endif
+
+        .endm
+
+/*++
+
+Macro Description:
+
+    This macro generates code to sample the input buffer and update the pooling
+    intermediates as appropriate.
+
+Arguments:
+
+    PoolingType - Supplies the pooling type string.
+
+    OutputCount - Supplies the number of output blocks to produce.
+
+Implicit Arguments:
+
+    a3 - Supplies the address of the input buffer.
+
+    a1 - Supplies the number of blocks accessed by ComputeBlock, if
+        PoolingType=AverageExcludePad and OutputCount=1.
+
+    a4 - Supplies the StrideWidth parameter (see function description).
+
+    vr0-vr1 - Supplies the pooling intermediates.
+
+--*/
+
+        .macro ComputeBlock PoolingType, OutputCount
+
+.ifeqs "\PoolingType\()","Maximum"
+	vld	$vr24, $a3, 0
+	vfmax.s	$vr0, $vr0, $vr24
+	vld	$vr24, $a3, 16
+	vfmax.s	$vr1, $vr1, $vr24
+.else
+	vld	$vr24, $a3, 0
+	vfadd.s	$vr0, $vr0, $vr24
+	vld	$vr24, $a3, 16
+	vfadd.s	$vr1, $vr1, $vr24
+.endif
+
+.ifeqs "\PoolingType\()","AverageExcludePad"
+        # increment valid block counter
+	addi.d	$a1, $a1, 1
+.endif
+
+        .endm
+
+/*++
+
+Macro Description:
+
+    This macro generates code to process and store the pooling intermediates.
+
+Arguments:
+
+    PoolingType - Supplies the pooling type string.
+
+    OutputCount - Supplies the number of output blocks to produce.
+
+Implicit Arguments:
+
+    a2 - Supplies the address of the output buffer.
+
+    a1 - Supplies the number of blocks accessed by ComputeBlock, if
+        PoolingType=AverageExcludePad and OutputCount=1.
+
+    vr0-vr1 - Supplies the pooling intermediates.
+
+    vr5 - Supplies the kernel size computed by InitializeKernel, if
+        PoolingType=AverageExcludePad, else the actual kernel size, if
+        PoolingType=AverageIncludePad.
+
+--*/
+
+        .macro PostProcessBlock PoolingType, OutputCount
+
+//
+// If PoolingType=AverageExcludePad, divide the sum by the number of non-padding
+// blocks.
+//
+
+.ifeqs "\PoolingType\()","AverageExcludePad"
+	# convert valid block counter
+	vreplgr2vr.w	$vr4, $a1
+    vffint.s.w      $vr4, $vr4
+	vfdiv.s	$vr0, $vr0, $vr4
+	vfdiv.s	$vr1, $vr1, $vr4
+.endif
+
+//
+// If PoolingType=AverageIncludePad, divide the sum by the actual kernel size.
+//
+
+.ifeqs "\PoolingType\()","AverageIncludePad"
+	vfdiv.s	$vr0, $vr0, $vr5
+	vfdiv.s	$vr1, $vr1, $vr5
+.endif
+
+//
+// Store the output block in the output buffer.
+//
+
+	vst	$vr0, $a2, 0
+	vst	$vr1, $a2, 16
+        # advance output by 1 nchw8c block
+	addi.d	$a2, $a2, 8*4
+
+        .endm
+
+/*++
+
+Macro Description:
+
+    This macro generates code to compute pooling for a vector of input blocks
+    to produce a matrix of output blocks.
+
+    OutputCount=1 generates special case code to handle padding blocks. All
+    other output counts assume no padding.
+
+Arguments:
+
+    KernelFrame - Supplies the symbol name to access the convolution kernel
+        stack.
+
+    OutputCount - Supplies the number of output blocks to produce.
+
+Implicit Arguments:
+
+    a0 - Supplies the address of the input buffer.
+
+    a2 - Supplies the address of the output buffer.
+
+    a4 - Supplies the StrideWidth parameter (see function description).
+
+    a5 - Supplies the DilationWidth parameter (see function description).
+
+    s8 - Supplies the InputStride parameter (see function description).
+
+--*/
+
+        .macro ProcessOutputCountN KernelFrame, PoolingType, OutputCount
+
+	move	$a3, $a0
+	move	$t1, $a6
+	move	$t2, $a7
+.if \OutputCount\() == 1
+	ld.d	$t3, $sp, InputBase_arg
+	ld.d	$t4, $sp, InputWidth_arg
+	sub.d	$t3, $r0, $t3		# keep negative for lea usage below
+.endif
+        ClearBlock \PoolingType\(), \OutputCount\()
+        beqz	$t1, .L\PoolingType\().\OutputCount\().HandlePostProcessing
+
+.L\PoolingType\().\OutputCount\().ProcessNextRow:
+	or	$t6, $t2, $t2
+
+.L\PoolingType\().\OutputCount\().ProcessNextColumn:
+.if \OutputCount\() == 1
+        # (Input - InputBase) >= InputWidth?
+	add.d	$t7, $a3, $t3
+    bgeu	$t7, $t4, .L\PoolingType\().\OutputCount\().SkipOverPadding
+.endif
+        ComputeBlock \PoolingType\(), \OutputCount\()
+
+.L\PoolingType\().\OutputCount\().SkipOverPadding:
+        add.d	$a3, $a3, $a5       # advance input by dilation width
+        # decrement columns remaining
+	    addi.d	$t6, $t6, -1
+        bnez	$t6, .L\PoolingType\().\OutputCount\().ProcessNextColumn
+        add.d	$a3, $a3, $t8      # advance input to next row
+.if \OutputCount\() == 1
+	ld.d	$s0, $sp, DilatedInputWidth_arg
+        # advance input base to next row
+	sub.d	$t3, $t3, $s0
+.endif
+	addi.d	$t1, $t1, -1
+        bnez	$t1, .L\PoolingType\().\OutputCount\().ProcessNextRow
+
+.L\PoolingType\().\OutputCount\().HandlePostProcessing:
+        PostProcessBlock \PoolingType\(), \OutputCount\()
+
+        .endm
+/*++
+
+Macro Description:
+
+    This macro generates code for the inner pooling kernel.
+
+Arguments:
+
+    PoolingType - Supplies the pooling type string.
+
+    Isa - Supplies the instruction set architecture string for function tags.
+
+--*/
+
+        .macro SpoolKernelFunction PoolingType, Isa
+
+/*++
+
+Routine Description:
+
+    This routine is the inner kernel to compute pooling for the elements of an
+    output row for a set of filter rows.
+
+Arguments:
+
+    Input (a0) - Supplies the address of the input buffer.
+
+        The address is biased to include padding blocks for the left width
+        dimension. The address is not biased to include padding rows for the
+        left height dimension  these are accounted for in the outer kernel.
+
+    Output (a1) - Supplies the address of the output buffer.
+
+    StrideWidth (a2) - Supplies the length in bytes of the blocked stride width.
+
+    DilationWidth (a3) - Supplies the length in bytes of the blocked dilation
+        width.
+
+    InputStride (a4) - Supplies the length in bytes to advance the input buffer to
+        the next input row.
+
+    ActualKernelSize (a5) - Supplies the size of the kernel based on the original
+        kernel dimensions, used for PoolingType=AverageIncludePad.
+
+    KernelHeight (a6) - Supplies the height of the kernel to apply. This height may
+        be less than the original kernel height after removing any padding
+        rows.
+
+    KernelWidth (a7) - Supplies the width of the kernel to apply.
+
+    InputBase (0)- Supplies the address of the valid input buffer.
+
+        This parameter is similar to the Input parameter, but does not include
+        the padding blocks for the left width dimension. This parameter is used
+        with the following InputWidth parameter in order to validate that the
+        current input buffer address in bounds and not in the left or right
+        width padding region.
+
+    InputWidth (1*8)- Supplies the length in bytes of the blocked input width.
+
+    DilatedInputWidth (2*8)- Supplies the length in bytes to advance the input base
+        buffer to the next input row including dilation.
+
+    OutputCountLeftPad (3*8)- Supplies the number of output elements that include
+        one or more padding elements from the left edge.
+
+    OutputCount (4*8)- Supplies the number of output elements that do not include
+        any padding elements.
+
+    OutputCountRightPad (5*8)- Supplies the number of output elements that include
+        one or more padding elements from the right edge.
+
+Return Value:
+
+    None.
+
+--*/
+
+        FUNCTION_ENTRY MlasPool\PoolingType\()FloatKernel\Isa\()
+        SpoolKernelEntry \PoolingType\()
+
+	ld.d	$s0, $sp, OutputCountLeftPad_arg
+	ld.d	$s1, $sp, OutputCount_arg
+	add.d	$t0, $s0, $s1
+	ld.d	$s0, $sp, OutputCountRightPad_arg
+	add.d	$t0, $t0, $s0
+    beqz	$t0, .L\PoolingType\().ExitKernel
+
+.L\PoolingType\().ProcessNextOutputCount:
+    ProcessOutputCountN .LSpoolKernelFrame, \PoolingType\(), 1
+	add.d	$a0, $a0, $a4
+	addi.d	$t0, $t0, -1
+    bnez	$t0, .L\PoolingType\().ProcessNextOutputCount
+
+.L\PoolingType\().ExitKernel:
+        SpoolKernelExit
+
+        .endm
+
+//
+// Generate the pooling kernels.
+//
+
+        SpoolKernelFunction Maximum, LSX
+        SpoolKernelFunction AverageExcludePad, LSX
+        SpoolKernelFunction AverageIncludePad, LSX
+
+        .end
diff --git a/onnxruntime/core/mlas/lib/loongarch64/SpoolKernelLasx.S b/onnxruntime/core/mlas/lib/loongarch64/SpoolKernelLasx.S
new file mode 100644
index 0000000000000..6e5f0136cd4ab
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/loongarch64/SpoolKernelLasx.S
@@ -0,0 +1,238 @@
+/*++
+
+Copyright (C) 2023 Loongson Technology Corporation Limited. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    SpoolKernelLasx.s
+
+Abstract:
+
+    This module implements the kernels for the single precision pooling
+    operation.
+
+    This implementation uses Lasx instructions.
+
+--*/
+
+#include "asmmacro.h"
+#include "SpoolKernelLasxCommon.h"
+
+        .text
+
+/*++
+
+Macro Description:
+
+    This macro generates code to initialize registers used across the kernel.
+
+Arguments:
+
+    PoolingType - Supplies the pooling type string.
+
+Implicit Arguments:
+
+    a5 - Supplies the ActualKernelSize parameter (see function description).
+
+--*/
+
+        .macro InitializeKernel PoolingType
+
+.ifeqs "\PoolingType\()","Maximum"
+	li.w	$s0, 0xFF7FFFFF
+	xvreplgr2vr.w	$xr5, $s0
+.else
+	xvxor.v	$xr5, $xr5, $xr5
+.ifeqs "\PoolingType\()","AverageExcludePad"
+	move	$t6, $a6
+	mul.d	$t6, $t6, $a7
+    xvreplgr2vr.w   $xr5, $t6
+.else
+    xvreplgr2vr.w   $xr5, $a5
+.endif
+    xvffint.s.w  $xr5, $xr5
+.endif
+
+        .endm
+
+/*++
+
+Macro Description:
+
+    This macro generates code to clear the pooling intermediates.
+
+    For PoolingType==Maximum, the pooling intermediates are set to the minimum
+    float value. Otherwise, the pooling intermediates are cleared to zero.
+
+Arguments:
+
+    PoolingType - Supplies the pooling type string.
+
+    OutputCount - Supplies the number of output blocks to produce.
+
+Implicit Arguments:
+
+    a1 - Supplies the number of blocks accessed by ComputeBlock, if
+        PoolingType=AverageExcludePad and OutputCount=1.
+
+    xr0-xr2 - Supplies the pooling intermediates.
+
+    xr5 - Supplies a vector containing the minimum float value broadcasted,
+        if PoolingType==Maximum.
+
+--*/
+
+        .macro ClearBlock PoolingType, OutputCount
+
+.ifeqs "\PoolingType\()","Maximum"
+        EmitIfCountGE \OutputCount\(), 1, "xvor.v $xr0, $xr5, $xr5"
+        EmitIfCountGE \OutputCount\(), 2, "xvor.v $xr1, $xr5, $xr5"
+        EmitIfCountGE \OutputCount\(), 3, "xvor.v $xr2, $xr5, $xr5"
+.else
+        EmitIfCountGE \OutputCount\(), 1, "xvxor.v $xr0, $xr0, $xr0"
+        EmitIfCountGE \OutputCount\(), 2, "xvxor.v $xr1, $xr1, $xr1"
+        EmitIfCountGE \OutputCount\(), 3, "xvxor.v $xr2, $xr2, $xr2"
+.endif
+
+.ifeqs "\PoolingType\()","AverageExcludePad"
+.if \OutputCount\() == 1
+	xor	$a1, $a1, $a1                # reset valid block counter
+.endif
+.endif
+
+        .endm
+
+/*++
+
+Macro Description:
+
+    This macro generates code to sample the input buffer and update the pooling
+    intermediates as appropriate.
+
+Arguments:
+
+    PoolingType - Supplies the pooling type string.
+
+    OutputCount - Supplies the number of output blocks to produce.
+
+Implicit Arguments:
+
+    a3 - Supplies the address of the input buffer.
+
+    a1 - Supplies the number of blocks accessed by ComputeBlock, if
+        PoolingType=AverageExcludePad and OutputCount=1.
+
+    a4 - Supplies the StrideWidth parameter (see function description).
+
+    xr0-xr2 - Supplies the pooling intermediates.
+
+--*/
+
+        .macro ComputeBlock PoolingType, OutputCount
+
+.ifeqs "\PoolingType\()","Maximum"
+        EmitIfCountGE \OutputCount\(), 1, "xvld	$xr16, $a3, 0"
+        EmitIfCountGE \OutputCount\(), 1, "xvfmax.s	$xr0, $xr0, $xr16"
+        EmitIfCountGE \OutputCount\(), 2, "xvldx	$xr16, $a3, $a4"
+        EmitIfCountGE \OutputCount\(), 2, "xvfmax.s	$xr1, $xr1, $xr16"
+        EmitIfCountGE \OutputCount\(), 3, "slli.d	$s0, $a4, 1"
+        EmitIfCountGE \OutputCount\(), 3, "xvldx	$xr16, $a3, $s0"
+        EmitIfCountGE \OutputCount\(), 3, "xvfmax.s	$xr2, $xr2, $xr16"
+.else
+        EmitIfCountGE \OutputCount\(), 1, "xvld	$xr16, $a3, 0"
+        EmitIfCountGE \OutputCount\(), 1, "xvfadd.s	$xr0, $xr0, $xr16"
+        EmitIfCountGE \OutputCount\(), 2, "xvldx	$xr16, $a3, $a4"
+        EmitIfCountGE \OutputCount\(), 2, "xvfadd.s	$xr1, $xr1, $xr16"
+        EmitIfCountGE \OutputCount\(), 3, "slli.d	$s0, $a4, 1"
+        EmitIfCountGE \OutputCount\(), 3, "xvldx	$xr16, $a3, $s0"
+        EmitIfCountGE \OutputCount\(), 3, "xvfadd.s	$xr2, $xr2, $xr16"
+.endif
+
+.ifeqs "\PoolingType\()","AverageExcludePad"
+.if \OutputCount\() == 1
+	addi.d	$a1, $a1, 1                  # increment valid block counter
+.endif
+.endif
+
+        .endm
+
+/*++
+
+Macro Description:
+
+    This macro generates code to process and store the pooling intermediates.
+
+Arguments:
+
+    PoolingType - Supplies the pooling type string.
+
+    OutputCount - Supplies the number of output blocks to produce.
+
+Implicit Arguments:
+
+    a2 - Supplies the address of the output buffer.
+
+    a1 - Supplies the number of blocks accessed by ComputeBlock, if
+        PoolingType=AverageExcludePad and OutputCount=1.
+
+    xr0-xr2 - Supplies the pooling intermediates.
+
+    xr5 - Supplies the kernel size computed by InitializeKernel, if
+        PoolingType=AverageExcludePad, else the actual kernel size, if
+        PoolingType=AverageIncludePad.
+
+--*/
+
+        .macro PostProcessBlock PoolingType, OutputCount
+
+//
+// If PoolingType=AverageExcludePad, divide the sum by the number of non-padding
+// blocks. OutputCount=1 generates code to count the number of blocks accessed by
+// ComputeBlock. Other cases use the kernel size computed by InitializeKernel.
+//
+
+.ifeqs "\PoolingType\()","AverageExcludePad"
+.if \OutputCount\() == 1
+	xvxor.v	$xr4, $xr4, $xr4
+	xvreplgr2vr.w	$xr4, $a1
+    xvffint.s.w  $xr4, $xr4
+	xvfdiv.s	$xr0, $xr0, $xr4
+.else
+        EmitIfCountGE \OutputCount\(), 1, "xvfdiv.s $xr0, $xr0, $xr5"
+        EmitIfCountGE \OutputCount\(), 2, "xvfdiv.s $xr1, $xr1, $xr5"
+        EmitIfCountGE \OutputCount\(), 3, "xvfdiv.s $xr2, $xr2, $xr5"
+.endif
+.endif
+
+//
+// If PoolingType=AverageIncludePad, divide the sum by the actual kernel size.
+//
+
+.ifeqs "\PoolingType\()","AverageIncludePad"
+        EmitIfCountGE \OutputCount\(), 1, "xvfdiv.s $xr0, $xr0, $xr5"
+        EmitIfCountGE \OutputCount\(), 2, "xvfdiv.s $xr1, $xr1, $xr5"
+        EmitIfCountGE \OutputCount\(), 3, "xvfdiv.s $xr2, $xr2, $xr5"
+.endif
+
+//
+// Store the output block in the output buffer.
+//
+
+        EmitIfCountGE \OutputCount\(), 1, "xvst $xr0, $a2, 0"
+        EmitIfCountGE \OutputCount\(), 2, "xvst $xr1, $a2, 0x20"
+        EmitIfCountGE \OutputCount\(), 3, "xvst $xr2, $a2, 0x40"
+        add_immed $a2,\OutputCount\()*8*4   # advance output by N nchw8c blocks
+
+        .endm
+
+//
+// Generate the pooling kernels.
+//
+
+        SpoolKernelFunction Maximum, Lasx
+        SpoolKernelFunction AverageExcludePad, Lasx
+        SpoolKernelFunction AverageIncludePad, Lasx
+
+        .end
diff --git a/onnxruntime/core/mlas/lib/loongarch64/SpoolKernelLasxCommon.h b/onnxruntime/core/mlas/lib/loongarch64/SpoolKernelLasxCommon.h
new file mode 100644
index 0000000000000..066c75d34f3f9
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/loongarch64/SpoolKernelLasxCommon.h
@@ -0,0 +1,311 @@
+/*++
+
+Copyright (C) 2023 Loongson Technology Corporation Limited. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    SpoolKernelasxCommon.h
+
+Abstract:
+
+    This module contains common kernel macros and structures for the single
+    precision pooling operation for the Lasx kernels.
+
+--*/
+
+//
+// Stack frame layout for the pooling kernels.
+//
+
+#define SP_SIZE 8*8
+#define InputBase_arg                   SP_SIZE+0*8
+#define InputWidth_arg                  SP_SIZE+1*8
+#define DilatedInputWidth_arg           SP_SIZE+2*8
+#define OutputCountLeftPad_arg          SP_SIZE+3*8
+#define OutputCount_arg                 SP_SIZE+4*8
+#define OutputCountRightPad_arg         SP_SIZE+5*8
+/*++
+
+Macro Description:
+
+    This macro generates the common prologue code for the pooling kernels.
+
+Arguments:
+
+    PoolingType - Supplies the pooling type string.
+
+--*/
+
+        .macro SpoolKernelEntry PoolingType
+
+	addi.d	$sp, $sp, -SP_SIZE
+	st.d	$s0, $sp, 0
+	st.d	$s1, $sp, 1*8
+    fst.d   $f16, $sp, 2*8
+	st.d	$ra, $sp, 5*8
+
+        InitializeKernel \PoolingType\()
+	move	$t8, $a4
+	move	$a4, $a2
+	move	$a5, $a3
+	move	$a2, $a1
+
+        .endm
+
+/*++
+
+Macro Description:
+
+    This macro generates the common epilogue code for the pooling kernels.
+
+Arguments:
+
+    None.
+
+--*/
+
+        .macro SpoolKernelExit
+
+	ld.d	$s0, $sp, 0
+	ld.d	$s1, $sp,  1*8
+    fld.d   $f16, $sp, 2*8
+	ld.d	$ra, $sp, 5*8
+	addi.d	$sp, $sp, SP_SIZE
+	jr	$ra
+
+        .endm
+
+/*++
+
+Macro Description:
+
+    This macro generates code to compute pooling for a vector of input blocks
+    to produce a matrix of output blocks.
+
+    OutputCount=1 generates special case code to handle padding blocks. All
+    other output counts assume no padding.
+
+Arguments:
+
+    KernelFrame - Supplies the symbol name to access the convolution kernel
+        stack.
+
+    OutputCount - Supplies the number of output blocks to produce.
+
+Implicit Arguments:
+
+    a0 - Supplies the address of the input buffer.
+
+    a2 - Supplies the address of the output buffer.
+
+    a4 - Supplies the StrideWidth parameter (see function description).
+
+    a5 - Supplies the DilationWidth parameter (see function description).
+
+    t8 - Supplies the InputStride parameter (see function description).
+
+--*/
+
+        .macro ProcessOutputCountN KernelFrame, PoolingType, OutputCount
+
+	move	$a3, $a0
+	move	$t1, $a6
+	move	$t2, $a7
+.if \OutputCount\() == 1
+	ld.d	$t3, $sp, InputBase_arg
+	ld.d	$t4, $sp, InputWidth_arg
+	sub.d	$t3, $zero, $t3
+.endif
+        ClearBlock \PoolingType\(), \OutputCount\()
+        beqz	$t1, .L\PoolingType\().\OutputCount\().HandlePostProcessing
+
+.L\PoolingType\().\OutputCount\().ProcessNextRow:
+	move	$t6, $t2
+
+.L\PoolingType\().\OutputCount\().ProcessNextColumn:
+.if \OutputCount\() == 1
+	add.d	$t7, $a3, $t3               # compute (Input - InputBase)
+        # (Input - InputBase) >= InputWidth?
+        bgeu	$t7, $t4, .L\PoolingType\().\OutputCount\().SkipOverPadding
+.endif
+        ComputeBlock \PoolingType\(), \OutputCount\()
+
+.L\PoolingType\().\OutputCount\().SkipOverPadding:
+	add.d	$a3, $a3, $a5                # advance input by dilation width
+	addi.d	$t6, $t6, -1                 # decrement columns remaining
+        bnez	$t6, .L\PoolingType\().\OutputCount\().ProcessNextColumn
+	add.d	$a3, $a3, $t8                # advance input to next row
+.if \OutputCount\() == 1
+	ld.d	$s0, $sp, DilatedInputWidth_arg
+	sub.d	$t3, $t3, $s0
+                                            # advance input base to next row
+.endif
+	addi.d	$t1, $t1, -1
+        bnez	$t1, .L\PoolingType\().\OutputCount\().ProcessNextRow
+
+.L\PoolingType\().\OutputCount\().HandlePostProcessing:
+        PostProcessBlock \PoolingType\(), \OutputCount\()
+
+        .endm
+/*++
+
+Macro Description:
+
+    This macro generates code for the inner pooling kernel.
+
+Arguments:
+
+    PoolingType - Supplies the pooling type string.
+
+    Isa - Supplies the instruction set architecture string for function tags.
+
+--*/
+
+        .macro SpoolKernelFunction PoolingType, Isa
+
+/*++
+
+Routine Description:
+
+    This routine is the inner kernel to compute pooling for the elements of an
+    output row for a set of filter rows.
+
+Arguments:
+
+    Input (a0) - Supplies the address of the input buffer.
+
+        The address is biased to include padding blocks for the left width
+        dimension. The address is not biased to include padding rows for the
+        left height dimension  these are accounted for in the outer kernel.
+
+    Output (a1) - Supplies the address of the output buffer.
+
+    StrideWidth (a2) - Supplies the length in bytes of the blocked stride width.
+
+    DilationWidth (a3) - Supplies the length in bytes of the blocked dilation
+        width.
+
+    InputStride (a4) - Supplies the length in bytes to advance the input buffer to
+        the next input row.
+
+    ActualKernelSize (a5) - Supplies the size of the kernel based on the original
+        kernel dimensions, used for PoolingType=AverageIncludePad.
+
+    KernelHeight (a6) - Supplies the height of the kernel to apply. This height may
+        be less than the original kernel height after removing any padding
+        rows.
+
+    KernelWidth (a7)- Supplies the width of the kernel to apply.
+
+    InputBase (sp + 0)- Supplies the address of the valid input buffer.
+
+        This parameter is similar to the Input parameter, but does not include
+        the padding blocks for the left width dimension. This parameter is used
+        with the following InputWidth parameter in order to validate that the
+        current input buffer address in bounds and not in the left or right
+        width padding region.
+
+    InputWidth (sp + 0x8)- Supplies the length in bytes of the blocked input width.
+
+    DilatedInputWidth (sp + 0x10)- Supplies the length in bytes to advance the input base
+        buffer to the next input row including dilation.
+
+    OutputCountLeftPad (sp + 0x18)- Supplies the number of output elements that include
+        one or more padding elements from the left edge.
+
+    OutputCount (sp + 0x20)- Supplies the number of output elements that do not include
+        any padding elements.
+
+    OutputCountRightPad (sp + 0x28)- Supplies the number of output elements that include
+        one or more padding elements from the right edge.
+
+Return Value:
+
+    None.
+
+--*/
+
+        FUNCTION_ENTRY MlasPool\PoolingType\()FloatKernel\Isa\()
+
+        SpoolKernelEntry \PoolingType\()
+
+.L\PoolingType\().ProcessOutputCountLeftPad:
+	ld.d	$t0, $sp, OutputCountLeftPad_arg
+
+        beqz	$t0, .L\PoolingType\().ProcessOutputCount
+        bl    MlasPool\PoolingType\()FloatSingle\Isa\()
+
+.L\PoolingType\().ProcessOutputCount:
+	ld.d	$t0, $sp, OutputCount_arg
+    li.d    $s0, 3
+    bltu	$t0, $s0, .L\PoolingType\().ProcessRemainingOutputCount
+
+.L\PoolingType\().ProcessNextOutputCountBy3:
+        ProcessOutputCountN .LSpoolKernelFrame, \PoolingType\(), 3
+	slli.d	$s0, $a4, 1
+	add.d	$t6, $s0, $a4
+	add.d	$a0, $a0, $t6                # advance input by 3 elements
+	addi.d	$t0, $t0, -3
+    li.d    $s0, 3
+    bgeu	$t0, $s0, .L\PoolingType\().ProcessNextOutputCountBy3
+
+.L\PoolingType\().ProcessRemainingOutputCount:
+
+.L\PoolingType\().ProcessOutputCountRightPad:
+	ld.d	$s0, $sp, OutputCountRightPad_arg
+	add.d	$t0, $t0, $s0
+        beqz	$t0, .L\PoolingType\().ExitKernel
+        bl    MlasPool\PoolingType\()FloatSingle\Isa\()
+
+.L\PoolingType\().ExitKernel:
+	xvinsgr2vr.d	$xr0, $zero, 2
+	xvinsgr2vr.d	$xr0, $zero, 3
+	xvinsgr2vr.d	$xr1, $zero, 2
+	xvinsgr2vr.d	$xr1, $zero, 3
+	xvinsgr2vr.d	$xr2, $zero, 2
+	xvinsgr2vr.d	$xr2, $zero, 3
+	xvinsgr2vr.d	$xr3, $zero, 2
+	xvinsgr2vr.d	$xr3, $zero, 3
+	xvinsgr2vr.d	$xr4, $zero, 2
+	xvinsgr2vr.d	$xr4, $zero, 3
+	xvinsgr2vr.d	$xr5, $zero, 2
+	xvinsgr2vr.d	$xr5, $zero, 3
+	xvinsgr2vr.d	$xr6, $zero, 2
+	xvinsgr2vr.d	$xr6, $zero, 3
+	xvinsgr2vr.d	$xr7, $zero, 2
+	xvinsgr2vr.d	$xr7, $zero, 3
+	xvinsgr2vr.d	$xr8, $zero, 2
+	xvinsgr2vr.d	$xr8, $zero, 3
+	xvinsgr2vr.d	$xr9, $zero, 2
+	xvinsgr2vr.d	$xr9, $zero, 3
+	xvinsgr2vr.d	$xr10, $zero, 2
+	xvinsgr2vr.d	$xr10, $zero, 3
+	xvinsgr2vr.d	$xr11, $zero, 2
+	xvinsgr2vr.d	$xr11, $zero, 3
+	xvinsgr2vr.d	$xr12, $zero, 2
+	xvinsgr2vr.d	$xr12, $zero, 3
+	xvinsgr2vr.d	$xr13, $zero, 2
+	xvinsgr2vr.d	$xr13, $zero, 3
+	xvinsgr2vr.d	$xr14, $zero, 2
+	xvinsgr2vr.d	$xr14, $zero, 3
+	xvinsgr2vr.d	$xr15, $zero, 2
+	xvinsgr2vr.d	$xr15, $zero, 3
+        SpoolKernelExit
+
+//
+// Generate out-of-band helpers for handling output blocks involving padding.
+//
+
+MlasPool\PoolingType\()FloatSingle\Isa\():
+	st.d	$ra, $sp, 6*8
+loopMlasPool\PoolingType\()FloatSingle\Isa\():
+        ProcessOutputCountN .LSpoolKernelSingleFrame, \PoolingType\(), 1
+	add.d	$a0, $a0, $a4                # advance input by 1 element
+	addi.d	$t0, $t0, -1                 # decrement output count remaining
+        bnez	$t0, loopMlasPool\PoolingType\()FloatSingle\Isa\()
+	ld.d	$ra, $sp, 6*8
+	jr	$ra
+
+        .endm
diff --git a/onnxruntime/core/mlas/lib/loongarch64/asmmacro.h b/onnxruntime/core/mlas/lib/loongarch64/asmmacro.h
new file mode 100644
index 0000000000000..837aca77dd883
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/loongarch64/asmmacro.h
@@ -0,0 +1,144 @@
+/*++
+
+Copyright (C) 2023 Loongson Technology Corporation Limited. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    asmmacro.h
+
+Abstract:
+
+    This module implements common macros for the assembly modules.
+
+--*/
+
+#define C_UNDERSCORE(symbol) symbol
+
+.macro vmove dst src
+    vand.v  \dst, \src, \src
+.endm
+
+/*++
+
+Macro Description:
+
+    This macro emits the assembler directives to annotate a new function.
+
+Arguments:
+
+    FunctionName - Supplies the name of the function.
+
+--*/
+
+        .macro FUNCTION_ENTRY FunctionName
+        .align 2
+        .globl  \FunctionName\()
+        .type   \FunctionName\(),@function
+\FunctionName\():
+
+        .endm
+
+/*++
+
+Macro Description:
+
+    This macro generates an optimization for "add reg,128" which can instead
+    be encoded as "sub reg,-128" to reduce code size by using a signed 8-bit
+    value.
+
+Arguments:
+
+    Register - Supplies the register to be added to.
+
+    Immediate - Supplies the immediate to add to the register.
+
+--*/
+
+        .macro add_immed Register, Immediate
+
+.if (\Immediate\() != 128)
+        addi.d     \Register\(),\Register\(),\Immediate\()
+.else
+        addi.d     \Register\(),\Register\(),\Immediate\() # smaller encoding
+.endif
+
+        .endm
+
+/*++
+
+Macro Description:
+
+    This macro conditionally emits the statement if Count is greater than or
+    equal to Value.
+
+Arguments:
+
+    Count - Supplies the variable used in the comparison.
+
+    Value - Supplies the static used in the comparison.
+
+    Statement - Supplies the statement to conditionally emit.
+
+--*/
+
+        .macro EmitIfCountGE Count1, Value1, Statement
+
+.if (\Count1\() >= \Value1\())
+        \Statement\()
+.endif
+
+        .endm
+
+/*++
+
+Macro Description:
+
+    This macro conditionally emits the statement if Count1 is greater than or
+    equal to Value1 and Count2 is greater than or equal to Value2.
+
+Arguments:
+
+    Count1 - Supplies the variable used in the comparison.
+
+    Value1 - Supplies the static used in the comparison.
+
+    Count2 - Supplies the variable used in the comparison.
+
+    Value2 - Supplies the static used in the comparison.
+
+    Statement - Supplies the statement to conditionally emit.
+
+--*/
+
+        .macro EmitIfCount2GE Count1, Value1, Count2, Value2, Statement
+
+.if (\Count1\() >= \Value1\()) && (\Count2\() >= \Value2\())
+        \Statement\()
+.endif
+
+        .endm
+
+/*++
+
+Macro Description:
+
+    This macro emits the statement for each register listed in the register
+    list. The statement can use RegItem to access the current register.
+
+Arguments:
+
+    RegList - Supplies the list of registers.
+
+    Statement - Supplies the statement to emit.
+
+--*/
+
+        .macro EmitForEachRegister RegList, Statement
+
+        .irp    RegItem, \RegList\()
+        \Statement\()
+        .endr
+
+        .endm
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
index 6c859e4e4f44b..7bda1bb504173 100644
--- a/onnxruntime/core/mlas/lib/mlasi.h
+++ b/onnxruntime/core/mlas/lib/mlasi.h
@@ -67,6 +67,9 @@ Module Name:
 #undef pixel
 #undef bool
 #endif
+#if defined(__loongarch64)
+#include <lsxintrin.h>
+#endif
 #if defined(MLAS_TARGET_WASM_SIMD)
 #include <wasm_simd128.h>
 #endif
@@ -317,7 +320,8 @@ static_assert(sizeof(MLAS_FP16) == FP16_SIZE);
 // Define the prototypes of the platform optimized routines.
 //
 
-#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER)
+#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || \
+    defined(MLAS_TARGET_LARCH64)
 
 typedef
 size_t
@@ -694,6 +698,30 @@ extern "C" {
     MLAS_GEMM_DOUBLE_KERNEL MlasDgemmKernelPOWER10;
     MLAS_QUANTIZE_LINEAR_S8_KERNEL MlasQuantizeLinearS8KernelVSX;
     MLAS_QUANTIZE_LINEAR_U8_KERNEL MlasQuantizeLinearU8KernelVSX;
+#elif defined(MLAS_TARGET_LARCH64)
+    MLAS_GEMM_FLOAT_KERNEL MlasGemmFloatKernelLSX;
+    MLAS_GEMM_FLOAT_KERNEL MlasGemmFloatKernelLasx;
+    MLAS_GEMM_DOUBLE_KERNEL MlasGemmDoubleKernelLSX;
+    MLAS_GEMM_DOUBLE_KERNEL MlasGemmDoubleKernelLasx;
+    MLAS_CONV_FLOAT_KERNEL MlasConvNchwFloatKernelLSX;
+    MLAS_CONV_FLOAT_KERNEL MlasConvNchwcFloatKernelLSX;
+    MLAS_CONV_DEPTHWISE_FLOAT_KERNEL MlasConvDepthwiseFloatKernelLSX;
+    MLAS_CONV_POINTWISE_FLOAT_KERNEL MlasConvPointwiseFloatKernelLSX;
+    MLAS_CONV_FLOAT_KERNEL MlasConvNchwFloatKernelLasx;
+    MLAS_CONV_FLOAT_KERNEL MlasConvNchwcFloatKernelLasx;
+    MLAS_CONV_DEPTHWISE_FLOAT_KERNEL MlasConvDepthwiseFloatKernelLasx;
+    MLAS_CONV_POINTWISE_FLOAT_KERNEL MlasConvPointwiseFloatKernelLasx;
+    MLAS_POOL_FLOAT_KERNEL MlasPoolMaximumFloatKernelLSX;
+    MLAS_POOL_FLOAT_KERNEL MlasPoolAverageExcludePadFloatKernelLSX;
+    MLAS_POOL_FLOAT_KERNEL MlasPoolAverageIncludePadFloatKernelLSX;
+    MLAS_POOL_FLOAT_KERNEL MlasPoolMaximumFloatKernelLasx;
+    MLAS_POOL_FLOAT_KERNEL MlasPoolAverageExcludePadFloatKernelLasx;
+    MLAS_POOL_FLOAT_KERNEL MlasPoolAverageIncludePadFloatKernelLasx;
+    MLAS_SGEMM_TRANSPOSE_PACKB_BLOCK_ROUTINE MlasSgemmTransposePackB16x4LSX;
+    MLAS_SGEMM_TRANSPOSE_PACKB_BLOCK_ROUTINE MlasSgemmTransposePackB16x4Lasx;
+    MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL MlasReduceMaximumF32KernelLasx;
+    MLAS_COMPUTE_SOFTMAX_OUTPUT_FLOAT_KERNEL MlasComputeSoftmaxOutputF32KernelLasx;
+    MLAS_COMPUTE_LOGSOFTMAX_OUTPUT_FLOAT_KERNEL MlasComputeLogSoftmaxOutputF32KernelLasx;
 #else
     MLAS_GEMM_FLOAT_KERNEL MlasSgemmKernelZero;
     MLAS_GEMM_FLOAT_KERNEL MlasSgemmKernelAdd;
@@ -854,6 +882,7 @@ MlasSgemmOperation(
 struct MLAS_GEMM_QUANT_DISPATCH;
 
 extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmU8X8DispatchSse;
+extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmU8X8DispatchLSX;
 extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmU8S8DispatchSse41;
 extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmU8S8DispatchAvx2;
 extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmU8U8DispatchAvx2;
@@ -979,7 +1008,22 @@ struct MLAS_PLATFORM {
 #if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER)
     MLAS_GEMM_FLOAT_KERNEL* GemmFloatKernel;
 #endif
-
+#if defined(MLAS_TARGET_LARCH64)
+    const MLAS_GEMM_QUANT_DISPATCH* GemmU8S8Dispatch;
+    const MLAS_GEMM_QUANT_DISPATCH* GemmU8U8Dispatch;
+    MLAS_GEMM_FLOAT_KERNEL* GemmFloatKernel;
+    MLAS_GEMM_DOUBLE_KERNEL* GemmDoubleKernel;
+    MLAS_CONV_FLOAT_KERNEL* ConvNchwFloatKernel;
+    MLAS_CONV_FLOAT_KERNEL* ConvNchwcFloatKernel;
+    MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* ConvDepthwiseFloatKernel;
+    MLAS_CONV_POINTWISE_FLOAT_KERNEL* ConvPointwiseFloatKernel;
+    MLAS_POOL_FLOAT_KERNEL* PoolFloatKernel[MlasPoolingKindCount];
+    MLAS_SGEMM_TRANSPOSE_PACKB_BLOCK_ROUTINE* TransposePackB16x4Routine;
+    MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL* ReduceMaximumF32Kernel;
+    MLAS_COMPUTE_SOFTMAX_OUTPUT_FLOAT_KERNEL* ComputeSoftmaxOutputF32Kernel;
+    MLAS_COMPUTE_LOGSOFTMAX_OUTPUT_FLOAT_KERNEL* ComputeLogSoftmaxOutputF32Kernel;
+    uint32_t NchwcBlockSize;
+#endif
 #if defined(MLAS_TARGET_AMD64_IX86)
     const MLAS_GEMM_QUANT_DISPATCH* GemmU8S8Dispatch;
     const MLAS_GEMM_QUANT_DISPATCH* GemmU8U8Dispatch;
@@ -1256,6 +1300,8 @@ MlasConvDepthwiseFloat_CHW(
 #endif
 #elif defined(MLAS_TARGET_WASM_SIMD)
 #define MLAS_WASM_SIMD_INTRINSICS
+#elif defined(MLAS_TARGET_LARCH64)
+#define MLAS_LSX_INTRINSICS
 #endif
 
 #if defined(MLAS_NEON_INTRINSICS)
@@ -1271,6 +1317,9 @@ typedef __vector unsigned MLAS_UINT32X4;
 #elif defined(MLAS_WASM_SIMD_INTRINSICS)
 typedef v128_t MLAS_FLOAT32X4;
 typedef v128_t MLAS_INT32X4;
+#elif defined(MLAS_LSX_INTRINSICS)
+typedef __m128 MLAS_FLOAT32X4;
+typedef __m128i MLAS_INT32X4;
 #else
 typedef float MLAS_FLOAT32X4 __attribute__ ((vector_size(16)));
 typedef int32_t MLAS_INT32X4 __attribute__ ((vector_size(16)));
@@ -1284,6 +1333,8 @@ MlasReinterpretAsInt32x4(MLAS_FLOAT32X4 Vector)
     return vreinterpretq_s32_f32(Vector);
 #elif defined(MLAS_SSE2_INTRINSICS)
     return _mm_castps_si128(Vector);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return (MLAS_INT32X4)Vector;
 #else
     return MLAS_INT32X4(Vector);
 #endif
@@ -1299,6 +1350,8 @@ MlasCastToInt32x4(MLAS_FLOAT32X4 Vector)
     return _mm_cvttps_epi32(Vector);
 #elif defined(MLAS_VSX_INTRINSICS)
     return vec_cts(Vector, 0);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return __lsx_vftint_w_s(Vector);
 #elif defined(MLAS_WASM_SIMD_INTRINSICS)
     return (MLAS_INT32X4)__builtin_convertvector((__f32x4)Vector, __i32x4);
 #else
@@ -1318,6 +1371,8 @@ MlasCastToFloat32x4(MLAS_INT32X4 Vector)
     return vec_ctf(Vector, 0);
 #elif defined(MLAS_WASM_SIMD_INTRINSICS)
     return wasm_f32x4_convert_i32x4(Vector);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return __lsx_vffint_s_w(Vector);
 #else
     return MLAS_FLOAT32X4{float(Vector[0]), float(Vector[1]), float(Vector[2]), float(Vector[3])};
 #endif
@@ -1335,6 +1390,8 @@ MlasBroadcastInt32x4(int32_t Value)
     return wasm_i32x4_splat(Value);
 #elif defined(MLAS_VSX_INTRINSICS)
     return vec_splats(Value);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return __lsx_vreplgr2vr_w(Value);
 #else
     return MLAS_INT32X4{Value, Value, Value, Value};
 #endif
@@ -1352,6 +1409,8 @@ MlasLoadInt32x4(const int32_t* Buffer)
     return vec_vsx_ld(0, Buffer);
 #elif defined(MLAS_WASM_SIMD_INTRINSICS)
     return wasm_v128_load(Buffer);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return __lsx_vld((const MLAS_INT32X4*)Buffer, 0);
 #else
     return *((MLAS_INT32X4*)Buffer);
 #endif
@@ -1369,6 +1428,8 @@ MlasStoreInt32x4(int32_t* Buffer, MLAS_INT32X4 Vector)
     vec_vsx_st(Vector, 0, Buffer);
 #elif defined(MLAS_WASM_SIMD_INTRINSICS)
     wasm_v128_store(Buffer, Vector);
+#elif defined(MLAS_LSX_INTRINSICS)
+    __lsx_vst(Vector, (MLAS_INT32X4 *)Buffer, 0);
 #else
     *((MLAS_INT32X4*)Buffer) = Vector;
 #endif
@@ -1386,6 +1447,8 @@ MlasAddInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
     return wasm_i32x4_add(Vector1, Vector2);
 #elif defined(MLAS_VSX_INTRINSICS)
     return vec_add(Vector1, Vector2);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return __lsx_vadd_w(Vector1, Vector2);
 #else
     return Vector1 + Vector2;
 #endif
@@ -1401,6 +1464,8 @@ MlasSubtractInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
     return _mm_sub_epi32(Vector1, Vector2);
 #elif defined(MLAS_WASM_SIMD_INTRINSICS)
     return wasm_i32x4_sub(Vector1, Vector2);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return __lsx_vsub_w(Vector1, Vector2);
 #else
     return Vector1 - Vector2;
 #endif
@@ -1416,6 +1481,8 @@ MlasAndInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
     return _mm_and_si128(Vector1, Vector2);
 #elif defined(MLAS_WASM_SIMD_INTRINSICS)
     return wasm_v128_and(Vector1, Vector2);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return __lsx_vand_v(Vector1, Vector2);
 #else
     return Vector1 & Vector2;
 #endif
@@ -1431,6 +1498,8 @@ MlasOrInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
     return _mm_or_si128(Vector1, Vector2);
 #elif defined(MLAS_WASM_SIMD_INTRINSICS)
     return wasm_v128_or(Vector1, Vector2);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return __lsx_vor_v(Vector1, Vector2);
 #else
     return Vector1 | Vector2;
 #endif
@@ -1446,6 +1515,8 @@ MlasAndNotInt32x4(MLAS_INT32X4 VectorNot, MLAS_INT32X4 Vector)
     return _mm_andnot_si128(VectorNot, Vector);
 #elif defined(MLAS_WASM_SIMD_INTRINSICS)
     return wasm_v128_andnot(Vector, VectorNot);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return __lsx_vandn_v(VectorNot, Vector);
 #else
     return (~VectorNot) & Vector;
 #endif
@@ -1463,6 +1534,8 @@ MlasXorInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
     return wasm_v128_xor(Vector1, Vector2);
 #elif defined(MLAS_VSX_INTRINSICS)
     return vec_xor(Vector1, Vector2);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return __lsx_vxor_v(Vector1, Vector2);
 #else
     return Vector1 ^ Vector2;
 #endif
@@ -1486,6 +1559,8 @@ MlasShiftLeftInt32x4(MLAS_INT32X4 Vector)
     return _mm_slli_epi32(Vector, ShiftCount);
 #elif defined(MLAS_WASM_SIMD_INTRINSICS)
     return wasm_i32x4_shl(Vector, ShiftCount);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return __lsx_vslli_w(Vector, ShiftCount);
 #else
     return Vector << ShiftCount;
 #endif
@@ -1505,6 +1580,8 @@ MlasMaximumInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
     return vec_vmaxsw(Vector1, Vector2);
 #elif defined(MLAS_WASM_SIMD_INTRINSICS)
     return wasm_i32x4_max(Vector1, Vector2);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return __lsx_vmax_w(Vector1, Vector2);
 #else
     return MlasBlendInt32x4(Vector2, Vector1, Vector1 > Vector2);
 #endif
@@ -1524,6 +1601,8 @@ MlasMinimumInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
     return vec_vminsw(Vector1, Vector2);
 #elif defined(MLAS_WASM_SIMD_INTRINSICS)
     return wasm_i32x4_min(Vector1, Vector2);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return __lsx_vmin_w(Vector1, Vector2);
 #else
     return MlasBlendInt32x4(Vector2, Vector1, Vector2 > Vector1);
 #endif
@@ -1537,6 +1616,8 @@ MlasReinterpretAsFloat32x4(MLAS_INT32X4 Vector)
     return vreinterpretq_f32_s32(Vector);
 #elif defined(MLAS_SSE2_INTRINSICS)
     return _mm_castsi128_ps(Vector);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return MLAS_FLOAT32X4(Vector);
 #else
     return MLAS_FLOAT32X4(Vector);
 #endif
@@ -1556,6 +1637,8 @@ MlasBroadcastFloat32x4(float Value)
     // Suppress wrong GCC warnings
     MLAS_UNREFERENCED_PARAMETER(Value);
     return vec_splats(Value);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return MLAS_FLOAT32X4{Value, Value, Value, Value};
 #else
     return MLAS_FLOAT32X4{Value, Value, Value, Value};
 #endif
@@ -1573,6 +1656,8 @@ MlasBroadcastFloat32x4(const float* Value)
     return wasm_v128_load32_splat(Value);
 #elif defined(MLAS_VSX_INTRINSICS)
     return vec_splats(*Value);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return MLAS_FLOAT32X4{*Value, *Value, *Value, *Value};
 #else
     return MLAS_FLOAT32X4{*Value, *Value, *Value, *Value};
 #endif
@@ -1588,6 +1673,8 @@ MlasZeroFloat32x4(void)
     return _mm_setzero_ps();
 #elif defined(MLAS_WASM_SIMD_INTRINSICS)
     return wasm_f32x4_const(0.0f, 0.0f, 0.0f, 0.0f);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return MlasBroadcastFloat32x4(0.0f);
 #else
     return MlasBroadcastFloat32x4(0.0f);
 #endif
@@ -1605,6 +1692,9 @@ MlasLoadFloat32x4(const float* Buffer)
     return vec_vsx_ld(0, Buffer);
 #elif defined(MLAS_WASM_SIMD_INTRINSICS)
     return wasm_v128_load(Buffer);
+#elif defined(MLAS_LSX_INTRINSICS)
+    // return MlasReinterpretAsFloat32x4(__lsx_vld((const MLAS_INT32X4 *)Buffer, 0));
+    return (MLAS_FLOAT32X4)__lsx_vld((const MLAS_INT32X4 *)Buffer, 0);
 #else
     return *((MLAS_FLOAT32X4*)Buffer);
 #endif
@@ -1622,6 +1712,8 @@ MlasStoreFloat32x4(float* Buffer, MLAS_FLOAT32X4 Vector)
     vec_vsx_st(Vector, 0, Buffer);
 #elif defined(MLAS_WASM_SIMD_INTRINSICS)
     wasm_v128_store(Buffer, Vector);
+#elif defined(MLAS_LSX_INTRINSICS)
+    __lsx_vst(MlasReinterpretAsInt32x4(Vector), Buffer, 0);
 #else
     *((MLAS_FLOAT32X4*)Buffer) = Vector;
 #endif
@@ -1642,6 +1734,8 @@ MlasStoreAlignedFloat32x4(float* Buffer, MLAS_FLOAT32X4 Vector)
     vec_st(Vector, 0, Buffer);
 #elif defined(MLAS_WASM_SIMD_INTRINSICS)
     wasm_v128_store(Buffer, Vector);
+#elif defined(MLAS_LSX_INTRINSICS)
+    MlasStoreFloat32x4(Buffer, Vector);
 #else
     MlasStoreFloat32x4(Buffer, Vector);
 #endif
@@ -1660,6 +1754,8 @@ MlasStoreLaneFloat32x4(float* Buffer, MLAS_FLOAT32X4 Vector)
     _mm_store_ss(Buffer, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(Lane, Lane, Lane, Lane)));
 #elif defined(MLAS_WASM_SIMD_INTRINSICS)
     *Buffer = ((__f32x4)(Vector))[Lane];
+#elif defined(MLAS_LSX_INTRINSICS)
+    *Buffer = Vector[Lane];
 #else
     *Buffer = Vector[Lane];
 #endif
@@ -1675,6 +1771,9 @@ MlasStoreLowHalfFloat32x4(float* Buffer, MLAS_FLOAT32X4 Vector)
     _mm_storel_pi((__m64*)Buffer, Vector);
 #elif defined(MLAS_VSX_INTRINSICS)
     *((long long*)Buffer) = ((__vector long long)Vector)[0];
+#elif defined(MLAS_LSX_INTRINSICS)
+    MlasStoreLaneFloat32x4<0>(&Buffer[0], Vector);
+    MlasStoreLaneFloat32x4<1>(&Buffer[1], Vector);
 #else
     MlasStoreLaneFloat32x4<0>(&Buffer[0], Vector);
     MlasStoreLaneFloat32x4<1>(&Buffer[1], Vector);
@@ -1692,6 +1791,8 @@ MlasExtractLaneFloat32x4(MLAS_FLOAT32X4 Vector)
     return _mm_cvtss_f32(_mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(Lane, Lane, Lane, Lane)));
 #elif defined(MLAS_WASM_SIMD_INTRINSICS)
     return wasm_f32x4_extract_lane(Vector, Lane);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return Vector[Lane];
 #else
     return Vector[Lane];
 #endif
@@ -1736,6 +1837,9 @@ MlasShuffleFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
     return wasm_i32x4_shuffle(Vector1, Vector2, Index0, Index1, Index2, Index3);
 #elif defined(__clang__)
     return __builtin_shufflevector(Vector1, Vector2, Index0, Index1, Index2, Index3);
+#elif defined(MLAS_LSX_INTRINSICS)
+    typedef int32_t GEN_INT32X4 __attribute__ ((vector_size(16)));
+    return __builtin_shuffle(Vector1, Vector2, GEN_INT32X4{Index0, Index1, Index2, Index3});
 #else
     return __builtin_shuffle(Vector1, Vector2, MLAS_INT32X4{Index0, Index1, Index2, Index3});
 #endif
@@ -1764,6 +1868,8 @@ MlasInterleaveLowFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
     return _mm_unpacklo_ps(Vector1, Vector2);
 #elif defined(MLAS_VSX_INTRINSICS)
     return vec_mergeh(Vector1, Vector2);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return (MLAS_FLOAT32X4)__lsx_vilvl_w(MlasReinterpretAsInt32x4(Vector2), MlasReinterpretAsInt32x4(Vector1));
 #else
     return MlasShuffleFloat32x4<0, 4, 1, 5>(Vector1, Vector2);
 #endif
@@ -1782,6 +1888,8 @@ MlasInterleaveHighFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
     return _mm_unpackhi_ps(Vector1, Vector2);
 #elif defined(MLAS_VSX_INTRINSICS)
     return vec_mergel(Vector1, Vector2);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return (MLAS_FLOAT32X4)__lsx_vilvh_w(MlasReinterpretAsInt32x4(Vector2), MlasReinterpretAsInt32x4(Vector1));
 #else
     return MlasShuffleFloat32x4<2, 6, 3, 7>(Vector1, Vector2);
 #endif
@@ -1799,6 +1907,8 @@ MlasAddFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
     return wasm_f32x4_add(Vector1, Vector2);
 #elif defined(MLAS_VSX_INTRINSICS)
     return vec_add(Vector1, Vector2);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return __lsx_vfadd_s(Vector1, Vector2);
 #else
     return Vector1 + Vector2;
 #endif
@@ -1816,6 +1926,8 @@ MlasSubtractFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
     return wasm_f32x4_sub(Vector1, Vector2);
 #elif defined(MLAS_VSX_INTRINSICS)
     return vec_sub(Vector1, Vector2);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return __lsx_vfsub_s(Vector1, Vector2);
 #else
     return Vector1 - Vector2;
 #endif
@@ -1836,6 +1948,8 @@ MlasMultiplyFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
     MLAS_UNREFERENCED_PARAMETER(Vector1);
     MLAS_UNREFERENCED_PARAMETER(Vector2);
     return vec_mul(Vector1, Vector2);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return __lsx_vfmul_s(Vector1, Vector2);
 #else
     return Vector1 * Vector2;
 #endif
@@ -1855,6 +1969,8 @@ MlasMultiplyAddFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2, MLAS_FL
     return vec_madd(Vector1, Vector2, Vector3);
 #elif defined(MLAS_WASM_SIMD_INTRINSICS)
     return wasm_f32x4_add(wasm_f32x4_mul(Vector1, Vector2), Vector3);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return __lsx_vfmadd_s(Vector1, Vector2, Vector3);
 #else
     return Vector1 * Vector2 + Vector3;
 #endif
@@ -1890,6 +2006,8 @@ MlasDivideFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
     return _mm_div_ps(Vector1, Vector2);
 #elif defined(MLAS_WASM_SIMD_INTRINSICS)
     return wasm_f32x4_div(Vector1, Vector2);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return __lsx_vfdiv_s(Vector1, Vector2);
 #else
     return Vector1 / Vector2;
 #endif
@@ -1907,6 +2025,8 @@ MlasGreaterThanFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
     return wasm_f32x4_gt(Vector1, Vector2);
 #elif defined(MLAS_VSX_INTRINSICS)
     return MLAS_FLOAT32X4(vec_cmpgt(Vector1, Vector2));
+#elif defined(MLAS_LSX_INTRINSICS)
+    return (MLAS_FLOAT32X4)__lsx_vfcmp_clt_s(Vector2, Vector1);
 #else
     return Vector1 > Vector2;
 #endif
@@ -1920,6 +2040,8 @@ MlasAndFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
     return _mm_and_ps(Vector1, Vector2);
 #elif defined(MLAS_WASM_SIMD_INTRINSICS)
     return wasm_v128_and(Vector1, Vector2);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return MlasReinterpretAsFloat32x4(MlasAndInt32x4(MlasReinterpretAsInt32x4(Vector1), MlasReinterpretAsInt32x4(Vector2)));
 #else
     return MlasReinterpretAsFloat32x4(MlasAndInt32x4(MlasReinterpretAsInt32x4(Vector1), MlasReinterpretAsInt32x4(Vector2)));
 #endif
@@ -1933,6 +2055,8 @@ MlasOrFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
     return _mm_or_ps(Vector1, Vector2);
 #elif defined(MLAS_WASM_SIMD_INTRINSICS)
     return wasm_v128_or(Vector1, Vector2);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return MlasReinterpretAsFloat32x4(MlasOrInt32x4(MlasReinterpretAsInt32x4(Vector1), MlasReinterpretAsInt32x4(Vector2)));
 #else
     return MlasReinterpretAsFloat32x4(MlasOrInt32x4(MlasReinterpretAsInt32x4(Vector1), MlasReinterpretAsInt32x4(Vector2)));
 #endif
@@ -1946,6 +2070,8 @@ MlasAndNotFloat32x4(MLAS_FLOAT32X4 VectorNot, MLAS_FLOAT32X4 Vector)
     return _mm_andnot_ps(VectorNot, Vector);
 #elif defined(MLAS_WASM_SIMD_INTRINSICS)
     return wasm_v128_andnot(Vector, VectorNot);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return MlasReinterpretAsFloat32x4(MlasAndNotInt32x4(MlasReinterpretAsInt32x4(VectorNot), MlasReinterpretAsInt32x4(Vector)));
 #else
     return MlasReinterpretAsFloat32x4(MlasAndNotInt32x4(MlasReinterpretAsInt32x4(VectorNot), MlasReinterpretAsInt32x4(Vector)));
 #endif
@@ -1959,6 +2085,8 @@ MlasXorFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
     return _mm_xor_ps(Vector1, Vector2);
 #elif defined(MLAS_WASM_SIMD_INTRINSICS)
     return wasm_v128_xor(Vector1, Vector2);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return MlasReinterpretAsFloat32x4(MlasXorInt32x4(MlasReinterpretAsInt32x4(Vector1), MlasReinterpretAsInt32x4(Vector2)));
 #else
     return MlasReinterpretAsFloat32x4(MlasXorInt32x4(MlasReinterpretAsInt32x4(Vector1), MlasReinterpretAsInt32x4(Vector2)));
 #endif
@@ -1984,6 +2112,8 @@ MlasMaximumFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
     return vec_sel(Vector2, Vector1, vec_cmpgt(Vector1, Vector2));
 #elif defined(MLAS_WASM_SIMD_INTRINSICS)
     return wasm_f32x4_max(Vector1, Vector2);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return __lsx_vfmax_s(Vector1, Vector2);
 #else
     return MlasBlendFloat32x4(Vector2, Vector1, Vector1 > Vector2);
 #endif
@@ -2002,6 +2132,8 @@ MlasMinimumFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
     return vec_sel(Vector2, Vector1, vec_cmpgt(Vector2, Vector1));
 #elif defined(MLAS_WASM_SIMD_INTRINSICS)
     return wasm_f32x4_min(Vector1, Vector2);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return __lsx_vfmin_s(Vector1, Vector2);
 #else
     return MlasBlendFloat32x4(Vector2, Vector1, Vector2 > Vector1);
 #endif
@@ -2108,6 +2240,8 @@ MlasPowerOf2Float32x4(MLAS_FLOAT32X4 Vector)
 typedef __m128d MLAS_FLOAT64X2;
 #elif defined(MLAS_VSX_INTRINSICS)
 typedef __vector double MLAS_FLOAT64X2;
+#elif defined(MLAS_LSX_INTRINSICS)
+typedef __m128d MLAS_FLOAT64X2;
 #else
 #define MLAS_FLOAT64X2_UNSUPPORTED
 #endif
@@ -2129,6 +2263,27 @@ MlasMultiplyAddFloat64x2(MLAS_FLOAT64X2 Vector1, MLAS_FLOAT64X2 Vector2, MLAS_FL
     return vec_madd(Vector1, Vector2, Vector3);
 }
 
+MLAS_FORCEINLINE
+MLAS_FLOAT64X2
+MlasBroadcastFloat64x2(const double *Value)
+{
+    return MLAS_FLOAT64X2{*Value, *Value};
+}
+#elif defined(MLAS_LSX_INTRINSICS)
+template<unsigned Lane>
+MLAS_FORCEINLINE
+double
+MlasExtractLaneFloat64x2(MLAS_FLOAT64X2 Vector)
+{
+    return Vector[Lane];
+}
+MLAS_FORCEINLINE
+MLAS_FLOAT64X2
+MlasMultiplyAddFloat64x2(MLAS_FLOAT64X2 Vector1, MLAS_FLOAT64X2 Vector2, MLAS_FLOAT64X2 Vector3)
+{
+    return __lsx_vfmadd_d(Vector1, Vector2, Vector3);
+}
+
 MLAS_FORCEINLINE
 MLAS_FLOAT64X2
 MlasBroadcastFloat64x2(const double *Value)
@@ -2144,6 +2299,8 @@ MlasBroadcastFloat64x2(double Value)
     return _mm_set1_pd(Value);
 #elif defined(MLAS_VSX_INTRINSICS)
     return MLAS_FLOAT64X2{Value, Value};
+#elif defined(MLAS_LSX_INTRINSICS)
+    return MLAS_FLOAT64X2{Value, Value};
 #endif
 }
 
@@ -2155,6 +2312,8 @@ MlasZeroFloat64x2(void)
     return _mm_setzero_pd();
 #elif defined(MLAS_VSX_INTRINSICS)
     return MlasBroadcastFloat64x2(0.0f);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return MlasBroadcastFloat64x2(0.0f);
 #endif
 }
 
@@ -2166,6 +2325,8 @@ MlasLoadFloat64x2(const double* Buffer)
     return _mm_loadu_pd(Buffer);
 #elif defined(MLAS_VSX_INTRINSICS)
     return vec_vsx_ld(0, Buffer);
+#elif defined(MLAS_LSX_INTRINSICS)
+    return MLAS_FLOAT64X2(__lsx_vld((const MLAS_INT32X4 *)Buffer, 0));
 #endif
 }
 
@@ -2177,6 +2338,8 @@ MlasStoreFloat64x2(double* Buffer, MLAS_FLOAT64X2 Vector)
     _mm_storeu_pd(Buffer, Vector);
 #elif defined(MLAS_VSX_INTRINSICS)
     vec_vsx_st(Vector, 0, Buffer);
+#elif defined(MLAS_LSX_INTRINSICS)
+    (__lsx_vst(MLAS_INT32X4(Vector), Buffer, 0));
 #endif
 }
 
@@ -2188,6 +2351,8 @@ MlasStoreAlignedFloat64x2(double* Buffer, MLAS_FLOAT64X2 Vector)
     _mm_store_pd(Buffer, Vector);
 #elif defined(MLAS_VSX_INTRINSICS)
     *((MLAS_FLOAT64X2*)Buffer) = Vector;
+#elif defined(MLAS_LSX_INTRINSICS)
+    (__lsx_vst(MLAS_INT32X4(Vector), Buffer, 0));
 #endif
 }
 
@@ -2199,6 +2364,8 @@ MlasMultiplyFloat64x2(MLAS_FLOAT64X2 Vector1, MLAS_FLOAT64X2 Vector2)
     return _mm_mul_pd(Vector1, Vector2);
 #elif defined(MLAS_VSX_INTRINSICS)
     return Vector1 * Vector2;
+#elif defined(MLAS_LSX_INTRINSICS)
+    return __lsx_vfmul_d(Vector1, Vector2);
 #endif
 }
 
@@ -2233,6 +2400,17 @@ MlasReadTimeStampCounter(void)
     );
 
     return ((uint64_t)edx << 32) | eax;
+#elif defined(MLAS_TARGET_LARCH64)
+    uint64_t time_cnt, id;
+
+    __asm__ __volatile__
+    (
+        "rdtime.d %0, %1\n\t"
+        : "=r" (time_cnt), "=r" (id)
+	::
+    );
+
+    return time_cnt;
 #else
     return 0;
 #endif
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
index fec56c6ee063f..8329a34f1338f 100644
--- a/onnxruntime/core/mlas/lib/platform.cpp
+++ b/onnxruntime/core/mlas/lib/platform.cpp
@@ -185,6 +185,28 @@ MlasInitAMX()
 
 #endif // MLAS_TARGET_AMD64_IX86
 
+#ifdef MLAS_TARGET_LARCH64
+
+#if defined(__linux__)
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+#endif
+//
+// Stores a vector to build a conditional load/store mask for vmaskmovps.
+//
+
+MLAS_INTERNAL_DATA MLAS_DECLSPEC_ALIGN(const uint32_t MlasMaskMoveLasx[8], 32) = { 0, 1, 2, 3, 4, 5, 6, 7 };
+
+//
+// Stores a table of AVX vmaskmovps/vmaskmovpd load/store masks.
+//
+
+MLAS_INTERNAL_DATA MLAS_DECLSPEC_ALIGN(const uint32_t MlasMaskMoveTableLasx[16], 32) = {
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
+#endif
 MLAS_PLATFORM::MLAS_PLATFORM(
     void
     )
@@ -536,6 +558,63 @@ Return Value:
 #endif // __linux__
 #endif // MLAS_TARGET_POWER
 
+#if defined(MLAS_TARGET_LARCH64)
+
+    //
+    // Default to the baseline LSX support.
+    //
+
+    int hwcap = getauxval(AT_HWCAP);
+    bool cap_lasx = hwcap & HWCAP_LOONGARCH_LASX;
+    bool cap_lsx = hwcap & HWCAP_LOONGARCH_LSX;
+
+    if( cap_lasx ){
+        this->GemmFloatKernel = MlasGemmFloatKernelLasx;
+        this->GemmDoubleKernel = MlasGemmDoubleKernelLasx;
+        this->ConvNchwFloatKernel = MlasConvNchwFloatKernelLasx;
+        this->ConvNchwcFloatKernel = MlasConvNchwcFloatKernelLasx;
+        this->ConvDepthwiseFloatKernel = MlasConvDepthwiseFloatKernelLasx;
+        this->ConvPointwiseFloatKernel = MlasConvPointwiseFloatKernelLasx;
+        this->PoolFloatKernel[MlasMaximumPooling] = MlasPoolMaximumFloatKernelLasx;
+        this->PoolFloatKernel[MlasAveragePoolingExcludePad] = MlasPoolAverageExcludePadFloatKernelLasx;
+        this->PoolFloatKernel[MlasAveragePoolingIncludePad] = MlasPoolAverageIncludePadFloatKernelLasx;
+        this->ReduceMaximumF32Kernel = MlasReduceMaximumF32KernelLasx;
+        this->ComputeSoftmaxOutputF32Kernel = MlasComputeSoftmaxOutputF32KernelLasx;
+        this->ComputeLogSoftmaxOutputF32Kernel = MlasComputeLogSoftmaxOutputF32KernelLasx;
+        this->TransposePackB16x4Routine = MlasSgemmTransposePackB16x4Lasx;
+
+        this->GemmU8S8Dispatch = &MlasGemmU8X8DispatchLSX;
+        this->GemmU8U8Dispatch = &MlasGemmU8X8DispatchLSX;
+    }else if( cap_lsx ){
+        this->GemmFloatKernel = MlasGemmFloatKernelLSX;
+        this->GemmU8S8Dispatch = &MlasGemmU8X8DispatchLSX;
+        this->GemmU8U8Dispatch = &MlasGemmU8X8DispatchLSX;
+        this->TransposePackB16x4Routine = MlasSgemmTransposePackB16x4LSX;
+        this->GemmDoubleKernel = MlasGemmDoubleKernelLSX;
+        this->ConvNchwFloatKernel = MlasConvNchwFloatKernelLSX;
+        this->ConvNchwcFloatKernel = MlasConvNchwcFloatKernelLSX;
+        this->ConvDepthwiseFloatKernel = MlasConvDepthwiseFloatKernelLSX;
+        this->ConvPointwiseFloatKernel = MlasConvPointwiseFloatKernelLSX;
+
+        this->PoolFloatKernel[MlasMaximumPooling] = MlasPoolMaximumFloatKernelLSX;
+        this->PoolFloatKernel[MlasAveragePoolingExcludePad] = MlasPoolAverageExcludePadFloatKernelLSX;
+        this->PoolFloatKernel[MlasAveragePoolingIncludePad] = MlasPoolAverageIncludePadFloatKernelLSX;
+        this->ReduceMaximumF32Kernel = MlasReduceMaximumF32Kernel;
+        this->ComputeSoftmaxOutputF32Kernel = MlasComputeSoftmaxOutputF32Kernel;
+        this->ComputeLogSoftmaxOutputF32Kernel = MlasComputeLogSoftmaxOutputF32Kernel;
+    }else{
+        this->ReduceMaximumF32Kernel = MlasReduceMaximumF32Kernel;
+        this->ComputeSoftmaxOutputF32Kernel = MlasComputeSoftmaxOutputF32Kernel;
+        this->ComputeLogSoftmaxOutputF32Kernel = MlasComputeLogSoftmaxOutputF32Kernel;
+    }
+
+    this->NchwcBlockSize = 8;
+    // this->PreferredBufferAlignment = MLAS_DEFAULT_PREFERRED_BUFFER_ALIGNMENT;
+
+    // this->MaximumThreadCount = MLAS_MAXIMUM_THREAD_COUNT;
+
+#endif // MLAS_TARGET_LARCH64
+
 }
 
 size_t
diff --git a/onnxruntime/core/mlas/lib/pooling.cpp b/onnxruntime/core/mlas/lib/pooling.cpp
index 12128f6c700fd..50dcf19224510 100644
--- a/onnxruntime/core/mlas/lib/pooling.cpp
+++ b/onnxruntime/core/mlas/lib/pooling.cpp
@@ -1569,6 +1569,96 @@ Return Value:
             c -= 16;
         }
 
+#elif defined(MLAS_LSX_INTRINSICS)
+        uint32_t val = 0x80808080;
+        const __m128i BitFlipVector = __lsx_vreplgr2vr_w(val);
+        if constexpr (std::is_unsigned<T8Bits>::value) {
+            MLAS_UNREFERENCED_PARAMETER(BitFlipVector);
+        }
+
+        while (c >= 32) {
+
+            __m128i MaximumVector0 = __lsx_vldi(0);
+            __m128i MaximumVector1 = __lsx_vldi(0);
+
+            for (size_t k = 0; k < KernelSize; k++) {
+
+                __m128i InputVector0 = __lsx_vld((const __m128i*)&Input[k][ChannelOffset], 0);
+                __m128i InputVector1 = __lsx_vld((const __m128i*)&Input[k][ChannelOffset + 16], 0);
+
+                if constexpr (std::is_signed<T8Bits>::value) {
+                    InputVector0 = __lsx_vxor_v(InputVector0, BitFlipVector);
+                    InputVector1 = __lsx_vxor_v(InputVector1, BitFlipVector);
+                }
+
+                MaximumVector0 = __lsx_vmax_bu(MaximumVector0, InputVector0);
+                MaximumVector1 = __lsx_vmax_bu(MaximumVector1, InputVector1);
+            }
+
+            if constexpr (std::is_signed<T8Bits>::value) {
+                MaximumVector0 = __lsx_vxor_v(MaximumVector0, BitFlipVector);
+                MaximumVector1 = __lsx_vxor_v(MaximumVector1, BitFlipVector);
+            }
+
+            __lsx_vst(MaximumVector0, (__m128i*)&Output[0], 0);
+            __lsx_vst(MaximumVector1, (__m128i*)&Output[16], 0);
+            Output += 32;
+
+            ChannelOffset += 32;
+            c -= 32;
+        }
+
+        while (c >= 16) {
+
+            __m128i MaximumVector0 = __lsx_vldi(0);
+
+            for (size_t k = 0; k < KernelSize; k++) {
+
+                __m128i InputVector0 = __lsx_vld((const __m128i*)&Input[k][ChannelOffset], 0);
+
+                if constexpr (std::is_signed<T8Bits>::value){
+                    InputVector0 = __lsx_vxor_v(InputVector0, BitFlipVector);
+                }
+
+                MaximumVector0 = __lsx_vmax_bu(MaximumVector0, InputVector0);
+            }
+
+            if constexpr (std::is_signed<T8Bits>::value) {
+                MaximumVector0 = __lsx_vxor_v(MaximumVector0, BitFlipVector);
+            }
+
+            __lsx_vst(MaximumVector0, (__m128i*)&Output[0], 0);
+            Output += 16;
+
+            ChannelOffset += 16;
+            c -= 16;
+        }
+
+        if (c >= 8) {
+
+            __m128i MaximumVector0 = __lsx_vldi(0);
+
+            for (size_t k = 0; k < KernelSize; k++) {
+
+                __m128i InputVector0 = __lsx_vinsgr2vr_d(__lsx_vld((const __m128i*)&Input[k][ChannelOffset], 0), 0, 1);
+
+                if constexpr (std::is_signed<T8Bits>::value){
+                    InputVector0 = __lsx_vxor_v(InputVector0, BitFlipVector);
+                }
+
+                MaximumVector0 = __lsx_vmax_bu(MaximumVector0, InputVector0);
+            }
+
+            if constexpr (std::is_signed<T8Bits>::value) {
+                MaximumVector0 = __lsx_vxor_v(MaximumVector0, BitFlipVector);
+            }
+
+            __lsx_vst(__lsx_vinsgr2vr_d(__lsx_vld((__m128i*)&Output[0] , 0), __lsx_vpickve2gr_d(MaximumVector0, 0), 0), (__m128i*)&Output[0], 0);
+            Output += 8;
+
+            ChannelOffset += 8;
+            c -= 8;
+        }
 #endif
 
         while (c > 0) {
diff --git a/onnxruntime/core/mlas/lib/q4gemm.h b/onnxruntime/core/mlas/lib/q4gemm.h
index b1b51dd53c4fc..d16798eb8945f 100644
--- a/onnxruntime/core/mlas/lib/q4gemm.h
+++ b/onnxruntime/core/mlas/lib/q4gemm.h
@@ -126,7 +126,7 @@ MlasQ4GemmOperation(
 
         size_t RowsRemaining = RangeCountM;
         while (RowsRemaining > 0) {
-#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER)
+#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_LARCH64)
             auto RowsHandled = GetMlasPlatform().GemmFloatKernel(
                 a_row, dequant_b, c_blk, K, RowsRemaining, CountN, lda, ldc, 1.f, true);
 #else
diff --git a/onnxruntime/core/mlas/lib/qdwconv.cpp b/onnxruntime/core/mlas/lib/qdwconv.cpp
index 924009ab5ccf4..59f6877f70d56 100644
--- a/onnxruntime/core/mlas/lib/qdwconv.cpp
+++ b/onnxruntime/core/mlas/lib/qdwconv.cpp
@@ -41,6 +41,10 @@ MlasConvDepthwiseKernel(
 #elif defined(MLAS_NEON_INTRINSICS)
     const uint8x8_t InputZeroPointVector = vdup_n_u8(uint8_t(InputZeroPoint));
     const uint8x8_t FilterZeroPointVector = vdup_n_u8(uint8_t(FilterZeroPoint));
+#elif defined(MLAS_LSX_INTRINSICS)
+    const __m128i ZeroVector = __lsx_vldi(0);
+    const __m128i InputZeroPointVector = __lsx_vreplgr2vr_h(InputZeroPoint);
+    const __m128i FilterZeroPointVector = __lsx_vreplgr2vr_h(FilterZeroPoint);
 #endif
 
     while (OutputCount > 0) {
@@ -141,6 +145,54 @@ MlasConvDepthwiseKernel(
             vst1q_s32(&Output[4], Accumulator1);
             Output += 8;
 
+            ChannelOffset += 8;
+            c -= 8;
+        }
+#elif defined(MLAS_LSX_INTRINSICS)
+
+        while (c >= 8) {
+            __m128i Accumulator0 = __lsx_vldi(0);
+            __m128i Accumulator1 = __lsx_vldi(0);
+            size_t ChannelKernelOffset = ChannelOffset;
+
+            for (size_t k = 0; k < KernelSize; k++) {
+                __m128i InputVector = __lsx_vld((const __m128i*)&Input[k][ChannelOffset], 0);
+                __lsx_vinsgr2vr_d(InputVector, 0, 1);
+                __m128i FilterVector =
+                    __lsx_vld((const __m128i*)&Filter[ChannelKernelOffset], 0);
+                __lsx_vinsgr2vr_d(FilterVector, 0, 1);
+
+                if (std::is_signed<InputType>::value) {
+                    InputVector = __lsx_vsrai_h(__lsx_vilvl_b(InputVector, ZeroVector), 8);
+                } else {
+                    InputVector = __lsx_vilvl_b(ZeroVector, InputVector );
+                }
+
+                if (std::is_signed<FilterType>::value) {
+                    FilterVector = __lsx_vsrai_h(__lsx_vilvl_b(FilterVector, ZeroVector), 8);
+                } else {
+                    FilterVector = __lsx_vilvl_b(ZeroVector, FilterVector);
+                }
+
+                InputVector = __lsx_vsub_h(InputVector, InputZeroPointVector);
+                FilterVector = __lsx_vsub_h(FilterVector, FilterZeroPointVector);
+
+                // N.B. Emulate PMULLD functionality on LSX by computing the low
+                // and high parts of the result and interleaving the results.
+                __m128i MultiplyLowWords = __lsx_vmul_h(InputVector, FilterVector);
+                __m128i MultiplyHighWords = __lsx_vmuh_h(InputVector, FilterVector);
+                __m128i Multiply0 = __lsx_vilvl_h(MultiplyHighWords, MultiplyLowWords);
+                __m128i Multiply1 = __lsx_vilvh_h(MultiplyHighWords, MultiplyLowWords);
+
+                Accumulator0 = __lsx_vadd_w(Accumulator0, Multiply0);
+                Accumulator1 = __lsx_vadd_w(Accumulator1, Multiply1);
+                ChannelKernelOffset += Channels;
+            }
+
+            __lsx_vst(Accumulator0, (__m128i*)&Output[0], 0);
+            __lsx_vst(Accumulator1, (__m128i*)&Output[4], 0);
+            Output += 8;
+
             ChannelOffset += 8;
             c -= 8;
         }
@@ -322,4 +374,4 @@ Return Value:
                 );
         }
     }
-}
\ No newline at end of file
+}
diff --git a/onnxruntime/core/mlas/lib/qgemm.h b/onnxruntime/core/mlas/lib/qgemm.h
index 1fcd44e78a28c..75c17a6b5a177 100644
--- a/onnxruntime/core/mlas/lib/qgemm.h
+++ b/onnxruntime/core/mlas/lib/qgemm.h
@@ -871,7 +871,7 @@ MlasGemmQuantGetDispatch(
         GemmQuantDispatch = &MlasGemmQuantDispatchDefault;
     }
 
-#if defined(MLAS_TARGET_AMD64_IX86)
+#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_LARCH64)
     if (!AIsSigned) {
         if (BIsSigned) {
             GemmQuantDispatch = GetMlasPlatform().GemmU8S8Dispatch;
diff --git a/onnxruntime/core/mlas/lib/qgemm_kernel_lsx.cpp b/onnxruntime/core/mlas/lib/qgemm_kernel_lsx.cpp
new file mode 100644
index 0000000000000..7d5817335bd77
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/qgemm_kernel_lsx.cpp
@@ -0,0 +1,531 @@
+/*++
+
+Copyright (C) 2023 Loongson Technology Corporation Limited.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    qgemm_kernel_lsx.cpp
+
+Abstract:
+
+    This module implements QGEMM kernels for LSX.
+
+--*/
+
+#include "mlasi.h"
+#include "qgemm.h"
+#include <lsxintrin.h>
+
+struct MLAS_GEMM_U8X8_KERNEL_LSX
+{
+    typedef int16_t PackedAType;
+    typedef int16_t PackedBType;
+    typedef uint8_t OffsetAType;
+    typedef int8_t OffsetBType;
+
+    static constexpr size_t PackedK = 2;
+    static constexpr MLAS_GEMM_QUANT_STRIDES Strides{ 12, 128, 128 };
+    static constexpr MLAS_GEMM_QUANT_STRIDES PackedStrides{0, 0, 0};
+};
+
+constexpr size_t MLAS_GEMM_U8X8_KERNEL_LSX::PackedK;
+constexpr MLAS_GEMM_QUANT_STRIDES MLAS_GEMM_U8X8_KERNEL_LSX::Strides;
+
+template<>
+MLAS_FORCEINLINE constexpr
+int32_t
+MlasGemmQuantFixupZeroPointB<MLAS_GEMM_U8X8_KERNEL_LSX>(
+    int32_t ZeroPointB,
+    bool BIsSigned
+    )
+{
+    if (!BIsSigned) {
+        ZeroPointB = MLAS_GEMM_U8X8_KERNEL_LSX::OffsetBType(ZeroPointB ^ 0x80);
+    }
+
+    return ZeroPointB;
+}
+
+template<>
+void
+MlasGemmQuantCopyPackA<MLAS_GEMM_U8X8_KERNEL_LSX>(
+    MLAS_GEMM_U8X8_KERNEL_LSX::PackedAType* D,
+    const uint8_t* A,
+    size_t lda,
+    size_t CountM,
+    size_t CountK,
+    int32_t* RowSumBuffer,
+    bool AIsSigned
+    )
+{
+    MLAS_UNREFERENCED_PARAMETER(AIsSigned);
+    const __m128i ZeroVector = __lsx_vrepli_d(0);
+    uint16_t val = 1;
+    const __m128i OnesWordBroadcast = __lsx_vreplgr2vr_h(val);
+    uint8_t PaddedMatrixAData[8] = { 0 };
+
+    //
+    // Process a single row of matrix A in a loop.
+    //
+
+    while (CountM > 0) {
+
+        const uint8_t* a = A;
+        size_t k = CountK;
+        __m128i ReductionVector = ZeroVector;
+
+        //
+        // Zero extend the source bytes to 16-bits and write to the packed
+        // buffer.
+        //
+        // The packed buffer has the same data ordering as the source bytes,
+        // but CountK is aligned up to a multiple of 2 to maintain 32-bit
+        // alignment. All extra bytes are zero-padded.
+        //
+        // These 16-bit values are also accumulated into an intermediate per-row
+        // accumulator. CountK cannot be greater than 128 to avoid overflowing
+        // these signed 16-bit accumulators.
+        //
+
+        while (k >= 8) {
+
+            __m128i Bytes = __lsx_vld((const __m128i*) & a[0], 0);
+            __lsx_vinsgr2vr_d(Bytes, 0, 1);
+            __m128i Words = __lsx_vilvl_b(ZeroVector, Bytes);
+
+            ReductionVector = __lsx_vadd_h(ReductionVector, Words);
+
+            __lsx_vst(Words, (__m128i*) & D[0], 0);
+
+            a += 8;
+            D += 8;
+            k -= 8;
+        }
+
+        if (k > 0) {
+
+            //
+            // Copy the remaining bytes to the zero padded stack buffer.
+            //
+
+            uint8_t* padded = PaddedMatrixAData;
+            uint8_t* padded_end = padded + k;
+
+            do {
+                padded[0] = a[0];
+                padded++;
+                a++;
+            } while (padded < padded_end);
+
+            __m128i Bytes = __lsx_vld((__m128i*)PaddedMatrixAData, 0);
+            __lsx_vinsgr2vr_d(Bytes, 0, 1); 
+            __m128i Words = __lsx_vilvl_b(ZeroVector, Bytes);
+
+            ReductionVector = __lsx_vadd_h(ReductionVector, Words);
+
+            //
+            // Copy pairs of 16-bit values from the vector to the packed
+            // buffer and rotate the vector for the next iteration.
+            //
+
+            for (size_t pairs = (k + 1) / 2; pairs > 0; pairs--) {
+                __lsx_vstelm_w(Words, (int32_t*)D, 0 , 0);
+                D += 2;
+                Words = __lsx_vshuf4i_w(Words, 0x39); //(0, 3, 2, 1)
+            }
+        }
+
+        //
+        // Reduce the partial accumulators.
+        //
+        __m128i tmp1 = ZeroVector, tmp2 = ZeroVector;
+        tmp1 = __lsx_vmaddwev_w_h(tmp1, ReductionVector, OnesWordBroadcast);
+        tmp2 = __lsx_vmaddwod_w_h(tmp2, ReductionVector, OnesWordBroadcast);
+        ReductionVector = __lsx_vadd_w(tmp1, tmp2);
+        ReductionVector = __lsx_vadd_w(ReductionVector,
+                                        __lsx_vshuf4i_w(ReductionVector, 0xee));
+        ReductionVector = __lsx_vadd_w(ReductionVector,
+                                        __lsx_vshuf4i_w(ReductionVector, 0x11));
+
+        __lsx_vstelm_w(ReductionVector, RowSumBuffer++, 0 , 0);
+
+        A += lda;
+        CountM -= 1;
+    }
+}
+
+MLAS_FORCEINLINE
+void
+MlasGemmU8X8CopyPackBProcessLSX(
+    MLAS_GEMM_U8X8_KERNEL_LSX::PackedBType* D,
+    __m128i BytesRow0,
+    __m128i BytesRow1,
+    __m128i BitFlipVector,
+    __m128i ColumnSums[2]
+)
+{
+    __m128i BytesInterleaved = __lsx_vilvl_b(BytesRow1, BytesRow0);
+
+    BytesInterleaved = __lsx_vxor_v(BytesInterleaved, BitFlipVector);
+
+    __m128i WordsInterleaved0 = __lsx_vsrai_h(__lsx_vilvl_b(BytesInterleaved, BytesInterleaved), 8);
+    __m128i WordsInterleaved1 = __lsx_vsrai_h(__lsx_vilvh_b(BytesInterleaved, BytesInterleaved), 8);
+
+    ColumnSums[0] = __lsx_vadd_h(ColumnSums[0], WordsInterleaved0);
+    ColumnSums[1] = __lsx_vadd_h(ColumnSums[1], WordsInterleaved1);
+
+    __lsx_vst(WordsInterleaved0, (__m128i*) & D[0], 0);
+    __lsx_vst(WordsInterleaved1, (__m128i*) & D[8], 0);
+}
+
+template<>
+void
+MlasGemmQuantCopyPackB<MLAS_GEMM_U8X8_KERNEL_LSX>(
+    MLAS_GEMM_U8X8_KERNEL_LSX::PackedBType* D,
+    const uint8_t* B,
+    size_t ldb,
+    size_t CountN,
+    size_t CountK,
+    int32_t* ColumnSumBuffer,
+    bool BIsSigned
+    )
+{
+    uint16_t val = 1;
+    const __m128i OnesWordBroadcast = __lsx_vreplgr2vr_h(val);
+    const __m128i BitFlipVector = __lsx_vreplgr2vr_w(BIsSigned ? 0 : 0x80808080);
+
+    //
+    // Process 8 columns of matrix B in a loop.
+    //
+
+    while (CountN >= 8) {
+
+        const uint8_t* b = B;
+        size_t k = CountK;
+        __m128i ColumnSums[2];
+
+        ColumnSums[0] = __lsx_vldi(0);
+        ColumnSums[1] = __lsx_vldi(0);
+
+        //
+        // Interleave rows of matrix B and write to the packed buffer.
+        //
+        // These values are also zero-extended and accumulated into an
+        // intermediate per-column accumulator. CountK cannot be greater than
+        // 128 to avoid overflowing these signed 16-bit accumulators.
+        //
+
+        while (k >= MLAS_GEMM_U8X8_KERNEL_LSX::PackedK) {
+
+            __m128i BytesRow0 = __lsx_vld((const __m128i*) & b[0], 0);
+            __lsx_vinsgr2vr_d(BytesRow0, 0, 1);
+            __m128i BytesRow1 = __lsx_vld((const __m128i*) & b[ldb], 0);
+            __lsx_vinsgr2vr_d(BytesRow1, 0, 1);
+
+            MlasGemmU8X8CopyPackBProcessLSX(D, BytesRow0, BytesRow1, BitFlipVector, ColumnSums);
+
+            b += ldb * 2;
+            D += 16;
+            k -= 2;
+        }
+
+        if (k > 0) {
+
+            __m128i BytesRow0 = __lsx_vld((const __m128i*) & b[0], 0);
+            __lsx_vinsgr2vr_d(BytesRow0, 0, 1);
+
+            MlasGemmU8X8CopyPackBProcessLSX(D, BytesRow0, BitFlipVector, BitFlipVector, ColumnSums);
+
+            D += 16;
+        }
+
+        __m128i tmp1, tmp2;
+        tmp1 = tmp2 = __lsx_vldi(0);
+        tmp1 = __lsx_vmaddwev_w_h(tmp1, ColumnSums[0], OnesWordBroadcast);
+        tmp2 = __lsx_vmaddwod_w_h(tmp2, ColumnSums[0], OnesWordBroadcast);
+        ColumnSums[0]= __lsx_vadd_w(tmp1, tmp2);
+        tmp1 = tmp2 = __lsx_vldi(0);
+        tmp1 = __lsx_vmaddwev_w_h(tmp1, ColumnSums[1], OnesWordBroadcast);
+        tmp2 = __lsx_vmaddwod_w_h(tmp2, ColumnSums[1], OnesWordBroadcast);
+        ColumnSums[1]= __lsx_vadd_w(tmp1, tmp2);
+
+        __lsx_vst(ColumnSums[0], (__m128i*) & ColumnSumBuffer[0], 0);
+        __lsx_vst(ColumnSums[1], (__m128i*) & ColumnSumBuffer[4], 0);
+        ColumnSumBuffer += 8;
+
+        B += 8;
+        CountN -= 8;
+    }
+
+    //
+    // Process the remaining columns of matrix B.
+    //
+
+    if (CountN > 0) {
+
+        const uint8_t* b = B;
+        size_t k = CountK;
+        __m128i ColumnSums[2];
+        uint8_t PaddedMatrixBData[16];
+
+        __lsx_vst(BitFlipVector, (__m128i*)PaddedMatrixBData, 0);
+
+        ColumnSums[0] = __lsx_vldi(0);
+        ColumnSums[1] = __lsx_vldi(0);
+
+        //
+        // Interleave rows of matrix B using an intermediate zero padded stack
+        // buffer and write to the packed buffer.
+        //
+
+        while (k >= MLAS_GEMM_U8X8_KERNEL_LSX::PackedK) {
+
+            const uint8_t* bcopy = b;
+            uint8_t* padded = PaddedMatrixBData;
+            uint8_t* padded_end = padded + CountN;
+
+            do {
+                padded[0] = bcopy[0];
+                padded[8] = bcopy[ldb];
+                padded++;
+                bcopy++;
+            } while (padded < padded_end);
+
+            __m128i BytesRow0 = __lsx_vld((__m128i*) & PaddedMatrixBData[0], 0);
+            __lsx_vinsgr2vr_d(BytesRow0, 0, 1); 
+            __m128i BytesRow1 = __lsx_vld((__m128i*) & PaddedMatrixBData[8], 0);
+            __lsx_vinsgr2vr_d(BytesRow1, 0, 1); 
+
+            MlasGemmU8X8CopyPackBProcessLSX(D, BytesRow0, BytesRow1, BitFlipVector, ColumnSums);
+
+            b += ldb * 2;
+            D += 16;
+            k -= 2;
+        }
+
+        if (k > 0) {
+
+            const uint8_t* bcopy = b;
+            uint8_t* padded = PaddedMatrixBData;
+            uint8_t* padded_end = padded + CountN;
+
+            do {
+                padded[0] = bcopy[0];
+                padded++;
+                bcopy++;
+            } while (padded < padded_end);
+
+            __m128i BytesRow0 = __lsx_vld((__m128i*) & PaddedMatrixBData[0], 0);
+            __lsx_vinsgr2vr_d(BytesRow0, 0, 1); 
+
+            MlasGemmU8X8CopyPackBProcessLSX(D, BytesRow0, BitFlipVector, BitFlipVector, ColumnSums);
+        }
+
+        __m128i tmp1, tmp2;
+        tmp1 = tmp2 = __lsx_vldi(0);
+        tmp1 = __lsx_vmaddwev_w_h(tmp1, ColumnSums[0], OnesWordBroadcast);
+        tmp2 = __lsx_vmaddwod_w_h(tmp2, ColumnSums[0], OnesWordBroadcast);
+        ColumnSums[0]= __lsx_vadd_w(tmp1, tmp2);
+        tmp1 = tmp2 = __lsx_vldi(0);
+        tmp1 = __lsx_vmaddwev_w_h(tmp1, ColumnSums[1], OnesWordBroadcast);
+        tmp2 = __lsx_vmaddwod_w_h(tmp2, ColumnSums[1], OnesWordBroadcast);
+        ColumnSums[1]= __lsx_vadd_w(tmp1, tmp2);
+
+        __lsx_vst(ColumnSums[0], (__m128i*) & ColumnSumBuffer[0], 0);
+        __lsx_vst(ColumnSums[1], (__m128i*) & ColumnSumBuffer[4], 0);
+    }
+}
+
+MLAS_FORCEINLINE
+void
+MlasGemmU8X8MultiplyAccumulateRowLSX(
+    __m128i ABroadcast,
+    const int16_t* B,
+    __m128i Accumulators[2]
+)
+{
+    __m128i BElements0 = __lsx_vld((__m128i*) & B[0], 0);
+    __m128i BElements1 = __lsx_vld((__m128i*) & B[8], 0);
+
+    __m128i tmp1, tmp2;
+    tmp1 = tmp2 = __lsx_vldi(0);
+    tmp1 = __lsx_vmaddwev_w_h(tmp1, BElements0, ABroadcast);
+    tmp2 = __lsx_vmaddwod_w_h(tmp2, BElements0, ABroadcast);
+    Accumulators[0] = __lsx_vadd_w(Accumulators[0], __lsx_vadd_w(tmp1, tmp2));
+    tmp1 = tmp2 = __lsx_vldi(0);
+    tmp1 = __lsx_vmaddwev_w_h(tmp1, BElements1, ABroadcast);
+    tmp2 = __lsx_vmaddwod_w_h(tmp2, BElements1, ABroadcast);
+    Accumulators[1] = __lsx_vadd_w(Accumulators[1], __lsx_vadd_w(tmp1, tmp2));
+}
+
+template<>
+size_t
+MlasGemmQuantKernel<MLAS_GEMM_U8X8_KERNEL_LSX>(
+    const MLAS_GEMM_U8X8_KERNEL_LSX::PackedAType* A,
+    const MLAS_GEMM_U8X8_KERNEL_LSX::PackedBType* B,
+    int32_t* C,
+    size_t PackedCountK,
+    size_t CountM,
+    size_t CountN,
+    size_t ldc,
+    const int32_t* RowSumBuffer,
+    const int32_t* ColumnSumBuffer,
+    const int32_t* ZeroPointB,
+    bool ZeroMode
+    )
+{
+    MLAS_UNREFERENCED_PARAMETER(CountM);
+    MLAS_UNREFERENCED_PARAMETER(ldc);
+
+    while (CountN > 0) {
+
+        __m128i Accumulators[2];
+
+        //
+        // Initialize the accumulators with the row and column sums.
+        //
+
+        int32_t RowSumValue = RowSumBuffer[0];
+
+        if (ZeroPointB != nullptr) {
+
+            int32_t ScaledRowSumBuffer[8];
+
+            for (size_t i = 0; i < 8; i++) {
+                ScaledRowSumBuffer[i] = RowSumValue * ZeroPointB[i];
+            }
+
+            ZeroPointB += 8;
+
+            Accumulators[0] = __lsx_vld((__m128i*) & ScaledRowSumBuffer[0], 0);
+            Accumulators[1] = __lsx_vld((__m128i*) & ScaledRowSumBuffer[4], 0);
+
+        }
+        else {
+
+            Accumulators[0] = __lsx_vreplgr2vr_w(RowSumValue);
+            Accumulators[1] = Accumulators[0];
+        }
+
+        Accumulators[0] = __lsx_vadd_w(Accumulators[0], __lsx_vld((const __m128i*) & ColumnSumBuffer[0], 0));
+        Accumulators[1] = __lsx_vadd_w(Accumulators[1], __lsx_vld((const __m128i*) & ColumnSumBuffer[4], 0));
+        ColumnSumBuffer += 8;
+
+        //
+        // Broadcast each pair of 16-bit values from the matrix A and multiply
+        // with the pair of 16-bit values from matrix B, and add the 32-bit
+        // intermediate into the accumulator registers.
+        //
+
+        const int16_t* a = A;
+        size_t k = PackedCountK;
+
+        while (k >= 4) {
+
+            __m128i AElements = __lsx_vld((__m128i*)a, 0);
+            __m128i ABroadcast;
+
+            ABroadcast = __lsx_vreplvei_w(AElements, 0);
+            MlasGemmU8X8MultiplyAccumulateRowLSX(ABroadcast, &B[0], Accumulators);
+
+            ABroadcast = __lsx_vreplvei_w(AElements, 1);
+            MlasGemmU8X8MultiplyAccumulateRowLSX(ABroadcast, &B[16], Accumulators);
+
+            ABroadcast = __lsx_vreplvei_w(AElements, 2);
+            MlasGemmU8X8MultiplyAccumulateRowLSX(ABroadcast, &B[32], Accumulators);
+
+            ABroadcast = __lsx_vreplvei_w(AElements, 3);
+            MlasGemmU8X8MultiplyAccumulateRowLSX(ABroadcast, &B[48], Accumulators);
+
+            a += 4 * 2;
+            B += 4 * 16;
+            k -= 4;
+        }
+
+        while (k > 0) {
+
+            __m128i ABroadcast = __lsx_vldrepl_w((int32_t*)a, 0);
+            MlasGemmU8X8MultiplyAccumulateRowLSX(ABroadcast, &B[0], Accumulators);
+
+            a += 2;
+            B += 16;
+            k -= 1;
+        }
+
+        //
+        // Output the accumulator block after optionally accumulating the values
+        // from matrix C.
+        //
+
+        if (CountN >= 8) {
+
+            if (!ZeroMode) {
+                Accumulators[0] = __lsx_vadd_w(Accumulators[0], __lsx_vld((__m128i*) & C[0], 0));
+                Accumulators[1] = __lsx_vadd_w(Accumulators[1], __lsx_vld((__m128i*) & C[4], 0));
+            }
+
+            __lsx_vst(Accumulators[0], (__m128i*) & C[0], 0);
+            __lsx_vst(Accumulators[1], (__m128i*) & C[4], 0);
+
+            C += 8;
+            CountN -= 8;
+
+        }
+        else {
+
+            //
+            // Output the remaining partial output block.
+            //
+
+            if ((CountN & 4) != 0) {
+
+                if (!ZeroMode) {
+                    Accumulators[0] = __lsx_vadd_w(Accumulators[0], __lsx_vld((__m128i*) & C[0], 0));
+                }
+
+                __lsx_vst(Accumulators[0], (__m128i*) & C[0], 0);
+                C += 4;
+
+                Accumulators[0] = Accumulators[1];
+            }
+
+            if ((CountN & 2) != 0) {
+
+                if (!ZeroMode) {
+                    Accumulators[0] = __lsx_vadd_w(Accumulators[0], __lsx_vinsgr2vr_d(__lsx_vld((__m128i*) & C[0], 0), 0, 1));
+                }
+
+                *((uint64_t *)&C[0]) = __lsx_vpickve2gr_d(Accumulators[0], 0);
+                C += 2;
+
+                Accumulators[0] = __lsx_vshuf4i_w(Accumulators[0], 0xee);
+            }
+
+            if ((CountN & 1) != 0) {
+
+                int32_t AccumulatorValue = __lsx_vpickve2gr_w(Accumulators[0], 0);
+
+                if (!ZeroMode) {
+                    AccumulatorValue += C[0];
+                }
+
+                C[0] = AccumulatorValue;
+            }
+
+            CountN = 0;
+        }
+    }
+
+    return 1;
+}
+
+const MLAS_GEMM_QUANT_DISPATCH MlasGemmU8X8DispatchLSX = {
+    MlasGemmQuantOperation<MLAS_GEMM_U8X8_KERNEL_LSX>,
+    nullptr,
+    nullptr,
+    MLAS_GEMM_U8X8_KERNEL_LSX::PackedK,
+    0,
+    1  // aLSXmbly kernel M stride
+};
diff --git a/onnxruntime/core/mlas/lib/qladd.cpp b/onnxruntime/core/mlas/lib/qladd.cpp
index 971ea0161d7af..5dafa17c2ae66 100644
--- a/onnxruntime/core/mlas/lib/qladd.cpp
+++ b/onnxruntime/core/mlas/lib/qladd.cpp
@@ -552,6 +552,119 @@ MlasQLinearAddKernelHelper(
           InputA, ScaleA, ZeroPointA, InputB, ScaleB, ZeroPointB, ScaleC, ZeroPointC, OutputC, N);
     }
 }
+#elif defined(MLAS_LSX_INTRINSICS)
+
+template<typename DataType, bool IsScalarB>
+static
+void
+MlasQLinearAddKernelHelper(
+    const DataType* InputA,
+    float ScaleA,
+    int32_t ZeroPointA,
+    const DataType* InputB,
+    float ScaleB,
+    int32_t ZeroPointB,
+    float ScaleC,
+    int32_t ZeroPointC,
+    DataType* OutputC,
+    size_t N
+    )
+{
+    const float ScaleRatio_AC = ScaleA / ScaleC;
+    const float ScaleRatio_BC = ScaleB / ScaleC;
+    const auto VectorScaleRatio_AC = MlasBroadcastFloat32x4(ScaleRatio_AC);
+    const auto VectorScaleRatio_BC = MlasBroadcastFloat32x4(ScaleRatio_BC);
+    auto VectorFixedPart = MlasBroadcastFloat32x4((float)ZeroPointC - (ScaleRatio_AC * ZeroPointA + ScaleRatio_BC * ZeroPointB));
+
+    MLAS_FLOAT32X4 va_lo, va_hi, vb_lo, vb_hi;
+    if (IsScalarB) {
+        float tmp_f = (float)*InputB;
+        uint32_t *tmp_p = (uint32_t *)&tmp_f;
+        vb_lo = MlasReinterpretAsFloat32x4(__lsx_vreplgr2vr_w(*tmp_p));
+        VectorFixedPart = __lsx_vfmadd_s(vb_lo, VectorScaleRatio_BC, VectorFixedPart);
+    }
+
+    __m128i tmp, tmp1;
+
+    while (N >= 8) {
+        const auto va_low_half = __lsx_vinsgr2vr_d(__lsx_vld((const MLAS_INT32X4*)InputA, 0), 0 ,1);
+        const auto va_i16x8 = __lsx_vilvl_b(va_low_half, va_low_half);
+        InputA += 8;
+        va_lo = __lsx_vffint_s_w(MlasShiftRightInt32<DataType>(__lsx_vilvl_h(va_i16x8, va_i16x8), 24));
+        va_hi = __lsx_vffint_s_w(MlasShiftRightInt32<DataType>(__lsx_vilvh_h(va_i16x8, va_i16x8), 24));
+
+        if (!IsScalarB) {
+            const auto vb_low_half = __lsx_vinsgr2vr_d(__lsx_vld((const MLAS_INT32X4*)InputB, 0), 0 ,1);
+            const auto vb_i16x8 = __lsx_vilvl_b(vb_low_half, vb_low_half);
+            InputB += 8;
+            vb_lo = __lsx_vffint_s_w(MlasShiftRightInt32<DataType>(__lsx_vilvl_h(vb_i16x8, vb_i16x8), 24));
+            vb_hi = __lsx_vffint_s_w(MlasShiftRightInt32<DataType>(__lsx_vilvh_h(vb_i16x8, vb_i16x8), 24));
+        }
+
+        MLAS_INT32X4 r_lo, r_hi;
+        if (IsScalarB) {
+            r_lo = __lsx_vftint_w_s(__lsx_vfmadd_s(va_lo, VectorScaleRatio_AC, VectorFixedPart));
+            r_hi = __lsx_vftint_w_s(__lsx_vfmadd_s(va_hi, VectorScaleRatio_AC, VectorFixedPart));
+        } else {
+            r_lo = __lsx_vftint_w_s(__lsx_vfadd_s(__lsx_vfmadd_s(va_lo, VectorScaleRatio_AC, VectorFixedPart), __lsx_vfmul_s(vb_lo, VectorScaleRatio_BC)));
+            r_hi = __lsx_vftint_w_s(__lsx_vfadd_s(__lsx_vfmadd_s(va_hi, VectorScaleRatio_AC, VectorFixedPart), __lsx_vfmul_s(vb_hi, VectorScaleRatio_BC)));
+        }
+        tmp = __lsx_vsat_w(r_lo, 15);
+        tmp1 = __lsx_vsat_w(r_hi, 15);
+         const auto vc_i16x8 = __lsx_vpickev_h(tmp1, tmp);
+
+        MLAS_INT32X4 vc = MlasPackS16_128<DataType>(vc_i16x8, vc_i16x8);
+
+        N -= 8;
+        __lsx_vst(__lsx_vinsgr2vr_d(__lsx_vld((MLAS_INT32X4*)OutputC, 0), __lsx_vpickve2gr_d(vc, 0), 0), (MLAS_INT32X4*)OutputC, 0);
+        OutputC += 8;
+    }
+
+    if (N > 0) {
+        uint8_t TailData[8] = { 0 };
+
+        MlasCopyTailBytes(TailData, (const uint8_t*)InputA, N);
+        const auto va_low_half = __lsx_vinsgr2vr_d(__lsx_vld((const MLAS_INT32X4*)TailData, 0), 0 ,1);
+        const auto va_i16x8 = __lsx_vilvl_b(va_low_half, va_low_half);
+        va_lo = __lsx_vffint_s_w(MlasShiftRightInt32<DataType>(__lsx_vilvl_h(va_i16x8, va_i16x8), 24));
+        va_hi = __lsx_vffint_s_w(MlasShiftRightInt32<DataType>(__lsx_vilvh_h(va_i16x8, va_i16x8), 24));
+
+        if (!IsScalarB) {
+            MlasCopyTailBytes(TailData, (const uint8_t*)InputB, N);
+            const auto vb_low_half = __lsx_vinsgr2vr_d(__lsx_vld((const MLAS_INT32X4*)TailData, 0), 0 ,1);
+            const auto vb_i16x8 = __lsx_vilvl_b(vb_low_half, vb_low_half);
+            vb_lo = __lsx_vffint_s_w(MlasShiftRightInt32<DataType>(__lsx_vilvl_h(vb_i16x8, vb_i16x8), 24));
+            vb_hi = __lsx_vffint_s_w(MlasShiftRightInt32<DataType>(__lsx_vilvh_h(vb_i16x8, vb_i16x8), 24));
+        }
+
+        MLAS_INT32X4 r_lo, r_hi;
+        if (IsScalarB) {
+            r_lo = __lsx_vftint_w_s(__lsx_vfmadd_s(va_lo, VectorScaleRatio_AC, VectorFixedPart));
+            r_hi = __lsx_vftint_w_s(__lsx_vfmadd_s(va_hi, VectorScaleRatio_AC, VectorFixedPart));
+        } else {
+            r_lo = __lsx_vftint_w_s(__lsx_vfadd_s(__lsx_vfmadd_s(va_lo, VectorScaleRatio_AC, VectorFixedPart), __lsx_vfmul_s(vb_lo, VectorScaleRatio_BC)));
+            r_hi = __lsx_vftint_w_s(__lsx_vfadd_s(__lsx_vfmadd_s(va_hi, VectorScaleRatio_AC, VectorFixedPart), __lsx_vfmul_s(vb_hi, VectorScaleRatio_BC)));
+        }
+        tmp = __lsx_vsat_w(r_lo, 15);
+        tmp1 = __lsx_vsat_w(r_hi, 15);
+        const auto vc_i16x8 = __lsx_vpickev_h(tmp1, tmp);
+
+        MLAS_INT32X4 vc = MlasPackS16_128<DataType>(vc_i16x8, vc_i16x8);
+
+        if (N & 4) {
+            __lsx_vstelm_w(vc, (int*)OutputC, 0, 0);
+            N -= 4;
+            OutputC += 4;
+            vc = __lsx_vshuf4i_w(vc, 0x39); //_MM_SHUFFLE(0, 3, 2, 1)
+        }
+
+        uint32_t PackedValueC = (uint32_t)__lsx_vpickve2gr_w(vc, 0);
+        for (size_t i = 0; i < N; ++i) {
+            *((uint8_t*)OutputC + i) = (uint8_t)PackedValueC;
+            PackedValueC >>= 8;
+        }
+    }
+}
 #else
 
 template<typename DataType, bool IsScalarB>
diff --git a/onnxruntime/core/mlas/lib/qladd.h b/onnxruntime/core/mlas/lib/qladd.h
index 8c05a6185324a..94568941a5660 100644
--- a/onnxruntime/core/mlas/lib/qladd.h
+++ b/onnxruntime/core/mlas/lib/qladd.h
@@ -463,5 +463,132 @@ MlasPackS16_128<int8_t>(
 {
     return reinterpret_cast<MLAS_INT32X4>(vec_packs(a, b));
 }
+#elif defined(MLAS_LSX_INTRINSICS)
 
+#define LSX_DBG 1
+template <typename DataType>
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasShiftRightInt32(
+    MLAS_INT32X4 v,
+    int imm
+    );
+
+template<>
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasShiftRightInt32<int8_t>(
+    MLAS_INT32X4 v,
+    int imm
+    )
+{
+#if LSX_DBG
+    MLAS_INT32X4 imm_v = __lsx_vreplgr2vr_w(imm);
+    return __lsx_vsra_w(v, imm_v);
+#else
+    return __lsx_vsrai_w(v, imm);
+#endif
+}
+
+template<>
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasShiftRightInt32<uint8_t>(
+    MLAS_INT32X4 v,
+    int imm
+    )
+{
+#if LSX_DBG
+    MLAS_INT32X4 imm_v = __lsx_vreplgr2vr_w(imm);
+    return __lsx_vsrl_w(v, imm_v);
+#else
+    return __lsx_vsrli_w(v, imm);
+#endif
+}
+
+template <typename DataType>
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasShiftRightInt16(
+    MLAS_INT32X4 v,
+    int imm
+    );
+
+template<>
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasShiftRightInt16<int8_t>(
+    MLAS_INT32X4 v,
+    int imm
+    )
+{
+#if LSX_DBG
+    MLAS_INT32X4 imm_v = __lsx_vreplgr2vr_h(imm);
+    return __lsx_vsra_h(v, imm_v);
+#else
+    return __lsx_vsrai_h(v, imm);
+#endif
+}
+
+template<>
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasShiftRightInt16<uint8_t>(
+    MLAS_INT32X4 v,
+    int imm
+    )
+{
+#if LSX_DBG
+    MLAS_INT32X4 imm_v = __lsx_vreplgr2vr_h(imm);
+    return __lsx_vsrl_h(v, imm_v);
+#else
+    return __lsx_vsrli_h(v, imm);
+#endif
+}
+
+template <typename DataType>
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasPackS16_128(
+    MLAS_INT32X4 a,
+    MLAS_INT32X4 b
+    );
+
+template <>
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasPackS16_128<uint8_t>(
+    MLAS_INT32X4 a,
+    MLAS_INT32X4 b
+    )
+{
+    // return _mm_packus_epi16(a, b);
+    __m128i zero = __lsx_vldi(0);
+    __m128i tmp, tmp2, tmp3;
+
+    tmp = __lsx_vmax_h(zero, a);
+    tmp2 = __lsx_vsat_hu(tmp, 7);
+
+    tmp = __lsx_vmax_h(zero, b);
+    tmp3 = __lsx_vsat_hu(tmp, 7);
+    return  __lsx_vpickev_b(tmp3, tmp2);
+
+}
+
+template <>
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasPackS16_128<int8_t>(
+    MLAS_INT32X4 a,
+    MLAS_INT32X4 b
+    )
+{
+    // return _mm_packs_epi16(a, b);
+    __m128i tmp, tmp1;
+
+    tmp = __lsx_vsat_h(a, 7);
+    tmp1 = __lsx_vsat_h(b, 7);
+    return __lsx_vpickev_b(tmp1, tmp);
+
+}
 #endif
diff --git a/onnxruntime/core/mlas/lib/qlgavgpool.cpp b/onnxruntime/core/mlas/lib/qlgavgpool.cpp
index 1c2be0a833a3e..e44d7ad25c446 100644
--- a/onnxruntime/core/mlas/lib/qlgavgpool.cpp
+++ b/onnxruntime/core/mlas/lib/qlgavgpool.cpp
@@ -689,6 +689,316 @@ MlasQLinearGlobalAveragePoolNhwcSingleBatch(
                          Output_zero_point, 0, 0, 1, Channels);
 }
 
+#elif defined(MLAS_LSX_INTRINSICS)
+
+template <typename T8Bits>
+void MLASCALL
+MlasQLinearGlobalAveragePoolNchw(
+    const T8Bits* Input,
+    float ScaleInput,
+    int32_t ZeroPointInput,
+    T8Bits* Output,
+    float ScaleOutput,
+    int32_t ZeroPointOutput,
+    size_t Channels,
+    size_t ImageSize,
+    int32_t* AccumulateBuffer
+    )
+{
+    float scale = CheckQLinearGlobalAveragePoolScaleAndSize(ScaleInput, ScaleOutput, ImageSize);
+    const int32_t bias[] = {-ZeroPointInput * static_cast<int32_t>(ImageSize), 0, 0, 0};
+    const auto vbias = __lsx_vld((const __m128i*)&bias, 0);
+    const auto vzero = __lsx_vldi(0);
+    uint8_t buffer[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+
+    int32_t* sum_buffer = AccumulateBuffer;
+    for (size_t c = Channels; c > 0; c--) {
+
+        __m128i vacc_lo = vbias;
+        __m128i vacc_hi = vzero;
+        auto Len = ImageSize;
+        for (; Len >= 32; Len -= 32) {
+
+            const __m128i vi0 = __lsx_vld((const __m128i*)Input, 0);
+            __lsx_vinsgr2vr_d(vi0, 0, 1);
+            const __m128i vi1 = __lsx_vld((const __m128i*)(Input + 8), 0);
+            __lsx_vinsgr2vr_d(vi1, 0, 1);
+            const __m128i vi2 = __lsx_vld((const __m128i*)(Input + 16), 0);
+            __lsx_vinsgr2vr_d(vi2, 0, 1);
+            const __m128i vi3 = __lsx_vld((const __m128i*)(Input + 24), 0);
+            __lsx_vinsgr2vr_d(vi3, 0, 1);
+
+            if constexpr (std::is_signed<T8Bits>::value) {
+
+                const __m128i vxi0 = __lsx_vsrai_h(__lsx_vilvl_b(vi0, vzero), 8);
+                const __m128i vxi1 = __lsx_vsrai_h(__lsx_vilvl_b(vi1, vzero), 8);
+                const __m128i vxi2 = __lsx_vsrai_h(__lsx_vilvl_b(vi2, vzero), 8);
+                const __m128i vxi3 = __lsx_vsrai_h(__lsx_vilvl_b(vi3, vzero), 8);
+                const __m128i vsum = __lsx_vadd_h(__lsx_vadd_h(vxi0, vxi1),
+                                                   __lsx_vadd_h(vxi2, vxi3));
+                vacc_lo = __lsx_vadd_w(vacc_lo, __lsx_vsrai_w(__lsx_vilvl_h(vsum, vzero), 16));
+                vacc_hi = __lsx_vadd_w(vacc_hi, __lsx_vsrai_w(__lsx_vilvh_h(vsum, vzero), 16));
+            } else {
+
+                const __m128i vxi0 = __lsx_vilvl_b(vzero, vi0);
+                const __m128i vxi1 = __lsx_vilvl_b(vzero, vi1);
+                const __m128i vxi2 = __lsx_vilvl_b(vzero, vi2);
+                const __m128i vxi3 = __lsx_vilvl_b(vzero, vi3);
+                const __m128i vsum = __lsx_vadd_h(__lsx_vadd_h(vxi0, vxi1),
+                                                   __lsx_vadd_h(vxi2, vxi3));
+                vacc_lo = __lsx_vadd_w(vacc_lo, __lsx_vilvl_h(vzero, vsum));
+                vacc_hi = __lsx_vadd_w(vacc_hi, __lsx_vilvh_h(vzero, vsum));
+            }
+
+            Input += 32;
+        }
+        for (; Len >= 8; Len -= 8) {
+
+            if constexpr (std::is_signed<T8Bits>::value) {
+
+                const __m128i vsum = __lsx_vsrai_h(__lsx_vilvl_b(__lsx_vinsgr2vr_d(__lsx_vld((const __m128i*)Input, 0), 0, 1), vzero), 8);
+                vacc_lo = __lsx_vadd_w(vacc_lo, __lsx_vsrai_w(__lsx_vilvl_h(vsum, vzero), 16));
+                vacc_hi = __lsx_vadd_w(vacc_hi, __lsx_vsrai_w(__lsx_vilvh_h(vsum, vzero), 16));
+            } else {
+
+                const __m128i vsum = __lsx_vilvl_b(vzero, __lsx_vinsgr2vr_d(__lsx_vld((const __m128i*)Input, 0), 0, 1));
+                vacc_lo = __lsx_vadd_w(vacc_lo, __lsx_vilvl_h(vzero, vsum));
+                vacc_hi = __lsx_vadd_w(vacc_hi, __lsx_vilvh_h(vzero, vsum));
+            }
+
+            Input += 8;
+        }
+        if (Len > 0) {
+
+            memcpy(buffer, Input, Len);
+
+            if constexpr (std::is_signed<T8Bits>::value) {
+
+                const __m128i vsum = __lsx_vsrai_h(__lsx_vilvl_b(__lsx_vinsgr2vr_d(__lsx_vld((const __m128i*)buffer, 0), 0, 1), vzero), 8);
+                vacc_lo = __lsx_vadd_w(vacc_lo, __lsx_vsrai_w(__lsx_vilvl_h(vsum, vzero), 16));
+                vacc_hi = __lsx_vadd_w(vacc_hi, __lsx_vsrai_w(__lsx_vilvh_h(vsum, vzero), 16));
+            } else {
+
+                const __m128i vsum = __lsx_vilvl_b(vzero, __lsx_vinsgr2vr_d(__lsx_vld((const __m128i*)buffer, 0), 0, 1));
+                vacc_lo = __lsx_vadd_w(vacc_lo, __lsx_vilvl_h(vzero, vsum));
+                vacc_hi = __lsx_vadd_w(vacc_hi, __lsx_vilvh_h(vzero, vsum));
+            }
+
+            Input += Len;
+        }
+
+        __m128i vacc = __lsx_vadd_w(vacc_lo, vacc_hi);                    // [ D C | B A ]
+        __m128i vshuf = __lsx_vshuf4i_w(vacc, 0xb1);  // [ C D | A B ] _MM_SHUFFLE(2, 3, 0, 1)
+        __m128i vsums = __lsx_vadd_w(vacc, vshuf);                        // [ D+C C+D | B+A A+B ]
+        vshuf = __lsx_vshuf4i_w(vsums, 0x4e);         // [ B+A A+B | D+C C+D ] _MM_SHUFFLE(1, 0, 3, 2)
+        vsums = __lsx_vadd_w(vsums, vshuf);
+        __lsx_vstelm_w(vsums, sum_buffer++, 0 , 0);
+    }
+
+    MlasRequantizeOutput(AccumulateBuffer, Channels, Output, Channels, nullptr, &scale, false,
+                         static_cast<T8Bits>(ZeroPointOutput), 0, 0, 1, Channels);
+}
+
+template <typename T8Bits>
+MLAS_FORCEINLINE
+void
+MlasQLinearGlobalAveragePoolNhwcSingleBatch(
+    const T8Bits* Input,
+    T8Bits* Output,
+    const T8Bits* LastOf8,
+    size_t ImageSize,
+    size_t Channels,
+    size_t Stride,
+    int32_t Bias,
+    float Scale,
+    T8Bits Output_zero_point,
+    int32_t* AccumulateBuffer,
+    const T8Bits* ZeroBuffer
+    )
+{
+
+    constexpr size_t PixelsPerIteration = 7;
+#define LOAD_FULL_CHANNELS()                                 \
+    const __m128i vi0 = __lsx_vinsgr2vr_d(__lsx_vld((const __m128i*)i0, 0), 0 , 1); \
+    i0 += 8;                                                 \
+    const __m128i vi1 = __lsx_vinsgr2vr_d(__lsx_vld((const __m128i*)i1, 0), 0 , 1); \
+    i1 += 8;                                                 \
+    const __m128i vi2 = __lsx_vinsgr2vr_d(__lsx_vld((const __m128i*)i2, 0), 0 , 1); \
+    i2 += 8;                                                 \
+    const __m128i vi3 = __lsx_vinsgr2vr_d(__lsx_vld((const __m128i*)i3, 0), 0 , 1); \
+    i3 += 8;                                                 \
+    const __m128i vi4 = __lsx_vinsgr2vr_d(__lsx_vld((const __m128i*)i4, 0), 0 , 1); \
+    i4 += 8;                                                 \
+    const __m128i vi5 = __lsx_vinsgr2vr_d(__lsx_vld((const __m128i*)i5, 0), 0 , 1); \
+    i5 += 8;                                                 \
+    const __m128i vi6 = __lsx_vinsgr2vr_d(__lsx_vld((const __m128i*)i6, 0), 0 , 1); \
+    i6 += 8
+
+#define CALCULATE_ACCUMULATE_VECTORS()                                                         \
+    __m128i vacc_lo = finish_one_pass ? __lsx_vld((__m128i*)acc, 0) : vbias;                \
+    __m128i vacc_hi = finish_one_pass ? __lsx_vld(((__m128i*)acc) + 1, 0) : vbias;          \
+    __m128i vxi0;                                                                              \
+    __m128i vxi1;                                                                              \
+    __m128i vxi2;                                                                              \
+    __m128i vxi3;                                                                              \
+    __m128i vxi4;                                                                              \
+    __m128i vxi5;                                                                              \
+    __m128i vxi6;                                                                              \
+    if constexpr (std::is_signed<T8Bits>::value) {                                             \
+        vxi0 = __lsx_vsrai_h(__lsx_vilvl_b(vi0, vzero), 8);                               \
+        vxi1 = __lsx_vsrai_h(__lsx_vilvl_b(vi1, vzero), 8);                               \
+        vxi2 = __lsx_vsrai_h(__lsx_vilvl_b(vi2, vzero), 8);                               \
+        vxi3 = __lsx_vsrai_h(__lsx_vilvl_b(vi3, vzero), 8);                               \
+        vxi4 = __lsx_vsrai_h(__lsx_vilvl_b(vi4, vzero), 8);                               \
+        vxi5 = __lsx_vsrai_h(__lsx_vilvl_b(vi5, vzero), 8);                               \
+        vxi6 = __lsx_vsrai_h(__lsx_vilvl_b(vi6, vzero), 8);                               \
+    } else {                                                                                   \
+        vxi0 = __lsx_vilvl_b(vzero, vi0);                                                  \
+        vxi1 = __lsx_vilvl_b(vzero, vi1);                                                  \
+        vxi2 = __lsx_vilvl_b(vzero, vi2);                                                  \
+        vxi3 = __lsx_vilvl_b(vzero, vi3);                                                  \
+        vxi4 = __lsx_vilvl_b(vzero, vi4);                                                  \
+        vxi5 = __lsx_vilvl_b(vzero, vi5);                                                  \
+        vxi6 = __lsx_vilvl_b(vzero, vi6);                                                  \
+    }                                                                                          \
+    const __m128i vsum01 = __lsx_vadd_h(vxi0, vxi1);                                          \
+    const __m128i vsum23 = __lsx_vadd_h(vxi2, vxi3);                                          \
+    const __m128i vsum45 = __lsx_vadd_h(vxi4, vxi5);                                          \
+    const __m128i vsum016 = __lsx_vadd_h(vsum01, vxi6);                                       \
+    const __m128i vsum2345 = __lsx_vadd_h(vsum23, vsum45);                                    \
+    const __m128i vsum = __lsx_vadd_h(vsum016, vsum2345);                                     \
+    if constexpr (std::is_signed<T8Bits>::value) {                                             \
+        vacc_lo = __lsx_vadd_w(vacc_lo, __lsx_vsrai_w(__lsx_vilvl_h(vsum, vzero), 16)); \
+        vacc_hi = __lsx_vadd_w(vacc_hi, __lsx_vsrai_w(__lsx_vilvh_h(vsum, vzero), 16)); \
+    } else {                                                                                   \
+        vacc_lo = __lsx_vadd_w(vacc_lo, __lsx_vilvl_h(vzero, vsum));                     \
+        vacc_hi = __lsx_vadd_w(vacc_hi, __lsx_vilvh_h(vzero, vsum));                     \
+    }
+
+
+    T8Bits tail[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+    bool finish_one_pass = false;
+    const __m128i vbias = __lsx_vreplgr2vr_w(Bias);
+    const __m128i vzero = __lsx_vldi(0);
+    size_t step_next_group = PixelsPerIteration * Stride - (Channels & ~size_t{7});
+
+    const T8Bits* i0 = Input;
+    const T8Bits* i1 = i0 + Stride;
+    const T8Bits* i2 = i1 + Stride;
+    const T8Bits* i3 = i2 + Stride;
+    const T8Bits* i4 = i0 + Stride * 4;
+    const T8Bits* i5 = i4 + Stride;
+    const T8Bits* i6 = i5 + Stride;
+
+    for (; ImageSize > PixelsPerIteration; ImageSize -= PixelsPerIteration) {
+
+        int32_t* acc = AccumulateBuffer;
+        size_t c = Channels;
+        for (; c >= 8; c -= 8) {
+
+            LOAD_FULL_CHANNELS();
+
+            CALCULATE_ACCUMULATE_VECTORS();
+
+            __lsx_vst(vacc_lo, (__m128i*)acc, 0);
+            __lsx_vst(vacc_hi, ((__m128i*)acc) + 1, 0);
+            acc += 8;
+        }
+        if (c > 0) {
+            const __m128i vi0 =
+                __lsx_vinsgr2vr_d(__lsx_vld((const __m128i*)(i0 >= LastOf8 ? memcpy(tail, i0, c) : i0), 0), 0 ,1);
+            const __m128i vi1 =
+                __lsx_vinsgr2vr_d(__lsx_vld((const __m128i*)(i1 >= LastOf8 ? memcpy(tail, i1, c) : i1), 0), 0 ,1);
+            const __m128i vi2 =
+                __lsx_vinsgr2vr_d(__lsx_vld((const __m128i*)(i2 >= LastOf8 ? memcpy(tail, i2, c) : i2), 0), 0 ,1);
+            const __m128i vi3 =
+                __lsx_vinsgr2vr_d(__lsx_vld((const __m128i*)(i3 >= LastOf8 ? memcpy(tail, i3, c) : i3), 0), 0 ,1);
+            const __m128i vi4 =
+                __lsx_vinsgr2vr_d(__lsx_vld((const __m128i*)(i4 >= LastOf8 ? memcpy(tail, i4, c) : i4), 0), 0 ,1);
+            const __m128i vi5 =
+                __lsx_vinsgr2vr_d(__lsx_vld((const __m128i*)(i5 >= LastOf8 ? memcpy(tail, i5, c) : i5), 0), 0 ,1);
+            const __m128i vi6 =
+                __lsx_vinsgr2vr_d(__lsx_vld((const __m128i*)(i6 >= LastOf8 ? memcpy(tail, i6, c) : i6), 0), 0 ,1);
+
+            CALCULATE_ACCUMULATE_VECTORS();
+
+            __lsx_vst(vacc_lo, (__m128i*)acc, 0);
+            __lsx_vst(vacc_hi, ((__m128i*)acc) + 1, 0);
+        }
+        finish_one_pass = true;
+
+        i0 += step_next_group;
+        i1 += step_next_group;
+        i2 += step_next_group;
+        i3 += step_next_group;
+        i4 += step_next_group;
+        i5 += step_next_group;
+        i6 += step_next_group;
+    }
+
+    if (ImageSize > 0) {
+        switch (ImageSize) {
+            case 1:
+                i1 = ZeroBuffer;
+                [[fallthrough]];
+            case 2:
+                i2 = ZeroBuffer;
+                [[fallthrough]];
+            case 3:
+                i3 = ZeroBuffer;
+                [[fallthrough]];
+            case 4:
+                i4 = ZeroBuffer;
+                [[fallthrough]];
+            case 5:
+                i5 = ZeroBuffer;
+                [[fallthrough]];
+            case 6:
+                i6 = ZeroBuffer;
+                [[fallthrough]];
+            default:
+                break;
+        }
+
+        int32_t* acc = AccumulateBuffer;
+        size_t c = Channels;
+        for (; c >= 8; c -= 8) {
+
+            LOAD_FULL_CHANNELS();
+
+            CALCULATE_ACCUMULATE_VECTORS();
+
+            __lsx_vst(vacc_lo, (__m128i*)acc, 0);
+            __lsx_vst(vacc_hi, ((__m128i*)acc) + 1, 0);
+            acc += 8;
+        }
+
+        if (c > 0) {
+            const __m128i vi0 =
+                __lsx_vinsgr2vr_d(__lsx_vld((const __m128i*)(i0 >= LastOf8 ? memcpy(tail, i0, c) : i0), 0), 0 ,1);
+            const __m128i vi1 = __lsx_vinsgr2vr_d(__lsx_vld(
+                (const __m128i*)(1 < ImageSize && i1 >= LastOf8 ? memcpy(tail, i1, c) : i1), 0), 0, 1);
+            const __m128i vi2 = __lsx_vinsgr2vr_d(__lsx_vld(
+                (const __m128i*)(2 < ImageSize && i2 >= LastOf8 ? memcpy(tail, i2, c) : i2), 0), 0, 1);
+            const __m128i vi3 = __lsx_vinsgr2vr_d(__lsx_vld(
+                (const __m128i*)(3 < ImageSize && i3 >= LastOf8 ? memcpy(tail, i3, c) : i3), 0), 0, 1);
+            const __m128i vi4 = __lsx_vinsgr2vr_d(__lsx_vld(
+                (const __m128i*)(4 < ImageSize && i4 >= LastOf8 ? memcpy(tail, i4, c) : i4), 0), 0, 1);
+            const __m128i vi5 = __lsx_vinsgr2vr_d(__lsx_vld(
+                (const __m128i*)(5 < ImageSize && i5 >= LastOf8 ? memcpy(tail, i5, c) : i5), 0), 0, 1);
+            const __m128i vi6 = __lsx_vinsgr2vr_d(__lsx_vld(
+                (const __m128i*)(6 < ImageSize && i6 >= LastOf8 ? memcpy(tail, i6, c) : i6), 0), 0, 1);
+
+            CALCULATE_ACCUMULATE_VECTORS();
+
+            __lsx_vst(vacc_lo, (__m128i*)acc, 0);
+            __lsx_vst(vacc_hi, ((__m128i*)acc) + 1, 0);
+        }
+    }
+    MlasRequantizeOutput(AccumulateBuffer, Channels, Output, Channels, nullptr, &Scale, false,
+                         Output_zero_point, 0, 0, 1, Channels);
+}
+
 #else
 
 // Pure C++ Implementation
@@ -771,7 +1081,7 @@ MlasQLinearGlobalAveragePoolNhwc(
 
 #endif
 
-#if defined(MLAS_NEON_INTRINSICS) || defined(MLAS_SSE2_INTRINSICS)
+#if defined(MLAS_NEON_INTRINSICS) || defined(MLAS_SSE2_INTRINSICS) || defined(MLAS_LSX_INTRINSICS)
 
 template <typename T8Bits>
 void
diff --git a/onnxruntime/core/mlas/lib/qlmul.cpp b/onnxruntime/core/mlas/lib/qlmul.cpp
index 4b8537f2b378f..38818e1190d21 100644
--- a/onnxruntime/core/mlas/lib/qlmul.cpp
+++ b/onnxruntime/core/mlas/lib/qlmul.cpp
@@ -377,6 +377,170 @@ MlasQLinearMulKernel(
     MLAS_UNREFERENCED_PARAMETER(ValueBVector);
 }
 
+#elif defined(MLAS_LSX_INTRINSICS)
+
+template <class DataType, bool IsLow>
+MLAS_FORCEINLINE
+static
+__m128i
+MlasExtendToS16(
+    __m128i Int8Vector,
+    __m128i ZeroVector
+    );
+
+template <>
+MLAS_FORCEINLINE
+__m128i
+MlasExtendToS16<uint8_t, /* bool IsLow = */ true>(
+    __m128i Int8Vector,
+    __m128i ZeroVector
+    )
+{
+    return __lsx_vilvl_b(ZeroVector, Int8Vector);
+}
+
+template <>
+MLAS_FORCEINLINE
+__m128i
+MlasExtendToS16<uint8_t, /* bool IsLow = */ false>(
+    __m128i Int8Vector,
+    __m128i ZeroVector
+    )
+{
+    return __lsx_vilvh_b(ZeroVector, Int8Vector);
+}
+
+template <>
+MLAS_FORCEINLINE
+__m128i
+MlasExtendToS16<int8_t, /* bool IsLow = */ true>(
+    __m128i Int8Vector,
+    __m128i ZeroVector
+    )
+{
+    MLAS_UNREFERENCED_PARAMETER(ZeroVector);
+    return __lsx_vsrai_h(__lsx_vilvl_b(Int8Vector, Int8Vector), 8);
+}
+
+template <>
+MLAS_FORCEINLINE
+__m128i
+MlasExtendToS16<int8_t, /* bool IsLow = */ false>(
+    __m128i Int8Vector,
+    __m128i ZeroVector
+    )
+{
+    MLAS_UNREFERENCED_PARAMETER(ZeroVector);
+    return __lsx_vsrai_h(__lsx_vilvh_b(Int8Vector, Int8Vector), 8);
+}
+
+template <class DataType, bool IsLow>
+MLAS_FORCEINLINE
+static
+__m128i
+MlasExtendToS16Debias(
+    __m128i Int8Vector,
+    __m128i ZeroVector,
+    __m128i VectorBias
+    )
+{
+    return __lsx_vsub_h(MlasExtendToS16<DataType, IsLow>(Int8Vector, ZeroVector), VectorBias);
+}
+
+MLAS_FORCEINLINE
+static
+__m128i
+MlasQLinearMulVectorS16(
+    __m128i va_s16x8,
+    __m128i vb_s16x8,
+    __m128 VectorScaleRatio,
+    __m128 VectorZeroPointC
+    )
+{
+    __m128i tmp, tmp1;
+
+    const auto ab_lo = __lsx_vmul_h(va_s16x8, vb_s16x8);
+    const auto ab_hi = __lsx_vmuh_h(va_s16x8, vb_s16x8);
+    auto r_lo = __lsx_vilvl_h(ab_hi, ab_lo);
+    auto r_hi = __lsx_vilvh_h(ab_hi, ab_lo);
+    r_lo = __lsx_vftint_w_s(__lsx_vfmadd_s(__lsx_vffint_s_w(r_lo), VectorScaleRatio, VectorZeroPointC));
+    r_hi = __lsx_vftint_w_s(__lsx_vfmadd_s(__lsx_vffint_s_w(r_hi), VectorScaleRatio, VectorZeroPointC));
+
+    tmp = __lsx_vsat_w(r_lo, 15);
+    tmp1 = __lsx_vsat_w(r_hi, 15);
+    return __lsx_vpickev_h(tmp1, tmp);
+}
+
+template<typename DataType, bool IsScalarB>
+static
+void
+MlasQLinearMulKernel(
+    const DataType* InputA,
+    float ScaleA,
+    int32_t ZeroPointA,
+    const DataType* InputB,
+    float ScaleB,
+    int32_t ZeroPointB,
+    float ScaleC,
+    int32_t ZeroPointC,
+    DataType* OutputC,
+    size_t N
+    )
+{
+    const auto VectorZeroPointA = __lsx_vreplgr2vr_h((int16_t)ZeroPointA);
+    const auto VectorZeroPointB = __lsx_vreplgr2vr_h((int16_t)ZeroPointB);
+    const auto VectorZeroPointC = MlasBroadcastFloat32x4((float)ZeroPointC);
+    const auto VectorScaleRatio = MlasBroadcastFloat32x4(ScaleA * ScaleB / ScaleC);
+    const auto ZeroVector = __lsx_vldi(0);
+
+    uint8_t TailDataA[16] = { 0 };
+    uint8_t TailDataB[16] = { 0 };
+    __m128i vb_lo_s16x8, vb_hi_s16x8;
+
+    if (IsScalarB) {
+        vb_lo_s16x8 = __lsx_vsub_h(__lsx_vreplgr2vr_h((int16_t)*InputB), VectorZeroPointB);
+        vb_hi_s16x8 = vb_lo_s16x8;
+    }
+
+    while (N > 0) {
+        if (N < 16) {
+            MlasCopyTailBytes(TailDataA, (const uint8_t*)InputA, N);
+            InputA = (const DataType*)TailDataA;
+            if (!IsScalarB) {
+                MlasCopyTailBytes(TailDataB, (const uint8_t*)InputB, N);
+                InputB = (const DataType*)TailDataB;
+            }
+        }
+
+        const auto va_i8x16 = __lsx_vld((const MLAS_INT32X4*)InputA, 0);
+        InputA += 16;
+        const auto va_lo_s16x8 = MlasExtendToS16Debias<DataType, true>(va_i8x16, ZeroVector, VectorZeroPointA);
+        const auto va_hi_s16x8 = MlasExtendToS16Debias<DataType, false>(va_i8x16, ZeroVector, VectorZeroPointA);
+
+        if (!IsScalarB) {
+            const auto vb_i8x16 = __lsx_vld((const MLAS_INT32X4*)InputB, 0);
+            InputB += 16;
+            vb_lo_s16x8 = MlasExtendToS16Debias<DataType, true>(vb_i8x16, ZeroVector, VectorZeroPointB);
+            vb_hi_s16x8 = MlasExtendToS16Debias<DataType, false>(vb_i8x16, ZeroVector, VectorZeroPointB);
+        }
+
+        const auto vc_lo_s16x8 = MlasQLinearMulVectorS16(va_lo_s16x8, vb_lo_s16x8, VectorScaleRatio, VectorZeroPointC);
+        const auto vc_hi_s16x8 = MlasQLinearMulVectorS16(va_hi_s16x8, vb_hi_s16x8, VectorScaleRatio, VectorZeroPointC);
+        auto vc = MlasPackS16_128<DataType>(vc_lo_s16x8, vc_hi_s16x8);
+
+        if (N >= 16) {
+            __lsx_vst(vc, (__m128i*)OutputC, 0);
+            OutputC += 16;
+            N -= 16;
+        } else {
+            __lsx_vst(vc, (__m128i*)TailDataA, 0);
+            MlasCopyTailBytes((uint8_t*)OutputC, TailDataA, N);
+            N = 0;
+        }
+    }
+}
+
+
 #else
 
 // Pure C++ implementation.
diff --git a/onnxruntime/core/mlas/lib/quantize.cpp b/onnxruntime/core/mlas/lib/quantize.cpp
index 133ad79594c55..ffecc2dbeff9e 100644
--- a/onnxruntime/core/mlas/lib/quantize.cpp
+++ b/onnxruntime/core/mlas/lib/quantize.cpp
@@ -20,7 +20,9 @@ Module Name:
 
 #include "mlasi.h"
 
-#if defined(MLAS_NEON64_INTRINSICS) || defined(MLAS_SSE2_INTRINSICS)
+#if defined(MLAS_NEON64_INTRINSICS) || defined(MLAS_SSE2_INTRINSICS) || \
+    defined(MLAS_LSX_INTRINSICS)
+
 #include <type_traits>
 
 //
@@ -49,6 +51,9 @@ MlasQuantizeLinearVector(
     // is a NaN.
     FloatVector = vmaxnmq_f32(FloatVector, MinimumValueVector);
     FloatVector = vminnmq_f32(FloatVector, MaximumValueVector);
+#elif defined(MLAS_LSX_INTRINSICS)
+    FloatVector = __lsx_vfmax_s(FloatVector, MinimumValueVector);
+    FloatVector = __lsx_vfmin_s(FloatVector, MaximumValueVector);
 #else
     // N.B. MINPS and MAXPS returns the value from the second vector if the
     // value from the first vector is a NaN.
@@ -64,6 +69,9 @@ MlasQuantizeLinearVector(
 #if defined(MLAS_NEON64_INTRINSICS)
     auto IntegerVector = vcvtnq_s32_f32(FloatVector);
     IntegerVector = vaddq_s32(IntegerVector, ZeroPointVector);
+#elif defined(MLAS_LSX_INTRINSICS)
+    auto IntegerVector = __lsx_vftint_w_s(FloatVector);
+    IntegerVector = __lsx_vadd_w(IntegerVector, ZeroPointVector);
 #else
     // N.B. Assumes MXCSR has been configured with the default rounding mode of
     // "round to nearest even".
@@ -213,6 +221,121 @@ MlasQuantizeLinearStoreSingleValue<int16_t>(
     vst1q_lane_s16(Output, vreinterpretq_s16_s32(IntegerVector), 0);
 }
 
+#elif defined(MLAS_LSX_INTRINSICS)
+template<>
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasQuantizeLinearPackBytes<uint8_t>(
+    MLAS_INT32X4 integervector
+    )
+{
+
+    __m128i zero = __lsx_vldi(0);
+    __m128i tmp, tmp2;
+
+    tmp = __lsx_vmax_h(integervector, zero);
+    tmp2 = __lsx_vsat_hu(tmp, 7);
+
+    integervector = __lsx_vpickev_b(tmp2, tmp2);
+
+
+    tmp = __lsx_vmax_h(integervector, zero);
+    tmp2 = __lsx_vsat_hu(tmp, 7);
+
+    integervector = __lsx_vpickev_b(tmp2, tmp2);
+    return integervector;
+}
+
+template<>
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasQuantizeLinearPackBytes<int8_t>(
+    MLAS_INT32X4 integervector
+    )
+{
+
+    __m128i tmp, tmp1;
+
+    tmp = __lsx_vsat_h(integervector, 7);
+    tmp1 = __lsx_vsat_h(integervector, 7);
+    integervector = __lsx_vpickev_b(tmp1, tmp);
+
+    tmp = __lsx_vsat_h(integervector, 7);
+    tmp1 = __lsx_vsat_h(integervector, 7);
+    integervector = __lsx_vpickev_b(tmp1, tmp);
+    return integervector;
+}
+
+template <typename OutputType>
+MLAS_FORCEINLINE
+void
+MlasQuantizeLinearStore4PackedValues(
+    MLAS_INT32X4 IntegerVector,
+    OutputType* Output
+    )
+{
+    // Copies the lower 4 packed elements of the vector into memory (Output).
+
+    if constexpr (std::is_same_v<OutputType, uint8_t> || std::is_same_v<OutputType, int8_t>) {
+        __lsx_vstelm_w(IntegerVector, reinterpret_cast<int32_t*>(Output), 0, 0);
+    } else {
+        static_assert(std::is_same_v<OutputType, uint16_t> || std::is_same_v<OutputType, int16_t>);
+
+        __lsx_vstelm_d(IntegerVector, reinterpret_cast<int64_t*>(Output), 0, 0);
+    }
+}
+
+
+template <typename OutputType>
+MLAS_FORCEINLINE
+void
+MlasQuantizeLinearStoreSingleValue(
+    MLAS_INT32X4 IntegerVector,
+    OutputType* Output
+    )
+{
+    static_assert(std::is_same_v<OutputType, uint8_t> ||
+                  std::is_same_v<OutputType, int8_t> ||
+                  std::is_same_v<OutputType, uint16_t> ||
+                  std::is_same_v<OutputType, int16_t>);
+
+    // Copies the lower element of the vector into memory (Output).
+    // Expects that the 32-bit element in lane 0 is already within the valid numerical
+    // range of the OutputType.
+    *Output = static_cast<OutputType>(__lsx_vpickve2gr_w(IntegerVector, 0));
+}
+
+template<>
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasQuantizeLinearPackBytes<uint16_t>(
+    MLAS_INT32X4 IntegerVector
+    )
+{
+    __m128i zero = __lsx_vldi(0);
+    __m128i tmp, tmp2;
+
+    tmp = __lsx_vmax_w(IntegerVector, zero);
+    tmp2 = __lsx_vsat_wu(tmp, 15);
+
+    IntegerVector = __lsx_vpickev_h(tmp2, tmp2);
+    return IntegerVector;
+}
+
+template<>
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasQuantizeLinearPackBytes<int16_t>(
+    MLAS_INT32X4 IntegerVector
+    )
+{
+    __m128i tmp, tmp1;
+
+    tmp = __lsx_vsat_w(IntegerVector, 15);
+    tmp1 = __lsx_vsat_w(IntegerVector, 15);
+    IntegerVector = __lsx_vpickev_h(tmp1, tmp);
+    return IntegerVector;
+}
 #else
 
 template<>
@@ -384,6 +507,8 @@ Return Value:
 
 #if defined(MLAS_NEON64_INTRINSICS)
         auto FloatVector = vld1q_dup_f32(Input + n);
+#elif defined(MLAS_LSX_INTRINSICS)
+        MLAS_FLOAT32X4 FloatVector = (MLAS_FLOAT32X4)__lsx_vldrepl_w(Input+n, 0);
 #else
         auto FloatVector = _mm_load_ss(Input + n);
 #endif
@@ -1362,6 +1487,286 @@ MlasRequantizeOutput(
     }
 }
 
+#elif defined(MLAS_LSX_INTRINSICS)
+
+template <typename OutputType>
+void
+MlasRequantizeOutput(
+    const int32_t* Input,
+    size_t InputLeadingDimension,
+    OutputType* Output,
+    size_t OutputLeadingDimension,
+    const int32_t* Bias,
+    const float* Scale,
+    bool PerColumnScale,
+    OutputType ZeroPoint,
+    size_t StartM,
+    size_t StartN,
+    size_t CountM,
+    size_t CountN
+    )
+{
+    //TO BE CHECK
+    float min_f = float(std::numeric_limits<OutputType>::lowest() - ZeroPoint);
+    float max_f = float(std::numeric_limits<OutputType>::max() - ZeroPoint);
+    const __m128 PerMatrixScaleVector = PerColumnScale ? MlasReinterpretAsFloat32x4(__lsx_vldi(0)) : MlasReinterpretAsFloat32x4(__lsx_vldrepl_w(Scale, 0));
+    const __m128 MinimumValueVector = MlasReinterpretAsFloat32x4(__lsx_vreplgr2vr_w( *((uint32_t*)&min_f)));
+    const __m128 MaximumValueVector = MlasReinterpretAsFloat32x4(__lsx_vreplgr2vr_w( *((uint32_t*)&max_f)));
+    const __m128i ZeroPointVector = __lsx_vreplgr2vr_w(ZeroPoint);
+
+    if (nullptr != Bias) {
+        Bias += StartN;
+    }
+    if (PerColumnScale) {
+        Scale += StartN;
+    }
+
+    Input += StartM * InputLeadingDimension + StartN;
+    Output += StartM * OutputLeadingDimension + StartN;
+    //
+    // Step through each row of the output matrix.
+    //
+
+    while (CountM-- > 0) {
+
+        const int32_t* bias = Bias;
+        const float* scale = PerColumnScale ? Scale : nullptr;
+        size_t n = CountN;
+
+        auto* RowInput = Input;
+        auto* RowOutput = Output;
+
+        //
+        // Process 16 columns of the matrices at a time.
+        //
+
+        while (n >= 16) {
+
+            //
+            // Load the input data and optionally add the per-column bias.
+            //
+
+            __m128i IntegerVector0 = __lsx_vld((const __m128i*)&RowInput[0], 0);
+            __m128i IntegerVector1 = __lsx_vld((const __m128i*)&RowInput[4], 0);
+            __m128i IntegerVector2 = __lsx_vld((const __m128i*)&RowInput[8], 0);
+            __m128i IntegerVector3 = __lsx_vld((const __m128i*)&RowInput[12], 0);
+            RowInput += 16;
+
+            if (bias != nullptr) {
+                IntegerVector0 = __lsx_vadd_w(IntegerVector0, __lsx_vld((const __m128i *)&bias[0], 0));
+                IntegerVector1 = __lsx_vadd_w(IntegerVector1, __lsx_vld((const __m128i *)&bias[4], 0));
+                IntegerVector2 = __lsx_vadd_w(IntegerVector2, __lsx_vld((const __m128i *)&bias[8], 0));
+                IntegerVector3 = __lsx_vadd_w(IntegerVector3, __lsx_vld((const __m128i *)&bias[12], 0));
+                bias += 16;
+            }
+
+            //
+            // Convert to integer values to float and apply the per-tensor or
+            // per-column scaling.
+            //
+
+            __m128 FloatVector0 = __lsx_vffint_s_w(IntegerVector0);
+            __m128 FloatVector1 = __lsx_vffint_s_w(IntegerVector1);
+            __m128 FloatVector2 = __lsx_vffint_s_w(IntegerVector2);
+            __m128 FloatVector3 = __lsx_vffint_s_w(IntegerVector3);
+
+            if (scale != nullptr) {
+
+                FloatVector0 = __lsx_vfmul_s(FloatVector0, MlasReinterpretAsFloat32x4(__lsx_vld((__m128i *)&scale[0], 0)));
+                FloatVector1 = __lsx_vfmul_s(FloatVector1, MlasReinterpretAsFloat32x4(__lsx_vld((__m128i *)&scale[4], 0)));
+                FloatVector2 = __lsx_vfmul_s(FloatVector2, MlasReinterpretAsFloat32x4(__lsx_vld((__m128i *)&scale[8], 0)));
+                FloatVector3 = __lsx_vfmul_s(FloatVector3, MlasReinterpretAsFloat32x4(__lsx_vld((__m128i *)&scale[12], 0)));
+                scale += 16;
+
+            } else {
+
+                FloatVector0 = __lsx_vfmul_s(FloatVector0, PerMatrixScaleVector);
+                FloatVector1 = __lsx_vfmul_s(FloatVector1, PerMatrixScaleVector);
+                FloatVector2 = __lsx_vfmul_s(FloatVector2, PerMatrixScaleVector);
+                FloatVector3 = __lsx_vfmul_s(FloatVector3, PerMatrixScaleVector);
+            }
+            FloatVector0 = __lsx_vfmax_s(FloatVector0, MinimumValueVector);
+            FloatVector1 = __lsx_vfmax_s(FloatVector1, MinimumValueVector);
+            FloatVector2 = __lsx_vfmax_s(FloatVector2, MinimumValueVector);
+            FloatVector3 = __lsx_vfmax_s(FloatVector3, MinimumValueVector);
+
+            FloatVector0 = __lsx_vfmin_s(FloatVector0, MaximumValueVector);
+            FloatVector1 = __lsx_vfmin_s(FloatVector1, MaximumValueVector);
+            FloatVector2 = __lsx_vfmin_s(FloatVector2, MaximumValueVector);
+            FloatVector3 = __lsx_vfmin_s(FloatVector3, MaximumValueVector);
+
+            IntegerVector0 = __lsx_vftint_w_s(FloatVector0);
+            IntegerVector1 = __lsx_vftint_w_s(FloatVector1);
+            IntegerVector2 = __lsx_vftint_w_s(FloatVector2);
+            IntegerVector3 = __lsx_vftint_w_s(FloatVector3);
+
+            IntegerVector0 = __lsx_vadd_w(IntegerVector0, ZeroPointVector);
+            IntegerVector1 = __lsx_vadd_w(IntegerVector1, ZeroPointVector);
+            IntegerVector2 = __lsx_vadd_w(IntegerVector2, ZeroPointVector);
+            IntegerVector3 = __lsx_vadd_w(IntegerVector3, ZeroPointVector);
+
+            __m128i WordVector0;
+            __m128i WordVector1;
+            __m128i ByteVector;
+
+            if (std::is_signed<OutputType>::value) {
+
+                __m128i tmp, tmp1;
+                tmp = __lsx_vsat_w(IntegerVector0, 15);
+                tmp1 = __lsx_vsat_w(IntegerVector1, 15);
+                WordVector0 = __lsx_vpickev_h(tmp1, tmp);
+
+                tmp = __lsx_vsat_w(IntegerVector2, 15);
+                tmp1 = __lsx_vsat_w(IntegerVector3, 15);
+                WordVector1 = __lsx_vpickev_h(tmp1, tmp);
+
+                tmp = __lsx_vsat_h(WordVector0, 7);
+                tmp1 = __lsx_vsat_h(WordVector1, 7);
+                ByteVector = __lsx_vpickev_b(tmp1, tmp);
+
+
+            } else {
+
+                __m128i zero = __lsx_vldi(0);
+                __m128i tmp, tmp2, tmp3;
+
+                tmp = __lsx_vmax_h(IntegerVector0, zero);
+                tmp2 = __lsx_vsat_hu(tmp, 7);
+
+                tmp = __lsx_vmax_h(IntegerVector1, zero);
+                tmp3 = __lsx_vsat_hu(tmp, 7);
+                WordVector0 = __lsx_vpickev_b(tmp3, tmp2);
+
+                tmp = __lsx_vmax_h(IntegerVector2, zero);
+                tmp2 = __lsx_vsat_hu(tmp, 7);
+
+                tmp = __lsx_vmax_h(IntegerVector3, zero);
+                tmp3 = __lsx_vsat_hu(tmp, 7);
+                WordVector1 = __lsx_vpickev_b(tmp3, tmp2);
+
+                tmp = __lsx_vmax_h(WordVector0, zero);
+                tmp2 = __lsx_vsat_hu(tmp, 7);
+
+                tmp = __lsx_vmax_h(WordVector1, zero);
+                tmp3 = __lsx_vsat_hu(tmp, 7);
+                ByteVector = __lsx_vpickev_b(tmp3, tmp2);
+
+            }
+
+            __lsx_vst(ByteVector, (__m128i*)RowOutput, 0);
+            RowOutput += 16;
+
+            n -= 16;
+        }
+
+        //
+        // Process the remaining columns of the matrices.
+        //
+
+        while (n > 0) {
+
+            //
+            // Load the input data and optionally add the per-column bias.
+            //
+
+            __m128i IntegerVector;
+
+            if (n >= 4) {
+
+                IntegerVector = __lsx_vld((const __m128i*)&RowInput[0], 0);
+                RowInput += 4;
+
+                if (bias != nullptr) {
+                    IntegerVector = __lsx_vadd_w(IntegerVector, __lsx_vld((const __m128i*)&bias[0], 0));
+                    bias += 4;
+                }
+
+            } else {
+
+                int32_t IntegerValue = *RowInput++;
+
+                if (bias != nullptr) {
+                    IntegerValue += *bias++;
+                }
+                IntegerVector = __lsx_vldrepl_w(&IntegerValue, 0);
+            }
+
+            //
+            // Convert to integer values to float and apply the per-tensor or
+            // per-column scaling.
+            //
+            __m128 FloatVector = __lsx_vffint_s_w(IntegerVector);
+            __m128 ScaleVector;
+
+            if (scale != nullptr) {
+
+                if (n >= 4) {
+                    ScaleVector = MlasReinterpretAsFloat32x4(__lsx_vld((__m128i *)scale, 0));
+                    scale += 4;
+                } else {
+                    ScaleVector = (__m128)__lsx_vldrepl_w(scale, 0);
+                    scale += 1;
+                }
+
+            } else {
+                ScaleVector = PerMatrixScaleVector;
+            }
+            FloatVector = __lsx_vfmul_s(FloatVector, ScaleVector);
+
+            FloatVector = __lsx_vfmax_s(FloatVector, MinimumValueVector);
+            FloatVector = __lsx_vfmin_s(FloatVector, MaximumValueVector);
+
+            IntegerVector = __lsx_vftint_w_s(FloatVector);
+            IntegerVector = __lsx_vadd_w(IntegerVector, ZeroPointVector);
+
+            if (std::is_signed<OutputType>::value) {
+
+                __m128i tmp;
+                tmp = __lsx_vsat_w(IntegerVector, 15);
+                IntegerVector = __lsx_vpickev_h(tmp, tmp);
+
+                tmp = __lsx_vsat_h(IntegerVector, 7);
+                IntegerVector = __lsx_vpickev_b(tmp, tmp);
+
+            } else {
+
+                __m128i zero = __lsx_vldi(0);
+                __m128i tmp, tmp2;
+
+                tmp = __lsx_vmax_h(IntegerVector, zero);
+                tmp2 = __lsx_vsat_hu(tmp, 7);
+                IntegerVector = __lsx_vpickev_b(tmp2, tmp2);
+
+                tmp = __lsx_vmax_h(IntegerVector, zero);
+                tmp2 = __lsx_vsat_hu(tmp, 7);
+                IntegerVector = __lsx_vpickev_b(tmp2, tmp2);
+
+            }
+
+            uint32_t OutputValue = uint32_t(__lsx_vpickve2gr_w(IntegerVector, 0));
+
+            if (n >= 4) {
+
+                *reinterpret_cast<uint32_t*>(RowOutput) = OutputValue;
+                RowOutput += 4;
+
+                n -= 4;
+
+            } else {
+
+                *RowOutput = uint8_t(OutputValue);
+                RowOutput += 1;
+
+                n -= 1;
+            }
+        }
+
+        // Next Row
+        Input += InputLeadingDimension;
+        Output += OutputLeadingDimension;
+    }
+}
+
 #else
 
 template <typename OutputType>
diff --git a/onnxruntime/core/mlas/lib/reorder.cpp b/onnxruntime/core/mlas/lib/reorder.cpp
index 99c1dbac3b692..b329ea2ffb149 100644
--- a/onnxruntime/core/mlas/lib/reorder.cpp
+++ b/onnxruntime/core/mlas/lib/reorder.cpp
@@ -180,6 +180,31 @@ Return Value:
     v[2] = _mm_movelh_ps(t[2], t[3]);
     v[3] = _mm_movehl_ps(t[3], t[2]);
 
+    MlasStoreFloat32x4(&D[ScatterStride * 0], v[0]);
+    MlasStoreFloat32x4(&D[ScatterStride * 1], v[1]);
+    MlasStoreFloat32x4(&D[ScatterStride * 2], v[2]);
+    MlasStoreFloat32x4(&D[ScatterStride * 3], v[3]);
+#elif  defined(MLAS_LSX_INTRINSICS)
+
+    MLAS_FLOAT32X4 v[4];
+    MLAS_FLOAT32X4 t[4];
+
+    v[0] = MlasLoadFloat32x4(&S[GatherStride * 0]);
+    v[1] = MlasLoadFloat32x4(&S[GatherStride * 1]);
+    v[2] = MlasLoadFloat32x4(&S[GatherStride * 2]);
+    v[3] = MlasLoadFloat32x4(&S[GatherStride * 3]);
+
+    t[0] = (__m128)__lsx_vilvl_w((__m128i)v[1], (__m128i)v[0]);
+    t[2] = (__m128)__lsx_vilvh_w((__m128i)v[1], (__m128i)v[0]);
+    t[1] = (__m128)__lsx_vilvl_w((__m128i)v[3], (__m128i)v[2]);
+    t[3] = (__m128)__lsx_vilvh_w((__m128i)v[3], (__m128i)v[2]);
+
+
+    v[0] = (__m128)__lsx_vpickev_d((__m128i) t[1],(__m128i) t[0]);
+    v[1] = (__m128)__lsx_vpickod_d((__m128i) t[1],(__m128i) t[0]);
+    v[2] = (__m128)__lsx_vpickev_d((__m128i) t[3],(__m128i) t[2]);
+    v[3] = (__m128)__lsx_vpickod_d((__m128i) t[3],(__m128i) t[2]);
+
     MlasStoreFloat32x4(&D[ScatterStride * 0], v[0]);
     MlasStoreFloat32x4(&D[ScatterStride * 1], v[1]);
     MlasStoreFloat32x4(&D[ScatterStride * 2], v[2]);
@@ -456,7 +481,6 @@ Return Value:
         &TaskStart, &TasksRemaining);
 
     size_t TaskEnd = TaskStart + TasksRemaining;
-   
     //
     // Rebase the pointers to the source and destination buffers for this thread.
     //
@@ -567,18 +591,17 @@ Return Value:
 
     WorkBlock.S = S;
     WorkBlock.D = D;
-    
     WorkBlock.OutputChannels = size_t(OutputShape[1]);
     WorkBlock.OutputSize = size_t(OutputShape[2]) * size_t(OutputShape[3]);
 
     const size_t BlockSize = MlasNchwcGetBlockSize();
     const size_t TasksPerBatch = size_t(ceil(((float)WorkBlock.OutputChannels) / BlockSize));
     const size_t BatchCount = size_t(OutputShape[0]);
-    const size_t TasksCount = BatchCount * TasksPerBatch;    
+    const size_t TasksCount = BatchCount * TasksPerBatch;
     WorkBlock.TasksCount = TasksCount;
 
     //
-    // Schedule the operation across a set of worker threads if the output 
+    // Schedule the operation across a set of worker threads if the output
     // tensor is sufficienly large. Limit the number of threads to at least
     // the number of available tasks.
     //
@@ -590,7 +613,7 @@ Return Value:
         if (size_t(TargetThreadCount) > TasksCount) {
             TargetThreadCount = ptrdiff_t(TasksCount);
         }
-    }     
+    }
     WorkBlock.TargetThreadCount = TargetThreadCount;
 
     MlasExecuteThreaded(MlasReorderOutputNchwThreaded, &WorkBlock, TargetThreadCount, ThreadPool);
diff --git a/onnxruntime/core/mlas/lib/sgemm.cpp b/onnxruntime/core/mlas/lib/sgemm.cpp
index 1ce64712d63dc..4d7a1ceb4eee7 100644
--- a/onnxruntime/core/mlas/lib/sgemm.cpp
+++ b/onnxruntime/core/mlas/lib/sgemm.cpp
@@ -472,7 +472,7 @@ Return Value:
         const float* b = B;
         size_t x = CountX;
 
-#if defined(MLAS_TARGET_AMD64)
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64)
 
         MLAS_SGEMM_TRANSPOSE_PACKB_BLOCK_ROUTINE* SgemmTransposePackB16x4Routine =
             GetMlasPlatform().TransposePackB16x4Routine;
@@ -1061,7 +1061,7 @@ Return Value:
 
         size_t RowsHandled;
 
-#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER)
+#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_LARCH64)
         RowsHandled = GetMlasPlatform().GemmFloatKernel(A, B, C, CountK, CountM, CountN, lda, ldc, alpha, ZeroMode);
 #else
         if (ZeroMode) {
diff --git a/onnxruntime/core/mlas/lib/snchwc.cpp b/onnxruntime/core/mlas/lib/snchwc.cpp
index 74d65f934aaf5..f9cf1605787aa 100644
--- a/onnxruntime/core/mlas/lib/snchwc.cpp
+++ b/onnxruntime/core/mlas/lib/snchwc.cpp
@@ -101,7 +101,7 @@ Return Value:
 
 --*/
 {
-#if defined(MLAS_TARGET_AMD64)
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64)
     return GetMlasPlatform().NchwcBlockSize;
 #else
     return 1;
@@ -674,7 +674,7 @@ struct MLAS_NCHWC_CONV_NCHWC_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
 
         const size_t BlockedOutputWidth = BlockSize * OutputWidth;
 
-#if defined(MLAS_TARGET_AMD64)
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64)
         MLAS_CONV_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvNchwcFloatKernel;
 #else
         MLAS_CONV_FLOAT_KERNEL* Kernel = MlasConvNchwcFloatKernel;
@@ -784,7 +784,7 @@ struct MLAS_NCHWC_CONV_NCHW_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
 
         const size_t BlockedOutputWidth = BlockSize * OutputWidth;
 
-#if defined(MLAS_TARGET_AMD64)
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64)
         MLAS_CONV_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvNchwFloatKernel;
 #else
         MLAS_CONV_FLOAT_KERNEL* Kernel = MlasConvNchwFloatKernel;
@@ -879,7 +879,7 @@ struct MLAS_NCHWC_CONV_POINTWISE_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
         const size_t FilterStrideBytes = BlockSize * InputChannels * sizeof(float);
         const size_t OutputStrideBytes = BlockSize * OutputSize * sizeof(float);
 
-#if defined(MLAS_TARGET_AMD64)
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64)
         MLAS_CONV_POINTWISE_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvPointwiseFloatKernel;
 #else
         MLAS_CONV_POINTWISE_FLOAT_KERNEL* Kernel = MlasConvPointwiseFloatKernel;
@@ -1016,7 +1016,7 @@ struct MLAS_NCHWC_CONV_DEPTHWISE_ALGORITHM : MLAS_NCHWC_CONV_ALGORITHM
 
         const size_t BlockedOutputWidth = BlockSize * OutputWidth;
 
-#if defined(MLAS_TARGET_AMD64)
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64)
         MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvDepthwiseFloatKernel;
 #else
         MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* Kernel = MlasConvDepthwiseFloatKernel;
@@ -1093,7 +1093,7 @@ struct MLAS_NCHWC_CONV_DEPTHWISE_ALGORITHM : MLAS_NCHWC_CONV_ALGORITHM
 
 struct MLAS_NCHWC_POOL_ALGORITHM : MLAS_NCHWC_NN_ALGORITHM
 {
-#if !defined(MLAS_TARGET_AMD64)
+#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64)
     static MLAS_POOL_FLOAT_KERNEL* const PoolKernels[];
 #endif
 
@@ -1131,7 +1131,7 @@ struct MLAS_NCHWC_POOL_ALGORITHM : MLAS_NCHWC_NN_ALGORITHM
         const size_t DilatedInputWidthBytes = BlockSize * DilationHeight * InputWidth * sizeof(float);
         const size_t InputStrideBytes = DilatedInputWidthBytes - KernelWidth * DilationWidthBytes;
 
-#if defined(MLAS_TARGET_AMD64)
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64)
         MLAS_POOL_FLOAT_KERNEL* Kernel = GetMlasPlatform().PoolFloatKernel[WorkBlock->PoolingKind];
 #else
         MLAS_POOL_FLOAT_KERNEL* Kernel = PoolKernels[WorkBlock->PoolingKind];
@@ -1197,7 +1197,7 @@ struct MLAS_NCHWC_POOL_ALGORITHM : MLAS_NCHWC_NN_ALGORITHM
     }
 };
 
-#if !defined(MLAS_TARGET_AMD64)
+#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64)
 
 MLAS_POOL_FLOAT_KERNEL* const MLAS_NCHWC_POOL_ALGORITHM::PoolKernels[] =
 {
@@ -1621,7 +1621,7 @@ Return Value:
     }
 }
 
-#if !defined(MLAS_TARGET_AMD64)
+#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64)
 
 //
 // Convolution and pooling kernel stubs for architectures that do not yet have
diff --git a/onnxruntime/core/mlas/lib/transpose.cpp b/onnxruntime/core/mlas/lib/transpose.cpp
index 86b0897bb91ec..a758a0e59fb4f 100644
--- a/onnxruntime/core/mlas/lib/transpose.cpp
+++ b/onnxruntime/core/mlas/lib/transpose.cpp
@@ -371,6 +371,121 @@ MlasTranspose16x16Block(
     vec_vsx_st(e0, 0, &Output[OutputStride * 14]);
     vec_vsx_st(e1, 0, &Output[OutputStride * 15]);
 }
+
+#elif defined(MLAS_LSX_INTRINSICS)
+
+MLAS_FORCEINLINE
+void
+MlasTranspose4x4Block(
+    const uint32_t* Input,
+    size_t InputStride,
+    uint32_t* Output,
+    size_t OutputStride
+    )
+{
+    __m128i a0 = __lsx_vld((const __m128i*)&Input[InputStride * 0], 0);
+    __m128i a1 = __lsx_vld((const __m128i*)&Input[InputStride * 1], 0);
+    __m128i a2 = __lsx_vld((const __m128i*)&Input[InputStride * 2], 0);
+    __m128i a3 = __lsx_vld((const __m128i*)&Input[InputStride * 3], 0);
+
+    __m128i b0 = __lsx_vilvl_w(a2, a0);
+    __m128i b1 = __lsx_vilvh_w(a2, a0);
+    __m128i b2 = __lsx_vilvl_w(a3, a1);
+    __m128i b3 = __lsx_vilvh_w(a3, a1);
+    __m128i c0 = __lsx_vilvl_w(b2, b0);
+    __m128i c1 = __lsx_vilvh_w(b2, b0);
+    __m128i c2 = __lsx_vilvl_w(b3, b1);
+    __m128i c3 = __lsx_vilvh_w(b3, b1);
+
+    __lsx_vst(c0, (__m128i*)&Output[OutputStride * 0], 0);
+    __lsx_vst(c1, (__m128i*)&Output[OutputStride * 1], 0);
+    __lsx_vst(c2, (__m128i*)&Output[OutputStride * 2], 0);
+    __lsx_vst(c3, (__m128i*)&Output[OutputStride * 3], 0);
+}
+
+MLAS_FORCEINLINE
+void
+MlasTranspose4x4Block(
+    const uint16_t* Input,
+    size_t InputStride,
+    uint16_t* Output,
+    size_t OutputStride
+    )
+{
+    __m128i a0 = __lsx_vld((const __m128i*)&Input[InputStride * 0], 0);
+    __lsx_vinsgr2vr_d(a0, 0 , 1);
+    __m128i a1 = __lsx_vld((const __m128i*)&Input[InputStride * 1], 0);
+    __lsx_vinsgr2vr_d(a1, 0 , 1);
+    __m128i a2 = __lsx_vld((const __m128i*)&Input[InputStride * 2], 0);
+    __lsx_vinsgr2vr_d(a2, 0 , 1);
+    __m128i a3 = __lsx_vld((const __m128i*)&Input[InputStride * 3], 0);
+    __lsx_vinsgr2vr_d(a3, 0 , 1);
+
+    __m128i b0 = __lsx_vilvl_h(a2, a0);
+    __m128i b1 = __lsx_vilvl_h(a3, a1);
+    __m128i c0 = __lsx_vilvl_h(b1, b0);
+    __m128i c1 = __lsx_vilvh_h(b1, b0);
+
+    __lsx_vst(__lsx_vinsgr2vr_d(__lsx_vld((__m128i *)&Output[OutputStride * 0], 0), __lsx_vpickve2gr_d(c0, 0), 0), (__m128i *)&Output[OutputStride * 0], 0);
+    __lsx_vst(__lsx_vinsgr2vr_d(__lsx_vld((__m128i *)&Output[OutputStride * 1], 0), __lsx_vpickve2gr_d(c0, 1), 0), (__m128i *)&Output[OutputStride * 1], 0);
+    __lsx_vst(__lsx_vinsgr2vr_d(__lsx_vld((__m128i *)&Output[OutputStride * 2], 0), __lsx_vpickve2gr_d(c1, 0), 0), (__m128i *)&Output[OutputStride * 2], 0);
+    __lsx_vst(__lsx_vinsgr2vr_d(__lsx_vld((__m128i *)&Output[OutputStride * 3], 0), __lsx_vpickve2gr_d(c1, 1), 0), (__m128i *)&Output[OutputStride * 3], 0);
+}
+
+MLAS_FORCEINLINE
+void
+MlasTranspose8x8Block(
+    const uint8_t* Input,
+    size_t InputStride,
+    uint8_t* Output,
+    size_t OutputStride
+    )
+{
+    __m128i a0 = __lsx_vld((const __m128i*)&Input[InputStride * 0], 0);
+    __lsx_vinsgr2vr_d(a0, 0, 1);
+    __m128i a1 = __lsx_vld((const __m128i*)&Input[InputStride * 1], 0);
+    __lsx_vinsgr2vr_d(a1, 0, 1);
+    __m128i b0 = __lsx_vilvl_b(a1, a0);
+
+    __m128i a2 = __lsx_vld((const __m128i*)&Input[InputStride * 2], 0);
+    __lsx_vinsgr2vr_d(a2, 0, 1);
+    __m128i a3 = __lsx_vld((const __m128i*)&Input[InputStride * 3], 0);
+    __lsx_vinsgr2vr_d(a3, 0, 1);
+    __m128i b1 = __lsx_vilvl_b(a3, a2);
+
+    __m128i a4 = __lsx_vld((const __m128i*)&Input[InputStride * 4], 0);
+    __lsx_vinsgr2vr_d(a4, 0, 1);
+    __m128i a5 = __lsx_vld((const __m128i*)&Input[InputStride * 5], 0);
+    __lsx_vinsgr2vr_d(a5, 0, 1);
+    __m128i b2 = __lsx_vilvl_b(a5, a4);
+
+    __m128i a6 = __lsx_vld((const __m128i*)&Input[InputStride * 6], 0);
+    __lsx_vinsgr2vr_d(a6, 0, 1);
+    __m128i a7 = __lsx_vld((const __m128i*)&Input[InputStride * 7], 0);
+    __lsx_vinsgr2vr_d(a7, 0, 1);
+    __m128i b3 = __lsx_vilvl_b(a7, a6);
+    __m128i c0 = __lsx_vilvl_h(b1, b0);
+    __m128i c1 = __lsx_vilvh_h(b1, b0);
+    __m128i c2 = __lsx_vilvl_h(b3, b2);
+    __m128i c3 = __lsx_vilvh_h(b3, b2);
+
+    __m128 d0 = (__m128)(__lsx_vilvl_w(c2, c0));
+    __lsx_vst(__lsx_vinsgr2vr_d(__lsx_vld((__m128i *)&Output[OutputStride * 0], 0), __lsx_vpickve2gr_d(d0, 0), 0), (__m128i *)&Output[OutputStride * 0], 0);
+    __lsx_vst(__lsx_vinsgr2vr_d(__lsx_vld((__m128i *)&Output[OutputStride * 1], 0), __lsx_vpickve2gr_d(d0, 1), 0), (__m128i *)&Output[OutputStride * 1], 0);
+
+    __m128 d1 = (__m128)(__lsx_vilvh_w(c2, c0));
+    __lsx_vst(__lsx_vinsgr2vr_d(__lsx_vld((__m128i *)&Output[OutputStride * 2], 0), __lsx_vpickve2gr_d(d1, 0), 0), (__m128i *)&Output[OutputStride * 2], 0);
+    __lsx_vst(__lsx_vinsgr2vr_d(__lsx_vld((__m128i *)&Output[OutputStride * 3], 0), __lsx_vpickve2gr_d(d1, 1), 0), (__m128i *)&Output[OutputStride * 3], 0);
+
+    __m128 d2 = (__m128)(__lsx_vilvl_w(c3, c1));
+    __lsx_vst(__lsx_vinsgr2vr_d(__lsx_vld((__m128i *)&Output[OutputStride * 4], 0), __lsx_vpickve2gr_d(d2, 0), 0), (__m128i *)&Output[OutputStride * 4], 0);
+    __lsx_vst(__lsx_vinsgr2vr_d(__lsx_vld((__m128i *)&Output[OutputStride * 5], 0), __lsx_vpickve2gr_d(d2, 1), 0), (__m128i *)&Output[OutputStride * 5], 0);
+
+    __m128 d3 = (__m128)(__lsx_vilvh_w(c3, c1));
+    __lsx_vst(__lsx_vinsgr2vr_d(__lsx_vld((__m128i *)&Output[OutputStride * 6], 0), __lsx_vpickve2gr_d(d3, 0), 0), (__m128i *)&Output[OutputStride * 6], 0);
+    __lsx_vst(__lsx_vinsgr2vr_d(__lsx_vld((__m128i *)&Output[OutputStride * 7], 0), __lsx_vpickve2gr_d(d3, 1), 0), (__m128i *)&Output[OutputStride * 7], 0);
+}
+
 #endif
 
 template<typename ElementType>
@@ -472,7 +587,8 @@ Return Value:
         uint32_t* d = Output;
         size_t m = M;
 
-#if defined(MLAS_SSE2_INTRINSICS) || defined(MLAS_NEON_INTRINSICS) || defined(MLAS_TARGET_POWER)
+#if defined(MLAS_SSE2_INTRINSICS) || defined(MLAS_NEON_INTRINSICS) || defined(MLAS_TARGET_POWER) || \
+    defined(MLAS_LSX_INTRINSICS)
 
         while (m >= 4) {
 
@@ -597,7 +713,7 @@ Return Value:
         uint16_t* d = Output;
         size_t m = M;
 
-#if defined(MLAS_SSE2_INTRINSICS) || defined(MLAS_NEON_INTRINSICS) 
+#if defined(MLAS_SSE2_INTRINSICS) || defined(MLAS_NEON_INTRINSICS)  || defined(MLAS_LSX_INTRINSICS)
 
         while (m >= 4) {
 
@@ -734,7 +850,7 @@ Return Value:
         uint8_t* d = Output;
         size_t m = M;
 
-#if defined(MLAS_SSE2_INTRINSICS) || defined(MLAS_NEON_INTRINSICS)
+#if defined(MLAS_SSE2_INTRINSICS) || defined(MLAS_NEON_INTRINSICS)  || defined(MLAS_LSX_INTRINSICS)
 
         while (m >= 8) {