Skip to content

Commit

Permalink
[mlas] add loongarch lsx and lasx optimize code (microsoft#17937)
Browse files Browse the repository at this point in the history
### Description
Hello we(@lixing-star) are the developers of loongson team.

We add 128 (lsx), 256 (lasx) vector optimization code for the loongarch
architecture


[100% tests passed, 0 tests failed out of
7](https://cloud.a-boat.cn:2021/api/public/dl/6831z1Bi?inline=true)

### Development Environments1
```
CPU: 
    Loongson-3C5000L
uname -a:  
    Linux localhost.localdomain 4.19.190-6.4.lns8.loongarch64 #1 SMP Thu Jul 14 12:08:04 CST 2022 loongarch64 loongarch64 loongarch64 GNU/Linux

```
### LonngArch Documents
- [LoongArch Reference Manual - Volume 1: Basic Architecture: This
manual describes the basic part of the LoongArch
architecture.](https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html)
- [LoongArch ELF psABI: This manual describes the LoongArch ELF
psABI.](https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html)
-
[more](https://loongson.github.io/LoongArch-Documentation/README-EN.html)
  • Loading branch information
junchao-loongson authored Dec 7, 2023
1 parent a045be3 commit 4abec97
Show file tree
Hide file tree
Showing 41 changed files with 7,696 additions and 34 deletions.
22 changes: 22 additions & 0 deletions cmake/onnxruntime_mlas.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,8 @@ else()
set(X86 TRUE)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
set(X86_64 TRUE)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^loongarch64.*")
set(LOONGARCH64 TRUE)
endif()
endif()

Expand Down Expand Up @@ -575,6 +577,26 @@ else()
set(MLAS_SOURCE_IS_NOT_SET 0)
endif()
endif()
if(LOONGARCH64 AND MLAS_SOURCE_IS_NOT_SET)
set(mlas_platform_srcs
${MLAS_SRC_DIR}/qgemm_kernel_lsx.cpp
${MLAS_SRC_DIR}/loongarch64/SgemmKernelLasx.S
${MLAS_SRC_DIR}/loongarch64/DgemmKernelLsx.S
${MLAS_SRC_DIR}/loongarch64/DgemmKernelLasx.S
${MLAS_SRC_DIR}/loongarch64/SgemmKernelLsx.S
${MLAS_SRC_DIR}/loongarch64/SconvKernelLsx.S
${MLAS_SRC_DIR}/loongarch64/SconvKernelLasx.S
${MLAS_SRC_DIR}/loongarch64/SpoolKernelLSX.S
${MLAS_SRC_DIR}/loongarch64/SpoolKernelLasx.S
${MLAS_SRC_DIR}/loongarch64/SgemmTransposePackB16x4LSX.S
${MLAS_SRC_DIR}/loongarch64/SgemmTransposePackB16x4Lasx.S
${MLAS_SRC_DIR}/loongarch64/SoftmaxKernelLasx.S
)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mlsx -mlasx")
if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH)
set(MLAS_SOURCE_IS_NOT_SET 0)
endif()
endif()
if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH AND MLAS_SOURCE_IS_NOT_SET)
file(GLOB_RECURSE mlas_platform_srcs
"${MLAS_SRC_DIR}/scalar/*.cpp")
Expand Down
11 changes: 7 additions & 4 deletions onnxruntime/core/mlas/inc/mlas.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ Module Name:
#endif
#endif

#if defined(__loongarch64)
#define MLAS_TARGET_LARCH64
#endif
//
// Define the support levels for the target architecture.
//
Expand All @@ -87,7 +90,7 @@ Module Name:

#define MLAS_F16VEC_INTRINSICS_SUPPORTED

#endif //
#endif //
#endif // ARM64
#endif // Visual Studio 16 or earlier does not support fp16 intrinsic

Expand Down Expand Up @@ -1619,7 +1622,7 @@ MlasHalfGemmConvertPackB(
* @param Channels # of input channels
* @param OutputCount # of output pixels
* @param KernelSize # kernel size
* @return
* @return
*/
void
MLASCALL
Expand Down Expand Up @@ -1657,7 +1660,7 @@ MlasTranspose(
* @param Channels C in NHWC
* @param OutputCount Number of output pixels
* @param KernelSize Size of the kernel
* @return
* @return
*/
void
MLASCALL
Expand All @@ -1676,7 +1679,7 @@ MlasNhwcMaxPool(
* @param Channels C in NHWC
* @param OutputCount Number of output pixels
* @param KernelSize size of the kernel
* @return
* @return
*/
void
MLASCALL
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/core/mlas/lib/activate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@ struct MLAS_ACTIVATION_FUNCTION<MlasLeakyReluActivation>
return MlasBlendFloat32x4(ValueTimesAlpha, Value, _mm_cmple_ps(ZeroFloat32x4, Value));
#elif defined(MLAS_VSX_INTRINSICS)
return vec_sel(ValueTimesAlpha, Value, vec_cmple(ZeroFloat32x4, Value));
#elif defined(MLAS_LSX_INTRINSICS)
return MlasBlendFloat32x4(ValueTimesAlpha, Value, (__m128)__lsx_vfcmp_cle_s(ZeroFloat32x4, Value));
#else
return MlasBlendFloat32x4(ValueTimesAlpha, Value, ZeroFloat32x4 < Value);
#endif
Expand Down
13 changes: 10 additions & 3 deletions onnxruntime/core/mlas/lib/compute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,9 @@ Return Value:
// instead.
normal = _mm_min_epi16(normal, MaximumExponent);
normal = _mm_max_epi16(normal, MinimumExponent);
#elif defined(MLAS_LSX_INTRINSICS)
normal = __lsx_vmin_h(normal, MaximumExponent);
normal = __lsx_vmax_h(normal, MinimumExponent);
#else
normal = MlasMinimumInt32x4(normal, MaximumExponent);
normal = MlasMaximumInt32x4(normal, MinimumExponent);
Expand Down Expand Up @@ -215,6 +218,8 @@ Return Value:
// N.B. SSE2 lacks a broadcast load instruction, so avoid a shuffle
// and use zeroes for the upper elements.
Vector = _mm_load_ss(Input);
#elif defined(MLAS_LSX_INTRINSICS)
Vector = (MLAS_FLOAT32X4)__lsx_vldrepl_w(Input, 0);
#else
Vector = MlasBroadcastFloat32x4(Input);
#endif
Expand Down Expand Up @@ -467,6 +472,8 @@ Return Value:
// N.B. SSE2 lacks a broadcast load instruction, so avoid a shuffle and
// use zeroes for the upper elements.
MLAS_FLOAT32X4 Vector = _mm_load_ss(Input);
#elif defined(MLAS_LSX_INTRINSICS)
MLAS_FLOAT32X4 Vector = (MLAS_FLOAT32X4)__lsx_vldrepl_w(Input, 0);
#else
MLAS_FLOAT32X4 Vector = MlasBroadcastFloat32x4(Input);
#endif
Expand Down Expand Up @@ -849,7 +856,7 @@ Return Value:
// Find the maximum value for the row.
//

#if defined(MLAS_TARGET_AMD64)
#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64)
float Maximum = GetMlasPlatform().ReduceMaximumF32Kernel(Input, D);
#else
float Maximum = MlasReduceMaximumF32Kernel(Input, D);
Expand All @@ -874,7 +881,7 @@ Return Value:

float Parameters[] = { NegativeMaximum, std::log(Accumulation)};

#if defined(MLAS_TARGET_AMD64)
#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64)
GetMlasPlatform().ComputeLogSoftmaxOutputF32Kernel(Input, Output, D, Parameters);
#else
MlasComputeLogSoftmaxOutputF32Kernel(Input, Output, D, Parameters);
Expand All @@ -899,7 +906,7 @@ Return Value:

float Parameters[] = { 1.0f / Accumulation };

#if defined(MLAS_TARGET_AMD64)
#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64)
GetMlasPlatform().ComputeSoftmaxOutputF32Kernel(Output, D, Parameters);
#else
MlasComputeSoftmaxOutputF32Kernel(Output, D, Parameters);
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/mlas/lib/dgemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -530,7 +530,7 @@ Return Value:

size_t RowsHandled;

#if defined(MLAS_TARGET_AMD64_IX86) || defined (MLAS_TARGET_POWER)
#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_LARCH64)
RowsHandled = GetMlasPlatform().GemmDoubleKernel(A, B, C, CountK, CountM, CountN, lda, ldc, alpha, ZeroMode);
#else
if (ZeroMode) {
Expand Down
27 changes: 27 additions & 0 deletions onnxruntime/core/mlas/lib/loongarch64/DgemmKernelCommon.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/*++
Copyright (C) 2023 Loongson Technology Corporation Limited. All rights reserved.
Licensed under the MIT License.
Module Name:
DgemmKernelCommon.h
Abstract:
This module contains common kernel macros and structures for the double
precision matrix/matrix multiply operation (DGEMM).
--*/

#define LFgemmElementShift 3
#define LFgemmElementSize (1 << LFgemmElementShift)
#define LFgemmYmmElementCount (32/LFgemmElementSize)

#include "FgemmKernelCommon.h"

FGEMM_TYPED_INSTRUCTION(xvfadd, xvfadd.d)
FGEMM_TYPED_INSTRUCTION(xvfmadd, xvfmadd.d)
FGEMM_TYPED_INSTRUCTION(xvldrepl, xvldrepl.d)
FGEMM_TYPED_INSTRUCTION(xvfmul, xvfmul.d)
32 changes: 32 additions & 0 deletions onnxruntime/core/mlas/lib/loongarch64/DgemmKernelLasx.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*++
Copyright (C) 2023 Loongson Technology Corporation Limited. All rights reserved.
Licensed under the MIT License.
Module Name:
DgemmKernelLasx.s
Abstract:
This module implements the kernels for the double precision matrix/matrix
multiply operation (DGEMM).
This implementation uses Lasx instructions.
--*/

#include "asmmacro.h"
#include "DgemmKernelCommon.h"
#include "FgemmKernelLasxCommon.h"

.text

//
// Generate the GEMM kernel.
//

FgemmKernelLasxFunction MlasGemmDoubleKernelLasx

.end
Loading

0 comments on commit 4abec97

Please sign in to comment.