diff --git a/onnxruntime/core/mlas/lib/amd64/cvtfp16Avx.asm b/onnxruntime/core/mlas/lib/amd64/cvtfp16Avx.asm index c7f6342c527bf..800863c77a230 100644 --- a/onnxruntime/core/mlas/lib/amd64/cvtfp16Avx.asm +++ b/onnxruntime/core/mlas/lib/amd64/cvtfp16Avx.asm @@ -54,7 +54,7 @@ HIGH_SELECTOR equ 00110001b LEAF_ENTRY MlasCastF16ToF32KernelAvx, _TEXT - test r8, r8 ; Check if we have any elements to convert + test r8, r8 ; Check if we have any elements to convert jz ExitRoutine cmp r8, 8 jb ConvertMaskedVectors @@ -80,6 +80,8 @@ Convert256Vectors: jz ExitRoutine ; If we are done, exit cmp r8, 16 ; If the vector is big enough, we go again jae Convert256Vectors + cmp r8, 8 ; Check if we have enough elements to convert + jb ConvertMaskedVectors diff --git a/onnxruntime/core/mlas/lib/x86_64/cvtfp16Avx.S b/onnxruntime/core/mlas/lib/x86_64/cvtfp16Avx.S index 1a70061460e50..a4d730fa513ab 100644 --- a/onnxruntime/core/mlas/lib/x86_64/cvtfp16Avx.S +++ b/onnxruntime/core/mlas/lib/x86_64/cvtfp16Avx.S @@ -51,8 +51,6 @@ FUNCTION_ENTRY MlasCastF16ToF32KernelAvx test rdx, rdx // Check if we have any elements to convert jz ExitRoutine - -AVX_NE_CONVERT: cmp rdx, 8 jb ConvertMaskedVectors cmp rdx, 16 @@ -75,6 +73,8 @@ Convert256Vectors: jz ExitRoutine // If we are done, exit cmp rdx, 16 // If the vector is big enough, we go again jae Convert256Vectors + cmp rdx, 8 // Check if we have enough elements to convert + jb ConvertMaskedVectors