From 9eca8cc83e0dfd740d45adcdc9f62e2040bd195f Mon Sep 17 00:00:00 2001 From: Katherine Whitlock Date: Fri, 13 Dec 2024 03:41:32 -0500 Subject: [PATCH] Fix usage of vfma on platform that don't have it (#221) --- Source/MatrixFunctions/arm_mat_cholesky_f32.c | 15 +++++++++++---- .../arm_mat_solve_lower_triangular_f32.c | 4 ++++ .../arm_mat_solve_upper_triangular_f32.c | 4 ++++ Source/StatisticsFunctions/arm_mse_f32.c | 5 ++++- 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/Source/MatrixFunctions/arm_mat_cholesky_f32.c b/Source/MatrixFunctions/arm_mat_cholesky_f32.c index e1ad00d17..6b6de661f 100755 --- a/Source/MatrixFunctions/arm_mat_cholesky_f32.c +++ b/Source/MatrixFunctions/arm_mat_cholesky_f32.c @@ -258,12 +258,17 @@ ARM_DSP_ATTRIBUTE arm_status arm_mat_cholesky_f32( vecGj1=vld1q_f32(&pG[(j + 1) * n + k]); vecGj2=vld1q_f32(&pG[(j + 2) * n + k]); vecGj3=vld1q_f32(&pG[(j + 3) * n + k]); - +#if defined(__ARM_FEATURE_FMA) acc0 = vfmaq_f32(acc0, vecGi, vecGj0); acc1 = vfmaq_f32(acc1, vecGi, vecGj1); acc2 = vfmaq_f32(acc2, vecGi, vecGj2); acc3 = vfmaq_f32(acc3, vecGi, vecGj3); - +#else + acc0 = vmlaq_f32(acc0, vecGi, vecGj0); + acc1 = vmlaq_f32(acc1, vecGi, vecGj1); + acc2 = vmlaq_f32(acc2, vecGi, vecGj2); + acc3 = vmlaq_f32(acc3, vecGi, vecGj3); +#endif kCnt--; k+=4; } @@ -319,9 +324,11 @@ ARM_DSP_ATTRIBUTE arm_status arm_mat_cholesky_f32( vecGi=vld1q_f32(&pG[i * n + k]); vecGj=vld1q_f32(&pG[j * n + k]); - +#if defined(__ARM_FEATURE_FMA) acc = vfmaq_f32(acc, vecGi, vecGj); - +#else + acc = vmlaq_f32(acc, vecGi, vecGj); +#endif kCnt--; k+=4; } diff --git a/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f32.c b/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f32.c index c7cdecc3f..6b83d0a5e 100755 --- a/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f32.c +++ b/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f32.c @@ -209,7 +209,11 @@ for(k=0; k < i; k++) { vecX = vld1q_f32(&pX[cols*k+j]); +#if defined(__ARM_FEATURE_FMA) vecA = vfmsq_f32(vecA,vdupq_n_f32(pLT[n*i + k]),vecX); +#else + vecA = vmlsq_f32(vecA,vdupq_n_f32(pLT[n*i + k]),vecX); +#endif } if (pLT[n*i + i]==0.0f) diff --git a/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f32.c b/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f32.c index f58dfbd9f..6fe69a300 100755 --- a/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f32.c +++ b/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f32.c @@ -197,7 +197,11 @@ arm_status status; /* status of matrix inverse */ for(k=n-1; k > i; k--) { vecX = vld1q_f32(&pX[cols*k+j]); +#if defined(__ARM_FEATURE_FMA) vecA = vfmsq_f32(vecA,vdupq_n_f32(pUT[n*i + k]),vecX); +#else + vecA = vmlsq_f32(vecA,vdupq_n_f32(pUT[n*i + k]),vecX); +#endif } if (pUT[n*i + i]==0.0f) diff --git a/Source/StatisticsFunctions/arm_mse_f32.c b/Source/StatisticsFunctions/arm_mse_f32.c index b4c676159..d95a90fdd 100755 --- a/Source/StatisticsFunctions/arm_mse_f32.c +++ b/Source/StatisticsFunctions/arm_mse_f32.c @@ -132,8 +132,11 @@ ARM_DSP_ATTRIBUTE void arm_mse_f32( pSrcB += 4; vecA = vsubq_f32(vecA, vecB); - +#if defined(__ARM_FEATURE_FMA) vecSum = vfmaq_f32(vecSum, vecA, vecA); +#else + vecSum = vmlaq_f32(vecSum, vecA, vecA); +#endif /* * Decrement the blockSize loop counter */