From 0ea72783a3792ed7d9fa2571f8f2f66676467099 Mon Sep 17 00:00:00 2001 From: Alberto Mardegan Date: Fri, 3 May 2024 09:03:16 +0300 Subject: [PATCH] gu: add a function to multiply 4x4 matrices (#172) Add guMtx44Concat() in both C and PS variants. The PS variant is more than 3 times faster that the C one; it's written in a separate file (and not in gu_psasm.S) because it uses a different naming of the matrix registers. These functions are especially useful when porting programs written for OpenGL, which uses 4x4 matrices. --- Makefile | 2 +- gc/ogc/gu.h | 6 ++ libogc/gu.c | 34 +++++++++++ libogc/gu_ps_concat44.S | 123 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 libogc/gu_ps_concat44.S diff --git a/Makefile b/Makefile index a5901bc7..cf7b7eb9 100644 --- a/Makefile +++ b/Makefile @@ -143,7 +143,7 @@ OGCOBJ := \ exception_handler.o exception.o irq.o irq_handler.o semaphore.o \ video_asm.o video.o pad.o dvd.o exi.o mutex.o arqueue.o arqmgr.o \ cache_asm.o system.o system_asm.o cond.o \ - gx.o gu.o gu_psasm.o audio.o cache.o decrementer.o \ + gx.o gu.o gu_psasm.o gu_ps_concat44.o audio.o cache.o decrementer.o \ message.o card.o aram.o depackrnc.o decrementer_handler.o \ depackrnc1.o dsp.o si.o tpl.o ipc.o ogc_crt0.o \ console_font_8x16.o timesupp.o lock_supp.o usbgecko.o usbmouse.o \ diff --git a/gc/ogc/gu.h b/gc/ogc/gu.h index 3f95a8c5..0ce89497 100644 --- a/gc/ogc/gu.h +++ b/gc/ogc/gu.h @@ -397,6 +397,7 @@ void c_guMtxRotTrig(Mtx mt,const char axis,f32 sinA,f32 cosA); void c_guMtxRotAxisRad(Mtx mt,guVector *axis,f32 rad); void c_guMtxReflect(Mtx m,const guVector *p,const guVector *n); void c_guMtxQuat(Mtx m,const guQuaternion *a); +void c_guMtx44Concat(const Mtx44 a,const Mtx44 b,Mtx44 ab); #ifdef GEKKO void ps_guMtxIdentity( Mtx mt); @@ -415,6 +416,7 @@ void ps_guMtxRotRad( Mtx mt, const char axis, f32 rad); void ps_guMtxRotTrig( Mtx mt, const char axis, f32 sinA, f32 cosA); void ps_guMtxRotAxisRad( Mtx mt, guVector *axis, f32 tmp0); void ps_guMtxReflect( Mtx m, const guVector *p, const guVector *n); +void ps_guMtx44Concat(const Mtx44 a, const Mtx44 b, Mtx44 ab); #endif //GEKKO void guMtx44Identity(Mtx44 mt); @@ -457,6 +459,8 @@ u32 guMtx44Inverse(const Mtx44 src,Mtx44 inv); #define guMtxReflect c_guMtxReflect #define guMtxQuat c_guMtxQuat +#define guMtx44Concat c_guMtx44Concat + #else //MTX_USE_C #define guVecAdd ps_guVecAdd @@ -491,6 +495,8 @@ u32 guMtx44Inverse(const Mtx44 src,Mtx44 inv); #define guMtxRotAxisRad ps_guMtxRotAxisRad #define guMtxReflect ps_guMtxReflect +#define guMtx44Concat ps_guMtx44Concat + #endif //MTX_USE_PS #define guMtxRotDeg(mt,axis,deg) guMtxRotRad(mt,axis,DegToRad(deg)) diff --git a/libogc/gu.c b/libogc/gu.c index 9d3e4aea..1c6b301a 100644 --- a/libogc/gu.c +++ b/libogc/gu.c @@ -112,6 +112,40 @@ void guMtx44Copy(const Mtx44 src,Mtx44 dst) dst[3][0] = src[3][0]; dst[3][1] = src[3][1]; dst[3][2] = src[3][2]; dst[3][3] = src[3][3]; } +void c_guMtx44Concat(const Mtx44 a,const Mtx44 b,Mtx44 ab) +{ + Mtx44 tmp; + Mtx44P m; + + if(ab==b || ab==a) + m = tmp; + else + m = ab; + + m[0][0] = a[0][0]*b[0][0] + a[0][1]*b[1][0] + a[0][2]*b[2][0] + a[0][3]*b[3][0]; + m[0][1] = a[0][0]*b[0][1] + a[0][1]*b[1][1] + a[0][2]*b[2][1] + a[0][3]*b[3][1]; + m[0][2] = a[0][0]*b[0][2] + a[0][1]*b[1][2] + a[0][2]*b[2][2] + a[0][3]*b[3][2]; + m[0][3] = a[0][0]*b[0][3] + a[0][1]*b[1][3] + a[0][2]*b[2][3] + a[0][3]*b[3][3]; + + m[1][0] = a[1][0]*b[0][0] + a[1][1]*b[1][0] + a[1][2]*b[2][0] + a[1][3]*b[3][0]; + m[1][1] = a[1][0]*b[0][1] + a[1][1]*b[1][1] + a[1][2]*b[2][1] + a[1][3]*b[3][1]; + m[1][2] = a[1][0]*b[0][2] + a[1][1]*b[1][2] + a[1][2]*b[2][2] + a[1][3]*b[3][2]; + m[1][3] = a[1][0]*b[0][3] + a[1][1]*b[1][3] + a[1][2]*b[2][3] + a[1][3]*b[3][3]; + + m[2][0] = a[2][0]*b[0][0] + a[2][1]*b[1][0] + a[2][2]*b[2][0] + a[2][3]*b[3][0]; + m[2][1] = a[2][0]*b[0][1] + a[2][1]*b[1][1] + a[2][2]*b[2][1] + a[2][3]*b[3][1]; + m[2][2] = a[2][0]*b[0][2] + a[2][1]*b[1][2] + a[2][2]*b[2][2] + a[2][3]*b[3][2]; + m[2][3] = a[2][0]*b[0][3] + a[2][1]*b[1][3] + a[2][2]*b[2][3] + a[2][3]*b[3][3]; + + m[3][0] = a[3][0]*b[0][0] + a[3][1]*b[1][0] + a[3][2]*b[2][0] + a[3][3]*b[3][0]; + m[3][1] = a[3][0]*b[0][1] + a[3][1]*b[1][1] + a[3][2]*b[2][1] + a[3][3]*b[3][1]; + m[3][2] = a[3][0]*b[0][2] + a[3][1]*b[1][2] + a[3][2]*b[2][2] + a[3][3]*b[3][2]; + m[3][3] = a[3][0]*b[0][3] + a[3][1]*b[1][3] + a[3][2]*b[2][3] + a[3][3]*b[3][3]; + + if(m==tmp) + guMtx44Copy(tmp,ab); +} + u32 guMtx44Inverse(const Mtx44 src,Mtx44 inv) { f32 det; diff --git a/libogc/gu_ps_concat44.S b/libogc/gu_ps_concat44.S new file mode 100644 index 00000000..f3bf0e0e --- /dev/null +++ b/libogc/gu_ps_concat44.S @@ -0,0 +1,123 @@ +#include + +/* We can use up to 32 registers, but registers starting from fr14 need to be + * saved and restored at the end on the function call, so we try to avoid them. + * We try to optimize the registers usage in the following way: + * + * The Dxx_Dxx registers are used to accumulate the value of the resulting + * matrix. We compute them in row-major order, after which the register is + * stored to the destination variable and can be reused. That's why we just + * need to keep two D registers per row. + * + * The Axx_Axx values can also be disposed after a row has been computed, so + * two registers per row could also be enough. However, to prevent data hazard + * while processing row N we do preload the two A registers used in row N+1. + * So, during the processing of a row, we can have up to four active A + * registers. + * + * The only values that are needed throught the computation are the + * elements of the B matrix, so to avoid unnnecessary re-loading we keep + * them in registers all the time. We therefore don't reuse B registers. */ + +#define A00_A01 fr10 +#define A02_A03 fr11 +#define A10_A11 fr12 +#define A12_A13 fr13 +#define A20_A21 fr10 +#define A22_A23 fr11 +#define A30_A31 fr12 +#define A32_A33 fr13 + +#define B00_B01 fr0 +#define B02_B03 fr1 +#define B10_B11 fr2 +#define B12_B13 fr3 +#define B20_B21 fr4 +#define B22_B23 fr5 +#define B30_B31 fr6 +#define B32_B33 fr7 + +#define D00_D01 fr8 +#define D02_D03 fr9 +#define D10_D11 fr8 +#define D12_D13 fr9 +#define D20_D21 fr8 +#define D22_D23 fr9 +#define D30_D31 fr8 +#define D32_D33 fr9 + + .globl ps_guMtx44Concat + //r3 = mtxA, r4 = mtxB, r5 = mtxAB +ps_guMtx44Concat: + /* First row. This block is longer than the others below due to the fact + * that we must also load all the B matrix into registers. + * The code is less readable than what it could be because we intertwine + * the instructions in order to avoid data hazards. + */ + psq_l A00_A01,0(r3),0,0 + psq_l B00_B01,0(r4),0,0 + psq_l B02_B03,8(r4),0,0 + psq_l B10_B11,16(r4),0,0 + ps_muls0 D00_D01,B00_B01,A00_A01 + psq_l A02_A03,8(r3),0,0 + ps_muls0 D02_D03,B02_B03,A00_A01 + psq_l B12_B13,24(r4),0,0 + ps_madds1 D00_D01,B10_B11,A00_A01,D00_D01 + psq_l B20_B21,32(r4),0,0 + ps_madds1 D02_D03,B12_B13,A00_A01,D02_D03 + psq_l B22_B23,40(r4),0,0 + ps_madds0 D00_D01,B20_B21,A02_A03,D00_D01 + psq_l B30_B31,48(r4),0,0 + ps_madds0 D02_D03,B22_B23,A02_A03,D02_D03 + psq_l B32_B33,56(r4),0,0 + ps_madds1 D00_D01,B30_B31,A02_A03,D00_D01 + psq_l A10_A11,16(r3),0,0 + ps_madds1 D02_D03,B32_B33,A02_A03,D02_D03 + psq_st D00_D01,0(r5),0,0 + psq_l A12_A13,24(r3),0,0 + psq_st D02_D03,8(r5),0,0 + + // Second row + ps_muls0 D10_D11,B00_B01,A10_A11 + ps_muls0 D12_D13,B02_B03,A10_A11 + ps_madds0 D10_D11,B20_B21,A12_A13,D10_D11 + ps_madds0 D12_D13,B22_B23,A12_A13,D12_D13 + ps_madds1 D10_D11,B10_B11,A10_A11,D10_D11 + ps_madds1 D12_D13,B12_B13,A10_A11,D12_D13 + psq_l A20_A21,32(r3),0,0 + ps_madds1 D10_D11,B30_B31,A12_A13,D10_D11 + psq_l A22_A23,40(r3),0,0 + ps_madds1 D12_D13,B32_B33,A12_A13,D12_D13 + psq_st D10_D11,16(r5),0,0 + psq_st D12_D13,24(r5),0,0 + + // Third row + ps_muls0 D20_D21,B00_B01,A20_A21 + ps_muls0 D22_D23,B02_B03,A20_A21 + ps_madds0 D20_D21,B20_B21,A22_A23,D20_D21 + ps_madds0 D22_D23,B22_B23,A22_A23,D22_D23 + ps_madds1 D20_D21,B10_B11,A20_A21,D20_D21 + ps_madds1 D22_D23,B12_B13,A20_A21,D22_D23 + psq_l A30_A31,48(r3),0,0 + ps_madds1 D20_D21,B30_B31,A22_A23,D20_D21 + psq_l A32_A33,56(r3),0,0 + ps_madds1 D22_D23,B32_B33,A22_A23,D22_D23 + psq_st D20_D21,32(r5),0,0 + psq_st D22_D23,40(r5),0,0 + + // Fourth row + ps_muls0 D30_D31,B00_B01,A30_A31 + ps_muls0 D32_D33,B02_B03,A30_A31 + ps_madds0 D30_D31,B20_B21,A32_A33,D30_D31 + ps_madds0 D32_D33,B22_B23,A32_A33,D32_D33 + ps_madds1 D30_D31,B10_B11,A30_A31,D30_D31 + ps_madds1 D32_D33,B12_B13,A30_A31,D32_D33 + ps_madds1 D30_D31,B30_B31,A32_A33,D30_D31 + ps_madds1 D32_D33,B32_B33,A32_A33,D32_D33 + psq_st D30_D31,48(r5),0,0 + psq_st D32_D33,56(r5),0,0 + + blr + + .section .sdata + .balign 16