gu: add a function to multiply 4x4 matrices (#172)

Add guMtx44Concat() in both C and PS variants. The PS variant is more than 3 times faster that the C one; it's written in a separate file (and not in gu_psasm.S) because it uses a different naming of the matrix registers. These functions are especially useful when porting programs written for OpenGL, which uses 4x4 matrices.
devkitPro · May 3, 2024 · 0ea7278 · 0ea7278
1 parent f1c3747
commit 0ea7278
Show file tree

Hide file tree

Showing 4 changed files with 164 additions and 1 deletion.
diff --git a/Makefile b/Makefile
@@ -143,7 +143,7 @@ OGCOBJ		:=	\
 			exception_handler.o exception.o irq.o irq_handler.o semaphore.o \
 			video_asm.o video.o pad.o dvd.o exi.o mutex.o arqueue.o	arqmgr.o	\
 			cache_asm.o system.o system_asm.o cond.o			\
-			gx.o gu.o gu_psasm.o audio.o cache.o decrementer.o			\
+			gx.o gu.o gu_psasm.o gu_ps_concat44.o audio.o cache.o decrementer.o	\
 			message.o card.o aram.o depackrnc.o decrementer_handler.o	\
 			depackrnc1.o dsp.o si.o tpl.o ipc.o ogc_crt0.o \
 			console_font_8x16.o timesupp.o lock_supp.o usbgecko.o usbmouse.o \

diff --git a/gc/ogc/gu.h b/gc/ogc/gu.h
@@ -397,6 +397,7 @@ void c_guMtxRotTrig(Mtx mt,const char axis,f32 sinA,f32 cosA);
 void c_guMtxRotAxisRad(Mtx mt,guVector *axis,f32 rad);
 void c_guMtxReflect(Mtx m,const guVector *p,const guVector *n);
 void c_guMtxQuat(Mtx m,const guQuaternion *a);
+void c_guMtx44Concat(const Mtx44 a,const Mtx44 b,Mtx44 ab);
 
 #ifdef GEKKO
 void ps_guMtxIdentity( Mtx mt);
@@ -415,6 +416,7 @@ void ps_guMtxRotRad( Mtx mt, const char axis, f32 rad);
 void ps_guMtxRotTrig( Mtx mt, const char axis, f32 sinA, f32 cosA);
 void ps_guMtxRotAxisRad( Mtx mt, guVector *axis, f32 tmp0);
 void ps_guMtxReflect( Mtx m, const guVector *p, const guVector *n);
+void ps_guMtx44Concat(const Mtx44 a, const Mtx44 b, Mtx44 ab);
 #endif	//GEKKO
 
 void guMtx44Identity(Mtx44 mt);
@@ -457,6 +459,8 @@ u32 guMtx44Inverse(const Mtx44 src,Mtx44 inv);
 #define guMtxReflect			c_guMtxReflect
 #define guMtxQuat				c_guMtxQuat
 
+#define guMtx44Concat			c_guMtx44Concat
+
 #else //MTX_USE_C
 
 #define guVecAdd				ps_guVecAdd
@@ -491,6 +495,8 @@ u32 guMtx44Inverse(const Mtx44 src,Mtx44 inv);
 #define guMtxRotAxisRad			ps_guMtxRotAxisRad
 #define guMtxReflect			ps_guMtxReflect
 
+#define guMtx44Concat			ps_guMtx44Concat
+
 #endif //MTX_USE_PS
 
 #define guMtxRotDeg(mt,axis,deg)		guMtxRotRad(mt,axis,DegToRad(deg))

diff --git a/libogc/gu.c b/libogc/gu.c
@@ -112,6 +112,40 @@ void guMtx44Copy(const Mtx44 src,Mtx44 dst)
 	dst[3][0] = src[3][0]; dst[3][1] = src[3][1]; dst[3][2] = src[3][2]; dst[3][3] = src[3][3];
 }
 
+void c_guMtx44Concat(const Mtx44 a,const Mtx44 b,Mtx44 ab)
+{
+	Mtx44 tmp;
+	Mtx44P m;
+
+	if(ab==b || ab==a)
+		m = tmp;
+	else
+		m = ab;
+
+	m[0][0] = a[0][0]*b[0][0] + a[0][1]*b[1][0] + a[0][2]*b[2][0] + a[0][3]*b[3][0];
+	m[0][1] = a[0][0]*b[0][1] + a[0][1]*b[1][1] + a[0][2]*b[2][1] + a[0][3]*b[3][1];
+	m[0][2] = a[0][0]*b[0][2] + a[0][1]*b[1][2] + a[0][2]*b[2][2] + a[0][3]*b[3][2];
+	m[0][3] = a[0][0]*b[0][3] + a[0][1]*b[1][3] + a[0][2]*b[2][3] + a[0][3]*b[3][3];
+
+	m[1][0] = a[1][0]*b[0][0] + a[1][1]*b[1][0] + a[1][2]*b[2][0] + a[1][3]*b[3][0];
+	m[1][1] = a[1][0]*b[0][1] + a[1][1]*b[1][1] + a[1][2]*b[2][1] + a[1][3]*b[3][1];
+	m[1][2] = a[1][0]*b[0][2] + a[1][1]*b[1][2] + a[1][2]*b[2][2] + a[1][3]*b[3][2];
+	m[1][3] = a[1][0]*b[0][3] + a[1][1]*b[1][3] + a[1][2]*b[2][3] + a[1][3]*b[3][3];
+
+	m[2][0] = a[2][0]*b[0][0] + a[2][1]*b[1][0] + a[2][2]*b[2][0] + a[2][3]*b[3][0];
+	m[2][1] = a[2][0]*b[0][1] + a[2][1]*b[1][1] + a[2][2]*b[2][1] + a[2][3]*b[3][1];
+	m[2][2] = a[2][0]*b[0][2] + a[2][1]*b[1][2] + a[2][2]*b[2][2] + a[2][3]*b[3][2];
+	m[2][3] = a[2][0]*b[0][3] + a[2][1]*b[1][3] + a[2][2]*b[2][3] + a[2][3]*b[3][3];
+
+	m[3][0] = a[3][0]*b[0][0] + a[3][1]*b[1][0] + a[3][2]*b[2][0] + a[3][3]*b[3][0];
+	m[3][1] = a[3][0]*b[0][1] + a[3][1]*b[1][1] + a[3][2]*b[2][1] + a[3][3]*b[3][1];
+	m[3][2] = a[3][0]*b[0][2] + a[3][1]*b[1][2] + a[3][2]*b[2][2] + a[3][3]*b[3][2];
+	m[3][3] = a[3][0]*b[0][3] + a[3][1]*b[1][3] + a[3][2]*b[2][3] + a[3][3]*b[3][3];
+
+	if(m==tmp)
+		guMtx44Copy(tmp,ab);
+}
+
 u32 guMtx44Inverse(const Mtx44 src,Mtx44 inv)
 {
     f32 det;

diff --git a/libogc/gu_ps_concat44.S b/libogc/gu_ps_concat44.S
@@ -0,0 +1,123 @@
+#include <asm.h>
+
+/* We can use up to 32 registers, but registers starting from fr14 need to be
+ * saved and restored at the end on the function call, so we try to avoid them.
+ * We try to optimize the registers usage in the following way:
+ *
+ * The Dxx_Dxx registers are used to accumulate the value of the resulting
+ * matrix. We compute them in row-major order, after which the register is
+ * stored to the destination variable and can be reused. That's why we just
+ * need to keep two D registers per row.
+ *
+ * The Axx_Axx values can also be disposed after a row has been computed, so
+ * two registers per row could also be enough. However, to prevent data hazard
+ * while processing row N we do preload the two A registers used in row N+1.
+ * So, during the processing of a row, we can have up to four active A
+ * registers.
+ *
+ * The only values that are needed throught the computation are the
+ * elements of the B matrix, so to avoid unnnecessary re-loading we keep
+ * them in registers all the time. We therefore don't reuse B registers. */
+
+#define A00_A01		fr10
+#define A02_A03		fr11
+#define A10_A11		fr12
+#define A12_A13		fr13
+#define A20_A21		fr10
+#define A22_A23		fr11
+#define A30_A31     fr12
+#define A32_A33     fr13
+
+#define B00_B01		fr0
+#define B02_B03		fr1
+#define B10_B11		fr2
+#define B12_B13		fr3
+#define B20_B21		fr4
+#define B22_B23		fr5
+#define B30_B31     fr6
+#define B32_B33     fr7
+
+#define D00_D01		fr8
+#define D02_D03		fr9
+#define D10_D11		fr8
+#define D12_D13		fr9
+#define D20_D21		fr8
+#define D22_D23		fr9
+#define D30_D31		fr8
+#define D32_D33		fr9
+
+	.globl	ps_guMtx44Concat
+	//r3 = mtxA, r4 = mtxB, r5 = mtxAB
+ps_guMtx44Concat:
+	/* First row. This block is longer than the others below due to the fact
+	 * that we must also load all the B matrix into registers.
+	 * The code is less readable than what it could be because we intertwine
+	 * the instructions in order to avoid data hazards.
+	 */
+	psq_l		A00_A01,0(r3),0,0
+	psq_l		B00_B01,0(r4),0,0
+	psq_l		B02_B03,8(r4),0,0
+	psq_l		B10_B11,16(r4),0,0
+	ps_muls0	D00_D01,B00_B01,A00_A01
+	psq_l		A02_A03,8(r3),0,0
+	ps_muls0	D02_D03,B02_B03,A00_A01
+	psq_l		B12_B13,24(r4),0,0
+	ps_madds1	D00_D01,B10_B11,A00_A01,D00_D01
+	psq_l		B20_B21,32(r4),0,0
+	ps_madds1	D02_D03,B12_B13,A00_A01,D02_D03
+	psq_l		B22_B23,40(r4),0,0
+	ps_madds0	D00_D01,B20_B21,A02_A03,D00_D01
+	psq_l		B30_B31,48(r4),0,0
+	ps_madds0	D02_D03,B22_B23,A02_A03,D02_D03
+	psq_l		B32_B33,56(r4),0,0
+	ps_madds1	D00_D01,B30_B31,A02_A03,D00_D01
+	psq_l		A10_A11,16(r3),0,0
+	ps_madds1	D02_D03,B32_B33,A02_A03,D02_D03
+	psq_st		D00_D01,0(r5),0,0
+	psq_l		A12_A13,24(r3),0,0
+	psq_st		D02_D03,8(r5),0,0
+
+    // Second row
+	ps_muls0	D10_D11,B00_B01,A10_A11
+	ps_muls0	D12_D13,B02_B03,A10_A11
+	ps_madds0	D10_D11,B20_B21,A12_A13,D10_D11
+	ps_madds0	D12_D13,B22_B23,A12_A13,D12_D13
+	ps_madds1	D10_D11,B10_B11,A10_A11,D10_D11
+	ps_madds1	D12_D13,B12_B13,A10_A11,D12_D13
+	psq_l		A20_A21,32(r3),0,0
+	ps_madds1	D10_D11,B30_B31,A12_A13,D10_D11
+	psq_l		A22_A23,40(r3),0,0
+	ps_madds1	D12_D13,B32_B33,A12_A13,D12_D13
+	psq_st		D10_D11,16(r5),0,0
+	psq_st		D12_D13,24(r5),0,0
+
+    // Third row
+	ps_muls0	D20_D21,B00_B01,A20_A21
+	ps_muls0	D22_D23,B02_B03,A20_A21
+	ps_madds0	D20_D21,B20_B21,A22_A23,D20_D21
+	ps_madds0	D22_D23,B22_B23,A22_A23,D22_D23
+	ps_madds1	D20_D21,B10_B11,A20_A21,D20_D21
+	ps_madds1	D22_D23,B12_B13,A20_A21,D22_D23
+	psq_l		A30_A31,48(r3),0,0
+	ps_madds1	D20_D21,B30_B31,A22_A23,D20_D21
+	psq_l		A32_A33,56(r3),0,0
+	ps_madds1	D22_D23,B32_B33,A22_A23,D22_D23
+	psq_st		D20_D21,32(r5),0,0
+	psq_st		D22_D23,40(r5),0,0
+
+    // Fourth row
+	ps_muls0	D30_D31,B00_B01,A30_A31
+	ps_muls0	D32_D33,B02_B03,A30_A31
+	ps_madds0	D30_D31,B20_B21,A32_A33,D30_D31
+	ps_madds0	D32_D33,B22_B23,A32_A33,D32_D33
+	ps_madds1	D30_D31,B10_B11,A30_A31,D30_D31
+	ps_madds1	D32_D33,B12_B13,A30_A31,D32_D33
+	ps_madds1	D30_D31,B30_B31,A32_A33,D30_D31
+	ps_madds1	D32_D33,B32_B33,A32_A33,D32_D33
+	psq_st		D30_D31,48(r5),0,0
+	psq_st		D32_D33,56(r5),0,0
+
+	blr
+
+	.section .sdata
+	.balign 16