Skip to content

Commit

Permalink
gu: add a function to multiply 4x4 matrices (#172)
Browse files Browse the repository at this point in the history
Add guMtx44Concat() in both C and PS variants. The PS variant is more
than 3 times faster that the C one; it's written in a separate file (and
not in gu_psasm.S) because it uses a different naming of the matrix
registers.

These functions are especially useful when porting programs written for
OpenGL, which uses 4x4 matrices.
  • Loading branch information
mardy authored May 3, 2024
1 parent f1c3747 commit 0ea7278
Show file tree
Hide file tree
Showing 4 changed files with 164 additions and 1 deletion.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ OGCOBJ := \
exception_handler.o exception.o irq.o irq_handler.o semaphore.o \
video_asm.o video.o pad.o dvd.o exi.o mutex.o arqueue.o arqmgr.o \
cache_asm.o system.o system_asm.o cond.o \
gx.o gu.o gu_psasm.o audio.o cache.o decrementer.o \
gx.o gu.o gu_psasm.o gu_ps_concat44.o audio.o cache.o decrementer.o \
message.o card.o aram.o depackrnc.o decrementer_handler.o \
depackrnc1.o dsp.o si.o tpl.o ipc.o ogc_crt0.o \
console_font_8x16.o timesupp.o lock_supp.o usbgecko.o usbmouse.o \
Expand Down
6 changes: 6 additions & 0 deletions gc/ogc/gu.h
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,7 @@ void c_guMtxRotTrig(Mtx mt,const char axis,f32 sinA,f32 cosA);
void c_guMtxRotAxisRad(Mtx mt,guVector *axis,f32 rad);
void c_guMtxReflect(Mtx m,const guVector *p,const guVector *n);
void c_guMtxQuat(Mtx m,const guQuaternion *a);
void c_guMtx44Concat(const Mtx44 a,const Mtx44 b,Mtx44 ab);

#ifdef GEKKO
void ps_guMtxIdentity( Mtx mt);
Expand All @@ -415,6 +416,7 @@ void ps_guMtxRotRad( Mtx mt, const char axis, f32 rad);
void ps_guMtxRotTrig( Mtx mt, const char axis, f32 sinA, f32 cosA);
void ps_guMtxRotAxisRad( Mtx mt, guVector *axis, f32 tmp0);
void ps_guMtxReflect( Mtx m, const guVector *p, const guVector *n);
void ps_guMtx44Concat(const Mtx44 a, const Mtx44 b, Mtx44 ab);
#endif //GEKKO

void guMtx44Identity(Mtx44 mt);
Expand Down Expand Up @@ -457,6 +459,8 @@ u32 guMtx44Inverse(const Mtx44 src,Mtx44 inv);
#define guMtxReflect c_guMtxReflect
#define guMtxQuat c_guMtxQuat

#define guMtx44Concat c_guMtx44Concat

#else //MTX_USE_C

#define guVecAdd ps_guVecAdd
Expand Down Expand Up @@ -491,6 +495,8 @@ u32 guMtx44Inverse(const Mtx44 src,Mtx44 inv);
#define guMtxRotAxisRad ps_guMtxRotAxisRad
#define guMtxReflect ps_guMtxReflect

#define guMtx44Concat ps_guMtx44Concat

#endif //MTX_USE_PS

#define guMtxRotDeg(mt,axis,deg) guMtxRotRad(mt,axis,DegToRad(deg))
Expand Down
34 changes: 34 additions & 0 deletions libogc/gu.c
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,40 @@ void guMtx44Copy(const Mtx44 src,Mtx44 dst)
dst[3][0] = src[3][0]; dst[3][1] = src[3][1]; dst[3][2] = src[3][2]; dst[3][3] = src[3][3];
}

void c_guMtx44Concat(const Mtx44 a,const Mtx44 b,Mtx44 ab)
{
Mtx44 tmp;
Mtx44P m;

if(ab==b || ab==a)
m = tmp;
else
m = ab;

m[0][0] = a[0][0]*b[0][0] + a[0][1]*b[1][0] + a[0][2]*b[2][0] + a[0][3]*b[3][0];
m[0][1] = a[0][0]*b[0][1] + a[0][1]*b[1][1] + a[0][2]*b[2][1] + a[0][3]*b[3][1];
m[0][2] = a[0][0]*b[0][2] + a[0][1]*b[1][2] + a[0][2]*b[2][2] + a[0][3]*b[3][2];
m[0][3] = a[0][0]*b[0][3] + a[0][1]*b[1][3] + a[0][2]*b[2][3] + a[0][3]*b[3][3];

m[1][0] = a[1][0]*b[0][0] + a[1][1]*b[1][0] + a[1][2]*b[2][0] + a[1][3]*b[3][0];
m[1][1] = a[1][0]*b[0][1] + a[1][1]*b[1][1] + a[1][2]*b[2][1] + a[1][3]*b[3][1];
m[1][2] = a[1][0]*b[0][2] + a[1][1]*b[1][2] + a[1][2]*b[2][2] + a[1][3]*b[3][2];
m[1][3] = a[1][0]*b[0][3] + a[1][1]*b[1][3] + a[1][2]*b[2][3] + a[1][3]*b[3][3];

m[2][0] = a[2][0]*b[0][0] + a[2][1]*b[1][0] + a[2][2]*b[2][0] + a[2][3]*b[3][0];
m[2][1] = a[2][0]*b[0][1] + a[2][1]*b[1][1] + a[2][2]*b[2][1] + a[2][3]*b[3][1];
m[2][2] = a[2][0]*b[0][2] + a[2][1]*b[1][2] + a[2][2]*b[2][2] + a[2][3]*b[3][2];
m[2][3] = a[2][0]*b[0][3] + a[2][1]*b[1][3] + a[2][2]*b[2][3] + a[2][3]*b[3][3];

m[3][0] = a[3][0]*b[0][0] + a[3][1]*b[1][0] + a[3][2]*b[2][0] + a[3][3]*b[3][0];
m[3][1] = a[3][0]*b[0][1] + a[3][1]*b[1][1] + a[3][2]*b[2][1] + a[3][3]*b[3][1];
m[3][2] = a[3][0]*b[0][2] + a[3][1]*b[1][2] + a[3][2]*b[2][2] + a[3][3]*b[3][2];
m[3][3] = a[3][0]*b[0][3] + a[3][1]*b[1][3] + a[3][2]*b[2][3] + a[3][3]*b[3][3];

if(m==tmp)
guMtx44Copy(tmp,ab);
}

u32 guMtx44Inverse(const Mtx44 src,Mtx44 inv)
{
f32 det;
Expand Down
123 changes: 123 additions & 0 deletions libogc/gu_ps_concat44.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#include <asm.h>

/* We can use up to 32 registers, but registers starting from fr14 need to be
* saved and restored at the end on the function call, so we try to avoid them.
* We try to optimize the registers usage in the following way:
*
* The Dxx_Dxx registers are used to accumulate the value of the resulting
* matrix. We compute them in row-major order, after which the register is
* stored to the destination variable and can be reused. That's why we just
* need to keep two D registers per row.
*
* The Axx_Axx values can also be disposed after a row has been computed, so
* two registers per row could also be enough. However, to prevent data hazard
* while processing row N we do preload the two A registers used in row N+1.
* So, during the processing of a row, we can have up to four active A
* registers.
*
* The only values that are needed throught the computation are the
* elements of the B matrix, so to avoid unnnecessary re-loading we keep
* them in registers all the time. We therefore don't reuse B registers. */

#define A00_A01 fr10
#define A02_A03 fr11
#define A10_A11 fr12
#define A12_A13 fr13
#define A20_A21 fr10
#define A22_A23 fr11
#define A30_A31 fr12
#define A32_A33 fr13

#define B00_B01 fr0
#define B02_B03 fr1
#define B10_B11 fr2
#define B12_B13 fr3
#define B20_B21 fr4
#define B22_B23 fr5
#define B30_B31 fr6
#define B32_B33 fr7

#define D00_D01 fr8
#define D02_D03 fr9
#define D10_D11 fr8
#define D12_D13 fr9
#define D20_D21 fr8
#define D22_D23 fr9
#define D30_D31 fr8
#define D32_D33 fr9

.globl ps_guMtx44Concat
//r3 = mtxA, r4 = mtxB, r5 = mtxAB
ps_guMtx44Concat:
/* First row. This block is longer than the others below due to the fact
* that we must also load all the B matrix into registers.
* The code is less readable than what it could be because we intertwine
* the instructions in order to avoid data hazards.
*/
psq_l A00_A01,0(r3),0,0
psq_l B00_B01,0(r4),0,0
psq_l B02_B03,8(r4),0,0
psq_l B10_B11,16(r4),0,0
ps_muls0 D00_D01,B00_B01,A00_A01
psq_l A02_A03,8(r3),0,0
ps_muls0 D02_D03,B02_B03,A00_A01
psq_l B12_B13,24(r4),0,0
ps_madds1 D00_D01,B10_B11,A00_A01,D00_D01
psq_l B20_B21,32(r4),0,0
ps_madds1 D02_D03,B12_B13,A00_A01,D02_D03
psq_l B22_B23,40(r4),0,0
ps_madds0 D00_D01,B20_B21,A02_A03,D00_D01
psq_l B30_B31,48(r4),0,0
ps_madds0 D02_D03,B22_B23,A02_A03,D02_D03
psq_l B32_B33,56(r4),0,0
ps_madds1 D00_D01,B30_B31,A02_A03,D00_D01
psq_l A10_A11,16(r3),0,0
ps_madds1 D02_D03,B32_B33,A02_A03,D02_D03
psq_st D00_D01,0(r5),0,0
psq_l A12_A13,24(r3),0,0
psq_st D02_D03,8(r5),0,0

// Second row
ps_muls0 D10_D11,B00_B01,A10_A11
ps_muls0 D12_D13,B02_B03,A10_A11
ps_madds0 D10_D11,B20_B21,A12_A13,D10_D11
ps_madds0 D12_D13,B22_B23,A12_A13,D12_D13
ps_madds1 D10_D11,B10_B11,A10_A11,D10_D11
ps_madds1 D12_D13,B12_B13,A10_A11,D12_D13
psq_l A20_A21,32(r3),0,0
ps_madds1 D10_D11,B30_B31,A12_A13,D10_D11
psq_l A22_A23,40(r3),0,0
ps_madds1 D12_D13,B32_B33,A12_A13,D12_D13
psq_st D10_D11,16(r5),0,0
psq_st D12_D13,24(r5),0,0

// Third row
ps_muls0 D20_D21,B00_B01,A20_A21
ps_muls0 D22_D23,B02_B03,A20_A21
ps_madds0 D20_D21,B20_B21,A22_A23,D20_D21
ps_madds0 D22_D23,B22_B23,A22_A23,D22_D23
ps_madds1 D20_D21,B10_B11,A20_A21,D20_D21
ps_madds1 D22_D23,B12_B13,A20_A21,D22_D23
psq_l A30_A31,48(r3),0,0
ps_madds1 D20_D21,B30_B31,A22_A23,D20_D21
psq_l A32_A33,56(r3),0,0
ps_madds1 D22_D23,B32_B33,A22_A23,D22_D23
psq_st D20_D21,32(r5),0,0
psq_st D22_D23,40(r5),0,0

// Fourth row
ps_muls0 D30_D31,B00_B01,A30_A31
ps_muls0 D32_D33,B02_B03,A30_A31
ps_madds0 D30_D31,B20_B21,A32_A33,D30_D31
ps_madds0 D32_D33,B22_B23,A32_A33,D32_D33
ps_madds1 D30_D31,B10_B11,A30_A31,D30_D31
ps_madds1 D32_D33,B12_B13,A30_A31,D32_D33
ps_madds1 D30_D31,B30_B31,A32_A33,D30_D31
ps_madds1 D32_D33,B32_B33,A32_A33,D32_D33
psq_st D30_D31,48(r5),0,0
psq_st D32_D33,56(r5),0,0

blr

.section .sdata
.balign 16

0 comments on commit 0ea7278

Please sign in to comment.