From 869f7187bc3b803d435e992c35f5aa37c844f44b Mon Sep 17 00:00:00 2001 From: Joe Mattiello Date: Sun, 19 Sep 2021 03:04:18 -0400 Subject: [PATCH 01/34] Add Provenance mods to build for iOS --- src/dac.c | 4 ++-- src/m68000/inlines.h | 4 ++++ src/m68000/readcpu.c | 1 + 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/dac.c b/src/dac.c index 75f08bb1..a73cada2 100644 --- a/src/dac.c +++ b/src/dac.c @@ -53,7 +53,7 @@ #include "../libretro.h" -extern retro_audio_sample_batch_t audio_batch_cb; +//extern retro_audio_sample_batch_t audio_batch_cb; #define BUFFER_SIZE 0x10000 // Make the DAC buffers 64K x 16 bits #define DAC_AUDIO_RATE 48000 // Set the audio rate to 48 KHz @@ -188,7 +188,7 @@ void SDLSoundCallback(void * userdata, uint16_t * buffer, int length) HandleNextEvent(EVENT_JERRY); } while (!bufferDone); - audio_batch_cb((int16_t*)sampleBuffer, length / 2); +// audio_batch_cb((int16_t*)sampleBuffer, length / 2); } // LTXD/RTXD/SCLK/SMODE ($F1A148/4C/50/54) diff --git a/src/m68000/inlines.h b/src/m68000/inlines.h index 787b5207..a3dcda90 100644 --- a/src/m68000/inlines.h +++ b/src/m68000/inlines.h @@ -10,6 +10,10 @@ #ifndef __INLINES_H__ #define __INLINES_H__ +#ifndef INLINE +#define INLINE __inline__ +#endif /* INLINE */ + #include "cpudefs.h" #include "m68kinterface.h" diff --git a/src/m68000/readcpu.c b/src/m68000/readcpu.c index ab8a64f8..f218f5e2 100644 --- a/src/m68000/readcpu.c +++ b/src/m68000/readcpu.c @@ -22,6 +22,7 @@ #include #include +#include "inlines.h" #include "readcpu.h" int nr_cpuop_funcs; From 7e981306e99dc6d8d060b59eeef827123f693911 Mon Sep 17 00:00:00 2001 From: Joe Mattiello Date: Tue, 27 Mar 2018 18:54:34 -0400 Subject: [PATCH 02/34] Performance patches to dsp The bit shifting and masking is expensive on ARM64 for some reason. The unions seem to greatly reduce the perfomance hit of these common calls. --- src/dsp.c | 188 +++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 157 insertions(+), 31 deletions(-) diff --git a/src/dsp.c b/src/dsp.c index fdac9650..b29af591 100644 --- a/src/dsp.c +++ b/src/dsp.c @@ -266,16 +266,136 @@ static uint32_t dsp_flags; static uint32_t dsp_matrix_control; static uint32_t dsp_pointer_to_matrix; static uint32_t dsp_data_organization; -uint32_t dsp_control; + +typedef union Bits32 { + uint32_t WORD; + struct bits { +#ifdef LITTLE_ENDIAN + unsigned int b0: 1; + unsigned int b1: 1; + unsigned int b2: 1; + unsigned int b3: 1; + unsigned int b4: 1; + unsigned int b5: 1; + unsigned int b6: 1; + unsigned int b7: 1; + unsigned int b8: 1; + unsigned int b9: 1; + unsigned int b10: 1; + unsigned int b11: 1; + unsigned int b12: 1; + unsigned int b13: 1; + unsigned int b14: 1; + unsigned int b15: 1; + unsigned int b16: 1; + unsigned int b17: 1; + unsigned int b18: 1; + unsigned int b19: 1; + unsigned int b20: 1; + unsigned int b21: 1; + unsigned int b22: 1; + unsigned int b23: 1; + unsigned int b24: 1; + unsigned int b25: 1; + unsigned int b26: 1; + unsigned int b27: 1; + unsigned int b28: 1; + unsigned int b29: 1; + unsigned int b30: 1; + unsigned int b31: 1; +#else + // reverse the order of the bit fields. + unsigned int b31: 1; + unsigned int b30: 1; + unsigned int b29: 1; + unsigned int b28: 1; + unsigned int b27: 1; + unsigned int b26: 1; + unsigned int b25: 1; + unsigned int b24: 1; + unsigned int b23: 1; + unsigned int b22: 1; + unsigned int b21: 1; + unsigned int b20: 1; + unsigned int b19: 1; + unsigned int b18: 1; + unsigned int b17: 1; + unsigned int b16: 1; + unsigned int b15: 1; + unsigned int b14: 1; + unsigned int b13: 1; + unsigned int b12: 1; + unsigned int b11: 1; + unsigned int b10: 1; + unsigned int b9: 1; + unsigned int b8: 1; + unsigned int b7: 1; + unsigned int b6: 1; + unsigned int b5: 1; + unsigned int b4: 1; + unsigned int b3: 1; + unsigned int b2: 1; + unsigned int b1: 1; + unsigned int b0: 1; +#endif + } bits; +} Bits32; + +typedef union OpCode { + uint16_t WORD; +#pragma pack(push, 1) + struct Codes { +#ifdef LITTLE_ENDIAN + unsigned int second : 5; + unsigned int first : 5; + unsigned int index : 6; +#else + unsigned int index : 6; + unsigned int first : 5; + unsigned int second : 5; +#endif + } Codes; +#pragma pack(pop) +} OpCode; + +typedef union Offset { + uint32_t LONG; +#pragma pack(push, 1) + struct Members { +#ifdef LITTLE_ENDIAN + unsigned int offset : 31; + unsigned int bit : 1; +#else + unsigned int bit : 1; + unsigned int offset : 31; +#endif + } Members; +#pragma pack(pop) +} Offset; + +typedef union DSPLong { + uint32_t LONG; + struct Data { +#ifdef LITTLE_ENDIAN + uint16_t LWORD; + uint16_t UWORD; +#else + uint16_t UWORD; + uint16_t LWORD; +#endif + } Data; +} DSPLong; + +Bits32 dsp_control; static uint32_t dsp_div_control; static uint8_t dsp_flag_z, dsp_flag_n, dsp_flag_c; static uint32_t * dsp_reg = NULL, * dsp_alternate_reg = NULL; uint32_t dsp_reg_bank_0[32], dsp_reg_bank_1[32]; -static uint32_t dsp_opcode_first_parameter; -static uint32_t dsp_opcode_second_parameter; +static uint8_t dsp_opcode_first_parameter; +static uint8_t dsp_opcode_second_parameter; -#define DSP_RUNNING (dsp_control & 0x01) +#define DSP_RUNNING (dsp_control.bits.b0) #define RM dsp_reg[dsp_opcode_first_parameter] #define RN dsp_reg[dsp_opcode_second_parameter] @@ -397,8 +517,10 @@ uint8_t DSPReadByte(uint32_t offset, uint32_t who/*=UNKNOWN*/) uint16_t DSPReadWord(uint32_t offset, uint32_t who/*=UNKNOWN*/) { - offset &= 0xFFFFFFFE; - + Offset offsett; + offsett.LONG = offset; + offset = offsett.Members.offset; + if (offset >= DSP_WORK_RAM_BASE && offset <= DSP_WORK_RAM_BASE+0x1FFF) { offset -= DSP_WORK_RAM_BASE; @@ -406,11 +528,14 @@ uint16_t DSPReadWord(uint32_t offset, uint32_t who/*=UNKNOWN*/) } else if ((offset>=DSP_CONTROL_RAM_BASE)&&(offset> 16; + DSPLong data; + data.LONG = DSPReadLong(offset & 0xFFFFFFFC, who); + + if (offset & 0x03) { + return data.Data.LWORD; + } else { + return data.Data.UWORD; + } } return JaguarReadWord(offset, who); @@ -442,7 +567,7 @@ uint32_t DSPReadLong(uint32_t offset, uint32_t who/*=UNKNOWN*/) case 0x10: return dsp_pc; case 0x14: - return dsp_control; + return dsp_control.WORD; case 0x18: return dsp_modulo; case 0x1C: @@ -551,8 +676,8 @@ void DSPWriteLong(uint32_t offset, uint32_t data, uint32_t who/*=UNKNOWN*/) dsp_flag_c = (dsp_flags >> 1) & 0x01; dsp_flag_n = (dsp_flags >> 2) & 0x01; DSPUpdateRegisterBanks(); - dsp_control &= ~((dsp_flags & CINT04FLAGS) >> 3); - dsp_control &= ~((dsp_flags & CINT5FLAG) >> 1); + dsp_control.WORD &= ~((dsp_flags & CINT04FLAGS) >> 3); + dsp_control.WORD &= ~((dsp_flags & CINT5FLAG) >> 1); break; } case 0x04: @@ -596,7 +721,7 @@ void DSPWriteLong(uint32_t offset, uint32_t data, uint32_t who/*=UNKNOWN*/) } // Protect writes to VERSION and the interrupt latches... mask = VERSION | INT_LAT0 | INT_LAT1 | INT_LAT2 | INT_LAT3 | INT_LAT4 | INT_LAT5; - dsp_control = (dsp_control & mask) | (data & ~mask); + dsp_control.WORD = (dsp_control.WORD & mask) | (data & ~mask); //CC only! //!!!!!!!! @@ -650,7 +775,7 @@ void DSPHandleIRQs(void) return; // Get the active interrupt bits (latches) & interrupt mask (enables) - bits = ((dsp_control >> 10) & 0x20) | ((dsp_control >> 6) & 0x1F); + bits = ((dsp_control.WORD >> 10) & 0x20) | ((dsp_control.WORD >> 6) & 0x1F), mask = ((dsp_flags >> 11) & 0x20) | ((dsp_flags >> 4) & 0x1F); bits &= mask; @@ -737,7 +862,7 @@ void DSPHandleIRQsNP(void) return; // Get the active interrupt bits (latches) & interrupt mask (enables) - bits = ((dsp_control >> 10) & 0x20) | ((dsp_control >> 6) & 0x1F); + bits = ((dsp_control.WORD >> 10) & 0x20) | ((dsp_control.WORD >> 6) & 0x1F); mask = ((dsp_flags >> 11) & 0x20) | ((dsp_flags >> 4) & 0x1F); bits &= mask; @@ -777,11 +902,11 @@ void DSPSetIRQLine(int irqline, int state) { //NOTE: This doesn't take INT_LAT5 into account. !!! FIX !!! uint32_t mask = INT_LAT0 << irqline; - dsp_control &= ~mask; // Clear the latch bit + dsp_control.WORD &= ~mask; // Clear the latch bit if (state) { - dsp_control |= mask; // Set the latch bit + dsp_control.WORD |= mask; // Set the latch bit DSPHandleIRQsNP(); } } @@ -809,7 +934,7 @@ void DSPReset(void) dsp_matrix_control = 0x00000000; dsp_pointer_to_matrix = 0x00000000; dsp_data_organization = 0xFFFFFFFF; - dsp_control = 0x00002000; // Report DSP version 2 + dsp_control.WORD = 0x00002000; // Report DSP version 2 dsp_div_control = 0x00000000; dsp_in_exec = 0; @@ -883,7 +1008,7 @@ void DSPDone(void) WriteLog("DSP: %sin interrupt handler\n", ((dsp_flags & IMASK) ? "" : "not ")); // get the active interrupt bits - bits = ((dsp_control >> 10) & 0x20) | ((dsp_control >> 6) & 0x1F); + bits = ((dsp_control.WORD >> 10) & 0x20) | ((dsp_control.WORD >> 6) & 0x1F); // get the interrupt mask mask = ((dsp_flags >> 11) & 0x20) | ((dsp_flags >> 4) & 0x1F); @@ -951,22 +1076,23 @@ void DSPExec(int32_t cycles) while (cycles > 0 && DSP_RUNNING) { - uint16_t opcode; - uint32_t index; - if (IMASKCleared) // If IMASK was cleared, { DSPHandleIRQsNP(); // See if any other interrupts are pending! IMASKCleared = false; } - opcode = DSPReadWord(dsp_pc, DSP); - index = opcode >> 10; - dsp_opcode_first_parameter = (opcode >> 5) & 0x1F; - dsp_opcode_second_parameter = opcode & 0x1F; - dsp_pc += 2; - dsp_opcode[index](); - dsp_opcode_use[index]++; + OpCode opcode; + opcode.WORD = DSPReadWord(dsp_pc, DSP); + uint8_t index = opcode.Codes.index; + uint8_t fp = opcode.Codes.first; + uint8_t sp = opcode.Codes.second; + dsp_opcode_first_parameter = fp; + dsp_opcode_second_parameter = sp; + dsp_pc += 2; + dsp_opcode[index](); +// Counter is not necessary and expensive -jm prov +// dsp_opcode_use[index]++; cycles -= dsp_opcode_cycles[index]; } From 18486895a9b4ef7f660fa7c720b33447d61f3732 Mon Sep 17 00:00:00 2001 From: Joe Mattiello Date: Tue, 27 Mar 2018 19:43:52 -0400 Subject: [PATCH 03/34] Fix a bad comma, remove superfulous slow logging --- src/dsp.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/dsp.c b/src/dsp.c index b29af591..6260dcf3 100644 --- a/src/dsp.c +++ b/src/dsp.c @@ -775,7 +775,7 @@ void DSPHandleIRQs(void) return; // Get the active interrupt bits (latches) & interrupt mask (enables) - bits = ((dsp_control.WORD >> 10) & 0x20) | ((dsp_control.WORD >> 6) & 0x1F), + bits = ((dsp_control.WORD >> 10) & 0x20) | ((dsp_control.WORD >> 6) & 0x1F); mask = ((dsp_flags >> 11) & 0x20) | ((dsp_flags >> 4) & 0x1F); bits &= mask; @@ -1051,11 +1051,11 @@ void DSPDone(void) WriteLog("DSP opcodes use:\n"); - for (i=0;i<64;i++) - { - if (dsp_opcode_use[i]) - WriteLog("\t%s %i\n", dsp_opcode_str[i], dsp_opcode_use[i]); - } +// for (i=0;i<64;i++) +// { +// if (dsp_opcode_use[i]) +// WriteLog("\t%s %i\n", dsp_opcode_str[i], dsp_opcode_use[i]); +// } } @@ -2157,7 +2157,7 @@ static void DSP_jr(void) }//*/ dsp_pc += 2; // For DSP_DIS_* accuracy DSPOpcode[pipeline[plPtrExec].opcode](); - dsp_opcode_use[pipeline[plPtrExec].opcode]++; +// dsp_opcode_use[pipeline[plPtrExec].opcode]++; pipeline[plPtrWrite] = pipeline[plPtrExec]; // Step 3: Flush pipeline & set new PC @@ -2233,7 +2233,7 @@ static void DSP_jump(void) } dsp_pc += 2; // For DSP_DIS_* accuracy DSPOpcode[pipeline[plPtrExec].opcode](); - dsp_opcode_use[pipeline[plPtrExec].opcode]++; +// dsp_opcode_use[pipeline[plPtrExec].opcode]++; pipeline[plPtrWrite] = pipeline[plPtrExec]; // Step 3: Flush pipeline & set new PC From a51f84b25d0781c041686bfb381af04be8899635 Mon Sep 17 00:00:00 2001 From: Joe Mattiello Date: Tue, 27 Mar 2018 22:00:27 -0400 Subject: [PATCH 04/34] Inline a bunch of stuff --- src/dsp.c | 623 ++++++++++++++++++++++-------------------------------- 1 file changed, 252 insertions(+), 371 deletions(-) diff --git a/src/dsp.c b/src/dsp.c index 6260dcf3..04670e8e 100644 --- a/src/dsp.c +++ b/src/dsp.c @@ -131,70 +131,70 @@ bool IMASKCleared = false; #define INT_LAT5 0x10000 // Is opcode 62 *really* a NOP? Seems like it... -static void dsp_opcode_abs(void); -static void dsp_opcode_add(void); -static void dsp_opcode_addc(void); -static void dsp_opcode_addq(void); -static void dsp_opcode_addqmod(void); -static void dsp_opcode_addqt(void); -static void dsp_opcode_and(void); -static void dsp_opcode_bclr(void); -static void dsp_opcode_bset(void); -static void dsp_opcode_btst(void); -static void dsp_opcode_cmp(void); -static void dsp_opcode_cmpq(void); -static void dsp_opcode_div(void); -static void dsp_opcode_imacn(void); -static void dsp_opcode_imult(void); -static void dsp_opcode_imultn(void); -static void dsp_opcode_jr(void); -static void dsp_opcode_jump(void); -static void dsp_opcode_load(void); -static void dsp_opcode_loadb(void); -static void dsp_opcode_loadw(void); -static void dsp_opcode_load_r14_indexed(void); -static void dsp_opcode_load_r14_ri(void); -static void dsp_opcode_load_r15_indexed(void); -static void dsp_opcode_load_r15_ri(void); -static void dsp_opcode_mirror(void); -static void dsp_opcode_mmult(void); -static void dsp_opcode_move(void); -static void dsp_opcode_movei(void); -static void dsp_opcode_movefa(void); -static void dsp_opcode_move_pc(void); -static void dsp_opcode_moveq(void); -static void dsp_opcode_moveta(void); -static void dsp_opcode_mtoi(void); -static void dsp_opcode_mult(void); -static void dsp_opcode_neg(void); -static void dsp_opcode_nop(void); -static void dsp_opcode_normi(void); -static void dsp_opcode_not(void); -static void dsp_opcode_or(void); -static void dsp_opcode_resmac(void); -static void dsp_opcode_ror(void); -static void dsp_opcode_rorq(void); -static void dsp_opcode_xor(void); -static void dsp_opcode_sat16s(void); -static void dsp_opcode_sat32s(void); -static void dsp_opcode_sh(void); -static void dsp_opcode_sha(void); -static void dsp_opcode_sharq(void); -static void dsp_opcode_shlq(void); -static void dsp_opcode_shrq(void); -static void dsp_opcode_store(void); -static void dsp_opcode_storeb(void); -static void dsp_opcode_storew(void); -static void dsp_opcode_store_r14_indexed(void); -static void dsp_opcode_store_r14_ri(void); -static void dsp_opcode_store_r15_indexed(void); -static void dsp_opcode_store_r15_ri(void); -static void dsp_opcode_sub(void); -static void dsp_opcode_subc(void); -static void dsp_opcode_subq(void); -static void dsp_opcode_subqmod(void); -static void dsp_opcode_subqt(void); -static void dsp_opcode_illegal(void); +INLINE static void dsp_opcode_abs(void); +INLINE static void dsp_opcode_add(void); +INLINE static void dsp_opcode_addc(void); +INLINE static void dsp_opcode_addq(void); +INLINE static void dsp_opcode_addqmod(void); +INLINE static void dsp_opcode_addqt(void); +INLINE static void dsp_opcode_and(void); +INLINE static void dsp_opcode_bclr(void); +INLINE static void dsp_opcode_bset(void); +INLINE static void dsp_opcode_btst(void); +INLINE static void dsp_opcode_cmp(void); +INLINE static void dsp_opcode_cmpq(void); +INLINE static void dsp_opcode_div(void); +INLINE static void dsp_opcode_imacn(void); +INLINE static void dsp_opcode_imult(void); +INLINE static void dsp_opcode_imultn(void); +INLINE static void dsp_opcode_jr(void); +INLINE static void dsp_opcode_jump(void); +INLINE static void dsp_opcode_load(void); +INLINE static void dsp_opcode_loadb(void); +INLINE static void dsp_opcode_loadw(void); +INLINE static void dsp_opcode_load_r14_indexed(void); +INLINE static void dsp_opcode_load_r14_ri(void); +INLINE static void dsp_opcode_load_r15_indexed(void); +INLINE static void dsp_opcode_load_r15_ri(void); +INLINE static void dsp_opcode_mirror(void); +INLINE static void dsp_opcode_mmult(void); +INLINE static void dsp_opcode_move(void); +INLINE static void dsp_opcode_movei(void); +INLINE static void dsp_opcode_movefa(void); +INLINE static void dsp_opcode_move_pc(void); +INLINE static void dsp_opcode_moveq(void); +INLINE static void dsp_opcode_moveta(void); +INLINE static void dsp_opcode_mtoi(void); +INLINE static void dsp_opcode_mult(void); +INLINE static void dsp_opcode_neg(void); +INLINE static void dsp_opcode_nop(void); +INLINE static void dsp_opcode_normi(void); +INLINE static void dsp_opcode_not(void); +INLINE static void dsp_opcode_or(void); +INLINE static void dsp_opcode_resmac(void); +INLINE static void dsp_opcode_ror(void); +INLINE static void dsp_opcode_rorq(void); +INLINE static void dsp_opcode_xor(void); +INLINE static void dsp_opcode_sat16s(void); +INLINE static void dsp_opcode_sat32s(void); +INLINE static void dsp_opcode_sh(void); +INLINE static void dsp_opcode_sha(void); +INLINE static void dsp_opcode_sharq(void); +INLINE static void dsp_opcode_shlq(void); +INLINE static void dsp_opcode_shrq(void); +INLINE static void dsp_opcode_store(void); +INLINE static void dsp_opcode_storeb(void); +INLINE static void dsp_opcode_storew(void); +INLINE static void dsp_opcode_store_r14_indexed(void); +INLINE static void dsp_opcode_store_r14_ri(void); +INLINE static void dsp_opcode_store_r15_indexed(void); +INLINE static void dsp_opcode_store_r15_ri(void); +INLINE static void dsp_opcode_sub(void); +INLINE static void dsp_opcode_subc(void); +INLINE static void dsp_opcode_subq(void); +INLINE static void dsp_opcode_subqmod(void); +INLINE static void dsp_opcode_subqt(void); +INLINE static void dsp_opcode_illegal(void); //Here's a QnD kludge... //This is wrong, wrong, WRONG, but it seems to work for the time being... @@ -267,125 +267,6 @@ static uint32_t dsp_matrix_control; static uint32_t dsp_pointer_to_matrix; static uint32_t dsp_data_organization; -typedef union Bits32 { - uint32_t WORD; - struct bits { -#ifdef LITTLE_ENDIAN - unsigned int b0: 1; - unsigned int b1: 1; - unsigned int b2: 1; - unsigned int b3: 1; - unsigned int b4: 1; - unsigned int b5: 1; - unsigned int b6: 1; - unsigned int b7: 1; - unsigned int b8: 1; - unsigned int b9: 1; - unsigned int b10: 1; - unsigned int b11: 1; - unsigned int b12: 1; - unsigned int b13: 1; - unsigned int b14: 1; - unsigned int b15: 1; - unsigned int b16: 1; - unsigned int b17: 1; - unsigned int b18: 1; - unsigned int b19: 1; - unsigned int b20: 1; - unsigned int b21: 1; - unsigned int b22: 1; - unsigned int b23: 1; - unsigned int b24: 1; - unsigned int b25: 1; - unsigned int b26: 1; - unsigned int b27: 1; - unsigned int b28: 1; - unsigned int b29: 1; - unsigned int b30: 1; - unsigned int b31: 1; -#else - // reverse the order of the bit fields. - unsigned int b31: 1; - unsigned int b30: 1; - unsigned int b29: 1; - unsigned int b28: 1; - unsigned int b27: 1; - unsigned int b26: 1; - unsigned int b25: 1; - unsigned int b24: 1; - unsigned int b23: 1; - unsigned int b22: 1; - unsigned int b21: 1; - unsigned int b20: 1; - unsigned int b19: 1; - unsigned int b18: 1; - unsigned int b17: 1; - unsigned int b16: 1; - unsigned int b15: 1; - unsigned int b14: 1; - unsigned int b13: 1; - unsigned int b12: 1; - unsigned int b11: 1; - unsigned int b10: 1; - unsigned int b9: 1; - unsigned int b8: 1; - unsigned int b7: 1; - unsigned int b6: 1; - unsigned int b5: 1; - unsigned int b4: 1; - unsigned int b3: 1; - unsigned int b2: 1; - unsigned int b1: 1; - unsigned int b0: 1; -#endif - } bits; -} Bits32; - -typedef union OpCode { - uint16_t WORD; -#pragma pack(push, 1) - struct Codes { -#ifdef LITTLE_ENDIAN - unsigned int second : 5; - unsigned int first : 5; - unsigned int index : 6; -#else - unsigned int index : 6; - unsigned int first : 5; - unsigned int second : 5; -#endif - } Codes; -#pragma pack(pop) -} OpCode; - -typedef union Offset { - uint32_t LONG; -#pragma pack(push, 1) - struct Members { -#ifdef LITTLE_ENDIAN - unsigned int offset : 31; - unsigned int bit : 1; -#else - unsigned int bit : 1; - unsigned int offset : 31; -#endif - } Members; -#pragma pack(pop) -} Offset; - -typedef union DSPLong { - uint32_t LONG; - struct Data { -#ifdef LITTLE_ENDIAN - uint16_t LWORD; - uint16_t UWORD; -#else - uint16_t UWORD; - uint16_t LWORD; -#endif - } Data; -} DSPLong; - Bits32 dsp_control; static uint32_t dsp_div_control; static uint8_t dsp_flag_z, dsp_flag_n, dsp_flag_c; @@ -524,8 +405,8 @@ uint16_t DSPReadWord(uint32_t offset, uint32_t who/*=UNKNOWN*/) if (offset >= DSP_WORK_RAM_BASE && offset <= DSP_WORK_RAM_BASE+0x1FFF) { offset -= DSP_WORK_RAM_BASE; - return GET16(dsp_ram_8, offset); - } + return GET16(dsp_ram_8, offset); + } else if ((offset>=DSP_CONTROL_RAM_BASE)&&(offset= DSP_WORK_RAM_BASE && RM <= (DSP_WORK_RAM_BASE + 0x1FFF)) DSPWriteLong(RM, RN & 0xFF, DSP); @@ -1312,7 +1193,7 @@ static void dsp_opcode_storeb(void) } -static void dsp_opcode_storew(void) +INLINE static void dsp_opcode_storew(void) { #ifdef DSP_CORRECT_ALIGNMENT_STORE if (RM >= DSP_WORK_RAM_BASE && RM <= (DSP_WORK_RAM_BASE + 0x1FFF)) @@ -1328,7 +1209,7 @@ static void dsp_opcode_storew(void) } -static void dsp_opcode_store(void) +INLINE static void dsp_opcode_store(void) { #ifdef DSP_CORRECT_ALIGNMENT_STORE DSPWriteLong(RM & 0xFFFFFFFC, RN, DSP); @@ -1338,7 +1219,7 @@ static void dsp_opcode_store(void) } -static void dsp_opcode_loadb(void) +INLINE static void dsp_opcode_loadb(void) { if (RM >= DSP_WORK_RAM_BASE && RM <= (DSP_WORK_RAM_BASE + 0x1FFF)) RN = DSPReadLong(RM, DSP) & 0xFF; @@ -1347,7 +1228,7 @@ static void dsp_opcode_loadb(void) } -static void dsp_opcode_loadw(void) +INLINE static void dsp_opcode_loadw(void) { #ifdef DSP_CORRECT_ALIGNMENT if (RM >= DSP_WORK_RAM_BASE && RM <= (DSP_WORK_RAM_BASE + 0x1FFF)) @@ -1363,7 +1244,7 @@ static void dsp_opcode_loadw(void) } -static void dsp_opcode_load(void) +INLINE static void dsp_opcode_load(void) { #ifdef DSP_CORRECT_ALIGNMENT RN = DSPReadLong(RM & 0xFFFFFFFC, DSP); @@ -1373,7 +1254,7 @@ static void dsp_opcode_load(void) } -static void dsp_opcode_load_r14_indexed(void) +INLINE static void dsp_opcode_load_r14_indexed(void) { #ifdef DSP_CORRECT_ALIGNMENT RN = DSPReadLong((dsp_reg[14] & 0xFFFFFFFC) + (dsp_convert_zero[IMM_1] << 2), DSP); @@ -1383,7 +1264,7 @@ static void dsp_opcode_load_r14_indexed(void) } -static void dsp_opcode_load_r15_indexed(void) +INLINE static void dsp_opcode_load_r15_indexed(void) { #ifdef DSP_CORRECT_ALIGNMENT RN = DSPReadLong((dsp_reg[15] & 0xFFFFFFFC) + (dsp_convert_zero[IMM_1] << 2), DSP); @@ -1393,7 +1274,7 @@ static void dsp_opcode_load_r15_indexed(void) } -static void dsp_opcode_movei(void) +INLINE static void dsp_opcode_movei(void) { // This instruction is followed by 32-bit value in LSW / MSW format... RN = (uint32_t)DSPReadWord(dsp_pc, DSP) | ((uint32_t)DSPReadWord(dsp_pc + 2, DSP) << 16); @@ -1401,51 +1282,51 @@ static void dsp_opcode_movei(void) } -static void dsp_opcode_moveta(void) +INLINE static void dsp_opcode_moveta(void) { ALTERNATE_RN = RM; } -static void dsp_opcode_movefa(void) +INLINE static void dsp_opcode_movefa(void) { RN = ALTERNATE_RM; } -static void dsp_opcode_move(void) +INLINE static void dsp_opcode_move(void) { RN = RM; } -static void dsp_opcode_moveq(void) +INLINE static void dsp_opcode_moveq(void) { RN = IMM_1; } -static void dsp_opcode_resmac(void) +INLINE static void dsp_opcode_resmac(void) { RN = (uint32_t)dsp_acc; } -static void dsp_opcode_imult(void) +INLINE static void dsp_opcode_imult(void) { RN = (int16_t)RN * (int16_t)RM; SET_ZN(RN); } -static void dsp_opcode_mult(void) +INLINE static void dsp_opcode_mult(void) { RN = (uint16_t)RM * (uint16_t)RN; SET_ZN(RN); } -static void dsp_opcode_bclr(void) +INLINE static void dsp_opcode_bclr(void) { uint32_t res = RN & ~(1 << IMM_1); RN = res; @@ -1453,13 +1334,13 @@ static void dsp_opcode_bclr(void) } -static void dsp_opcode_btst(void) +INLINE static void dsp_opcode_btst(void) { dsp_flag_z = (~RN >> IMM_1) & 1; } -static void dsp_opcode_bset(void) +INLINE static void dsp_opcode_bset(void) { uint32_t res = RN | (1 << IMM_1); RN = res; @@ -1467,19 +1348,19 @@ static void dsp_opcode_bset(void) } -static void dsp_opcode_subqt(void) +INLINE static void dsp_opcode_subqt(void) { RN -= dsp_convert_zero[IMM_1]; } -static void dsp_opcode_addqt(void) +INLINE static void dsp_opcode_addqt(void) { RN += dsp_convert_zero[IMM_1]; } -static void dsp_opcode_imacn(void) +INLINE static void dsp_opcode_imacn(void) { int32_t res = (int16_t)RM * (int16_t)RN; dsp_acc += (int64_t)res; @@ -1487,14 +1368,14 @@ static void dsp_opcode_imacn(void) } -static void dsp_opcode_mtoi(void) +INLINE static void dsp_opcode_mtoi(void) { RN = (((int32_t)RM >> 8) & 0xFF800000) | (RM & 0x007FFFFF); SET_ZN(RN); } -static void dsp_opcode_normi(void) +INLINE static void dsp_opcode_normi(void) { uint32_t _Rm = RM; uint32_t res = 0; @@ -1517,7 +1398,7 @@ static void dsp_opcode_normi(void) } -static void dsp_opcode_mmult(void) +INLINE static void dsp_opcode_mmult(void) { uint32_t res; unsigned i; @@ -1564,7 +1445,7 @@ static void dsp_opcode_mmult(void) } -static void dsp_opcode_abs(void) +INLINE static void dsp_opcode_abs(void) { uint32_t _Rn = RN; @@ -1582,7 +1463,7 @@ static void dsp_opcode_abs(void) } -static void dsp_opcode_div(void) +INLINE static void dsp_opcode_div(void) { unsigned i; // Real algorithm, courtesy of SCPCD: NYAN! @@ -1607,7 +1488,7 @@ static void dsp_opcode_div(void) } -static void dsp_opcode_imultn(void) +INLINE static void dsp_opcode_imultn(void) { // This is OK, since this multiply won't overflow 32 bits... int32_t res = (int32_t)((int16_t)RN * (int16_t)RM); @@ -1616,7 +1497,7 @@ static void dsp_opcode_imultn(void) } -static void dsp_opcode_neg(void) +INLINE static void dsp_opcode_neg(void) { uint32_t res = -RN; SET_ZNC_SUB(0, RN, res); @@ -1624,7 +1505,7 @@ static void dsp_opcode_neg(void) } -static void dsp_opcode_shlq(void) +INLINE static void dsp_opcode_shlq(void) { // NB: This instruction is the *only* one that does (32 - immediate data). int32_t r1 = 32 - IMM_1; @@ -1634,7 +1515,7 @@ static void dsp_opcode_shlq(void) } -static void dsp_opcode_shrq(void) +INLINE static void dsp_opcode_shrq(void) { int32_t r1 = dsp_convert_zero[IMM_1]; uint32_t res = RN >> r1; @@ -1643,7 +1524,7 @@ static void dsp_opcode_shrq(void) } -static void dsp_opcode_ror(void) +INLINE static void dsp_opcode_ror(void) { uint32_t r1 = RM & 0x1F; uint32_t res = (RN >> r1) | (RN << (32 - r1)); @@ -1652,7 +1533,7 @@ static void dsp_opcode_ror(void) } -static void dsp_opcode_rorq(void) +INLINE static void dsp_opcode_rorq(void) { uint32_t r1 = dsp_convert_zero[IMM_1 & 0x1F]; uint32_t r2 = RN; @@ -1662,7 +1543,7 @@ static void dsp_opcode_rorq(void) } -static void dsp_opcode_sha(void) +INLINE static void dsp_opcode_sha(void) { int32_t sRm=(int32_t)RM; uint32_t _Rn=RN; @@ -1694,7 +1575,7 @@ static void dsp_opcode_sha(void) } -static void dsp_opcode_sharq(void) +INLINE static void dsp_opcode_sharq(void) { uint32_t res = (int32_t)RN >> dsp_convert_zero[IMM_1]; SET_ZN(res); dsp_flag_c = RN & 0x01; @@ -1702,7 +1583,7 @@ static void dsp_opcode_sharq(void) } -static void dsp_opcode_sh(void) +INLINE static void dsp_opcode_sh(void) { int32_t sRm=(int32_t)RM; uint32_t _Rn=RN; @@ -1784,70 +1665,70 @@ void dsp_opcode_illegal(void) /* New pipelined DSP core */ -static void DSP_abs(void); -static void DSP_add(void); -static void DSP_addc(void); -static void DSP_addq(void); -static void DSP_addqmod(void); -static void DSP_addqt(void); -static void DSP_and(void); -static void DSP_bclr(void); -static void DSP_bset(void); -static void DSP_btst(void); -static void DSP_cmp(void); -static void DSP_cmpq(void); -static void DSP_div(void); -static void DSP_imacn(void); -static void DSP_imult(void); -static void DSP_imultn(void); -static void DSP_illegal(void); -static void DSP_jr(void); -static void DSP_jump(void); -static void DSP_load(void); -static void DSP_loadb(void); -static void DSP_loadw(void); -static void DSP_load_r14_i(void); -static void DSP_load_r14_r(void); -static void DSP_load_r15_i(void); -static void DSP_load_r15_r(void); -static void DSP_mirror(void); -static void DSP_mmult(void); -static void DSP_move(void); -static void DSP_movefa(void); -static void DSP_movei(void); -static void DSP_movepc(void); -static void DSP_moveq(void); -static void DSP_moveta(void); -static void DSP_mtoi(void); -static void DSP_mult(void); -static void DSP_neg(void); -static void DSP_nop(void); -static void DSP_normi(void); -static void DSP_not(void); -static void DSP_or(void); -static void DSP_resmac(void); -static void DSP_ror(void); -static void DSP_rorq(void); -static void DSP_sat16s(void); -static void DSP_sat32s(void); -static void DSP_sh(void); -static void DSP_sha(void); -static void DSP_sharq(void); -static void DSP_shlq(void); -static void DSP_shrq(void); -static void DSP_store(void); -static void DSP_storeb(void); -static void DSP_storew(void); -static void DSP_store_r14_i(void); -static void DSP_store_r14_r(void); -static void DSP_store_r15_i(void); -static void DSP_store_r15_r(void); -static void DSP_sub(void); -static void DSP_subc(void); -static void DSP_subq(void); -static void DSP_subqmod(void); -static void DSP_subqt(void); -static void DSP_xor(void); +INLINE static void DSP_abs(void); +INLINE static void DSP_add(void); +INLINE static void DSP_addc(void); +INLINE static void DSP_addq(void); +INLINE static void DSP_addqmod(void); +INLINE static void DSP_addqt(void); +INLINE static void DSP_and(void); +INLINE static void DSP_bclr(void); +INLINE static void DSP_bset(void); +INLINE static void DSP_btst(void); +INLINE static void DSP_cmp(void); +INLINE static void DSP_cmpq(void); +INLINE static void DSP_div(void); +INLINE static void DSP_imacn(void); +INLINE static void DSP_imult(void); +INLINE static void DSP_imultn(void); +INLINE static void DSP_illegal(void); +INLINE static void DSP_jr(void); +INLINE static void DSP_jump(void); +INLINE static void DSP_load(void); +INLINE static void DSP_loadb(void); +INLINE static void DSP_loadw(void); +INLINE static void DSP_load_r14_i(void); +INLINE static void DSP_load_r14_r(void); +INLINE static void DSP_load_r15_i(void); +INLINE static void DSP_load_r15_r(void); +INLINE static void DSP_mirror(void); +INLINE static void DSP_mmult(void); +INLINE static void DSP_move(void); +INLINE static void DSP_movefa(void); +INLINE static void DSP_movei(void); +INLINE static void DSP_movepc(void); +INLINE static void DSP_moveq(void); +INLINE static void DSP_moveta(void); +INLINE static void DSP_mtoi(void); +INLINE static void DSP_mult(void); +INLINE static void DSP_neg(void); +INLINE static void DSP_nop(void); +INLINE static void DSP_normi(void); +INLINE static void DSP_not(void); +INLINE static void DSP_or(void); +INLINE static void DSP_resmac(void); +INLINE static void DSP_ror(void); +INLINE static void DSP_rorq(void); +INLINE static void DSP_sat16s(void); +INLINE static void DSP_sat32s(void); +INLINE static void DSP_sh(void); +INLINE static void DSP_sha(void); +INLINE static void DSP_sharq(void); +INLINE static void DSP_shlq(void); +INLINE static void DSP_shrq(void); +INLINE static void DSP_store(void); +INLINE static void DSP_storeb(void); +INLINE static void DSP_storew(void); +INLINE static void DSP_store_r14_i(void); +INLINE static void DSP_store_r14_r(void); +INLINE static void DSP_store_r15_i(void); +INLINE static void DSP_store_r15_r(void); +INLINE static void DSP_sub(void); +INLINE static void DSP_subc(void); +INLINE static void DSP_subq(void); +INLINE static void DSP_subqmod(void); +INLINE static void DSP_subqt(void); +INLINE static void DSP_xor(void); void (* DSPOpcode[64])() = { @@ -1939,7 +1820,7 @@ static uint32_t prevR1; #define DSP_PPC dsp_pc - (pipeline[plPtrRead].opcode == 38 ? 6 : (pipeline[plPtrRead].opcode == PIPELINE_STALL ? 0 : 2)) - (pipeline[plPtrExec].opcode == 38 ? 6 : (pipeline[plPtrExec].opcode == PIPELINE_STALL ? 0 : 2)) #define WRITEBACK_ADDR pipeline[plPtrExec].writebackRegister = 0xFE -static void DSP_abs(void) +INLINE static void DSP_abs(void) { uint32_t _Rn = PRN; @@ -1953,14 +1834,14 @@ static void DSP_abs(void) } } -static void DSP_add(void) +INLINE static void DSP_add(void) { uint32_t res = PRN + PRM; SET_ZNC_ADD(PRN, PRM, res); PRES = res; } -static void DSP_addc(void) +INLINE static void DSP_addc(void) { uint32_t res = PRN + PRM + dsp_flag_c; uint32_t carry = dsp_flag_c; @@ -1968,7 +1849,7 @@ static void DSP_addc(void) PRES = res; } -static void DSP_addq(void) +INLINE static void DSP_addq(void) { uint32_t r1 = dsp_convert_zero[PIMM1]; uint32_t res = PRN + r1; @@ -1976,7 +1857,7 @@ static void DSP_addq(void) PRES = res; } -static void DSP_addqmod(void) +INLINE static void DSP_addqmod(void) { uint32_t r1 = dsp_convert_zero[PIMM1]; uint32_t r2 = PRN; @@ -1986,43 +1867,43 @@ static void DSP_addqmod(void) SET_ZNC_ADD(r2, r1, res); } -static void DSP_addqt(void) +INLINE static void DSP_addqt(void) { PRES = PRN + dsp_convert_zero[PIMM1]; } -static void DSP_and(void) +INLINE static void DSP_and(void) { PRES = PRN & PRM; SET_ZN(PRES); } -static void DSP_bclr(void) +INLINE static void DSP_bclr(void) { PRES = PRN & ~(1 << PIMM1); SET_ZN(PRES); } -static void DSP_bset(void) +INLINE static void DSP_bset(void) { PRES = PRN | (1 << PIMM1); SET_ZN(PRES); } -static void DSP_btst(void) +INLINE static void DSP_btst(void) { dsp_flag_z = (~PRN >> PIMM1) & 1; NO_WRITEBACK; } -static void DSP_cmp(void) +INLINE static void DSP_cmp(void) { uint32_t res = PRN - PRM; SET_ZNC_SUB(PRN, PRM, res); NO_WRITEBACK; } -static void DSP_cmpq(void) +INLINE static void DSP_cmpq(void) { static int32_t sqtable[32] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1 }; @@ -2032,7 +1913,7 @@ static void DSP_cmpq(void) NO_WRITEBACK; } -static void DSP_div(void) +INLINE static void DSP_div(void) { uint32_t _Rm = PRM, _Rn = PRN; @@ -2057,7 +1938,7 @@ static void DSP_div(void) PRES = 0xFFFFFFFF; } -static void DSP_imacn(void) +INLINE static void DSP_imacn(void) { int32_t res = (int16_t)PRM * (int16_t)PRN; dsp_acc += (int64_t)res; @@ -2065,13 +1946,13 @@ static void DSP_imacn(void) NO_WRITEBACK; } -static void DSP_imult(void) +INLINE static void DSP_imult(void) { PRES = (int16_t)PRN * (int16_t)PRM; SET_ZN(PRES); } -static void DSP_imultn(void) +INLINE static void DSP_imultn(void) { // This is OK, since this multiply won't overflow 32 bits... int32_t res = (int32_t)((int16_t)PRN * (int16_t)PRM); @@ -2080,7 +1961,7 @@ static void DSP_imultn(void) NO_WRITEBACK; } -static void DSP_illegal(void) +INLINE static void DSP_illegal(void) { NO_WRITEBACK; } @@ -2090,7 +1971,7 @@ static void DSP_illegal(void) // jump can execute... !!! FIX !!! // This can probably be solved by judicious coding in the pipeline execution core... // And should be fixed now... -static void DSP_jr(void) +INLINE static void DSP_jr(void) { // KLUDGE: Used by BRANCH_CONDITION macro uint32_t jaguar_flags = (dsp_flag_n << 2) | (dsp_flag_c << 1) | dsp_flag_z; @@ -2168,7 +2049,7 @@ static void DSP_jr(void) NO_WRITEBACK; } -static void DSP_jump(void) +INLINE static void DSP_jump(void) { // KLUDGE: Used by BRANCH_CONDITION macro uint32_t jaguar_flags = (dsp_flag_n << 2) | (dsp_flag_c << 1) | dsp_flag_z; @@ -2244,7 +2125,7 @@ static void DSP_jump(void) NO_WRITEBACK; } -static void DSP_load(void) +INLINE static void DSP_load(void) { #ifdef DSP_CORRECT_ALIGNMENT PRES = DSPReadLong(PRM & 0xFFFFFFFC, DSP); @@ -2253,7 +2134,7 @@ static void DSP_load(void) #endif } -static void DSP_loadb(void) +INLINE static void DSP_loadb(void) { if (PRM >= DSP_WORK_RAM_BASE && PRM <= (DSP_WORK_RAM_BASE + 0x1FFF)) PRES = DSPReadLong(PRM, DSP) & 0xFF; @@ -2261,7 +2142,7 @@ static void DSP_loadb(void) PRES = JaguarReadByte(PRM, DSP); } -static void DSP_loadw(void) +INLINE static void DSP_loadw(void) { #ifdef DSP_CORRECT_ALIGNMENT if (PRM >= DSP_WORK_RAM_BASE && PRM <= (DSP_WORK_RAM_BASE + 0x1FFF)) @@ -2276,7 +2157,7 @@ static void DSP_loadw(void) #endif } -static void DSP_load_r14_i(void) +INLINE static void DSP_load_r14_i(void) { #ifdef DSP_CORRECT_ALIGNMENT PRES = DSPReadLong((dsp_reg[14] & 0xFFFFFFFC) + (dsp_convert_zero[PIMM1] << 2), DSP); @@ -2285,7 +2166,7 @@ static void DSP_load_r14_i(void) #endif } -static void DSP_load_r14_r(void) +INLINE static void DSP_load_r14_r(void) { #ifdef DSP_CORRECT_ALIGNMENT PRES = DSPReadLong((dsp_reg[14] + PRM) & 0xFFFFFFFC, DSP); @@ -2294,7 +2175,7 @@ static void DSP_load_r14_r(void) #endif } -static void DSP_load_r15_i(void) +INLINE static void DSP_load_r15_i(void) { #ifdef DSP_CORRECT_ALIGNMENT PRES = DSPReadLong((dsp_reg[15] &0xFFFFFFFC) + (dsp_convert_zero[PIMM1] << 2), DSP); @@ -2303,7 +2184,7 @@ static void DSP_load_r15_i(void) #endif } -static void DSP_load_r15_r(void) +INLINE static void DSP_load_r15_r(void) { #ifdef DSP_CORRECT_ALIGNMENT PRES = DSPReadLong((dsp_reg[15] + PRM) & 0xFFFFFFFC, DSP); @@ -2312,14 +2193,14 @@ static void DSP_load_r15_r(void) #endif } -static void DSP_mirror(void) +INLINE static void DSP_mirror(void) { uint32_t r1 = PRN; PRES = (mirror_table[r1 & 0xFFFF] << 16) | mirror_table[r1 >> 16]; SET_ZN(PRES); } -static void DSP_mmult(void) +INLINE static void DSP_mmult(void) { uint32_t res; unsigned i; @@ -2366,64 +2247,64 @@ static void DSP_mmult(void) SET_ZN(PRES); } -static void DSP_move(void) +INLINE static void DSP_move(void) { PRES = PRM; } -static void DSP_movefa(void) +INLINE static void DSP_movefa(void) { PRES = dsp_alternate_reg[PIMM1]; } -static void DSP_movei(void) +INLINE static void DSP_movei(void) { // // This instruction is followed by 32-bit value in LSW / MSW format... } -static void DSP_movepc(void) +INLINE static void DSP_movepc(void) { //Need to fix this to take into account pipelining effects... !!! FIX !!! [DONE] //Account for pipeline effects... PRES = dsp_pc - 2 - (pipeline[plPtrRead].opcode == 38 ? 6 : (pipeline[plPtrRead].opcode == PIPELINE_STALL ? 0 : 2)); } -static void DSP_moveq(void) +INLINE static void DSP_moveq(void) { PRES = PIMM1; } -static void DSP_moveta(void) +INLINE static void DSP_moveta(void) { dsp_alternate_reg[PIMM2] = PRM; NO_WRITEBACK; } -static void DSP_mtoi(void) +INLINE static void DSP_mtoi(void) { PRES = (((int32_t)PRM >> 8) & 0xFF800000) | (PRM & 0x007FFFFF); SET_ZN(PRES); } -static void DSP_mult(void) +INLINE static void DSP_mult(void) { PRES = (uint16_t)PRM * (uint16_t)PRN; SET_ZN(PRES); } -static void DSP_neg(void) +INLINE static void DSP_neg(void) { uint32_t res = -PRN; SET_ZNC_SUB(0, PRN, res); PRES = res; } -static void DSP_nop(void) +INLINE static void DSP_nop(void) { NO_WRITEBACK; } -static void DSP_normi(void) +INLINE static void DSP_normi(void) { uint32_t _Rm = PRM; uint32_t res = 0; @@ -2445,24 +2326,24 @@ static void DSP_normi(void) SET_ZN(PRES); } -static void DSP_not(void) +INLINE static void DSP_not(void) { PRES = ~PRN; SET_ZN(PRES); } -static void DSP_or(void) +INLINE static void DSP_or(void) { PRES = PRN | PRM; SET_ZN(PRES); } -static void DSP_resmac(void) +INLINE static void DSP_resmac(void) { PRES = (uint32_t)dsp_acc; } -static void DSP_ror(void) +INLINE static void DSP_ror(void) { uint32_t r1 = PRM & 0x1F; uint32_t res = (PRN >> r1) | (PRN << (32 - r1)); @@ -2470,7 +2351,7 @@ static void DSP_ror(void) PRES = res; } -static void DSP_rorq(void) +INLINE static void DSP_rorq(void) { uint32_t r1 = dsp_convert_zero[PIMM1 & 0x1F]; uint32_t r2 = PRN; @@ -2479,7 +2360,7 @@ static void DSP_rorq(void) SET_ZN(res); dsp_flag_c = (r2 >> 31) & 0x01; } -static void DSP_sat16s(void) +INLINE static void DSP_sat16s(void) { int32_t r2 = PRN; uint32_t res = (r2 < -32768) ? -32768 : (r2 > 32767) ? 32767 : r2; @@ -2487,7 +2368,7 @@ static void DSP_sat16s(void) SET_ZN(res); } -static void DSP_sat32s(void) +INLINE static void DSP_sat32s(void) { int32_t r2 = (uint32_t)PRN; int32_t temp = dsp_acc >> 32; @@ -2496,7 +2377,7 @@ static void DSP_sat32s(void) SET_ZN(res); } -static void DSP_sh(void) +INLINE static void DSP_sh(void) { int32_t sRm = (int32_t)PRM; uint32_t _Rn = PRN; @@ -2536,7 +2417,7 @@ static void DSP_sh(void) SET_ZN(PRES); } -static void DSP_sha(void) +INLINE static void DSP_sha(void) { int32_t sRm = (int32_t)PRM; uint32_t _Rn = PRN; @@ -2576,14 +2457,14 @@ static void DSP_sha(void) SET_ZN(PRES); } -static void DSP_sharq(void) +INLINE static void DSP_sharq(void) { uint32_t res = (int32_t)PRN >> dsp_convert_zero[PIMM1]; SET_ZN(res); dsp_flag_c = PRN & 0x01; PRES = res; } -static void DSP_shlq(void) +INLINE static void DSP_shlq(void) { int32_t r1 = 32 - PIMM1; uint32_t res = PRN << r1; @@ -2591,7 +2472,7 @@ static void DSP_shlq(void) PRES = res; } -static void DSP_shrq(void) +INLINE static void DSP_shrq(void) { int32_t r1 = dsp_convert_zero[PIMM1]; uint32_t res = PRN >> r1; @@ -2599,7 +2480,7 @@ static void DSP_shrq(void) PRES = res; } -static void DSP_store(void) +INLINE static void DSP_store(void) { #ifdef DSP_CORRECT_ALIGNMENT_STORE pipeline[plPtrExec].address = PRM & 0xFFFFFFFC; @@ -2611,7 +2492,7 @@ static void DSP_store(void) WRITEBACK_ADDR; } -static void DSP_storeb(void) +INLINE static void DSP_storeb(void) { pipeline[plPtrExec].address = PRM; @@ -2629,7 +2510,7 @@ static void DSP_storeb(void) WRITEBACK_ADDR; } -static void DSP_storew(void) +INLINE static void DSP_storew(void) { #ifdef DSP_CORRECT_ALIGNMENT_STORE pipeline[plPtrExec].address = PRM & 0xFFFFFFFE; @@ -2650,7 +2531,7 @@ static void DSP_storew(void) WRITEBACK_ADDR; } -static void DSP_store_r14_i(void) +INLINE static void DSP_store_r14_i(void) { #ifdef DSP_CORRECT_ALIGNMENT_STORE pipeline[plPtrExec].address = (dsp_reg[14] & 0xFFFFFFFC) + (dsp_convert_zero[PIMM1] << 2); @@ -2662,7 +2543,7 @@ static void DSP_store_r14_i(void) WRITEBACK_ADDR; } -static void DSP_store_r14_r(void) +INLINE static void DSP_store_r14_r(void) { #ifdef DSP_CORRECT_ALIGNMENT_STORE pipeline[plPtrExec].address = (dsp_reg[14] + PRM) & 0xFFFFFFFC; @@ -2674,7 +2555,7 @@ static void DSP_store_r14_r(void) WRITEBACK_ADDR; } -static void DSP_store_r15_i(void) +INLINE static void DSP_store_r15_i(void) { #ifdef DSP_CORRECT_ALIGNMENT_STORE pipeline[plPtrExec].address = (dsp_reg[15] & 0xFFFFFFFC) + (dsp_convert_zero[PIMM1] << 2); @@ -2686,7 +2567,7 @@ static void DSP_store_r15_i(void) WRITEBACK_ADDR; } -static void DSP_store_r15_r(void) +INLINE static void DSP_store_r15_r(void) { #ifdef DSP_CORRECT_ALIGNMENT_STORE pipeline[plPtrExec].address = (dsp_reg[15] + PRM) & 0xFFFFFFFC; @@ -2698,14 +2579,14 @@ static void DSP_store_r15_r(void) WRITEBACK_ADDR; } -static void DSP_sub(void) +INLINE static void DSP_sub(void) { uint32_t res = PRN - PRM; SET_ZNC_SUB(PRN, PRM, res); PRES = res; } -static void DSP_subc(void) +INLINE static void DSP_subc(void) { uint32_t res = PRN - PRM - dsp_flag_c; uint32_t borrow = dsp_flag_c; @@ -2713,7 +2594,7 @@ static void DSP_subc(void) PRES = res; } -static void DSP_subq(void) +INLINE static void DSP_subq(void) { uint32_t r1 = dsp_convert_zero[PIMM1]; uint32_t res = PRN - r1; @@ -2721,7 +2602,7 @@ static void DSP_subq(void) PRES = res; } -static void DSP_subqmod(void) +INLINE static void DSP_subqmod(void) { uint32_t r1 = dsp_convert_zero[PIMM1]; uint32_t r2 = PRN; @@ -2731,12 +2612,12 @@ static void DSP_subqmod(void) SET_ZNC_SUB(r2, r1, res); } -static void DSP_subqt(void) +INLINE static void DSP_subqt(void) { PRES = PRN - dsp_convert_zero[PIMM1]; } -static void DSP_xor(void) +INLINE static void DSP_xor(void) { PRES = PRN ^ PRM; SET_ZN(PRES); From 471fee91c3bd636d192572e2eb91687dcbb07899 Mon Sep 17 00:00:00 2001 From: Joe Mattiello Date: Tue, 27 Mar 2018 22:00:51 -0400 Subject: [PATCH 05/34] byte pack struct --- src/m68000/readcpu.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/m68000/readcpu.h b/src/m68000/readcpu.h index 5fe0d495..721706cc 100644 --- a/src/m68000/readcpu.h +++ b/src/m68000/readcpu.h @@ -88,6 +88,7 @@ struct instr_def { extern const struct instr_def defs68k[]; extern int n_defs68k; +#pragma pack(push, 1) extern struct instr { long int handler; unsigned char dreg; @@ -110,6 +111,7 @@ extern struct instr { unsigned int isjmp:1; unsigned int unused2:4; } *table68k; +#pragma pack(pop) extern void read_table68k(void); extern void do_merges(void); From b906a885a02d82ad2c851b9a96c26cafd1b123d1 Mon Sep 17 00:00:00 2001 From: Joe Mattiello Date: Tue, 27 Mar 2018 22:01:08 -0400 Subject: [PATCH 06/34] Move memory structs to header --- src/vjag_memory.h | 148 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 147 insertions(+), 1 deletion(-) diff --git a/src/vjag_memory.h b/src/vjag_memory.h index 44749a92..45131210 100644 --- a/src/vjag_memory.h +++ b/src/vjag_memory.h @@ -13,6 +13,145 @@ extern "C" { #endif +typedef union Bits32 { + uint32_t WORD; + struct Words { +#ifdef LITTLE_ENDIAN + uint16_t LWORD; + uint16_t UWORD; +#else + uint16_t UWORD; + uint16_t LWORD; +#endif + } Words; + struct bits { +#ifdef LITTLE_ENDIAN + unsigned int b0: 1; + unsigned int b1: 1; + unsigned int b2: 1; + unsigned int b3: 1; + unsigned int b4: 1; + unsigned int b5: 1; + unsigned int b6: 1; + unsigned int b7: 1; + unsigned int b8: 1; + unsigned int b9: 1; + unsigned int b10: 1; + unsigned int b11: 1; + unsigned int b12: 1; + unsigned int b13: 1; + unsigned int b14: 1; + unsigned int b15: 1; + unsigned int b16: 1; + unsigned int b17: 1; + unsigned int b18: 1; + unsigned int b19: 1; + unsigned int b20: 1; + unsigned int b21: 1; + unsigned int b22: 1; + unsigned int b23: 1; + unsigned int b24: 1; + unsigned int b25: 1; + unsigned int b26: 1; + unsigned int b27: 1; + unsigned int b28: 1; + unsigned int b29: 1; + unsigned int b30: 1; + unsigned int b31: 1; +#else + // reverse the order of the bit fields. + unsigned int b31: 1; + unsigned int b30: 1; + unsigned int b29: 1; + unsigned int b28: 1; + unsigned int b27: 1; + unsigned int b26: 1; + unsigned int b25: 1; + unsigned int b24: 1; + unsigned int b23: 1; + unsigned int b22: 1; + unsigned int b21: 1; + unsigned int b20: 1; + unsigned int b19: 1; + unsigned int b18: 1; + unsigned int b17: 1; + unsigned int b16: 1; + unsigned int b15: 1; + unsigned int b14: 1; + unsigned int b13: 1; + unsigned int b12: 1; + unsigned int b11: 1; + unsigned int b10: 1; + unsigned int b9: 1; + unsigned int b8: 1; + unsigned int b7: 1; + unsigned int b6: 1; + unsigned int b5: 1; + unsigned int b4: 1; + unsigned int b3: 1; + unsigned int b2: 1; + unsigned int b1: 1; + unsigned int b0: 1; +#endif + } bits; +} Bits32; + +#pragma pack(push, 1) +typedef union OpCode { + uint16_t WORD; + struct Bytes { +#ifdef LITTLE_ENDIAN + uint8_t LBYTE; + uint8_t UBYTE; +#else + uint8_t UBYTE; + uint8_t LBYTE; +#endif + } Bytes; + struct Codes { +#ifdef LITTLE_ENDIAN + unsigned int second : 5; + unsigned int first : 5; + unsigned int index : 6; +#else + unsigned int index : 6; + unsigned int first : 5; + unsigned int second : 5; +#endif + } Codes; +} OpCode; +#pragma pack(pop) + +typedef OpCode U16Union; + +typedef union Offset { + uint32_t LONG; +#pragma pack(push, 1) + struct Members { +#ifdef LITTLE_ENDIAN + unsigned int offset : 31; + unsigned int bit : 1; +#else + unsigned int bit : 1; + unsigned int offset : 31; +#endif + } Members; +#pragma pack(pop) +} Offset; + +typedef union DSPLong { + uint32_t LONG; + struct Data { +#ifdef LITTLE_ENDIAN + uint16_t LWORD; + uint16_t UWORD; +#else + uint16_t UWORD; + uint16_t LWORD; +#endif + } Data; +} DSPLong; + extern uint8_t jagMemSpace[]; extern uint8_t * jaguarMainRAM; @@ -76,7 +215,14 @@ extern const char * whoName[10]; r[(a)+2] = ((v) & 0x0000FF00) >> 8, r[(a)+3] = (v) & 0x000000FF #define GET32(r, a) ((r[(a)] << 24) | (r[(a)+1] << 16) | (r[(a)+2] << 8) | r[(a)+3]) #define SET16(r, a, v) r[(a)] = ((v) & 0xFF00) >> 8, r[(a)+1] = (v) & 0xFF -#define GET16(r, a) ((r[(a)] << 8) | r[(a)+1]) + + +INLINE static uint16_t GET16(uint8_t* r,uint32_t a) { + U16Union u16; + u16.Bytes.UBYTE = r[a]; + u16.Bytes.LBYTE = r[a+1]; + return u16.WORD; +} #ifdef __cplusplus } From 52eeeb0cd4075819e2958500df8367260623ccb3 Mon Sep 17 00:00:00 2001 From: Joe Mattiello Date: Tue, 27 Mar 2018 22:11:36 -0400 Subject: [PATCH 07/34] Disable some of my speed hacks to test mem corruption --- src/dsp.c | 12 ++++++++++++ src/vjag_memory.h | 21 +++++++++++++++------ 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/src/dsp.c b/src/dsp.c index 04670e8e..39c460e7 100644 --- a/src/dsp.c +++ b/src/dsp.c @@ -963,6 +963,7 @@ void DSPExec(int32_t cycles) IMASKCleared = false; } +#ifdef USE_STRUCTS OpCode opcode; opcode.WORD = DSPReadWord(dsp_pc, DSP); uint8_t index = opcode.Codes.index; @@ -972,6 +973,17 @@ void DSPExec(int32_t cycles) dsp_opcode_second_parameter = sp; dsp_pc += 2; dsp_opcode[index](); +#else + uint16_t opcode; + uint32_t index; + opcode = DSPReadWord(dsp_pc, DSP); + index = opcode >> 10; + dsp_opcode_first_parameter = (opcode >> 5) & 0x1F; + dsp_opcode_second_parameter = opcode & 0x1F; + dsp_pc += 2; + dsp_opcode[index](); + dsp_opcode_use[index]++; +#endif // Counter is not necessary and expensive -jm prov // dsp_opcode_use[index]++; cycles -= dsp_opcode_cycles[index]; diff --git a/src/vjag_memory.h b/src/vjag_memory.h index 45131210..77a14f4a 100644 --- a/src/vjag_memory.h +++ b/src/vjag_memory.h @@ -96,6 +96,9 @@ typedef union Bits32 { } bits; } Bits32; +#ifdef USE_STRUCTS + + #pragma pack(push, 1) typedef union OpCode { uint16_t WORD; @@ -124,6 +127,8 @@ typedef union OpCode { typedef OpCode U16Union; +#endif + typedef union Offset { uint32_t LONG; #pragma pack(push, 1) @@ -217,12 +222,16 @@ extern const char * whoName[10]; #define SET16(r, a, v) r[(a)] = ((v) & 0xFF00) >> 8, r[(a)+1] = (v) & 0xFF -INLINE static uint16_t GET16(uint8_t* r,uint32_t a) { - U16Union u16; - u16.Bytes.UBYTE = r[a]; - u16.Bytes.LBYTE = r[a+1]; - return u16.WORD; -} +#ifdef USE_STRUCTS + INLINE static uint16_t GET16(uint8_t* r,uint32_t a) { + U16Union u16; + u16.Bytes.UBYTE = r[a]; + u16.Bytes.LBYTE = r[a+1]; + return u16.WORD; + } +#else + #define GET16(r, a) ((r[(a)] << 8) | r[(a)+1]) +#endif #ifdef __cplusplus } From fb695b1b0e1f104136428834b2b609c2139c7f8c Mon Sep 17 00:00:00 2001 From: Joe Mattiello Date: Tue, 27 Mar 2018 23:08:40 -0400 Subject: [PATCH 08/34] Fix memory corruption issue in joystick code The values for offset0 and offset1 were coming out to 63 when they should be no more than 3. I think the devide should have beena modulus? I wrote out the code with more vars to figure ouit what was going on --- src/joystick.c | 62 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/src/joystick.c b/src/joystick.c index 51d2c966..08d2acbf 100644 --- a/src/joystick.c +++ b/src/joystick.c @@ -75,10 +75,10 @@ void JoystickDone(void) uint16_t JoystickReadWord(uint32_t offset) { /* E, D, B, 7 */ - uint8_t joypad0Offset[16] = { + const uint8_t joypad0Offset[16] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0C, 0xFF, 0xFF, 0xFF, 0x08, 0xFF, 0x04, 0x00, 0xFF }; - uint8_t joypad1Offset[16] = { + const uint8_t joypad1Offset[16] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF, 0x04, 0xFF, 0x08, 0x0C, 0xFF }; @@ -90,8 +90,9 @@ uint16_t JoystickReadWord(uint32_t offset) uint8_t offset0, offset1; uint16_t data = 0xFFFF; - if (!joysticksEnabled) + if (!joysticksEnabled) { return 0xFFFF; + } // Joystick data returns active low for buttons pressed, high for non- // pressed. @@ -134,25 +135,52 @@ uint16_t JoystickReadWord(uint32_t offset) // Joystick data returns active low for buttons pressed, high for non- // pressed. - offset0 = joypad0Offset[joystick_ram[1] & 0x0F] / 4; - offset1 = joypad1Offset[(joystick_ram[1] >> 4) & 0x0F] / 4; + uint8_t jrmLow = joystick_ram[1] & 0x0F; + uint8_t jrmHigh = (joystick_ram[1] >> 4) & 0x0F; + uint8_t jp0offset = joypad0Offset[jrmLow]; + uint8_t jp1offset = joypad1Offset[jrmHigh]; - if (offset0 != 0xFF) - { - int8_t mask[4][2] = { { BUTTON_A, BUTTON_PAUSE }, { BUTTON_B, 0xFF }, { BUTTON_C, 0xFF }, { BUTTON_OPTION, 0xFF } }; - data &= (joypad0Buttons[mask[offset0][0]] ? 0xFFFD : 0xFFFF); + offset0 = jp0offset % 4; + offset1 = jp1offset % 4; - if (mask[offset0][1] != 0xFF) - data &= (joypad0Buttons[mask[offset0][1]] ? 0xFFFE : 0xFFFF); - } + const int8_t mask[4][2] = { + { BUTTON_A, BUTTON_PAUSE }, + { BUTTON_B, 0xFF }, + { BUTTON_C, 0xFF }, + { BUTTON_OPTION, 0xFF } }; - if (offset1 != 0xFF) + if (offset0 != 0xFF && offset0 < 4) { - int8_t mask[4][2] = { { BUTTON_A, BUTTON_PAUSE }, { BUTTON_B, 0xFF }, { BUTTON_C, 0xFF }, { BUTTON_OPTION, 0xFF } }; - data &= (joypad1Buttons[mask[offset1][0]] ? 0xFFF7 : 0xFFFF); + uint8_t i0 = mask[offset0][0]; + uint8_t i1 = mask[offset0][1]; + + uint8_t maskOffset00 = joypad0Buttons[i0]; + uint8_t maskOffset01 = joypad0Buttons[i1]; + + data &= (maskOffset00 ? 0xFFFD : 0xFFFF); + + if (maskOffset01 != 0xFF) { + uint8_t button = joypad0Buttons[maskOffset01]; + uint16_t dataMask = (button ? 0xFFFE : 0xFFFF); + data &= dataMask; + } + } - if (mask[offset1][1] != 0xFF) - data &= (joypad1Buttons[mask[offset1][1]] ? 0xFFFB : 0xFFFF); + if (offset1 != 0xFF && offset1 < 4) + { + uint8_t i0 = mask[offset1][0]; + uint8_t i1 = mask[offset1][1]; + + uint8_t maskOffset10 = joypad1Buttons[i0]; + uint8_t maskOffset11 = joypad1Buttons[i1]; + + data &= (maskOffset10 ? 0xFFF7 : 0xFFFF); + + if (maskOffset11 != 0xFF) { + uint8_t button = joypad1Buttons[maskOffset11]; + uint16_t dataMask = (button ? 0xFFFB : 0xFFFF); + data &= dataMask; + } } return data; From 744adf2f8554b9e69312a5d1aa3741dc0f30c3a6 Mon Sep 17 00:00:00 2001 From: Joe Mattiello Date: Tue, 27 Mar 2018 23:09:18 -0400 Subject: [PATCH 09/34] Wrap more of my structs in ifdefs for testing --- src/dsp.c | 13 ++++++++++++- src/joystick.h | 4 ++-- src/vjag_memory.h | 49 +++++++++++++++++++++++------------------------ 3 files changed, 38 insertions(+), 28 deletions(-) diff --git a/src/dsp.c b/src/dsp.c index 39c460e7..9701964c 100644 --- a/src/dsp.c +++ b/src/dsp.c @@ -398,10 +398,13 @@ uint8_t DSPReadByte(uint32_t offset, uint32_t who/*=UNKNOWN*/) uint16_t DSPReadWord(uint32_t offset, uint32_t who/*=UNKNOWN*/) { +#ifdef USE_STRUCTS Offset offsett; offsett.LONG = offset; offset = offsett.Members.offset; - +#else + offset &= 0xFFFFFFFE; +#endif if (offset >= DSP_WORK_RAM_BASE && offset <= DSP_WORK_RAM_BASE+0x1FFF) { offset -= DSP_WORK_RAM_BASE; @@ -409,6 +412,7 @@ uint16_t DSPReadWord(uint32_t offset, uint32_t who/*=UNKNOWN*/) } else if ((offset>=DSP_CONTROL_RAM_BASE)&&(offset> 16; +#endif } return JaguarReadWord(offset, who); diff --git a/src/joystick.h b/src/joystick.h index 599afd2a..41cee209 100644 --- a/src/joystick.h +++ b/src/joystick.h @@ -13,7 +13,7 @@ extern "C" { #endif -enum +typedef enum BUTTON { BUTTON_FIRST = 0, BUTTON_U = 0, @@ -40,7 +40,7 @@ enum BUTTON_OPTION = 19, BUTTON_PAUSE = 20, BUTTON_LAST = 20 -}; +} BUTTON; void JoystickInit(void); void JoystickReset(void); diff --git a/src/vjag_memory.h b/src/vjag_memory.h index 77a14f4a..f50bb7ac 100644 --- a/src/vjag_memory.h +++ b/src/vjag_memory.h @@ -95,40 +95,38 @@ typedef union Bits32 { #endif } bits; } Bits32; - -#ifdef USE_STRUCTS - +#ifdef USE_STRUCTS #pragma pack(push, 1) -typedef union OpCode { - uint16_t WORD; - struct Bytes { + typedef union OpCode { + uint16_t WORD; + struct Bytes { #ifdef LITTLE_ENDIAN - uint8_t LBYTE; - uint8_t UBYTE; + uint8_t LBYTE; + uint8_t UBYTE; #else - uint8_t UBYTE; - uint8_t LBYTE; + uint8_t UBYTE; + uint8_t LBYTE; #endif - } Bytes; - struct Codes { + } Bytes; + struct Codes { #ifdef LITTLE_ENDIAN - unsigned int second : 5; - unsigned int first : 5; - unsigned int index : 6; + unsigned int second : 5; + unsigned int first : 5; + unsigned int index : 6; #else - unsigned int index : 6; - unsigned int first : 5; - unsigned int second : 5; + unsigned int index : 6; + unsigned int first : 5; + unsigned int second : 5; #endif - } Codes; -} OpCode; + } Codes; + } OpCode; #pragma pack(pop) - -typedef OpCode U16Union; - -#endif + typedef OpCode U16Union; +#endif //USE_STRUCTS + +#ifdef USE_STRUCTS typedef union Offset { uint32_t LONG; #pragma pack(push, 1) @@ -143,7 +141,8 @@ typedef union Offset { } Members; #pragma pack(pop) } Offset; - +#endif //USE_STRUCTS + typedef union DSPLong { uint32_t LONG; struct Data { From f4ebb99aa85369f774d41ba764a17a3e45a6889d Mon Sep 17 00:00:00 2001 From: Joe Mattiello Date: Tue, 27 Mar 2018 23:29:26 -0400 Subject: [PATCH 10/34] More joystick fixes copying official version looking at the offical shamusworld repo the division was moved after the 0xFF checks --- src/joystick.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/joystick.c b/src/joystick.c index 08d2acbf..3a62f90e 100644 --- a/src/joystick.c +++ b/src/joystick.c @@ -140,8 +140,8 @@ uint16_t JoystickReadWord(uint32_t offset) uint8_t jp0offset = joypad0Offset[jrmLow]; uint8_t jp1offset = joypad1Offset[jrmHigh]; - offset0 = jp0offset % 4; - offset1 = jp1offset % 4; + offset0 = jp0offset; // % 4; + offset1 = jp1offset; // % 4; const int8_t mask[4][2] = { { BUTTON_A, BUTTON_PAUSE }, @@ -149,8 +149,10 @@ uint16_t JoystickReadWord(uint32_t offset) { BUTTON_C, 0xFF }, { BUTTON_OPTION, 0xFF } }; - if (offset0 != 0xFF && offset0 < 4) + if (offset0 != 0xFF) { + offset0 /= 4; + uint8_t i0 = mask[offset0][0]; uint8_t i1 = mask[offset0][1]; @@ -166,8 +168,10 @@ uint16_t JoystickReadWord(uint32_t offset) } } - if (offset1 != 0xFF && offset1 < 4) + if (offset1 != 0xFF) { + offset1 /= 4; + uint8_t i0 = mask[offset1][0]; uint8_t i1 = mask[offset1][1]; From 6ecfc51c5278fbf6c2a6df71c5b23fa11e2b40b9 Mon Sep 17 00:00:00 2001 From: Joe Mattiello Date: Wed, 28 Mar 2018 00:18:24 -0400 Subject: [PATCH 11/34] gpu_control code clarity and optimization GPU_RUNNING running macro was pretty slow on ARM for some reason. Bitswise structs are faster in my testing --- src/gpu.c | 21 +++++++++++---------- src/vjag_memory.h | 40 +++++++++++++++++++++++++++++----------- 2 files changed, 40 insertions(+), 21 deletions(-) diff --git a/src/gpu.c b/src/gpu.c index b816cfe8..cc642a18 100644 --- a/src/gpu.c +++ b/src/gpu.c @@ -180,7 +180,8 @@ static uint32_t gpu_flags; static uint32_t gpu_matrix_control; static uint32_t gpu_pointer_to_matrix; static uint32_t gpu_data_organization; -static uint32_t gpu_control; +static GPUControl gpu_control; + static uint32_t gpu_div_control; // There is a distinct advantage to having these separated out--there's no need to clear // a bit before writing a result. I.e., if the result of an operation leaves a zero in @@ -195,7 +196,7 @@ static uint32_t gpu_instruction; static uint32_t gpu_opcode_first_parameter; static uint32_t gpu_opcode_second_parameter; -#define GPU_RUNNING (gpu_control & 0x01) +#define GPU_RUNNING (gpu_control.bits.b0) #define RM gpu_reg[gpu_opcode_first_parameter] #define RN gpu_reg[gpu_opcode_second_parameter] @@ -386,7 +387,7 @@ uint32_t GPUReadLong(uint32_t offset, uint32_t who/*=UNKNOWN*/) case 0x10: return gpu_pc; case 0x14: - return gpu_control; + return gpu_control.WORD; case 0x18: return gpu_hidata; case 0x1C: @@ -499,7 +500,7 @@ void GPUWriteLong(uint32_t offset, uint32_t data, uint32_t who/*=UNKNOWN*/) gpu_flag_c = (gpu_flags & CARRY_FLAG) >> 1; gpu_flag_n = (gpu_flags & NEGA_FLAG) >> 2; GPUUpdateRegisterBanks(); - gpu_control &= ~((gpu_flags & CINT04FLAGS) >> 3); // Interrupt latch clear bits + gpu_control.WORD &= ~((gpu_flags & CINT04FLAGS) >> 3); // Interrupt latch clear bits //Writing here is only an interrupt enable--this approach is just plain wrong! // GPUHandleIRQs(); //This, however, is A-OK! ;-) @@ -550,7 +551,7 @@ void GPUWriteLong(uint32_t offset, uint32_t data, uint32_t who/*=UNKNOWN*/) data &= ~0x04; } - gpu_control = (gpu_control & 0xF7C0) | (data & (~0xF7C0)); + gpu_control.WORD = (gpu_control.WORD & 0xF7C0) | (data & (~0xF7C0)); // if gpu wasn't running but is now running, execute a few cycles #ifdef GPU_SINGLE_STEPPING @@ -606,7 +607,7 @@ void GPUHandleIRQs(void) return; // Get the interrupt latch & enable bits - bits = (gpu_control >> 6) & 0x1F; + bits = gpu_control.gpuIRQ.irqMask; //(gpu_control >> 6) & 0x1F; mask = (gpu_flags >> 4) & 0x1F; // Bail out if latched interrupts aren't enabled @@ -645,11 +646,11 @@ void GPUHandleIRQs(void) void GPUSetIRQLine(int irqline, int state) { uint32_t mask = 0x0040 << irqline; - gpu_control &= ~mask; // Clear the interrupt latch + gpu_control.WORD &= ~mask; // Clear the interrupt latch if (state) { - gpu_control |= mask; // Assert the interrupt latch + gpu_control.WORD |= mask; // Assert the interrupt latch GPUHandleIRQs(); // And handle the interrupt... } } @@ -671,7 +672,7 @@ void GPUReset(void) gpu_pointer_to_matrix = 0x00000000; gpu_data_organization = 0xFFFFFFFF; gpu_pc = 0x00F03000; - gpu_control = 0x00002800; // Correctly sets this as TOM Rev. 2 + gpu_control.WORD = 0x00002800; // Correctly sets this as TOM Rev. 2 gpu_hidata = 0x00000000; gpu_remain = 0x00000000; // These two registers are RO/WO gpu_div_control = 0x00000000; @@ -764,7 +765,7 @@ void GPUDone(void) WriteLog("GPU: Stopped at PC=%08X (GPU %s running)\n", (unsigned int)gpu_pc, GPU_RUNNING ? "was" : "wasn't"); // Get the interrupt latch & enable bits - bits = (gpu_control >> 6) & 0x1F; + bits = gpu_control.gpuIRQ.irqMask; //(gpu_control >> 6) & 0x1F; mask = (gpu_flags >> 4) & 0x1F; WriteLog("GPU: Latch bits = %02X, enable bits = %02X\n", bits, mask); diff --git a/src/vjag_memory.h b/src/vjag_memory.h index f50bb7ac..0f90cadf 100644 --- a/src/vjag_memory.h +++ b/src/vjag_memory.h @@ -23,8 +23,8 @@ typedef union Bits32 { uint16_t UWORD; uint16_t LWORD; #endif - } Words; - struct bits { + } words; + struct Bits { #ifdef LITTLE_ENDIAN unsigned int b0: 1; unsigned int b1: 1; @@ -96,6 +96,24 @@ typedef union Bits32 { } bits; } Bits32; +typedef union GPUControl { + uint32_t WORD; + struct Words words; + struct Bits bits; + struct __attribute__ ((__packed__)) { +#ifdef LITTLE_ENDIAN + unsigned int : 6; + unsigned int irqMask: 5; + unsigned int : 21; +#else + unsigned int : 21; + unsigned int irqMask: 5; + unsigned int : 6; +#endif + } gpuIRQ; + +} GPUControl; + #ifdef USE_STRUCTS #pragma pack(push, 1) typedef union OpCode { @@ -221,16 +239,16 @@ extern const char * whoName[10]; #define SET16(r, a, v) r[(a)] = ((v) & 0xFF00) >> 8, r[(a)+1] = (v) & 0xFF -#ifdef USE_STRUCTS - INLINE static uint16_t GET16(uint8_t* r,uint32_t a) { - U16Union u16; - u16.Bytes.UBYTE = r[a]; - u16.Bytes.LBYTE = r[a+1]; - return u16.WORD; - } -#else +//#ifdef USE_STRUCTS +// INLINE static uint16_t GET16(uint8_t* r,uint32_t a) { +// U16Union u16; +// u16.Bytes.UBYTE = r[a]; +// u16.Bytes.LBYTE = r[a+1]; +// return u16.WORD; +// } +//#else #define GET16(r, a) ((r[(a)] << 8) | r[(a)+1]) -#endif +//#endif #ifdef __cplusplus } From de607f66960e9ea635a60a46b560d1024a274ce8 Mon Sep 17 00:00:00 2001 From: Joe Mattiello Date: Wed, 28 Mar 2018 00:52:36 -0400 Subject: [PATCH 12/34] Unroll loop --- src/blitter.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/blitter.c b/src/blitter.c index fb81bd98..c705e3c2 100644 --- a/src/blitter.c +++ b/src/blitter.c @@ -2470,12 +2470,14 @@ void ADDARRAY(uint16_t * addq, uint8_t daddasel, uint8_t daddbsel, uint8_t daddm hicinh = ((daddmode & 0x03) == 0x03); //Note that the carry out is saved between calls to this function... - for( i=0; i<4; i++) - ADD16SAT(&addq[i], &co[i], adda[i], addb[i], cin[i], sat, eightbit, hicinh); + ADD16SAT(&addq[0], &co[0], adda[0], addb[0], cin[0], sat, eightbit, hicinh); + ADD16SAT(&addq[1], &co[1], adda[1], addb[1], cin[1], sat, eightbit, hicinh); + ADD16SAT(&addq[2], &co[2], adda[2], addb[2], cin[2], sat, eightbit, hicinh); + ADD16SAT(&addq[3], &co[3], adda[3], addb[3], cin[3], sat, eightbit, hicinh); } -void ADD16SAT(uint16_t *r, uint8_t *co, uint16_t a, uint16_t b, uint8_t cin, bool sat, bool eightbit, bool hicinh) +void ADD16SAT(uint16_t *r, uint8_t *co, uint16_t a, const uint16_t b, const uint8_t cin, const bool sat, const bool eightbit, const bool hicinh) { uint8_t carry[4]; uint8_t btop, ctop; From 37c34f4636b470077ec099aa0101c968edd6906b Mon Sep 17 00:00:00 2001 From: Joe Mattiello Date: Wed, 28 Mar 2018 00:52:48 -0400 Subject: [PATCH 13/34] Attempt to speedup gpu opcode calls --- src/gpu.c | 474 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 344 insertions(+), 130 deletions(-) diff --git a/src/gpu.c b/src/gpu.c index cc642a18..5ade7099 100644 --- a/src/gpu.c +++ b/src/gpu.c @@ -74,70 +74,72 @@ void GPUDumpDisassembly(void); void GPUDumpRegisters(void); void GPUDumpMemory(void); -static void gpu_opcode_add(void); -static void gpu_opcode_addc(void); -static void gpu_opcode_addq(void); -static void gpu_opcode_addqt(void); -static void gpu_opcode_sub(void); -static void gpu_opcode_subc(void); -static void gpu_opcode_subq(void); -static void gpu_opcode_subqt(void); -static void gpu_opcode_neg(void); -static void gpu_opcode_and(void); -static void gpu_opcode_or(void); -static void gpu_opcode_xor(void); -static void gpu_opcode_not(void); -static void gpu_opcode_btst(void); -static void gpu_opcode_bset(void); -static void gpu_opcode_bclr(void); -static void gpu_opcode_mult(void); -static void gpu_opcode_imult(void); -static void gpu_opcode_imultn(void); -static void gpu_opcode_resmac(void); -static void gpu_opcode_imacn(void); -static void gpu_opcode_div(void); -static void gpu_opcode_abs(void); -static void gpu_opcode_sh(void); -static void gpu_opcode_shlq(void); -static void gpu_opcode_shrq(void); -static void gpu_opcode_sha(void); -static void gpu_opcode_sharq(void); -static void gpu_opcode_ror(void); -static void gpu_opcode_rorq(void); -static void gpu_opcode_cmp(void); -static void gpu_opcode_cmpq(void); -static void gpu_opcode_sat8(void); -static void gpu_opcode_sat16(void); -static void gpu_opcode_move(void); -static void gpu_opcode_moveq(void); -static void gpu_opcode_moveta(void); -static void gpu_opcode_movefa(void); -static void gpu_opcode_movei(void); -static void gpu_opcode_loadb(void); -static void gpu_opcode_loadw(void); -static void gpu_opcode_load(void); -static void gpu_opcode_loadp(void); -static void gpu_opcode_load_r14_indexed(void); -static void gpu_opcode_load_r15_indexed(void); -static void gpu_opcode_storeb(void); -static void gpu_opcode_storew(void); -static void gpu_opcode_store(void); -static void gpu_opcode_storep(void); -static void gpu_opcode_store_r14_indexed(void); -static void gpu_opcode_store_r15_indexed(void); -static void gpu_opcode_move_pc(void); -static void gpu_opcode_jump(void); -static void gpu_opcode_jr(void); -static void gpu_opcode_mmult(void); -static void gpu_opcode_mtoi(void); -static void gpu_opcode_normi(void); -static void gpu_opcode_nop(void); -static void gpu_opcode_load_r14_ri(void); -static void gpu_opcode_load_r15_ri(void); -static void gpu_opcode_store_r14_ri(void); -static void gpu_opcode_store_r15_ri(void); -static void gpu_opcode_sat24(void); -static void gpu_opcode_pack(void); +INLINE static void gpu_opcode_add(void); +INLINE static void gpu_opcode_addc(void); +INLINE static void gpu_opcode_addq(void); +INLINE static void gpu_opcode_addqt(void); +INLINE static void gpu_opcode_sub(void); +INLINE static void gpu_opcode_subc(void); +INLINE static void gpu_opcode_subq(void); +INLINE static void gpu_opcode_subqt(void); +INLINE static void gpu_opcode_neg(void); +INLINE static void gpu_opcode_and(void); +INLINE static void gpu_opcode_or(void); +INLINE static void gpu_opcode_xor(void); +INLINE static void gpu_opcode_not(void); +INLINE static void gpu_opcode_btst(void); +INLINE static void gpu_opcode_bset(void); +INLINE static void gpu_opcode_bclr(void); +INLINE static void gpu_opcode_mult(void); +INLINE static void gpu_opcode_imult(void); +INLINE static void gpu_opcode_imultn(void); +INLINE static void gpu_opcode_resmac(void); +INLINE static void gpu_opcode_imacn(void); +INLINE static void gpu_opcode_div(void); +INLINE static void gpu_opcode_abs(void); +INLINE static void gpu_opcode_sh(void); +INLINE static void gpu_opcode_shlq(void); +INLINE static void gpu_opcode_shrq(void); +INLINE static void gpu_opcode_sha(void); +INLINE static void gpu_opcode_sharq(void); +INLINE static void gpu_opcode_ror(void); +INLINE static void gpu_opcode_rorq(void); +INLINE static void gpu_opcode_cmp(void); +INLINE static void gpu_opcode_cmpq(void); +INLINE static void gpu_opcode_sat8(void); +INLINE static void gpu_opcode_sat16(void); +INLINE static void gpu_opcode_move(void); +INLINE static void gpu_opcode_moveq(void); +INLINE static void gpu_opcode_moveta(void); +INLINE static void gpu_opcode_movefa(void); +INLINE static void gpu_opcode_movei(void); +INLINE static void gpu_opcode_loadb(void); +INLINE static void gpu_opcode_loadw(void); +INLINE static void gpu_opcode_load(void); +INLINE static void gpu_opcode_loadp(void); +INLINE static void gpu_opcode_load_r14_indexed(void); +INLINE static void gpu_opcode_load_r15_indexed(void); +INLINE static void gpu_opcode_storeb(void); +INLINE static void gpu_opcode_storew(void); +INLINE static void gpu_opcode_store(void); +INLINE static void gpu_opcode_storep(void); +INLINE static void gpu_opcode_store_r14_indexed(void); +INLINE static void gpu_opcode_store_r15_indexed(void); +INLINE static void gpu_opcode_move_pc(void); +INLINE static void gpu_opcode_jump(void); +INLINE static void gpu_opcode_jr(void); +INLINE static void gpu_opcode_mmult(void); +INLINE static void gpu_opcode_mtoi(void); +INLINE static void gpu_opcode_normi(void); +INLINE static void gpu_opcode_nop(void); +INLINE static void gpu_opcode_load_r14_ri(void); +INLINE static void gpu_opcode_load_r15_ri(void); +INLINE static void gpu_opcode_store_r14_ri(void); +INLINE static void gpu_opcode_store_r15_ri(void); +INLINE static void gpu_opcode_sat24(void); +INLINE static void gpu_opcode_pack(void); + +INLINE static void executeOpcode(uint32_t index); uint8_t gpu_opcode_cycles[64] = { @@ -229,7 +231,9 @@ uint32_t gpu_convert_zero[32] = uint8_t * branch_condition_table = 0; #define BRANCH_CONDITION(x) branch_condition_table[(x) + ((jaguar_flags & 7) << 5)] +#ifdef DEBUG uint32_t gpu_opcode_use[64]; +#endif const char * gpu_opcode_str[64]= { @@ -705,8 +709,10 @@ uint32_t GPUReadPC(void) void GPUResetStats(void) { unsigned i; +#ifdef DEBUG for(i=0; i<64; i++) gpu_opcode_use[i] = 0; +#endif WriteLog("--> GPU stats were reset!\n"); } @@ -772,6 +778,7 @@ void GPUDone(void) GPUDumpRegisters(); GPUDumpDisassembly(); +#ifdef DEBUG WriteLog("\nGPU opcodes use:\n"); for(i=0; i<64; i++) { @@ -779,6 +786,7 @@ void GPUDone(void) WriteLog("\t%17s %lu\n", gpu_opcode_str[i], gpu_opcode_use[i]); } WriteLog("\n"); +#endif } // Main GPU execution core @@ -805,7 +813,7 @@ void GPUExec(int32_t cycles) while (cycles > 0 && GPU_RUNNING) { uint16_t opcode; - uint32_t index; + uint8_t index; if (gpu_ram_8[0x054] == 0x98 && gpu_ram_8[0x055] == 0x0A && gpu_ram_8[0x056] == 0x03 && gpu_ram_8[0x057] == 0x00 && gpu_ram_8[0x058] == 0x00 && gpu_ram_8[0x059] == 0x00) @@ -825,14 +833,19 @@ void GPUExec(int32_t cycles) //$E400 -> 1110 01 -> $39 -> 57 //GPU #1 gpu_pc += 2; +#if 0 gpu_opcode[index](); - +#else + executeOpcode(index); +#endif // BIOS hacking //GPU: [00F03548] jr nz,00F03560 (0xd561) (RM=00F03114, RN=00000004) -> --> JR: Branch taken. //GPU: [00F0354C] jump nz,(r29) (0xd3a1) (RM=00F03314, RN=00000004) -> (RM=00F03314, RN=00000004) cycles -= gpu_opcode_cycles[index]; +#ifdef DEBUG gpu_opcode_use[index]++; +#endif if ((gpu_pc < 0xF03000 || gpu_pc > 0xF03FFF) && !tripwire) tripwire = true; } @@ -840,6 +853,207 @@ void GPUExec(int32_t cycles) gpu_in_exec--; } +INLINE static void executeOpcode(uint32_t index) { + switch (index) { + case 0: + gpu_opcode_add(); + break; + case 1: + gpu_opcode_addc(); + break; + case 2: + gpu_opcode_addq(); + break; + case 3: + gpu_opcode_addqt(); + break; + case 4: + gpu_opcode_sub(); + break; + case 5: + gpu_opcode_subc(); + break; + case 6: + gpu_opcode_subq(); + break; + case 7: + gpu_opcode_subqt(); + break; + case 8: + gpu_opcode_neg(); + break; + case 9: + gpu_opcode_and(); + break; + case 10: + gpu_opcode_or(); + break; + case 11: + gpu_opcode_xor(); + break; + case 12: + gpu_opcode_not(); + break; + case 13: + gpu_opcode_btst(); + break; + case 14: + gpu_opcode_bset(); + break; + case 15: + gpu_opcode_bclr(); + break; + case 16: + gpu_opcode_mult(); + break; + case 17: + gpu_opcode_imult(); + break; + case 18: + gpu_opcode_imultn(); + break; + case 19: + gpu_opcode_resmac(); + break; + case 20: + gpu_opcode_imacn(); + break; + case 21: + gpu_opcode_div(); + break; + case 22: + gpu_opcode_abs(); + break; + case 23: + gpu_opcode_sh(); + break; + case 24: + gpu_opcode_shlq(); + break; + case 25: + gpu_opcode_shrq(); + break; + case 26: + gpu_opcode_sha(); + break; + case 27: + gpu_opcode_sharq(); + break; + case 28: + gpu_opcode_ror(); + break; + case 29: + gpu_opcode_rorq(); + break; + case 30: + gpu_opcode_cmp(); + break; + case 31: + gpu_opcode_cmpq(); + break; + case 32: + gpu_opcode_sat8(); + break; + case 33: + gpu_opcode_sat16(); + break; + case 34: + gpu_opcode_move(); + break; + case 35: + gpu_opcode_moveq(); + break; + case 36: + gpu_opcode_moveta(); + break; + case 37: + gpu_opcode_movefa(); + break; + case 38: + gpu_opcode_movei(); + break; + case 39: + gpu_opcode_loadb(); + break; + case 40: + gpu_opcode_loadw(); + break; + case 41: + gpu_opcode_load(); + break; + case 42: + gpu_opcode_loadp(); + break; + case 43: + gpu_opcode_load_r14_indexed(); + break; + case 44: + gpu_opcode_load_r15_indexed(); + break; + case 45: + gpu_opcode_storeb(); + break; + case 46: + gpu_opcode_storew(); + break; + case 47: + gpu_opcode_store(); + break; + case 48: + gpu_opcode_storep(); + break; + case 49: + gpu_opcode_store_r14_indexed(); + break; + case 50: + gpu_opcode_store_r15_indexed(); + break; + case 51: + gpu_opcode_move_pc(); + break; + case 52: + gpu_opcode_jump(); + break; + case 53: + gpu_opcode_jr(); + break; + case 54: + gpu_opcode_mmult(); + break; + case 55: + gpu_opcode_mtoi(); + break; + case 56: + gpu_opcode_normi(); + break; + case 57: + gpu_opcode_nop(); + break; + case 58: + gpu_opcode_load_r14_ri(); + break; + case 59: + gpu_opcode_load_r15_ri(); + break; + case 60: + + gpu_opcode_store_r14_ri(); + break; + case 61: + gpu_opcode_store_r15_ri(); + break; + case 62: + gpu_opcode_sat24(); + break; + case 63: + gpu_opcode_pack(); + break; + default: + WriteLog("\nUnknown opcode %i\n", index); + break; + } +} + // GPU opcodes /* @@ -878,7 +1092,7 @@ void GPUExec(int32_t cycles) */ -static void gpu_opcode_jump(void) +INLINE static void gpu_opcode_jump(void) { // normalize flags /* gpu_flag_c = (gpu_flag_c ? 1 : 0); @@ -896,7 +1110,7 @@ static void gpu_opcode_jump(void) } -static void gpu_opcode_jr(void) +INLINE static void gpu_opcode_jr(void) { uint32_t jaguar_flags = (gpu_flag_n << 2) | (gpu_flag_c << 1) | gpu_flag_z; @@ -910,7 +1124,7 @@ static void gpu_opcode_jr(void) } -static void gpu_opcode_add(void) +INLINE static void gpu_opcode_add(void) { uint32_t res = RN + RM; CLR_ZNC; SET_ZNC_ADD(RN, RM, res); @@ -918,7 +1132,7 @@ static void gpu_opcode_add(void) } -static void gpu_opcode_addc(void) +INLINE static void gpu_opcode_addc(void) { uint32_t res = RN + RM + gpu_flag_c; uint32_t carry = gpu_flag_c; @@ -927,7 +1141,7 @@ static void gpu_opcode_addc(void) } -static void gpu_opcode_addq(void) +INLINE static void gpu_opcode_addq(void) { uint32_t r1 = gpu_convert_zero[IMM_1]; uint32_t res = RN + r1; @@ -936,13 +1150,13 @@ static void gpu_opcode_addq(void) } -static void gpu_opcode_addqt(void) +INLINE static void gpu_opcode_addqt(void) { RN += gpu_convert_zero[IMM_1]; } -static void gpu_opcode_sub(void) +INLINE static void gpu_opcode_sub(void) { uint32_t res = RN - RM; SET_ZNC_SUB(RN, RM, res); @@ -950,7 +1164,7 @@ static void gpu_opcode_sub(void) } -static void gpu_opcode_subc(void) +INLINE static void gpu_opcode_subc(void) { // This is how the GPU ALU does it--Two's complement with inverted carry uint64_t res = (uint64_t)RN + (uint64_t)(RM ^ 0xFFFFFFFF) + (gpu_flag_c ^ 1); @@ -961,7 +1175,7 @@ static void gpu_opcode_subc(void) } -static void gpu_opcode_subq(void) +INLINE static void gpu_opcode_subq(void) { uint32_t r1 = gpu_convert_zero[IMM_1]; uint32_t res = RN - r1; @@ -970,20 +1184,20 @@ static void gpu_opcode_subq(void) } -static void gpu_opcode_subqt(void) +INLINE static void gpu_opcode_subqt(void) { RN -= gpu_convert_zero[IMM_1]; } -static void gpu_opcode_cmp(void) +INLINE static void gpu_opcode_cmp(void) { uint32_t res = RN - RM; SET_ZNC_SUB(RN, RM, res); } -static void gpu_opcode_cmpq(void) +INLINE static void gpu_opcode_cmpq(void) { static int32_t sqtable[32] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1 }; @@ -993,35 +1207,35 @@ static void gpu_opcode_cmpq(void) } -static void gpu_opcode_and(void) +INLINE static void gpu_opcode_and(void) { RN = RN & RM; SET_ZN(RN); } -static void gpu_opcode_or(void) +INLINE static void gpu_opcode_or(void) { RN = RN | RM; SET_ZN(RN); } -static void gpu_opcode_xor(void) +INLINE static void gpu_opcode_xor(void) { RN = RN ^ RM; SET_ZN(RN); } -static void gpu_opcode_not(void) +INLINE static void gpu_opcode_not(void) { RN = ~RN; SET_ZN(RN); } -static void gpu_opcode_move_pc(void) +INLINE static void gpu_opcode_move_pc(void) { // Should be previous PC--this might not always be previous instruction! // Then again, this will point right at the *current* instruction, i.e., MOVE PC,R! @@ -1029,27 +1243,27 @@ static void gpu_opcode_move_pc(void) } -static void gpu_opcode_sat8(void) +INLINE static void gpu_opcode_sat8(void) { RN = ((int32_t)RN < 0 ? 0 : (RN > 0xFF ? 0xFF : RN)); SET_ZN(RN); } -static void gpu_opcode_sat16(void) +INLINE static void gpu_opcode_sat16(void) { RN = ((int32_t)RN < 0 ? 0 : (RN > 0xFFFF ? 0xFFFF : RN)); SET_ZN(RN); } -static void gpu_opcode_sat24(void) +INLINE static void gpu_opcode_sat24(void) { RN = ((int32_t)RN < 0 ? 0 : (RN > 0xFFFFFF ? 0xFFFFFF : RN)); SET_ZN(RN); } -static void gpu_opcode_store_r14_indexed(void) +INLINE static void gpu_opcode_store_r14_indexed(void) { #ifdef GPU_CORRECT_ALIGNMENT uint32_t address = gpu_reg[14] + (gpu_convert_zero[IMM_1] << 2); @@ -1064,7 +1278,7 @@ static void gpu_opcode_store_r14_indexed(void) } -static void gpu_opcode_store_r15_indexed(void) +INLINE static void gpu_opcode_store_r15_indexed(void) { #ifdef GPU_CORRECT_ALIGNMENT uint32_t address = gpu_reg[15] + (gpu_convert_zero[IMM_1] << 2); @@ -1079,7 +1293,7 @@ static void gpu_opcode_store_r15_indexed(void) } -static void gpu_opcode_load_r14_ri(void) +INLINE static void gpu_opcode_load_r14_ri(void) { #ifdef GPU_CORRECT_ALIGNMENT uint32_t address = gpu_reg[14] + RM; @@ -1094,7 +1308,7 @@ static void gpu_opcode_load_r14_ri(void) } -static void gpu_opcode_load_r15_ri(void) +INLINE static void gpu_opcode_load_r15_ri(void) { #ifdef GPU_CORRECT_ALIGNMENT uint32_t address = gpu_reg[15] + RM; @@ -1109,7 +1323,7 @@ static void gpu_opcode_load_r15_ri(void) } -static void gpu_opcode_store_r14_ri(void) +INLINE static void gpu_opcode_store_r14_ri(void) { #ifdef GPU_CORRECT_ALIGNMENT uint32_t address = gpu_reg[14] + RM; @@ -1124,7 +1338,7 @@ static void gpu_opcode_store_r14_ri(void) } -static void gpu_opcode_store_r15_ri(void) +INLINE static void gpu_opcode_store_r15_ri(void) { #ifdef GPU_CORRECT_ALIGNMENT_STORE uint32_t address = gpu_reg[15] + RM; @@ -1139,12 +1353,12 @@ static void gpu_opcode_store_r15_ri(void) } -static void gpu_opcode_nop(void) +INLINE static void gpu_opcode_nop(void) { } -static void gpu_opcode_pack(void) +INLINE static void gpu_opcode_pack(void) { uint32_t val = RN; @@ -1155,7 +1369,7 @@ static void gpu_opcode_pack(void) } -static void gpu_opcode_storeb(void) +INLINE static void gpu_opcode_storeb(void) { //Is this right??? // Would appear to be so...! @@ -1166,7 +1380,7 @@ static void gpu_opcode_storeb(void) } -static void gpu_opcode_storew(void) +INLINE static void gpu_opcode_storew(void) { #ifdef GPU_CORRECT_ALIGNMENT if ((RM >= 0xF03000) && (RM <= 0xF03FFF)) @@ -1182,7 +1396,7 @@ static void gpu_opcode_storew(void) } -static void gpu_opcode_store(void) +INLINE static void gpu_opcode_store(void) { #ifdef GPU_CORRECT_ALIGNMENT if ((RM >= 0xF03000) && (RM <= 0xF03FFF)) @@ -1195,7 +1409,7 @@ static void gpu_opcode_store(void) } -static void gpu_opcode_storep(void) +INLINE static void gpu_opcode_storep(void) { #ifdef GPU_CORRECT_ALIGNMENT if ((RM >= 0xF03000) && (RM <= 0xF03FFF)) @@ -1214,7 +1428,7 @@ static void gpu_opcode_storep(void) #endif } -static void gpu_opcode_loadb(void) +INLINE static void gpu_opcode_loadb(void) { if ((RM >= 0xF03000) && (RM <= 0xF03FFF)) RN = GPUReadLong(RM, GPU) & 0xFF; @@ -1223,7 +1437,7 @@ static void gpu_opcode_loadb(void) } -static void gpu_opcode_loadw(void) +INLINE static void gpu_opcode_loadw(void) { #ifdef GPU_CORRECT_ALIGNMENT if ((RM >= 0xF03000) && (RM <= 0xF03FFF)) @@ -1256,7 +1470,7 @@ static void gpu_opcode_loadw(void) to test that. They seem to be stable, though, which would indicate such a mechanism. Sometimes, however, the off by 2 case returns $12345678! */ -static void gpu_opcode_load(void) +INLINE static void gpu_opcode_load(void) { #ifdef GPU_CORRECT_ALIGNMENT RN = GPUReadLong(RM & 0xFFFFFFFC, GPU); @@ -1266,7 +1480,7 @@ static void gpu_opcode_load(void) } -static void gpu_opcode_loadp(void) +INLINE static void gpu_opcode_loadp(void) { #ifdef GPU_CORRECT_ALIGNMENT if ((RM >= 0xF03000) && (RM <= 0xF03FFF)) @@ -1286,7 +1500,7 @@ static void gpu_opcode_loadp(void) } -static void gpu_opcode_load_r14_indexed(void) +INLINE static void gpu_opcode_load_r14_indexed(void) { #ifdef GPU_CORRECT_ALIGNMENT uint32_t address = gpu_reg[14] + (gpu_convert_zero[IMM_1] << 2); @@ -1301,7 +1515,7 @@ static void gpu_opcode_load_r14_indexed(void) } -static void gpu_opcode_load_r15_indexed(void) +INLINE static void gpu_opcode_load_r15_indexed(void) { #ifdef GPU_CORRECT_ALIGNMENT uint32_t address = gpu_reg[15] + (gpu_convert_zero[IMM_1] << 2); @@ -1316,7 +1530,7 @@ static void gpu_opcode_load_r15_indexed(void) } -static void gpu_opcode_movei(void) +INLINE static void gpu_opcode_movei(void) { // This instruction is followed by 32-bit value in LSW / MSW format... RN = (uint32_t)GPUReadWord(gpu_pc, GPU) | ((uint32_t)GPUReadWord(gpu_pc + 2, GPU) << 16); @@ -1324,51 +1538,51 @@ static void gpu_opcode_movei(void) } -static void gpu_opcode_moveta(void) +INLINE static void gpu_opcode_moveta(void) { ALTERNATE_RN = RM; } -static void gpu_opcode_movefa(void) +INLINE static void gpu_opcode_movefa(void) { RN = ALTERNATE_RM; } -static void gpu_opcode_move(void) +INLINE static void gpu_opcode_move(void) { RN = RM; } -static void gpu_opcode_moveq(void) +INLINE static void gpu_opcode_moveq(void) { RN = IMM_1; } -static void gpu_opcode_resmac(void) +INLINE static void gpu_opcode_resmac(void) { RN = gpu_acc; } -static void gpu_opcode_imult(void) +INLINE static void gpu_opcode_imult(void) { RN = (int16_t)RN * (int16_t)RM; SET_ZN(RN); } -static void gpu_opcode_mult(void) +INLINE static void gpu_opcode_mult(void) { RN = (uint16_t)RM * (uint16_t)RN; SET_ZN(RN); } -static void gpu_opcode_bclr(void) +INLINE static void gpu_opcode_bclr(void) { uint32_t res = RN & ~(1 << IMM_1); RN = res; @@ -1376,13 +1590,13 @@ static void gpu_opcode_bclr(void) } -static void gpu_opcode_btst(void) +INLINE static void gpu_opcode_btst(void) { gpu_flag_z = (~RN >> IMM_1) & 1; } -static void gpu_opcode_bset(void) +INLINE static void gpu_opcode_bset(void) { uint32_t res = RN | (1 << IMM_1); RN = res; @@ -1390,14 +1604,14 @@ static void gpu_opcode_bset(void) } -static void gpu_opcode_imacn(void) +INLINE static void gpu_opcode_imacn(void) { uint32_t res = (int16_t)RM * (int16_t)(RN); gpu_acc += res; } -static void gpu_opcode_mtoi(void) +INLINE static void gpu_opcode_mtoi(void) { uint32_t _RM = RM; uint32_t res = RN = (((int32_t)_RM >> 8) & 0xFF800000) | (_RM & 0x007FFFFF); @@ -1405,7 +1619,7 @@ static void gpu_opcode_mtoi(void) } -static void gpu_opcode_normi(void) +INLINE static void gpu_opcode_normi(void) { uint32_t _RM = RM; uint32_t res = 0; @@ -1427,7 +1641,7 @@ static void gpu_opcode_normi(void) SET_ZN(res); } -static void gpu_opcode_mmult(void) +INLINE static void gpu_opcode_mmult(void) { unsigned i; int count = gpu_matrix_control & 0x0F; // Matrix width @@ -1473,7 +1687,7 @@ static void gpu_opcode_mmult(void) } -static void gpu_opcode_abs(void) +INLINE static void gpu_opcode_abs(void) { gpu_flag_c = RN >> 31; if (RN == 0x80000000) @@ -1488,7 +1702,7 @@ static void gpu_opcode_abs(void) } -static void gpu_opcode_div(void) // RN / RM +INLINE static void gpu_opcode_div(void) // RN / RM { unsigned i; // Real algorithm, courtesy of SCPCD: NYAN! @@ -1514,7 +1728,7 @@ static void gpu_opcode_div(void) // RN / RM } -static void gpu_opcode_imultn(void) +INLINE static void gpu_opcode_imultn(void) { uint32_t res = (int32_t)((int16_t)RN * (int16_t)RM); gpu_acc = (int32_t)res; @@ -1523,7 +1737,7 @@ static void gpu_opcode_imultn(void) } -static void gpu_opcode_neg(void) +INLINE static void gpu_opcode_neg(void) { uint32_t res = -RN; SET_ZNC_SUB(0, RN, res); @@ -1531,7 +1745,7 @@ static void gpu_opcode_neg(void) } -static void gpu_opcode_shlq(void) +INLINE static void gpu_opcode_shlq(void) { int32_t r1 = 32 - IMM_1; uint32_t res = RN << r1; @@ -1540,7 +1754,7 @@ static void gpu_opcode_shlq(void) } -static void gpu_opcode_shrq(void) +INLINE static void gpu_opcode_shrq(void) { int32_t r1 = gpu_convert_zero[IMM_1]; uint32_t res = RN >> r1; @@ -1549,7 +1763,7 @@ static void gpu_opcode_shrq(void) } -static void gpu_opcode_ror(void) +INLINE static void gpu_opcode_ror(void) { uint32_t r1 = RM & 0x1F; uint32_t res = (RN >> r1) | (RN << (32 - r1)); @@ -1558,7 +1772,7 @@ static void gpu_opcode_ror(void) } -static void gpu_opcode_rorq(void) +INLINE static void gpu_opcode_rorq(void) { uint32_t r1 = gpu_convert_zero[IMM_1 & 0x1F]; uint32_t r2 = RN; @@ -1568,7 +1782,7 @@ static void gpu_opcode_rorq(void) } -static void gpu_opcode_sha(void) +INLINE static void gpu_opcode_sha(void) { uint32_t res; @@ -1587,7 +1801,7 @@ static void gpu_opcode_sha(void) } -static void gpu_opcode_sharq(void) +INLINE static void gpu_opcode_sharq(void) { uint32_t res = (int32_t)RN >> gpu_convert_zero[IMM_1]; SET_ZN(res); gpu_flag_c = RN & 0x01; @@ -1595,7 +1809,7 @@ static void gpu_opcode_sharq(void) } -static void gpu_opcode_sh(void) +INLINE static void gpu_opcode_sh(void) { if (RM & 0x80000000) // Shift left { From b94e82c7e47ee751f2e965abf3c5ad5c55d5b3d8 Mon Sep 17 00:00:00 2001 From: Joe Mattiello Date: Wed, 28 Mar 2018 06:03:14 -0400 Subject: [PATCH 14/34] Try some blitter optimizations --- src/blitter.c | 179 +++++++++++++++++++++++++--------------------- src/vjag_memory.h | 26 +++++++ 2 files changed, 122 insertions(+), 83 deletions(-) diff --git a/src/blitter.c b/src/blitter.c index c705e3c2..490002d8 100644 --- a/src/blitter.c +++ b/src/blitter.c @@ -92,41 +92,41 @@ void BlitterMidsummer2(void); // Blitter command bits -#define SRCEN (cmd & 0x00000001) -#define SRCENZ (cmd & 0x00000002) -#define SRCENX (cmd & 0x00000004) -#define DSTEN (cmd & 0x00000008) -#define DSTENZ (cmd & 0x00000010) -#define DSTWRZ (cmd & 0x00000020) -#define CLIPA1 (cmd & 0x00000040) - -#define UPDA1F (cmd & 0x00000100) -#define UPDA1 (cmd & 0x00000200) -#define UPDA2 (cmd & 0x00000400) - -#define DSTA2 (cmd & 0x00000800) - -#define Z_OP_INF (cmd & 0x00040000) -#define Z_OP_EQU (cmd & 0x00080000) -#define Z_OP_SUP (cmd & 0x00100000) - -#define LFU_NAN (cmd & 0x00200000) -#define LFU_NA (cmd & 0x00400000) -#define LFU_AN (cmd & 0x00800000) -#define LFU_A (cmd & 0x01000000) - -#define CMPDST (cmd & 0x02000000) -#define BCOMPEN (cmd & 0x04000000) -#define DCOMPEN (cmd & 0x08000000) - -#define PATDSEL (cmd & 0x00010000) -#define ADDDSEL (cmd & 0x00020000) -#define TOPBEN (cmd & 0x00004000) -#define TOPNEN (cmd & 0x00008000) -#define BKGWREN (cmd & 0x10000000) -#define GOURD (cmd & 0x00001000) -#define GOURZ (cmd & 0x00002000) -#define SRCSHADE (cmd & 0x40000000) +#define SRCEN (cmd.bits.b0) +#define SRCENZ (cmd.bits.b1) +#define SRCENX (cmd.bits.b2) +#define DSTEN (cmd.bits.b3) +#define DSTENZ (cmd.bits.b4) +#define DSTWRZ (cmd.bits.b5) +#define CLIPA1 (cmd.bits.b6) + +#define UPDA1F (cmd.bits.b8) +#define UPDA1 (cmd.bits.b9) +#define UPDA2 (cmd.bits.b10) + +#define DSTA2 (cmd.bits.b11) + +#define Z_OP_INF (cmd.bits.b18) +#define Z_OP_EQU (cmd.bits.b19) +#define Z_OP_SUP (cmd.bits.b20) + +#define LFU_NAN (cmd.bits.b21) +#define LFU_NA (cmd.bits.b22) +#define LFU_AN (cmd.bits.b23) +#define LFU_A (cmd.bits.b24) + +#define CMPDST (cmd.bits.b25) +#define BCOMPEN (cmd.bits.b26) +#define DCOMPEN (cmd.bits.b27) + +#define PATDSEL (cmd.bits.b16) +#define ADDDSEL (cmd.bits.b17) +#define TOPBEN (cmd.bits.b14) +#define TOPNEN (cmd.bits.b15) +#define BKGWREN (cmd.bits.b28) +#define GOURD (cmd.bits.b12) +#define GOURZ (cmd.bits.b13) +#define SRCSHADE (cmd.bits.b30) #define XADDPHR 0 @@ -310,8 +310,11 @@ static int32_t a1_clip_x, a1_clip_y; // to optimize the blitter, then we may revisit it in the future... // Generic blit handler -void blitter_generic(uint32_t cmd) +void blitter_generic(uint32_t icmd) { + Bits32 cmd; + cmd.WORD = icmd; + uint32_t srcdata, srczdata, dstdata, dstzdata, writedata, inhibit; uint32_t bppSrc = (DSTA2 ? 1 << ((REG(A1_FLAGS) >> 3) & 0x07) : 1 << ((REG(A2_FLAGS) >> 3) & 0x07)); @@ -343,14 +346,14 @@ void blitter_generic(uint32_t cmd) if (SRCENZ) srczdata = READ_ZDATA(a2, REG(A2_FLAGS)); - else if (cmd & 0x0001C020) // PATDSEL | TOPBEN | TOPNEN | DSTWRZ + else if (cmd.WORD & 0x0001C020) // PATDSEL | TOPBEN | TOPNEN | DSTWRZ srczdata = READ_RDATA(SRCZINT, a2, REG(A2_FLAGS), a2_phrase_mode); } else // Use SRCDATA register... { srcdata = READ_RDATA(SRCDATA, a2, REG(A2_FLAGS), a2_phrase_mode); - if (cmd & 0x0001C020) // PATDSEL | TOPBEN | TOPNEN | DSTWRZ + if (cmd.WORD & 0x0001C020) // PATDSEL | TOPBEN | TOPNEN | DSTWRZ srczdata = READ_RDATA(SRCZINT, a2, REG(A2_FLAGS), a2_phrase_mode); } @@ -521,13 +524,13 @@ void blitter_generic(uint32_t cmd) srcdata = READ_PIXEL(a1, REG(A1_FLAGS)); if (SRCENZ) srczdata = READ_ZDATA(a1, REG(A1_FLAGS)); - else if (cmd & 0x0001C020) // PATDSEL | TOPBEN | TOPNEN | DSTWRZ + else if (cmd.WORD & 0x0001C020) // PATDSEL | TOPBEN | TOPNEN | DSTWRZ srczdata = READ_RDATA(SRCZINT, a1, REG(A1_FLAGS), a1_phrase_mode); } else { srcdata = READ_RDATA(SRCDATA, a1, REG(A1_FLAGS), a1_phrase_mode); - if (cmd & 0x001C020) // PATDSEL | TOPBEN | TOPNEN | DSTWRZ + if (cmd.WORD & 0x001C020) // PATDSEL | TOPBEN | TOPNEN | DSTWRZ srczdata = READ_RDATA(SRCZINT, a1, REG(A1_FLAGS), a1_phrase_mode); } @@ -761,20 +764,23 @@ void blitter_generic(uint32_t cmd) WREG(A2_PIXEL, (a2_y & 0xFFFF0000) | ((a2_x >> 16) & 0xFFFF)); } -void blitter_blit(uint32_t cmd) +void blitter_blit(uint32_t cmdi) { + Bits32 cmd; + cmd.WORD = cmdi; + uint32_t m, e; uint32_t pitchValue[4] = { 0, 1, 3, 2 }; colour_index = 0; - src = cmd & 0x07; - dst = (cmd >> 3) & 0x07; - misc = (cmd >> 6) & 0x03; - a1ctl = (cmd >> 8) & 0x7; - mode = (cmd >> 11) & 0x07; - ity = (cmd >> 14) & 0x0F; - zop = (cmd >> 18) & 0x07; - op = (cmd >> 21) & 0x0F; - ctrl = (cmd >> 25) & 0x3F; + src = cmd.WORD & 0x07; + dst = (cmd.WORD >> 3) & 0x07; + misc = (cmd.WORD >> 6) & 0x03; + a1ctl = (cmd.WORD >> 8) & 0x7; + mode = (cmd.WORD >> 11) & 0x07; + ity = (cmd.WORD >> 14) & 0x0F; + zop = (cmd.WORD >> 18) & 0x07; + op = (cmd.WORD >> 21) & 0x0F; + ctrl = (cmd.WORD >> 25) & 0x3F; // Addresses in A1/2_BASE are *phrase* aligned, i.e., bottom three bits are ignored! // NOTE: This fixes Rayman's bad collision detection AND keeps T2K working! @@ -957,7 +963,7 @@ void blitter_blit(uint32_t cmd) gd_ca = 0xFFFFFF00 | gd_ca; } - blitter_generic(cmd); + blitter_generic(cmd.WORD); } #endif /******************************************************************************* @@ -1119,10 +1125,11 @@ void BlitterWriteWord(uint32_t offset, uint16_t data, uint32_t who/*=UNKNOWN*/) // I.e., the second write of 32-bit value--not convinced this is the best way to do this! // But then again, according to the Jaguar docs, this is correct...! { - if (vjs.useFastBlitter) - blitter_blit(GET32(blitter_ram, 0x38)); - else - BlitterMidsummer2(); + if (vjs.useFastBlitter) { + blitter_blit(GET32(blitter_ram, 0x38)); + } else { + BlitterMidsummer2(); + } } } //F02278,9,A,B @@ -1141,10 +1148,10 @@ void BlitterWriteLong(uint32_t offset, uint32_t data, uint32_t who) void ADDRGEN(uint32_t *, uint32_t *, bool, bool, uint16_t, uint16_t, uint32_t, uint8_t, uint8_t, uint8_t, uint8_t, uint16_t, uint16_t, uint32_t, uint8_t, uint8_t, uint8_t, uint8_t); -void ADDARRAY(uint16_t * addq, uint8_t daddasel, uint8_t daddbsel, uint8_t daddmode, - uint64_t dstd, uint32_t iinc, uint8_t initcin[], uint64_t initinc, uint16_t initpix, - uint32_t istep, uint64_t patd, uint64_t srcd, uint64_t srcz1, uint64_t srcz2, - uint32_t zinc, uint32_t zstep); +void ADDARRAY(const uint16_t * addq, const uint8_t daddasel, const uint8_t daddbsel, const uint8_t daddmode, + const uint64_t dstd, const uint32_t iinc, const uint8_t initcin[], const uint64_t initinc, const uint16_t initpix, + const uint32_t istep, const uint64_t patd, const uint64_t srcd, const uint64_t srcz1, const uint64_t srcz2, + const uint32_t zinc, const uint32_t zstep); void ADD16SAT(uint16_t *r, uint8_t *co, uint16_t a, uint16_t b, uint8_t cin, bool sat, bool eightbit, bool hicinh); void ADDAMUX(int16_t *adda_x, int16_t *adda_y, uint8_t addasel, int16_t a1_step_x, int16_t a1_step_y, int16_t a1_stepf_x, int16_t a1_stepf_y, int16_t a2_step_x, int16_t a2_step_y, @@ -1172,7 +1179,8 @@ void BlitterMidsummer2(void) //Will remove stuff that isn't in Jaguar I once fully described (stuff like texture won't //be described here at all)... - uint32_t cmd = GET32(blitter_ram, COMMAND); + Bits32 cmd; + cmd.WORD = GET32(blitter_ram, COMMAND); // Line states passed in via the command register @@ -1183,7 +1191,7 @@ void BlitterMidsummer2(void) patdsel = (PATDSEL), adddsel = (ADDDSEL), cmpdst = (CMPDST), bcompen = (BCOMPEN), dcompen = (DCOMPEN), bkgwren = (BKGWREN), srcshade = (SRCSHADE); - uint8_t zmode = (cmd & 0x01C0000) >> 18, lfufunc = (cmd & 0x1E00000) >> 21; + uint8_t zmode = (cmd.WORD & 0x01C0000) >> 18, lfufunc = (cmd.WORD & 0x1E00000) >> 21; //Missing: BUSHI //Where to find various lines: // clip_a1 -> inner @@ -2024,9 +2032,9 @@ A2ptrldi := NAN2 (a2ptrldi, a2update\, a2pldt);*/ if (pixsize == 3) dstart = (dstxp & 0x07) << 3; - if (pixsize == 4) + else if (pixsize == 4) dstart = (dstxp & 0x03) << 4; - if (pixsize == 5) + else if (pixsize == 5) dstart = (dstxp & 0x01) << 5; dstart = (phrase_mode ? dstart : pixAddr & 0x07); @@ -2041,9 +2049,9 @@ A2ptrldi := NAN2 (a2ptrldi, a2update\, a2pldt);*/ if (pixsize == 3) window_mask = (a1_win_x & 0x07) << 3; - if (pixsize == 4) + else if (pixsize == 4) window_mask = (a1_win_x & 0x03) << 4; - if (pixsize == 5) + else if (pixsize == 5) window_mask = (a1_win_x & 0x01) << 5; window_mask = (penden ? window_mask : 0); @@ -2053,9 +2061,9 @@ A2ptrldi := NAN2 (a2ptrldi, a2update\, a2pldt);*/ if (pixsize == 3) inner_mask = (icount & 0x07) << 3; - if (pixsize == 4) + else if (pixsize == 4) inner_mask = (icount & 0x03) << 4; - if (pixsize == 5) + else if (pixsize == 5) inner_mask = (icount & 0x01) << 5; if (!inner0) inner_mask = 0; @@ -2391,11 +2399,11 @@ void ADDRGEN(uint32_t *address, uint32_t *pixa, bool gena2, bool zaddr, // Here's an important bit: The source data adder logic. Need to track down the inputs!!! // //////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////// - -void ADDARRAY(uint16_t * addq, uint8_t daddasel, uint8_t daddbsel, uint8_t daddmode, - uint64_t dstd, uint32_t iinc, uint8_t initcin[], uint64_t initinc, uint16_t initpix, - uint32_t istep, uint64_t patd, uint64_t srcd, uint64_t srcz1, uint64_t srcz2, - uint32_t zinc, uint32_t zstep) +#include +void ADDARRAY(const uint16_t * addq, const uint8_t daddasel, const uint8_t daddbsel, const uint8_t daddmode, + const uint64_t dstd, const uint32_t iinc, const uint8_t initcin[], const uint64_t initinc, const uint16_t initpix, + const uint32_t istep, const uint64_t patd, const uint64_t srcd, const uint64_t srcz1, const uint64_t srcz2, + const uint32_t zinc, const uint32_t zstep) { unsigned i; uint16_t adda[4]; @@ -2847,7 +2855,7 @@ Patdhi := JOIN (patdhi, patd[32..63]);*/ uint8_t dech38el[2][8] = { { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } }; int en; - uint64_t cmpd; + Bits64 cmpd; uint8_t dbinht; uint16_t addq[4]; uint8_t initcin[4] = { 0, 0, 0, 0 }; @@ -2871,23 +2879,23 @@ Zstep := JOIN (zstep, zstep[0..31]);*/ /*Datacomp := DATACOMP (dcomp[0..7], cmpdst, dstdlo, dstdhi, patdlo, patdhi, srcdlo, srcdhi);*/ ////////////////////////////////////// C++ CODE ////////////////////////////////////// *dcomp = 0; - cmpd = *patd ^ (cmpdst ? dstd : srcd); + cmpd.DATA = *patd ^ (cmpdst ? dstd : srcd); - if ((cmpd & 0x00000000000000FFLL) == 0) + if (cmpd.bytes.b0 == 0) *dcomp |= 0x01; - if ((cmpd & 0x000000000000FF00LL) == 0) + if (cmpd.bytes.b1 == 0) *dcomp |= 0x02; - if ((cmpd & 0x0000000000FF0000LL) == 0) + if (cmpd.bytes.b2 == 0) *dcomp |= 0x04; - if ((cmpd & 0x00000000FF000000LL) == 0) + if (cmpd.bytes.b3 == 0) *dcomp |= 0x08; - if ((cmpd & 0x000000FF00000000LL) == 0) + if (cmpd.bytes.b4 == 0) *dcomp |= 0x10; - if ((cmpd & 0x0000FF0000000000LL) == 0) + if (cmpd.bytes.b5 == 0) *dcomp |= 0x20; - if ((cmpd & 0x00FF000000000000LL) == 0) + if (cmpd.bytes.b6 == 0) *dcomp |= 0x40; - if ((cmpd & 0xFF00000000000000LL) == 0) + if (cmpd.bytes.b7 == 0) *dcomp |= 0x80; ////////////////////////////////////////////////////////////////////////////////////// @@ -2905,7 +2913,7 @@ with srcshift bits 4 & 5 selecting the start position */ //So... basically what we have here is: *zcomp = 0; - + // TODO: Byte and bit this -jm provenance if ((((*srcz & 0x000000000000FFFFLL) < (dstz & 0x000000000000FFFFLL)) && (zmode & 0x01)) || (((*srcz & 0x000000000000FFFFLL) == (dstz & 0x000000000000FFFFLL)) && (zmode & 0x02)) || (((*srcz & 0x000000000000FFFFLL) > (dstz & 0x000000000000FFFFLL)) && (zmode & 0x04))) @@ -3036,6 +3044,8 @@ Sfine := DECH38EL (s_fine[0..7], dstart[0..2], sfen\);*/ /*Maskt[0] := BUF1 (maskt[0], s_fine[0]); Maskt[1-7] := OAN1P (maskt[1-7], maskt[0-6], s_fine[1-7], e_fine\[1-7]);*/ ////////////////////////////////////// C++ CODE ////////////////////////////////////// + // TODO: Byte and bit this -jm provenance + maskt = s_fine & 0x0001; maskt |= (((maskt & 0x0001) || (s_fine & 0x02)) && (e_fine & 0x02) ? 0x0002 : 0x0000); maskt |= (((maskt & 0x0002) || (s_fine & 0x04)) && (e_fine & 0x04) ? 0x0004 : 0x0000); @@ -3052,6 +3062,8 @@ masktla = s_coarse[0] . /e_coarse[0] */ Maskt[8] := OAN1P (maskt[8], masktla, s_coarse[1], e_coarse\[1]); Maskt[9-14] := OAN1P (maskt[9-14], maskt[8-13], s_coarse[2-7], e_coarse\[2-7]);*/ ////////////////////////////////////// C++ CODE ////////////////////////////////////// + // TODO: Byte and bit this -jm provenance + maskt |= (((s_coarse & e_coarse & 0x01) || (s_coarse & 0x02)) && (e_coarse & 0x02) ? 0x0100 : 0x0000); maskt |= (((maskt & 0x0100) || (s_coarse & 0x04)) && (e_coarse & 0x04) ? 0x0200 : 0x0000); maskt |= (((maskt & 0x0200) || (s_coarse & 0x08)) && (e_coarse & 0x08) ? 0x0400 : 0x0000); @@ -3089,6 +3101,7 @@ Masku[14] := MX2 (masku[14], maskt[14], maskt[0], mir_byte);*/ mir_bit = true/*big_pix*/ && !phrase_mode; mir_byte = true/*big_pix*/ && phrase_mode; masku = maskt; + // TODO: Byte and bit this -jm provenance if (mir_bit) { diff --git a/src/vjag_memory.h b/src/vjag_memory.h index 0f90cadf..c96a86b1 100644 --- a/src/vjag_memory.h +++ b/src/vjag_memory.h @@ -13,6 +13,32 @@ extern "C" { #endif +#pragma pack(push, 1) + typedef union Bits64 { + uint64_t DATA; + struct Bytes8 { +#ifdef LITTLE_ENDIAN + uint8_t b0; + uint8_t b1; + uint8_t b2; + uint8_t b3; + uint8_t b4; + uint8_t b5; + uint8_t b6; + uint8_t b7; +#else + uint8_t b7; + uint8_t b6; + uint8_t b5; + uint8_t b4; + uint8_t b3; + uint8_t b2; + uint8_t b1; + uint8_t b0; +#endif + } bytes; + } Bits64; +#pragma pack(pop) typedef union Bits32 { uint32_t WORD; struct Words { From f85f748510a1304b231ddd3ec005ac71d7d2203f Mon Sep 17 00:00:00 2001 From: Joe Mattiello Date: Wed, 28 Mar 2018 06:03:50 -0400 Subject: [PATCH 15/34] Try some GPU optimizations and clarity --- src/gpu.c | 75 +++++++++++++++++++++++++++++++++++++---------- src/vjag_memory.h | 35 +++++++++++++++++++--- 2 files changed, 90 insertions(+), 20 deletions(-) diff --git a/src/gpu.c b/src/gpu.c index 5ade7099..3568d45d 100644 --- a/src/gpu.c +++ b/src/gpu.c @@ -325,14 +325,21 @@ uint8_t GPUReadByte(uint32_t offset, uint32_t who/*=UNKNOWN*/) } // GPU word access (read) -uint16_t GPUReadWord(uint32_t offset, uint32_t who/*=UNKNOWN*/) +INLINE uint16_t GPUReadWord(uint32_t offset, uint32_t who/*=UNKNOWN*/) { if ((offset >= GPU_WORK_RAM_BASE) && (offset < GPU_WORK_RAM_BASE+0x1000)) { - uint16_t data; - offset &= 0xFFF; + offset &= 0xFFF; +#ifdef USE_STRUCTS + OpCode data; + data.Bytes.UBYTE = (uint16_t)gpu_ram_8[offset]; + data.Bytes.LBYTE = (uint16_t)gpu_ram_8[offset+1]; + return data.WORD; +#else + uint16_t data; data = ((uint16_t)gpu_ram_8[offset] << 8) | (uint16_t)gpu_ram_8[offset+1]; return data; +#endif } else if ((offset >= GPU_CONTROL_RAM_BASE) && (offset < GPU_CONTROL_RAM_BASE+0x20)) { @@ -355,7 +362,7 @@ uint16_t GPUReadWord(uint32_t offset, uint32_t who/*=UNKNOWN*/) } // GPU dword access (read) -uint32_t GPUReadLong(uint32_t offset, uint32_t who/*=UNKNOWN*/) +INLINE uint32_t GPUReadLong(uint32_t offset, uint32_t who/*=UNKNOWN*/) { if (offset >= 0xF02000 && offset <= 0xF020FF) { @@ -841,8 +848,11 @@ void GPUExec(int32_t cycles) // BIOS hacking //GPU: [00F03548] jr nz,00F03560 (0xd561) (RM=00F03114, RN=00000004) -> --> JR: Branch taken. //GPU: [00F0354C] jump nz,(r29) (0xd3a1) (RM=00F03314, RN=00000004) -> (RM=00F03314, RN=00000004) - +#if 0 cycles -= gpu_opcode_cycles[index]; +#else + cycles -= 1; +#endif #ifdef DEBUG gpu_opcode_use[index]++; #endif @@ -1702,30 +1712,63 @@ INLINE static void gpu_opcode_abs(void) } +#ifdef USE_STRUCTS INLINE static void gpu_opcode_div(void) // RN / RM { + unsigned i; // Real algorithm, courtesy of SCPCD: NYAN! - uint32_t q = RN; - uint32_t r = 0; + Bits32 q; + q.WORD = RN; + + Bits32 r; + r.WORD = 0; // If 16.16 division, stuff top 16 bits of RN into remainder and put the // bottom 16 of RN in top 16 of quotient - if (gpu_div_control & 0x01) - q <<= 16, r = RN >> 16; + if (gpu_div_control & 0x01) { + r.WORD = q.words.UWORD; + q.words.UWORD = q.words.LWORD; + q.words.LWORD = 0; + } for(i=0; i<32; i++) { - uint32_t sign = r & 0x80000000; - r = (r << 1) | ((q >> 31) & 0x01); - r += (sign ? RM : -RM); - q = (q << 1) | (((~r) >> 31) & 0x01); + uint32_t sign = r.bits.b31; + r.WORD = (r.WORD << 1) | q.bits.b31; + r.WORD += (sign ? RM : -RM); + q.WORD = (q.WORD << 1) | !r.bits.b31; // (((~r) >> 31) & 0x01); } - RN = q; - gpu_remain = r; - + RN = q.WORD; + gpu_remain = r.WORD; +} +#else +INLINE static void gpu_opcode_div(void) // RN / RM +{ + unsigned i; + // Real algorithm, courtesy of SCPCD: NYAN! + uint32_t q = RN; + uint32_t r = 0; + + // If 16.16 division, stuff top 16 bits of RN into remainder and put the + // bottom 16 of RN in top 16 of quotient + if (gpu_div_control & 0x01) + q <<= 16, r = RN >> 16; + + for(i=0; i<32; i++) + { + uint32_t sign = r & 0x80000000; + r = (r << 1) | ((q >> 31) & 0x01); + r += (sign ? RM : -RM); + q = (q << 1) | (((~r) >> 31) & 0x01); + } + + RN = q; + gpu_remain = r; + } +#endif INLINE static void gpu_opcode_imultn(void) diff --git a/src/vjag_memory.h b/src/vjag_memory.h index c96a86b1..3b1cbb48 100644 --- a/src/vjag_memory.h +++ b/src/vjag_memory.h @@ -39,6 +39,8 @@ extern "C" { } bytes; } Bits64; #pragma pack(pop) + +#pragma pack(push, 1) typedef union Bits32 { uint32_t WORD; struct Words { @@ -50,6 +52,28 @@ typedef union Bits32 { uint16_t LWORD; #endif } words; + struct Bytes4 { +#ifdef LITTLE_ENDIAN + uint8_t LL; + uint8_t LU; + uint8_t UL; + uint8_t UU; // Upper upper [UU, UL, LU, LL] +#else + uint8_t UU; // Upper upper [UU, UL, LU, LL] + uint8_t UL; + uint8_t LU; + uint8_t LL; +#endif + } bytes; + struct TopThreeOne { +#ifdef LITTLE_ENDIAN + unsigned int : 1; + uint32_t value : 31; +#else + uint32_t value : 31; + unsigned int : 1; +#endif + } topThreeOne; struct Bits { #ifdef LITTLE_ENDIAN unsigned int b0: 1; @@ -121,7 +145,9 @@ typedef union Bits32 { #endif } bits; } Bits32; - +#pragma pack(pop) + +#pragma pack(push, 1) typedef union GPUControl { uint32_t WORD; struct Words words; @@ -136,15 +162,16 @@ typedef union GPUControl { unsigned int irqMask: 5; unsigned int : 6; #endif - } gpuIRQ; - +} gpuIRQ; +#pragma pack(pop) + } GPUControl; #ifdef USE_STRUCTS #pragma pack(push, 1) typedef union OpCode { uint16_t WORD; - struct Bytes { + struct Bytes2 { #ifdef LITTLE_ENDIAN uint8_t LBYTE; uint8_t UBYTE; From 681a1f3ab40c60833e886f0f461b3d47d1ee4524 Mon Sep 17 00:00:00 2001 From: Joe Mattiello Date: Wed, 28 Mar 2018 06:04:07 -0400 Subject: [PATCH 16/34] Refactor joystick, getting odd bugs still --- src/joystick.c | 134 ++++++++++++++++++++++++------------------------- 1 file changed, 67 insertions(+), 67 deletions(-) diff --git a/src/joystick.c b/src/joystick.c index 3a62f90e..60b93fbc 100644 --- a/src/joystick.c +++ b/src/joystick.c @@ -84,111 +84,111 @@ uint16_t JoystickReadWord(uint32_t offset) offset &= 0x03; - if (offset == 0) - { - unsigned i; - uint8_t offset0, offset1; - uint16_t data = 0xFFFF; - + if (offset == 0) + { + uint8_t offset0, offset1; + uint16_t data = 0xFFFF; + if (!joysticksEnabled) { - return 0xFFFF; + return 0xFFFF; } - - // Joystick data returns active low for buttons pressed, high for non- - // pressed. - offset0 = joypad0Offset[joystick_ram[1] & 0x0F]; - offset1 = joypad1Offset[(joystick_ram[1] >> 4) & 0x0F]; - - if (offset0 != 0xFF) - { - uint16_t mask[4] = { 0xFEFF, 0xFDFF, 0xFBFF, 0xF7FF }; - uint16_t msk2[4] = { 0xFFFF, 0xFFFD, 0xFFFB, 0xFFF7 }; - - for(i = 0; i < 4; i++) - data &= (joypad0Buttons[offset0 + i] ? mask[i] : 0xFFFF); - - data &= msk2[offset0 / 4]; - } - - if (offset1 != 0xFF) - { - uint16_t mask[4] = { 0xEFFF, 0xDFFF, 0xBFFF, 0x7FFF }; - uint16_t msk2[4] = { 0xFF7F, 0xFFBF, 0xFFDF, 0xFFEF }; - - for(i = 0; i < 4; i++) - data &= (joypad1Buttons[offset1 + i] ? mask[i] : 0xFFFF); - - data &= msk2[offset1 / 4]; - } - - return data; - } - else if (offset == 2) - { - uint8_t offset0, offset1; - // Hardware ID returns NTSC/PAL identification bit here - // N.B.: On real H/W, bit 7 is *always* zero...! - uint16_t data = 0xFF6F | (vjs.hardwareTypeNTSC ? 0x10 : 0x00); - - if (!joysticksEnabled) - return data; - - // Joystick data returns active low for buttons pressed, high for non- - // pressed. + + // Joystick data returns active low for buttons pressed, high for non- + // pressed. + offset0 = joypad0Offset[joystick_ram[1] & 0x0F]; + offset1 = joypad1Offset[(joystick_ram[1] >> 4) & 0x0F]; + + if (offset0 != 0xFF) + { + uint16_t mask[4] = { 0xFEFF, 0xFDFF, 0xFBFF, 0xF7FF }; + uint16_t msk2[4] = { 0xFFFF, 0xFFFD, 0xFFFB, 0xFFF7 }; + + for(uint8_t i = 0; i < 4; i++) { + data &= (joypad0Buttons[offset0 + i] ? mask[i] : 0xFFFF); + } + + data &= msk2[offset0 / 4]; + } + + if (offset1 != 0xFF) + { + uint16_t mask[4] = { 0xEFFF, 0xDFFF, 0xBFFF, 0x7FFF }; + uint16_t msk2[4] = { 0xFF7F, 0xFFBF, 0xFFDF, 0xFFEF }; + + for(uint8_t i = 0; i < 4; i++) { + data &= (joypad1Buttons[offset1 + i] ? mask[i] : 0xFFFF); + } + data &= msk2[offset1 / 4]; + } + + return data; + } + else if (offset == 2) + { + uint8_t offset0, offset1; + // Hardware ID returns NTSC/PAL identification bit here + // N.B.: On real H/W, bit 7 is *always* zero...! + uint16_t data = 0xFF6F | (vjs.hardwareTypeNTSC ? 0x10 : 0x00); + + if (!joysticksEnabled) + return data; + + // Joystick data returns active low for buttons pressed, high for non- + // pressed. uint8_t jrmLow = joystick_ram[1] & 0x0F; uint8_t jrmHigh = (joystick_ram[1] >> 4) & 0x0F; uint8_t jp0offset = joypad0Offset[jrmLow]; uint8_t jp1offset = joypad1Offset[jrmHigh]; - + offset0 = jp0offset; // % 4; offset1 = jp1offset; // % 4; - + const int8_t mask[4][2] = { { BUTTON_A, BUTTON_PAUSE }, { BUTTON_B, 0xFF }, { BUTTON_C, 0xFF }, { BUTTON_OPTION, 0xFF } }; - - if (offset0 != 0xFF) - { + + if (offset0 != 0xFF) + { offset0 /= 4; uint8_t i0 = mask[offset0][0]; uint8_t i1 = mask[offset0][1]; - + uint8_t maskOffset00 = joypad0Buttons[i0]; uint8_t maskOffset01 = joypad0Buttons[i1]; - data &= (maskOffset00 ? 0xFFFD : 0xFFFF); - + data &= (maskOffset00 ? 0xFFFD : 0xFFFF); + if (maskOffset01 != 0xFF) { uint8_t button = joypad0Buttons[maskOffset01]; uint16_t dataMask = (button ? 0xFFFE : 0xFFFF); data &= dataMask; } - } - + } + if (offset1 != 0xFF) - { + { offset1 /= 4; - + uint8_t i0 = mask[offset1][0]; uint8_t i1 = mask[offset1][1]; uint8_t maskOffset10 = joypad1Buttons[i0]; uint8_t maskOffset11 = joypad1Buttons[i1]; - data &= (maskOffset10 ? 0xFFF7 : 0xFFFF); - + data &= (maskOffset10 ? 0xFFF7 : 0xFFFF); + if (maskOffset11 != 0xFF) { uint8_t button = joypad1Buttons[maskOffset11]; uint16_t dataMask = (button ? 0xFFFB : 0xFFFF); data &= dataMask; } - } - - return data; - } + } + + return data; + } return 0xFFFF; } From 2bf8ead1a0dbad293bd693d20d889eedf46bf85d Mon Sep 17 00:00:00 2001 From: Joe Mattiello Date: Wed, 28 Mar 2018 06:05:08 -0400 Subject: [PATCH 17/34] Try inlining DSPExec --- src/dsp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dsp.c b/src/dsp.c index 9701964c..eef5bf0c 100644 --- a/src/dsp.c +++ b/src/dsp.c @@ -954,7 +954,7 @@ void DSPDone(void) /* DSP execution core */ -void DSPExec(int32_t cycles) +INLINE void DSPExec(int32_t cycles) { #ifdef DSP_SINGLE_STEPPING if (dsp_control & 0x18) From bbc8cfe66bd2c3afc7d55090056ebafc146e8f4d Mon Sep 17 00:00:00 2001 From: Joe Mattiello Date: Wed, 28 Mar 2018 06:16:56 -0400 Subject: [PATCH 18/34] Found my mistake in joystick causing wrong presses --- src/joystick.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/joystick.c b/src/joystick.c index 60b93fbc..695695f7 100644 --- a/src/joystick.c +++ b/src/joystick.c @@ -161,7 +161,8 @@ uint16_t JoystickReadWord(uint32_t offset) data &= (maskOffset00 ? 0xFFFD : 0xFFFF); - if (maskOffset01 != 0xFF) { + // Something about this is making pause active when pressing up + if (i1 != 0xFF) { uint8_t button = joypad0Buttons[maskOffset01]; uint16_t dataMask = (button ? 0xFFFE : 0xFFFF); data &= dataMask; @@ -176,12 +177,12 @@ uint16_t JoystickReadWord(uint32_t offset) uint8_t i1 = mask[offset1][1]; uint8_t maskOffset10 = joypad1Buttons[i0]; - uint8_t maskOffset11 = joypad1Buttons[i1]; data &= (maskOffset10 ? 0xFFF7 : 0xFFFF); - if (maskOffset11 != 0xFF) { - uint8_t button = joypad1Buttons[maskOffset11]; + if (i1 != 0xFF) { + uint8_t maskOffset11 = joypad1Buttons[i1]; + uint8_t button = maskOffset11; uint16_t dataMask = (button ? 0xFFFB : 0xFFFF); data &= dataMask; } From 34ca42f3339dc83d1fcf68ed21802db2cd4c1df5 Mon Sep 17 00:00:00 2001 From: Joe Mattiello Date: Wed, 28 Mar 2018 06:23:20 -0400 Subject: [PATCH 19/34] Really fixed joystick this time --- src/joystick.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/joystick.c b/src/joystick.c index 695695f7..f5bf236a 100644 --- a/src/joystick.c +++ b/src/joystick.c @@ -161,11 +161,8 @@ uint16_t JoystickReadWord(uint32_t offset) data &= (maskOffset00 ? 0xFFFD : 0xFFFF); - // Something about this is making pause active when pressing up if (i1 != 0xFF) { - uint8_t button = joypad0Buttons[maskOffset01]; - uint16_t dataMask = (button ? 0xFFFE : 0xFFFF); - data &= dataMask; + data &= (joypad0Buttons[i1] ? 0xFFFE : 0xFFFF); } } From a41dc2229d5a4591cd3fe33ab981e589bfa07584 Mon Sep 17 00:00:00 2001 From: Joseph Mattiello Date: Wed, 4 Apr 2018 16:44:32 -0400 Subject: [PATCH 20/34] Testing compiletime asserts --- src/vjag_memory.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/vjag_memory.h b/src/vjag_memory.h index 3b1cbb48..9246f520 100644 --- a/src/vjag_memory.h +++ b/src/vjag_memory.h @@ -8,6 +8,7 @@ #define __MEMORY_H__ #include +#include #ifdef __cplusplus extern "C" { @@ -39,7 +40,8 @@ extern "C" { } bytes; } Bits64; #pragma pack(pop) - + _Static_assert( sizeof(Bits64) == sizeof(uint64_t), "Pack error"); + #pragma pack(push, 1) typedef union Bits32 { uint32_t WORD; @@ -146,6 +148,7 @@ typedef union Bits32 { } bits; } Bits32; #pragma pack(pop) + _Static_assert( sizeof(Bits32) == sizeof(uint32_t), "Pack error"); #pragma pack(push, 1) typedef union GPUControl { From f2f37d12d9ff9980d6c42536804138c4a458c7a4 Mon Sep 17 00:00:00 2001 From: Joe Mattiello Date: Tue, 14 Sep 2021 09:41:24 -0400 Subject: [PATCH 21/34] blitter sign fixes Signed-off-by: Joe Mattiello --- src/blitter.c | 54 +++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/src/blitter.c b/src/blitter.c index 490002d8..8138d0db 100644 --- a/src/blitter.c +++ b/src/blitter.c @@ -2882,21 +2882,21 @@ Zstep := JOIN (zstep, zstep[0..31]);*/ cmpd.DATA = *patd ^ (cmpdst ? dstd : srcd); if (cmpd.bytes.b0 == 0) - *dcomp |= 0x01; + *dcomp |= 0x01u; if (cmpd.bytes.b1 == 0) - *dcomp |= 0x02; + *dcomp |= 0x02u; if (cmpd.bytes.b2 == 0) - *dcomp |= 0x04; + *dcomp |= 0x04u; if (cmpd.bytes.b3 == 0) - *dcomp |= 0x08; + *dcomp |= 0x08u; if (cmpd.bytes.b4 == 0) - *dcomp |= 0x10; + *dcomp |= 0x10u; if (cmpd.bytes.b5 == 0) - *dcomp |= 0x20; + *dcomp |= 0x20u; if (cmpd.bytes.b6 == 0) - *dcomp |= 0x40; + *dcomp |= 0x40u; if (cmpd.bytes.b7 == 0) - *dcomp |= 0x80; + *dcomp |= 0x80u; ////////////////////////////////////////////////////////////////////////////////////// // Zed comparator for Z-buffer operations @@ -2914,25 +2914,25 @@ with srcshift bits 4 & 5 selecting the start position //So... basically what we have here is: *zcomp = 0; // TODO: Byte and bit this -jm provenance - if ((((*srcz & 0x000000000000FFFFLL) < (dstz & 0x000000000000FFFFLL)) && (zmode & 0x01)) - || (((*srcz & 0x000000000000FFFFLL) == (dstz & 0x000000000000FFFFLL)) && (zmode & 0x02)) - || (((*srcz & 0x000000000000FFFFLL) > (dstz & 0x000000000000FFFFLL)) && (zmode & 0x04))) - *zcomp |= 0x01; - - if ((((*srcz & 0x00000000FFFF0000LL) < (dstz & 0x00000000FFFF0000LL)) && (zmode & 0x01)) - || (((*srcz & 0x00000000FFFF0000LL) == (dstz & 0x00000000FFFF0000LL)) && (zmode & 0x02)) - || (((*srcz & 0x00000000FFFF0000LL) > (dstz & 0x00000000FFFF0000LL)) && (zmode & 0x04))) - *zcomp |= 0x02; - - if ((((*srcz & 0x0000FFFF00000000LL) < (dstz & 0x0000FFFF00000000LL)) && (zmode & 0x01)) - || (((*srcz & 0x0000FFFF00000000LL) == (dstz & 0x0000FFFF00000000LL)) && (zmode & 0x02)) - || (((*srcz & 0x0000FFFF00000000LL) > (dstz & 0x0000FFFF00000000LL)) && (zmode & 0x04))) - *zcomp |= 0x04; - - if ((((*srcz & 0xFFFF000000000000LL) < (dstz & 0xFFFF000000000000LL)) && (zmode & 0x01)) - || (((*srcz & 0xFFFF000000000000LL) == (dstz & 0xFFFF000000000000LL)) && (zmode & 0x02)) - || (((*srcz & 0xFFFF000000000000LL) > (dstz & 0xFFFF000000000000LL)) && (zmode & 0x04))) - *zcomp |= 0x08; + if ((((*srcz & 0x000000000000FFFFLL) < (dstz & 0x000000000000FFFFLL)) && (zmode & 0x01u)) + || (((*srcz & 0x000000000000FFFFLL) == (dstz & 0x000000000000FFFFLL)) && (zmode & 0x02u)) + || (((*srcz & 0x000000000000FFFFLL) > (dstz & 0x000000000000FFFFLL)) && (zmode & 0x04u))) + *zcomp |= 0x01u; + + if ((((*srcz & 0x00000000FFFF0000LL) < (dstz & 0x00000000FFFF0000LL)) && (zmode & 0x01u)) + || (((*srcz & 0x00000000FFFF0000LL) == (dstz & 0x00000000FFFF0000LL)) && (zmode & 0x02u)) + || (((*srcz & 0x00000000FFFF0000LL) > (dstz & 0x00000000FFFF0000LL)) && (zmode & 0x04u))) + *zcomp |= 0x02u; + + if ((((*srcz & 0x0000FFFF00000000LL) < (dstz & 0x0000FFFF00000000LL)) && (zmode & 0x01u)) + || (((*srcz & 0x0000FFFF00000000LL) == (dstz & 0x0000FFFF00000000LL)) && (zmode & 0x02u)) + || (((*srcz & 0x0000FFFF00000000LL) > (dstz & 0x0000FFFF00000000LL)) && (zmode & 0x04u))) + *zcomp |= 0x04u; + + if ((((*srcz & 0xFFFF000000000000LL) < (dstz & 0xFFFF000000000000LL)) && (zmode & 0x01u)) + || (((*srcz & 0xFFFF000000000000LL) == (dstz & 0xFFFF000000000000LL)) && (zmode & 0x02u)) + || (((*srcz & 0xFFFF000000000000LL) > (dstz & 0xFFFF000000000000LL)) && (zmode & 0x04u))) + *zcomp |= 0x08u; //TEMP, TO TEST IF ZCOMP IS THE CULPRIT... //Nope, this is NOT the problem... From b93bcbc99b7327bc195505565e18c6a1c94acd76 Mon Sep 17 00:00:00 2001 From: Joe Mattiello Date: Tue, 14 Sep 2021 11:03:21 -0400 Subject: [PATCH 22/34] Comment out unused simd import Signed-off-by: Joe Mattiello --- src/blitter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blitter.c b/src/blitter.c index 8138d0db..037d510c 100644 --- a/src/blitter.c +++ b/src/blitter.c @@ -2399,7 +2399,7 @@ void ADDRGEN(uint32_t *address, uint32_t *pixa, bool gena2, bool zaddr, // Here's an important bit: The source data adder logic. Need to track down the inputs!!! // //////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////// -#include + void ADDARRAY(const uint16_t * addq, const uint8_t daddasel, const uint8_t daddbsel, const uint8_t daddmode, const uint64_t dstd, const uint32_t iinc, const uint8_t initcin[], const uint64_t initinc, const uint16_t initpix, const uint32_t istep, const uint64_t patd, const uint64_t srcd, const uint64_t srcz1, const uint64_t srcz2, From ada5c15dfd80619766325c42114d21a9991ffc1a Mon Sep 17 00:00:00 2001 From: Joe Mattiello Date: Sun, 19 Sep 2021 02:28:11 -0400 Subject: [PATCH 23/34] re-add audio_batch_cb Signed-off-by: Joe Mattiello --- src/dac.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/dac.c b/src/dac.c index a73cada2..de2453c9 100644 --- a/src/dac.c +++ b/src/dac.c @@ -53,7 +53,7 @@ #include "../libretro.h" -//extern retro_audio_sample_batch_t audio_batch_cb; +extern retro_audio_sample_batch_t audio_batch_cb; #define BUFFER_SIZE 0x10000 // Make the DAC buffers 64K x 16 bits #define DAC_AUDIO_RATE 48000 // Set the audio rate to 48 KHz @@ -183,12 +183,12 @@ void SDLSoundCallback(void * userdata, uint16_t * buffer, int length) { double timeToNextEvent = GetTimeToNextEvent(EVENT_JERRY); - DSPExec(USEC_TO_RISC_CYCLES(timeToNextEvent)); + DSPExec(USEC_TO_RISC_CYCLES(timeToNextEvent)); - HandleNextEvent(EVENT_JERRY); - } - while (!bufferDone); -// audio_batch_cb((int16_t*)sampleBuffer, length / 2); + HandleNextEvent(EVENT_JERRY); + } + while (!bufferDone); + audio_batch_cb((int16_t*)sampleBuffer, length / 2); } // LTXD/RTXD/SCLK/SMODE ($F1A148/4C/50/54) From 306edb3562e7fc368008b537a22146024c0d57d5 Mon Sep 17 00:00:00 2001 From: Joseph Mattello Date: Thu, 14 Oct 2021 01:35:04 -0400 Subject: [PATCH 24/34] readcpu.h pack struct Signed-off-by: Joseph Mattello --- src/m68000/readcpu.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/m68000/readcpu.h b/src/m68000/readcpu.h index 5fe0d495..721706cc 100644 --- a/src/m68000/readcpu.h +++ b/src/m68000/readcpu.h @@ -88,6 +88,7 @@ struct instr_def { extern const struct instr_def defs68k[]; extern int n_defs68k; +#pragma pack(push, 1) extern struct instr { long int handler; unsigned char dreg; @@ -110,6 +111,7 @@ extern struct instr { unsigned int isjmp:1; unsigned int unused2:4; } *table68k; +#pragma pack(pop) extern void read_table68k(void); extern void do_merges(void); From 56ac06cd443fa4b6f5c3079a4c41ee2ee1e25ffb Mon Sep 17 00:00:00 2001 From: Joseph Mattello Date: Thu, 14 Oct 2021 01:41:07 -0400 Subject: [PATCH 25/34] Performance patches to dsp The bit shifting and masking is expensive on ARM64 for some reason. The unions seem to greatly reduce the perfomance hit of these common calls. byte pack struct Move memory structs to header Disable some of my speed hacks to test mem corruption Wrap more of my structs in ifdefs for testing Signed-off-by: Joseph Mattello --- src/dsp.c | 90 +++++++++++++++++++---------- src/vjag_memory.h | 143 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 203 insertions(+), 30 deletions(-) diff --git a/src/dsp.c b/src/dsp.c index 26760c5d..f985aa08 100644 --- a/src/dsp.c +++ b/src/dsp.c @@ -264,16 +264,17 @@ static uint32_t dsp_flags; static uint32_t dsp_matrix_control; static uint32_t dsp_pointer_to_matrix; static uint32_t dsp_data_organization; -uint32_t dsp_control; + +Bits32 dsp_control; static uint32_t dsp_div_control; static uint8_t dsp_flag_z, dsp_flag_n, dsp_flag_c; static uint32_t * dsp_reg = NULL, * dsp_alternate_reg = NULL; uint32_t dsp_reg_bank_0[32], dsp_reg_bank_1[32]; -static uint32_t dsp_opcode_first_parameter; -static uint32_t dsp_opcode_second_parameter; +static uint8_t dsp_opcode_first_parameter; +static uint8_t dsp_opcode_second_parameter; -#define DSP_RUNNING (dsp_control & 0x01) +#define DSP_RUNNING (dsp_control.bits.b0) #define RM dsp_reg[dsp_opcode_first_parameter] #define RN dsp_reg[dsp_opcode_second_parameter] @@ -393,8 +394,13 @@ uint8_t DSPReadByte(uint32_t offset, uint32_t who/*=UNKNOWN*/) uint16_t DSPReadWord(uint32_t offset, uint32_t who/*=UNKNOWN*/) { - offset &= 0xFFFFFFFE; - +#ifdef USE_STRUCTS + Offset offsett; + offsett.LONG = offset; + offset = offsett.Members.offset; +#else + offset &= 0xFFFFFFFE; +#endif if (offset >= DSP_WORK_RAM_BASE && offset <= DSP_WORK_RAM_BASE+0x1FFF) { offset -= DSP_WORK_RAM_BASE; @@ -402,11 +408,22 @@ uint16_t DSPReadWord(uint32_t offset, uint32_t who/*=UNKNOWN*/) } else if ((offset>=DSP_CONTROL_RAM_BASE)&&(offset> 16; +#ifdef USE_STRUCTS + DSPLong data; + data.LONG = DSPReadLong(offset & 0xFFFFFFFC, who); + + if (offset & 0x03) { + return data.Data.LWORD; + } else { + return data.Data.UWORD; + } +#else + uint32_t data = DSPReadLong(offset & 0xFFFFFFFC, who); + + if (offset & 0x03) + return data & 0xFFFF; + return data >> 16; +#endif } return JaguarReadWord(offset, who); @@ -438,7 +455,7 @@ uint32_t DSPReadLong(uint32_t offset, uint32_t who/*=UNKNOWN*/) case 0x10: return dsp_pc; case 0x14: - return dsp_control; + return dsp_control.WORD; case 0x18: return dsp_modulo; case 0x1C: @@ -547,8 +564,8 @@ void DSPWriteLong(uint32_t offset, uint32_t data, uint32_t who/*=UNKNOWN*/) dsp_flag_c = (dsp_flags >> 1) & 0x01; dsp_flag_n = (dsp_flags >> 2) & 0x01; DSPUpdateRegisterBanks(); - dsp_control &= ~((dsp_flags & CINT04FLAGS) >> 3); - dsp_control &= ~((dsp_flags & CINT5FLAG) >> 1); + dsp_control.WORD &= ~((dsp_flags & CINT04FLAGS) >> 3); + dsp_control.WORD &= ~((dsp_flags & CINT5FLAG) >> 1); break; } case 0x04: @@ -592,7 +609,7 @@ void DSPWriteLong(uint32_t offset, uint32_t data, uint32_t who/*=UNKNOWN*/) } // Protect writes to VERSION and the interrupt latches... mask = VERSION | INT_LAT0 | INT_LAT1 | INT_LAT2 | INT_LAT3 | INT_LAT4 | INT_LAT5; - dsp_control = (dsp_control & mask) | (data & ~mask); + dsp_control.WORD = (dsp_control.WORD & mask) | (data & ~mask); //CC only! //!!!!!!!! @@ -646,7 +663,7 @@ void DSPHandleIRQs(void) return; // Get the active interrupt bits (latches) & interrupt mask (enables) - bits = ((dsp_control >> 10) & 0x20) | ((dsp_control >> 6) & 0x1F); + bits = ((dsp_control.WORD >> 10) & 0x20) | ((dsp_control.WORD >> 6) & 0x1F), mask = ((dsp_flags >> 11) & 0x20) | ((dsp_flags >> 4) & 0x1F); bits &= mask; @@ -733,7 +750,7 @@ void DSPHandleIRQsNP(void) return; // Get the active interrupt bits (latches) & interrupt mask (enables) - bits = ((dsp_control >> 10) & 0x20) | ((dsp_control >> 6) & 0x1F); + bits = ((dsp_control.WORD >> 10) & 0x20) | ((dsp_control.WORD >> 6) & 0x1F); mask = ((dsp_flags >> 11) & 0x20) | ((dsp_flags >> 4) & 0x1F); bits &= mask; @@ -773,11 +790,11 @@ void DSPSetIRQLine(int irqline, int state) { //NOTE: This doesn't take INT_LAT5 into account. !!! FIX !!! uint32_t mask = INT_LAT0 << irqline; - dsp_control &= ~mask; // Clear the latch bit + dsp_control.WORD &= ~mask; // Clear the latch bit if (state) { - dsp_control |= mask; // Set the latch bit + dsp_control.WORD |= mask; // Set the latch bit DSPHandleIRQsNP(); } } @@ -805,7 +822,7 @@ void DSPReset(void) dsp_matrix_control = 0x00000000; dsp_pointer_to_matrix = 0x00000000; dsp_data_organization = 0xFFFFFFFF; - dsp_control = 0x00002000; // Report DSP version 2 + dsp_control.WORD = 0x00002000; // Report DSP version 2 dsp_div_control = 0x00000000; dsp_in_exec = 0; @@ -845,22 +862,35 @@ INLINE void DSPExec(int32_t cycles) while (cycles > 0 && DSP_RUNNING) { - uint16_t opcode; - uint32_t index; - if (IMASKCleared) // If IMASK was cleared, { DSPHandleIRQsNP(); // See if any other interrupts are pending! IMASKCleared = false; } - opcode = DSPReadWord(dsp_pc, DSP); - index = opcode >> 10; - dsp_opcode_first_parameter = (opcode >> 5) & 0x1F; - dsp_opcode_second_parameter = opcode & 0x1F; - dsp_pc += 2; - dsp_opcode[index](); - dsp_opcode_use[index]++; +#ifdef USE_STRUCTS + OpCode opcode; + opcode.WORD = DSPReadWord(dsp_pc, DSP); + uint8_t index = opcode.Codes.index; + uint8_t fp = opcode.Codes.first; + uint8_t sp = opcode.Codes.second; + dsp_opcode_first_parameter = fp; + dsp_opcode_second_parameter = sp; + dsp_pc += 2; + dsp_opcode[index](); +#else + uint16_t opcode; + uint32_t index; + opcode = DSPReadWord(dsp_pc, DSP); + index = opcode >> 10; + dsp_opcode_first_parameter = (opcode >> 5) & 0x1F; + dsp_opcode_second_parameter = opcode & 0x1F; + dsp_pc += 2; + dsp_opcode[index](); + dsp_opcode_use[index]++; +#endif +// Counter is not necessary and expensive -jm prov +// dsp_opcode_use[index]++; cycles -= dsp_opcode_cycles[index]; } diff --git a/src/vjag_memory.h b/src/vjag_memory.h index 44749a92..10f28d6c 100644 --- a/src/vjag_memory.h +++ b/src/vjag_memory.h @@ -13,6 +13,149 @@ extern "C" { #endif +typedef union Bits32 { + uint32_t WORD; + struct Words { +#ifdef LITTLE_ENDIAN + uint16_t LWORD; + uint16_t UWORD; +#else + uint16_t UWORD; + uint16_t LWORD; +#endif + } Words; + struct bits { +#ifdef LITTLE_ENDIAN + unsigned int b0: 1; + unsigned int b1: 1; + unsigned int b2: 1; + unsigned int b3: 1; + unsigned int b4: 1; + unsigned int b5: 1; + unsigned int b6: 1; + unsigned int b7: 1; + unsigned int b8: 1; + unsigned int b9: 1; + unsigned int b10: 1; + unsigned int b11: 1; + unsigned int b12: 1; + unsigned int b13: 1; + unsigned int b14: 1; + unsigned int b15: 1; + unsigned int b16: 1; + unsigned int b17: 1; + unsigned int b18: 1; + unsigned int b19: 1; + unsigned int b20: 1; + unsigned int b21: 1; + unsigned int b22: 1; + unsigned int b23: 1; + unsigned int b24: 1; + unsigned int b25: 1; + unsigned int b26: 1; + unsigned int b27: 1; + unsigned int b28: 1; + unsigned int b29: 1; + unsigned int b30: 1; + unsigned int b31: 1; +#else + // reverse the order of the bit fields. + unsigned int b31: 1; + unsigned int b30: 1; + unsigned int b29: 1; + unsigned int b28: 1; + unsigned int b27: 1; + unsigned int b26: 1; + unsigned int b25: 1; + unsigned int b24: 1; + unsigned int b23: 1; + unsigned int b22: 1; + unsigned int b21: 1; + unsigned int b20: 1; + unsigned int b19: 1; + unsigned int b18: 1; + unsigned int b17: 1; + unsigned int b16: 1; + unsigned int b15: 1; + unsigned int b14: 1; + unsigned int b13: 1; + unsigned int b12: 1; + unsigned int b11: 1; + unsigned int b10: 1; + unsigned int b9: 1; + unsigned int b8: 1; + unsigned int b7: 1; + unsigned int b6: 1; + unsigned int b5: 1; + unsigned int b4: 1; + unsigned int b3: 1; + unsigned int b2: 1; + unsigned int b1: 1; + unsigned int b0: 1; +#endif + } bits; +} Bits32; + +#ifdef USE_STRUCTS +#pragma pack(push, 1) + typedef union OpCode { + uint16_t WORD; + struct Bytes { +#ifdef LITTLE_ENDIAN + uint8_t LBYTE; + uint8_t UBYTE; +#else + uint8_t UBYTE; + uint8_t LBYTE; +#endif + } Bytes; + struct Codes { +#ifdef LITTLE_ENDIAN + unsigned int second : 5; + unsigned int first : 5; + unsigned int index : 6; +#else + unsigned int index : 6; + unsigned int first : 5; + unsigned int second : 5; +#endif + } Codes; + } OpCode; +#pragma pack(pop) + + typedef OpCode U16Union; +#endif //USE_STRUCTS + +#ifdef USE_STRUCTS +typedef union Offset { + uint32_t LONG; +#pragma pack(push, 1) + struct Members { +#ifdef LITTLE_ENDIAN + unsigned int offset : 31; + unsigned int bit : 1; +#else + unsigned int bit : 1; + unsigned int offset : 31; +#endif + } Members; +#pragma pack(pop) +} Offset; +#endif //USE_STRUCTS + +typedef union DSPLong { + uint32_t LONG; + struct Data { +#ifdef LITTLE_ENDIAN + uint16_t LWORD; + uint16_t UWORD; +#else + uint16_t UWORD; + uint16_t LWORD; +#endif + } Data; +} DSPLong; + extern uint8_t jagMemSpace[]; extern uint8_t * jaguarMainRAM; From 554f1e1311d6eb0400f7e95b566b51cd92cf7df9 Mon Sep 17 00:00:00 2001 From: Joseph Mattello Date: Thu, 14 Oct 2021 01:35:35 -0400 Subject: [PATCH 26/34] Joystick.h typdef enum BUTTON Signed-off-by: Joseph Mattello --- src/joystick.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/joystick.h b/src/joystick.h index 599afd2a..41cee209 100644 --- a/src/joystick.h +++ b/src/joystick.h @@ -13,7 +13,7 @@ extern "C" { #endif -enum +typedef enum BUTTON { BUTTON_FIRST = 0, BUTTON_U = 0, @@ -40,7 +40,7 @@ enum BUTTON_OPTION = 19, BUTTON_PAUSE = 20, BUTTON_LAST = 20 -}; +} BUTTON; void JoystickInit(void); void JoystickReset(void); From da1bc2e87666679e9cc9717e5c70282c769d77b6 Mon Sep 17 00:00:00 2001 From: Joseph Mattello Date: Thu, 14 Oct 2021 02:07:29 -0400 Subject: [PATCH 27/34] dsp.c remove structs ifdef GPU_RUNNING running macro was pretty slow on ARM for some reason. Bitswise structs are faster in my testing Try some blitter optimizations blitter sign fixes Signed-off-by: Joe Mattiello Comment out unused simd import Signed-off-by: Joe Mattiello Try some GPU optimizations and clarity post rebase cleanup Signed-off-by: Joseph Mattello Remove USE_STRUCTS ifdef Signed-off-by: Joseph Mattello gpu.c Fix bad return Signed-off-by: Joseph Mattello Fix gpu opcode bad merge Signed-off-by: Joseph Mattello remove duplicate struct defs Signed-off-by: Joseph Mattello --- src/dsp.c | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/src/dsp.c b/src/dsp.c index f985aa08..ed420f8a 100644 --- a/src/dsp.c +++ b/src/dsp.c @@ -394,13 +394,9 @@ uint8_t DSPReadByte(uint32_t offset, uint32_t who/*=UNKNOWN*/) uint16_t DSPReadWord(uint32_t offset, uint32_t who/*=UNKNOWN*/) { -#ifdef USE_STRUCTS Offset offsett; offsett.LONG = offset; offset = offsett.Members.offset; -#else - offset &= 0xFFFFFFFE; -#endif if (offset >= DSP_WORK_RAM_BASE && offset <= DSP_WORK_RAM_BASE+0x1FFF) { offset -= DSP_WORK_RAM_BASE; @@ -408,7 +404,6 @@ uint16_t DSPReadWord(uint32_t offset, uint32_t who/*=UNKNOWN*/) } else if ((offset>=DSP_CONTROL_RAM_BASE)&&(offset> 16; -#endif } return JaguarReadWord(offset, who); @@ -868,7 +856,6 @@ INLINE void DSPExec(int32_t cycles) IMASKCleared = false; } -#ifdef USE_STRUCTS OpCode opcode; opcode.WORD = DSPReadWord(dsp_pc, DSP); uint8_t index = opcode.Codes.index; @@ -878,20 +865,8 @@ INLINE void DSPExec(int32_t cycles) dsp_opcode_second_parameter = sp; dsp_pc += 2; dsp_opcode[index](); -#else - uint16_t opcode; - uint32_t index; - opcode = DSPReadWord(dsp_pc, DSP); - index = opcode >> 10; - dsp_opcode_first_parameter = (opcode >> 5) & 0x1F; - dsp_opcode_second_parameter = opcode & 0x1F; - dsp_pc += 2; - dsp_opcode[index](); - dsp_opcode_use[index]++; -#endif -// Counter is not necessary and expensive -jm prov -// dsp_opcode_use[index]++; - cycles -= dsp_opcode_cycles[index]; + + cycles -= dsp_opcode_cycles[index]; } dsp_in_exec--; From 8a0a2dffee239fea52e609bd7eb6d1964864d105 Mon Sep 17 00:00:00 2001 From: Joseph Mattello Date: Thu, 14 Oct 2021 02:07:47 -0400 Subject: [PATCH 28/34] dsp.c delete dsp_opcode_use Signed-off-by: Joseph Mattello --- src/dsp.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/dsp.c b/src/dsp.c index ed420f8a..27182822 100644 --- a/src/dsp.c +++ b/src/dsp.c @@ -1930,7 +1930,6 @@ INLINE static void DSP_jr(void) }//*/ dsp_pc += 2; // For DSP_DIS_* accuracy DSPOpcode[pipeline[plPtrExec].opcode](); - dsp_opcode_use[pipeline[plPtrExec].opcode]++; pipeline[plPtrWrite] = pipeline[plPtrExec]; // Step 3: Flush pipeline & set new PC From 13d9abfda53163b2e68fee42f4bfa2dd940c75bf Mon Sep 17 00:00:00 2001 From: Joseph Mattello Date: Thu, 14 Oct 2021 02:08:57 -0400 Subject: [PATCH 29/34] vjag_memory.h add more struct definitions Signed-off-by: Joseph Mattello --- src/vjag_memory.h | 81 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 74 insertions(+), 7 deletions(-) diff --git a/src/vjag_memory.h b/src/vjag_memory.h index 10f28d6c..ad33cbd6 100644 --- a/src/vjag_memory.h +++ b/src/vjag_memory.h @@ -13,6 +13,34 @@ extern "C" { #endif +#pragma pack(push, 1) + typedef union Bits64 { + uint64_t DATA; + struct Bytes8 { +#ifdef LITTLE_ENDIAN + uint8_t b0; + uint8_t b1; + uint8_t b2; + uint8_t b3; + uint8_t b4; + uint8_t b5; + uint8_t b6; + uint8_t b7; +#else + uint8_t b7; + uint8_t b6; + uint8_t b5; + uint8_t b4; + uint8_t b3; + uint8_t b2; + uint8_t b1; + uint8_t b0; +#endif + } bytes; + } Bits64; +#pragma pack(pop) + +#pragma pack(push, 1) typedef union Bits32 { uint32_t WORD; struct Words { @@ -23,8 +51,30 @@ typedef union Bits32 { uint16_t UWORD; uint16_t LWORD; #endif - } Words; - struct bits { + } words; + struct Bytes4 { +#ifdef LITTLE_ENDIAN + uint8_t LL; + uint8_t LU; + uint8_t UL; + uint8_t UU; // Upper upper [UU, UL, LU, LL] +#else + uint8_t UU; // Upper upper [UU, UL, LU, LL] + uint8_t UL; + uint8_t LU; + uint8_t LL; +#endif + } bytes; + struct TopThreeOne { +#ifdef LITTLE_ENDIAN + unsigned int : 1; + uint32_t value : 31; +#else + uint32_t value : 31; + unsigned int : 1; +#endif + } topThreeOne; + struct Bits { #ifdef LITTLE_ENDIAN unsigned int b0: 1; unsigned int b1: 1; @@ -95,12 +145,32 @@ typedef union Bits32 { #endif } bits; } Bits32; +#pragma pack(pop) + +#pragma pack(push, 1) +typedef union GPUControl { + uint32_t WORD; + struct Words words; + struct Bits bits; + struct __attribute__ ((__packed__)) { +#ifdef LITTLE_ENDIAN + unsigned int : 6; + unsigned int irqMask: 5; + unsigned int : 21; +#else + unsigned int : 21; + unsigned int irqMask: 5; + unsigned int : 6; +#endif +} gpuIRQ; +#pragma pack(pop) + +} GPUControl; -#ifdef USE_STRUCTS #pragma pack(push, 1) typedef union OpCode { uint16_t WORD; - struct Bytes { + struct Bytes2 { #ifdef LITTLE_ENDIAN uint8_t LBYTE; uint8_t UBYTE; @@ -124,9 +194,7 @@ typedef union Bits32 { #pragma pack(pop) typedef OpCode U16Union; -#endif //USE_STRUCTS -#ifdef USE_STRUCTS typedef union Offset { uint32_t LONG; #pragma pack(push, 1) @@ -141,7 +209,6 @@ typedef union Offset { } Members; #pragma pack(pop) } Offset; -#endif //USE_STRUCTS typedef union DSPLong { uint32_t LONG; From 3f6c66a526bb1966004c63bbdae6d95028048d18 Mon Sep 17 00:00:00 2001 From: Joseph Mattello Date: Thu, 14 Oct 2021 02:10:04 -0400 Subject: [PATCH 30/34] blitter.c use structs, const and u-ints GPU_RUNNING running macro was pretty slow on ARM for some reason. Bitswise structs are faster in my testing Try some blitter optimizations blitter sign fixes Signed-off-by: Joe Mattiello Comment out unused simd import Signed-off-by: Joe Mattiello Try some GPU optimizations and clarity post rebase cleanup Signed-off-by: Joseph Mattello Remove USE_STRUCTS ifdef Signed-off-by: Joseph Mattello gpu.c Fix bad return Signed-off-by: Joseph Mattello Fix gpu opcode bad merge Signed-off-by: Joseph Mattello remove duplicate struct defs Signed-off-by: Joseph Mattello --- src/blitter.c | 188 +++++++++++++++++++++++++++----------------------- 1 file changed, 100 insertions(+), 88 deletions(-) diff --git a/src/blitter.c b/src/blitter.c index d51ae7b8..c51a6562 100644 --- a/src/blitter.c +++ b/src/blitter.c @@ -87,41 +87,41 @@ void BlitterMidsummer2(void); // Blitter command bits -#define SRCEN (cmd & 0x00000001) -#define SRCENZ (cmd & 0x00000002) -#define SRCENX (cmd & 0x00000004) -#define DSTEN (cmd & 0x00000008) -#define DSTENZ (cmd & 0x00000010) -#define DSTWRZ (cmd & 0x00000020) -#define CLIPA1 (cmd & 0x00000040) - -#define UPDA1F (cmd & 0x00000100) -#define UPDA1 (cmd & 0x00000200) -#define UPDA2 (cmd & 0x00000400) - -#define DSTA2 (cmd & 0x00000800) - -#define Z_OP_INF (cmd & 0x00040000) -#define Z_OP_EQU (cmd & 0x00080000) -#define Z_OP_SUP (cmd & 0x00100000) - -#define LFU_NAN (cmd & 0x00200000) -#define LFU_NA (cmd & 0x00400000) -#define LFU_AN (cmd & 0x00800000) -#define LFU_A (cmd & 0x01000000) - -#define CMPDST (cmd & 0x02000000) -#define BCOMPEN (cmd & 0x04000000) -#define DCOMPEN (cmd & 0x08000000) - -#define PATDSEL (cmd & 0x00010000) -#define ADDDSEL (cmd & 0x00020000) -#define TOPBEN (cmd & 0x00004000) -#define TOPNEN (cmd & 0x00008000) -#define BKGWREN (cmd & 0x10000000) -#define GOURD (cmd & 0x00001000) -#define GOURZ (cmd & 0x00002000) -#define SRCSHADE (cmd & 0x40000000) +#define SRCEN (cmd.bits.b0) +#define SRCENZ (cmd.bits.b1) +#define SRCENX (cmd.bits.b2) +#define DSTEN (cmd.bits.b3) +#define DSTENZ (cmd.bits.b4) +#define DSTWRZ (cmd.bits.b5) +#define CLIPA1 (cmd.bits.b6) + +#define UPDA1F (cmd.bits.b8) +#define UPDA1 (cmd.bits.b9) +#define UPDA2 (cmd.bits.b10) + +#define DSTA2 (cmd.bits.b11) + +#define Z_OP_INF (cmd.bits.b18) +#define Z_OP_EQU (cmd.bits.b19) +#define Z_OP_SUP (cmd.bits.b20) + +#define LFU_NAN (cmd.bits.b21) +#define LFU_NA (cmd.bits.b22) +#define LFU_AN (cmd.bits.b23) +#define LFU_A (cmd.bits.b24) + +#define CMPDST (cmd.bits.b25) +#define BCOMPEN (cmd.bits.b26) +#define DCOMPEN (cmd.bits.b27) + +#define PATDSEL (cmd.bits.b16) +#define ADDDSEL (cmd.bits.b17) +#define TOPBEN (cmd.bits.b14) +#define TOPNEN (cmd.bits.b15) +#define BKGWREN (cmd.bits.b28) +#define GOURD (cmd.bits.b12) +#define GOURZ (cmd.bits.b13) +#define SRCSHADE (cmd.bits.b30) #define XADDPHR 0 @@ -305,8 +305,11 @@ static int32_t a1_clip_x, a1_clip_y; // to optimize the blitter, then we may revisit it in the future... // Generic blit handler -void blitter_generic(uint32_t cmd) +void blitter_generic(uint32_t cmdi) { + Bits32 cmd; + cmd.WORD = cmdi; + uint32_t srcdata, srczdata, dstdata, dstzdata, writedata, inhibit; uint32_t bppSrc = (DSTA2 ? 1 << ((REG(A1_FLAGS) >> 3) & 0x07) : 1 << ((REG(A2_FLAGS) >> 3) & 0x07)); @@ -338,14 +341,14 @@ void blitter_generic(uint32_t cmd) if (SRCENZ) srczdata = READ_ZDATA(a2, REG(A2_FLAGS)); - else if (cmd & 0x0001C020) // PATDSEL | TOPBEN | TOPNEN | DSTWRZ + else if (cmd.WORD & 0x0001C020) // PATDSEL | TOPBEN | TOPNEN | DSTWRZ srczdata = READ_RDATA(SRCZINT, a2, REG(A2_FLAGS), a2_phrase_mode); } else // Use SRCDATA register... { srcdata = READ_RDATA(SRCDATA, a2, REG(A2_FLAGS), a2_phrase_mode); - if (cmd & 0x0001C020) // PATDSEL | TOPBEN | TOPNEN | DSTWRZ + if (cmd.WORD & 0x0001C020) // PATDSEL | TOPBEN | TOPNEN | DSTWRZ srczdata = READ_RDATA(SRCZINT, a2, REG(A2_FLAGS), a2_phrase_mode); } @@ -516,13 +519,13 @@ void blitter_generic(uint32_t cmd) srcdata = READ_PIXEL(a1, REG(A1_FLAGS)); if (SRCENZ) srczdata = READ_ZDATA(a1, REG(A1_FLAGS)); - else if (cmd & 0x0001C020) // PATDSEL | TOPBEN | TOPNEN | DSTWRZ + else if (cmd.WORD & 0x0001C020) // PATDSEL | TOPBEN | TOPNEN | DSTWRZ srczdata = READ_RDATA(SRCZINT, a1, REG(A1_FLAGS), a1_phrase_mode); } else { srcdata = READ_RDATA(SRCDATA, a1, REG(A1_FLAGS), a1_phrase_mode); - if (cmd & 0x001C020) // PATDSEL | TOPBEN | TOPNEN | DSTWRZ + if (cmd.WORD & 0x001C020) // PATDSEL | TOPBEN | TOPNEN | DSTWRZ srczdata = READ_RDATA(SRCZINT, a1, REG(A1_FLAGS), a1_phrase_mode); } @@ -756,20 +759,23 @@ void blitter_generic(uint32_t cmd) WREG(A2_PIXEL, (a2_y & 0xFFFF0000) | ((a2_x >> 16) & 0xFFFF)); } -void blitter_blit(uint32_t cmd) +void blitter_blit(uint32_t cmdi) { + Bits32 cmd; + cmd.WORD = cmdi; + uint32_t m, e; uint32_t pitchValue[4] = { 0, 1, 3, 2 }; colour_index = 0; - src = cmd & 0x07; - dst = (cmd >> 3) & 0x07; - misc = (cmd >> 6) & 0x03; - a1ctl = (cmd >> 8) & 0x7; - mode = (cmd >> 11) & 0x07; - ity = (cmd >> 14) & 0x0F; - zop = (cmd >> 18) & 0x07; - op = (cmd >> 21) & 0x0F; - ctrl = (cmd >> 25) & 0x3F; + src = cmd.WORD & 0x07; + dst = (cmd.WORD >> 3) & 0x07; + misc = (cmd.WORD >> 6) & 0x03; + a1ctl = (cmd.WORD >> 8) & 0x7; + mode = (cmd.WORD >> 11) & 0x07; + ity = (cmd.WORD >> 14) & 0x0F; + zop = (cmd.WORD >> 18) & 0x07; + op = (cmd.WORD >> 21) & 0x0F; + ctrl = (cmd.WORD >> 25) & 0x3F; // Addresses in A1/2_BASE are *phrase* aligned, i.e., bottom three bits are ignored! // NOTE: This fixes Rayman's bad collision detection AND keeps T2K working! @@ -952,7 +958,7 @@ void blitter_blit(uint32_t cmd) gd_ca = 0xFFFFFF00 | gd_ca; } - blitter_generic(cmd); + blitter_generic(cmd.WORD); } #endif /******************************************************************************* @@ -1113,10 +1119,11 @@ void BlitterWriteWord(uint32_t offset, uint16_t data, uint32_t who/*=UNKNOWN*/) // I.e., the second write of 32-bit value--not convinced this is the best way to do this! // But then again, according to the Jaguar docs, this is correct...! { - if (vjs.useFastBlitter) - blitter_blit(GET32(blitter_ram, 0x38)); - else - BlitterMidsummer2(); + if (vjs.useFastBlitter) { + blitter_blit(GET32(blitter_ram, 0x38)); + } else { + BlitterMidsummer2(); + } } } //F02278,9,A,B @@ -1135,10 +1142,10 @@ void BlitterWriteLong(uint32_t offset, uint32_t data, uint32_t who) void ADDRGEN(uint32_t *, uint32_t *, bool, bool, uint16_t, uint16_t, uint32_t, uint8_t, uint8_t, uint8_t, uint8_t, uint16_t, uint16_t, uint32_t, uint8_t, uint8_t, uint8_t, uint8_t); -void ADDARRAY(uint16_t * addq, uint8_t daddasel, uint8_t daddbsel, uint8_t daddmode, - uint64_t dstd, uint32_t iinc, uint8_t initcin[], uint64_t initinc, uint16_t initpix, - uint32_t istep, uint64_t patd, uint64_t srcd, uint64_t srcz1, uint64_t srcz2, - uint32_t zinc, uint32_t zstep); +void ADDARRAY(const uint16_t * addq, const uint8_t daddasel, const uint8_t daddbsel, const uint8_t daddmode, + const uint64_t dstd, const uint32_t iinc, const uint8_t initcin[], const uint64_t initinc, const uint16_t initpix, + const uint32_t istep, const uint64_t patd, const uint64_t srcd, const uint64_t srcz1, const uint64_t srcz2, + const uint32_t zinc, const uint32_t zstep); void ADD16SAT(uint16_t *r, uint8_t *co, uint16_t a, uint16_t b, uint8_t cin, bool sat, bool eightbit, bool hicinh); void ADDAMUX(int16_t *adda_x, int16_t *adda_y, uint8_t addasel, int16_t a1_step_x, int16_t a1_step_y, int16_t a1_stepf_x, int16_t a1_stepf_y, int16_t a2_step_x, int16_t a2_step_y, @@ -1166,7 +1173,8 @@ void BlitterMidsummer2(void) //Will remove stuff that isn't in Jaguar I once fully described (stuff like texture won't //be described here at all)... - uint32_t cmd = GET32(blitter_ram, COMMAND); + Bits32 cmd; + cmd.WORD = GET32(blitter_ram, COMMAND); // Line states passed in via the command register @@ -1177,7 +1185,7 @@ void BlitterMidsummer2(void) patdsel = (PATDSEL), adddsel = (ADDDSEL), cmpdst = (CMPDST), bcompen = (BCOMPEN), dcompen = (DCOMPEN), bkgwren = (BKGWREN), srcshade = (SRCSHADE); - uint8_t zmode = (cmd & 0x01C0000) >> 18, lfufunc = (cmd & 0x1E00000) >> 21; + uint8_t zmode = (cmd.WORD & 0x01C0000) >> 18, lfufunc = (cmd.WORD & 0x1E00000) >> 21; //Missing: BUSHI //Where to find various lines: // clip_a1 -> inner @@ -2397,10 +2405,10 @@ void ADDRGEN(uint32_t *address, uint32_t *pixa, bool gena2, bool zaddr, //////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////// -void ADDARRAY(uint16_t * addq, uint8_t daddasel, uint8_t daddbsel, uint8_t daddmode, - uint64_t dstd, uint32_t iinc, uint8_t initcin[], uint64_t initinc, uint16_t initpix, - uint32_t istep, uint64_t patd, uint64_t srcd, uint64_t srcz1, uint64_t srcz2, - uint32_t zinc, uint32_t zstep) +void ADDARRAY(const uint16_t * addq, const uint8_t daddasel, const uint8_t daddbsel, const uint8_t daddmode, + const uint64_t dstd, const uint32_t iinc, const uint8_t initcin[], const uint64_t initinc, const uint16_t initpix, + const uint32_t istep, const uint64_t patd, const uint64_t srcd, const uint64_t srcz1, const uint64_t srcz2, + const uint32_t zinc, const uint32_t zstep) { unsigned i; uint16_t adda[4]; @@ -2851,7 +2859,7 @@ Patdhi := JOIN (patdhi, patd[32..63]);*/ uint8_t dech38el[2][8] = { { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } }; int en; - uint64_t cmpd; + Bits64 cmpd; uint8_t dbinht; uint16_t addq[4]; uint8_t initcin[4] = { 0, 0, 0, 0 }; @@ -2875,23 +2883,23 @@ Zstep := JOIN (zstep, zstep[0..31]);*/ /*Datacomp := DATACOMP (dcomp[0..7], cmpdst, dstdlo, dstdhi, patdlo, patdhi, srcdlo, srcdhi);*/ ////////////////////////////////////// C++ CODE ////////////////////////////////////// *dcomp = 0; - cmpd = *patd ^ (cmpdst ? dstd : srcd); + cmpd.DATA = *patd ^ (cmpdst ? dstd : srcd); - if ((cmpd & 0x00000000000000FFLL) == 0) + if (cmpd.bytes.b0 == 0) *dcomp |= 0x01u; - if ((cmpd & 0x000000000000FF00LL) == 0) + if (cmpd.bytes.b1 == 0) *dcomp |= 0x02u; - if ((cmpd & 0x0000000000FF0000LL) == 0) + if (cmpd.bytes.b2 == 0) *dcomp |= 0x04u; - if ((cmpd & 0x00000000FF000000LL) == 0) + if (cmpd.bytes.b3 == 0) *dcomp |= 0x08u; - if ((cmpd & 0x000000FF00000000LL) == 0) + if (cmpd.bytes.b4 == 0) *dcomp |= 0x10u; - if ((cmpd & 0x0000FF0000000000LL) == 0) + if (cmpd.bytes.b5 == 0) *dcomp |= 0x20u; - if ((cmpd & 0x00FF000000000000LL) == 0) + if (cmpd.bytes.b6 == 0) *dcomp |= 0x40u; - if ((cmpd & 0xFF00000000000000LL) == 0) + if (cmpd.bytes.b7 == 0) *dcomp |= 0x80u; ////////////////////////////////////////////////////////////////////////////////////// @@ -2909,25 +2917,25 @@ with srcshift bits 4 & 5 selecting the start position */ //So... basically what we have here is: *zcomp = 0; - - if ((((*srcz & 0x000000000000FFFFLL) < (dstz & 0x000000000000FFFFLL)) && (zmode & 0x01)) - || (((*srcz & 0x000000000000FFFFLL) == (dstz & 0x000000000000FFFFLL)) && (zmode & 0x02)) - || (((*srcz & 0x000000000000FFFFLL) > (dstz & 0x000000000000FFFFLL)) && (zmode & 0x04))) + // TODO: Byte and bit this - @joematt provenance + if ((((*srcz & 0x000000000000FFFFLL) < (dstz & 0x000000000000FFFFLL)) && (zmode & 0x01u)) + || (((*srcz & 0x000000000000FFFFLL) == (dstz & 0x000000000000FFFFLL)) && (zmode & 0x02u)) + || (((*srcz & 0x000000000000FFFFLL) > (dstz & 0x000000000000FFFFLL)) && (zmode & 0x04u))) *zcomp |= 0x01u; - if ((((*srcz & 0x00000000FFFF0000LL) < (dstz & 0x00000000FFFF0000LL)) && (zmode & 0x01)) - || (((*srcz & 0x00000000FFFF0000LL) == (dstz & 0x00000000FFFF0000LL)) && (zmode & 0x02)) - || (((*srcz & 0x00000000FFFF0000LL) > (dstz & 0x00000000FFFF0000LL)) && (zmode & 0x04))) + if ((((*srcz & 0x00000000FFFF0000LL) < (dstz & 0x00000000FFFF0000LL)) && (zmode & 0x01u)) + || (((*srcz & 0x00000000FFFF0000LL) == (dstz & 0x00000000FFFF0000LL)) && (zmode & 0x02u)) + || (((*srcz & 0x00000000FFFF0000LL) > (dstz & 0x00000000FFFF0000LL)) && (zmode & 0x04u))) *zcomp |= 0x02u; - if ((((*srcz & 0x0000FFFF00000000LL) < (dstz & 0x0000FFFF00000000LL)) && (zmode & 0x01)) - || (((*srcz & 0x0000FFFF00000000LL) == (dstz & 0x0000FFFF00000000LL)) && (zmode & 0x02)) - || (((*srcz & 0x0000FFFF00000000LL) > (dstz & 0x0000FFFF00000000LL)) && (zmode & 0x04))) + if ((((*srcz & 0x0000FFFF00000000LL) < (dstz & 0x0000FFFF00000000LL)) && (zmode & 0x01u)) + || (((*srcz & 0x0000FFFF00000000LL) == (dstz & 0x0000FFFF00000000LL)) && (zmode & 0x02u)) + || (((*srcz & 0x0000FFFF00000000LL) > (dstz & 0x0000FFFF00000000LL)) && (zmode & 0x04u))) *zcomp |= 0x04u; - if ((((*srcz & 0xFFFF000000000000LL) < (dstz & 0xFFFF000000000000LL)) && (zmode & 0x01)) - || (((*srcz & 0xFFFF000000000000LL) == (dstz & 0xFFFF000000000000LL)) && (zmode & 0x02)) - || (((*srcz & 0xFFFF000000000000LL) > (dstz & 0xFFFF000000000000LL)) && (zmode & 0x04))) + if ((((*srcz & 0xFFFF000000000000LL) < (dstz & 0xFFFF000000000000LL)) && (zmode & 0x01u)) + || (((*srcz & 0xFFFF000000000000LL) == (dstz & 0xFFFF000000000000LL)) && (zmode & 0x02u)) + || (((*srcz & 0xFFFF000000000000LL) > (dstz & 0xFFFF000000000000LL)) && (zmode & 0x04u))) *zcomp |= 0x08u; //TEMP, TO TEST IF ZCOMP IS THE CULPRIT... @@ -3040,6 +3048,8 @@ Sfine := DECH38EL (s_fine[0..7], dstart[0..2], sfen\);*/ /*Maskt[0] := BUF1 (maskt[0], s_fine[0]); Maskt[1-7] := OAN1P (maskt[1-7], maskt[0-6], s_fine[1-7], e_fine\[1-7]);*/ ////////////////////////////////////// C++ CODE ////////////////////////////////////// + // TODO: Byte and bit this - @joematt provenance + maskt = s_fine & 0x0001; maskt |= (((maskt & 0x0001) || (s_fine & 0x02u)) && (e_fine & 0x02u) ? 0x0002 : 0x0000); maskt |= (((maskt & 0x0002) || (s_fine & 0x04u)) && (e_fine & 0x04u) ? 0x0004 : 0x0000); @@ -3051,6 +3061,7 @@ Maskt[1-7] := OAN1P (maskt[1-7], maskt[0-6], s_fine[1-7], e_fine\[1-7]);*/ ////////////////////////////////////////////////////////////////////////////////////// /* Produce a look-ahead on the ripple carry */ + // TODO: Byte and bit this - @joematt provenance maskt |= (((s_coarse & e_coarse & 0x01u) || (s_coarse & 0x02u)) && (e_coarse & 0x02u) ? 0x0100 : 0x0000); maskt |= (((maskt & 0x0100) || (s_coarse & 0x04u)) && (e_coarse & 0x04u) ? 0x0200 : 0x0000); maskt |= (((maskt & 0x0200) || (s_coarse & 0x08u)) && (e_coarse & 0x08u) ? 0x0400 : 0x0000); @@ -3087,6 +3098,7 @@ Masku[14] := MX2 (masku[14], maskt[14], maskt[0], mir_byte);*/ mir_bit = true/*big_pix*/ && !phrase_mode; mir_byte = true/*big_pix*/ && phrase_mode; masku = maskt; + // TODO: Byte and bit this - @joematt provenance if (mir_bit) { From 465d588dea5754309477cb73d11630d2648d0394 Mon Sep 17 00:00:00 2001 From: Joseph Mattello Date: Thu, 14 Oct 2021 02:19:23 -0400 Subject: [PATCH 31/34] gpu.c gpucontrol as union GPU_RUNNING running macro was pretty slow on ARM for some reason. Bitswise structs are faster in my testing Signed-off-by: Joseph Mattello --- src/gpu.c | 72 +++++++++++++++++++++++++++---------------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/src/gpu.c b/src/gpu.c index 66959e52..05ec6fac 100644 --- a/src/gpu.c +++ b/src/gpu.c @@ -177,7 +177,8 @@ static uint32_t gpu_flags; static uint32_t gpu_matrix_control; static uint32_t gpu_pointer_to_matrix; static uint32_t gpu_data_organization; -static uint32_t gpu_control; +static GPUControl gpu_control; + static uint32_t gpu_div_control; // There is a distinct advantage to having these separated out--there's no need to clear // a bit before writing a result. I.e., if the result of an operation leaves a zero in @@ -192,7 +193,7 @@ static uint32_t gpu_instruction; static uint32_t gpu_opcode_first_parameter; static uint32_t gpu_opcode_second_parameter; -#define GPU_RUNNING (gpu_control & 0x01) +#define GPU_RUNNING (gpu_control.bits.b0) #define RM gpu_reg[gpu_opcode_first_parameter] #define RN gpu_reg[gpu_opcode_second_parameter] @@ -342,34 +343,33 @@ uint32_t GPUReadLong(uint32_t offset, uint32_t who/*=UNKNOWN*/) { offset &= 0x1F; switch (offset) - { - case 0x00: - gpu_flag_c = (gpu_flag_c ? 1 : 0); - gpu_flag_z = (gpu_flag_z ? 1 : 0); - gpu_flag_n = (gpu_flag_n ? 1 : 0); - - gpu_flags = (gpu_flags & 0xFFFFFFF8) | (gpu_flag_n << 2) | (gpu_flag_c << 1) | gpu_flag_z; - - return gpu_flags & 0xFFFFC1FF; - case 0x04: - return gpu_matrix_control; - case 0x08: - return gpu_pointer_to_matrix; - case 0x0C: - return gpu_data_organization; - case 0x10: - return gpu_pc; - case 0x14: - return gpu_control; - case 0x18: - return gpu_hidata; - case 0x1C: - return gpu_remain; - default: // unaligned long read - break; - } - - return 0; + { + case 0x00: + gpu_flag_c = (gpu_flag_c ? 1 : 0); + gpu_flag_z = (gpu_flag_z ? 1 : 0); + gpu_flag_n = (gpu_flag_n ? 1 : 0); + + gpu_flags = (gpu_flags & 0xFFFFFFF8) | (gpu_flag_n << 2) | (gpu_flag_c << 1) | gpu_flag_z; + + return gpu_flags & 0xFFFFC1FF; + case 0x04: + return gpu_matrix_control; + case 0x08: + return gpu_pointer_to_matrix; + case 0x0C: + return gpu_data_organization; + case 0x10: + return gpu_pc; + case 0x14: + return gpu_control.WORD; + case 0x18: + return gpu_hidata; + case 0x1C: + return gpu_remain; + default: // unaligned long read + break; + } + return 0; } return (JaguarReadWord(offset, who) << 16) | JaguarReadWord(offset + 2, who); @@ -473,7 +473,7 @@ void GPUWriteLong(uint32_t offset, uint32_t data, uint32_t who/*=UNKNOWN*/) gpu_flag_c = (gpu_flags & CARRY_FLAG) >> 1; gpu_flag_n = (gpu_flags & NEGA_FLAG) >> 2; GPUUpdateRegisterBanks(); - gpu_control &= ~((gpu_flags & CINT04FLAGS) >> 3); // Interrupt latch clear bits + gpu_control.WORD &= ~((gpu_flags & CINT04FLAGS) >> 3); // Interrupt latch clear bits //Writing here is only an interrupt enable--this approach is just plain wrong! // GPUHandleIRQs(); //This, however, is A-OK! ;-) @@ -523,7 +523,7 @@ void GPUWriteLong(uint32_t offset, uint32_t data, uint32_t who/*=UNKNOWN*/) data &= ~0x04; } - gpu_control = (gpu_control & 0xF7C0) | (data & (~0xF7C0)); + gpu_control.WORD = (gpu_control.WORD & 0xF7C0) | (data & (~0xF7C0)); // if gpu wasn't running but is now running, execute a few cycles #ifdef GPU_SINGLE_STEPPING @@ -579,7 +579,7 @@ void GPUHandleIRQs(void) return; // Get the interrupt latch & enable bits - bits = (gpu_control >> 6) & 0x1F; + bits = gpu_control.gpuIRQ.irqMask; //(gpu_control >> 6) & 0x1F; mask = (gpu_flags >> 4) & 0x1F; // Bail out if latched interrupts aren't enabled @@ -618,11 +618,11 @@ void GPUHandleIRQs(void) void GPUSetIRQLine(int irqline, int state) { uint32_t mask = 0x0040 << irqline; - gpu_control &= ~mask; // Clear the interrupt latch + gpu_control.WORD &= ~mask; // Clear the interrupt latch if (state) { - gpu_control |= mask; // Assert the interrupt latch + gpu_control.WORD |= mask; // Assert the interrupt latch GPUHandleIRQs(); // And handle the interrupt... } } @@ -644,7 +644,7 @@ void GPUReset(void) gpu_pointer_to_matrix = 0x00000000; gpu_data_organization = 0xFFFFFFFF; gpu_pc = 0x00F03000; - gpu_control = 0x00002800; // Correctly sets this as TOM Rev. 2 + gpu_control.WORD = 0x00002800; // Correctly sets this as TOM Rev. 2 gpu_hidata = 0x00000000; gpu_remain = 0x00000000; // These two registers are RO/WO gpu_div_control = 0x00000000; From 7bd6fc93227e0a0a9e9e1ce0d8c21ccfeb87605e Mon Sep 17 00:00:00 2001 From: Joseph Mattello Date: Thu, 14 Oct 2021 02:19:51 -0400 Subject: [PATCH 32/34] gpu.c inline some things Signed-off-by: Joseph Mattello --- src/gpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gpu.c b/src/gpu.c index 05ec6fac..f0dc274d 100644 --- a/src/gpu.c +++ b/src/gpu.c @@ -296,7 +296,7 @@ uint8_t GPUReadByte(uint32_t offset, uint32_t who/*=UNKNOWN*/) } // GPU word access (read) -uint16_t GPUReadWord(uint32_t offset, uint32_t who/*=UNKNOWN*/) +INLINE uint16_t GPUReadWord(uint32_t offset, uint32_t who/*=UNKNOWN*/) { if ((offset >= GPU_WORK_RAM_BASE) && (offset < GPU_WORK_RAM_BASE+0x1000)) { @@ -325,7 +325,7 @@ uint16_t GPUReadWord(uint32_t offset, uint32_t who/*=UNKNOWN*/) } // GPU dword access (read) -uint32_t GPUReadLong(uint32_t offset, uint32_t who/*=UNKNOWN*/) +INLINE uint32_t GPUReadLong(uint32_t offset, uint32_t who/*=UNKNOWN*/) { if (offset >= 0xF02000 && offset <= 0xF020FF) { From 84f36d8bb4cb82d109c293845399216652637da8 Mon Sep 17 00:00:00 2001 From: Joseph Mattello Date: Thu, 14 Oct 2021 02:20:45 -0400 Subject: [PATCH 33/34] gpu.c use opcode in... Signed-off-by: Joseph Mattello --- src/gpu.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/gpu.c b/src/gpu.c index f0dc274d..930d1771 100644 --- a/src/gpu.c +++ b/src/gpu.c @@ -300,10 +300,11 @@ INLINE uint16_t GPUReadWord(uint32_t offset, uint32_t who/*=UNKNOWN*/) { if ((offset >= GPU_WORK_RAM_BASE) && (offset < GPU_WORK_RAM_BASE+0x1000)) { - uint16_t data; - offset &= 0xFFF; - data = ((uint16_t)gpu_ram_8[offset] << 8) | (uint16_t)gpu_ram_8[offset+1]; - return data; + offset &= 0xFFF; + OpCode data; + data.Bytes.UBYTE = (uint16_t)gpu_ram_8[offset]; + data.Bytes.LBYTE = (uint16_t)gpu_ram_8[offset+1]; + return data.WORD; } else if ((offset >= GPU_CONTROL_RAM_BASE) && (offset < GPU_CONTROL_RAM_BASE+0x20)) { From 84a08084c289f0b287ceab4e686ab67a74477028 Mon Sep 17 00:00:00 2001 From: Joseph Mattello Date: Thu, 14 Oct 2021 19:13:50 -0400 Subject: [PATCH 34/34] gpu.c gpu_opcode_div use structs Signed-off-by: Joseph Mattello --- src/gpu.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/src/gpu.c b/src/gpu.c index 930d1771..01e61f84 100644 --- a/src/gpu.c +++ b/src/gpu.c @@ -1573,30 +1573,35 @@ INLINE static void gpu_opcode_abs(void) INLINE static void gpu_opcode_div(void) // RN / RM { + unsigned i; // Real algorithm, courtesy of SCPCD: NYAN! - uint32_t q = RN; - uint32_t r = 0; + Bits32 q; + q.WORD = RN; + + Bits32 r; + r.WORD = 0; // If 16.16 division, stuff top 16 bits of RN into remainder and put the // bottom 16 of RN in top 16 of quotient - if (gpu_div_control & 0x01) - q <<= 16, r = RN >> 16; + if (gpu_div_control & 0x01) { + r.WORD = q.words.UWORD; + q.words.UWORD = q.words.LWORD; + q.words.LWORD = 0; + } for(i=0; i<32; i++) { - uint32_t sign = r & 0x80000000; - r = (r << 1) | ((q >> 31) & 0x01); - r += (sign ? RM : -RM); - q = (q << 1) | (((~r) >> 31) & 0x01); + uint32_t sign = r.bits.b31; + r.WORD = (r.WORD << 1) | q.bits.b31; + r.WORD += (sign ? RM : -RM); + q.WORD = (q.WORD << 1) | !r.bits.b31; // (((~r) >> 31) & 0x01); } - RN = q; - gpu_remain = r; - + RN = q.WORD; + gpu_remain = r.WORD; } - INLINE static void gpu_opcode_imultn(void) { uint32_t res = (int32_t)((int16_t)RN * (int16_t)RM);