From 04cfb77caba22b326fcad3533741f90c77083b8c Mon Sep 17 00:00:00 2001 From: refractionpcsx2 Date: Tue, 3 Oct 2023 02:34:27 +0100 Subject: [PATCH] GS: Further transfer optimizations --- pcsx2/GS/GSRegs.h | 90 +++++++++++++++---------------- pcsx2/GS/GSState.cpp | 126 ++++++++++++++----------------------------- 2 files changed, 83 insertions(+), 133 deletions(-) diff --git a/pcsx2/GS/GSRegs.h b/pcsx2/GS/GSRegs.h index cf09bdd4d65f9b..d6cd4a7fe4dacc 100644 --- a/pcsx2/GS/GSRegs.h +++ b/pcsx2/GS/GSRegs.h @@ -85,15 +85,15 @@ enum GIF_REG enum GIF_REG_COMPLEX { - GIF_REG_STQRGBAXYZF2 = 0x00, - GIF_REG_STQRGBAXYZ2 = 0x01, - GIF_REG_UVRGBAXYZ2 = 0x02, + GIF_REG_RGBAXYZF2 = 0x00, + GIF_REG_STQXYZF2 = 0x01, + GIF_REG_UVXYZF2 = 0x02, GIF_REG_RGBAXYZ2 = 0x03, - GIF_REG_RGBAXYZF2 = 0x04, - GIF_REG_STQXYZ2 = 0x05, - GIF_REG_STQXYZF2 = 0x06, - GIF_REG_UVXYZ2 = 0x07, - GIF_REG_UVXYZF2 = 0x08, + GIF_REG_STQXYZ2 = 0x04, + GIF_REG_UVXYZ2 = 0x05, + GIF_REG_STQRGBAXYZF2 = 0x06, + GIF_REG_STQRGBAXYZ2 = 0x07, + GIF_REG_UVRGBAXYZ2 = 0x08, GIF_REG_RGBAUVXYZF2 = 0x09, GIF_REG_UVRGBAXYZF2 = 0x0A }; @@ -1132,19 +1132,19 @@ struct alignas(32) GIFPath enum { - TYPE_UNKNOWN, - TYPE_ADONLY, - TYPE_STQRGBAXYZF2, - TYPE_STQRGBAXYZ2, - TYPE_UVRGBAXYZ2, TYPE_RGBAXYZF2, - TYPE_UVXYZF2, TYPE_STQXYZF2, + TYPE_UVXYZF2, TYPE_RGBAXYZ2, - TYPE_UVXYZ2, TYPE_STQXYZ2, + TYPE_UVXYZ2, + TYPE_STQRGBAXYZF2, + TYPE_STQRGBAXYZ2, + TYPE_UVRGBAXYZ2, TYPE_RGBAUVXYZF2, TYPE_UVRGBAXYZF2, + TYPE_ADONLY, + TYPE_UNKNOWN, }; __forceinline void SetTag(const void* mem) @@ -1153,20 +1153,18 @@ struct alignas(32) GIFPath // the compiler has a hard time not reloading every time a field of src is accessed - u32 a = src->U32[0]; - u32 b = src->U32[1]; + const u64 a = src->U64[0]; - tag.U32[0] = a; - tag.U32[1] = b; + tag.U64[0] = a; nloop = a & 0x7fff; if (nloop == 0) return; - GSVector4i v = GSVector4i::loadl(&src->REGS); // REGS not stored to tag.REGS, only into this->regs, restored before saving the state though + const GSVector4i v = GSVector4i::loadl(&src->REGS); // REGS not stored to tag.REGS, only into this->regs, restored before saving the state though - nreg = (b & 0xf0000000) ? (b >> 28) : 16; // src->NREG + nreg = (a & 0xf000000000000000ULL) ? (a >> 60) : 16; // src->NREG regs = v.upl8(v >> 4) & GSVector4i::x0f(nreg); reg = 0; @@ -1178,49 +1176,47 @@ struct alignas(32) GIFPath { type = TYPE_ADONLY; } - else + else if(nloop > 4) { switch (nreg) { case 1: break; case 2: - if (regs.eq8(GSVector4i::cxpr(0x00000401)).mask() == (1 << 2) - 1) - type = TYPE_RGBAXYZF2; - else if (regs.eq8(GSVector4i::cxpr(0x0000040)).mask() == (1 << 2) - 1) - type = TYPE_STQXYZF2; - else if (regs.eq8(GSVector4i::cxpr(0x00000402)).mask() == (1 << 2) - 13) - type = TYPE_UVXYZF2; - else if (regs.eq8(GSVector4i::cxpr(0x00000501)).mask() == (1 << 2) - 1) - type = TYPE_RGBAXYZ2; - else if (regs.eq8(GSVector4i::cxpr(0x00000502)).mask() == (1 << 2) - 1) - type = TYPE_STQXYZ2; - else if (regs.eq8(GSVector4i::cxpr(0x00000503)).mask() == (1 << 2) - 1) - type = TYPE_UVXYZ2; + { + const u32 val = regs.U32[0]; + if ((val >> 8) == 0x04) + { + type = (val >= 0x401 && val <= 0x403) ? (TYPE_RGBAXYZF2 + (val & 0xf) - 1) : type; + } + else if ((val >> 8) == 0x05) + { + type = (val >= 0x501 && val <= 0x503) ? (TYPE_RGBAXYZ2 + (val & 0xf) - 1) : type; + } + } break; case 3: // many games, TODO: formats mixed with NOPs (xeno2: 040f010f02, 04010f020f, mgs3: 04010f0f02, 0401020f0f, 04010f020f) - if (regs.eq8(GSVector4i::cxpr(0x00040102)).mask() == (1 << 3) - 1) + if (regs.U32[0] == 0x00040102) type = TYPE_STQRGBAXYZF2; // GoW (has other crazy formats, like ...030503050103) - else if (regs.eq8(GSVector4i::cxpr(0x00050102)).mask() == (1 << 3) - 1) + else if (regs.U32[0] == 0x00050102) type = TYPE_STQRGBAXYZ2; - // TODO: common types with UV instead - else if (regs.eq8(GSVector4i::cxpr(0x00050103)).mask() == (1 << 3) - 1) + else if (regs.U32[0] == 0x00050103) type = TYPE_UVRGBAXYZ2; - else if (regs.eq8(GSVector4i::cxpr(0x00040103)).mask() == (1 << 3) - 1) + else if (regs.U32[0] == 0x00040103) type = TYPE_UVRGBAXYZF2; - else if (regs.eq8(GSVector4i::cxpr(0x00040301)).mask() == (1 << 3) - 1) + else if (regs.U32[0] == 0x00040301) type = TYPE_RGBAUVXYZF2; break; case 4: - if (regs.eq8(GSVector4i(0x04030403)).mask() == (1 << 4) - 1) + if (regs.U32[0] == 0x04030403) { type = TYPE_UVXYZF2; nreg = 2; nloop *= 2; } - else if (regs.eq8(GSVector4i::cxpr(0x05030503)).mask() == (1 << 4) - 1) + else if (regs.U32[0] == 0x05030503) { type = TYPE_UVXYZ2; nreg = 2; @@ -1230,19 +1226,19 @@ struct alignas(32) GIFPath case 5: break; case 6: - if (regs.U32[0] == 0x03040103 && regs.U32[1] == 0x00000401) + if (regs.U64[0] == 0x0000040103040103ULL) { type = TYPE_UVRGBAXYZF2; nreg = 3; nloop *= 2; } - else if (regs.U32[0] == 0x01040301 && regs.U32[1] == 0x00000403) + else if (regs.U64[0] == 0x0000040301040301ULL) { type = TYPE_RGBAUVXYZF2; nreg = 3; nloop *= 2; } - else if (regs.U32[0] == 0x03050103 && regs.U32[1] == 0x00000501) + else if (regs.U64[0] == 0x0000050103050103ULL) { type = TYPE_UVRGBAXYZ2; nreg = 3; @@ -1255,7 +1251,7 @@ struct alignas(32) GIFPath break; case 9: // ffx - if (regs.U32[0] == 0x02040102 && regs.U32[1] == 0x01020401 && regs.U32[2] == 0x00000004) + if (regs.U64[0] == 0x0102040102040102ULL && regs.U32[2] == 0x00000004) { type = TYPE_STQRGBAXYZF2; nreg = 3; @@ -1268,7 +1264,7 @@ struct alignas(32) GIFPath break; case 12: // dq8 (not many, mostly 040102) - if (regs.U32[0] == 0x02040102 && regs.U32[1] == 0x01020401 && regs.U32[2] == 0x04010204) + if (regs.U64[0] == 0x0102040102040102ULL && regs.U32[2] == 0x04010204) { type = TYPE_STQRGBAXYZF2; nreg = 3; diff --git a/pcsx2/GS/GSState.cpp b/pcsx2/GS/GSState.cpp index dde14aeffa0be6..732a760cfe7bd1 100644 --- a/pcsx2/GS/GSState.cpp +++ b/pcsx2/GS/GSState.cpp @@ -661,12 +661,14 @@ void GSState::GIFPackedRegHandlerSTQRGBAXYZF2(const GIFPackedReg* RESTRICT r, u3 } const GIFPackedReg* RESTRICT r_end = r + size; + const GSVector4i ff_mask = GSVector4i::cxpr(0x000000ff); + const GSVector4i ffffff_mask = GSVector4i::x00ffffff().upl32(ff_mask); while (r < r_end) { const GSVector4i st = GSVector4i::loadl(&r[0].U64[0]); GSVector4i q = GSVector4i::loadl(&r[0].U64[1]); - const GSVector4i rgba = (GSVector4i::load(&r[1]) & GSVector4i::x000000ff()).ps32().pu16(); + const GSVector4i rgba = (GSVector4i::load(&r[1]) & ff_mask).ps32().pu16(); q = q.blend8(GSVector4i::cast(GSVector4::m_one), q == GSVector4i::zero()); // see GIFPackedRegHandlerSTQ @@ -675,7 +677,7 @@ void GSState::GIFPackedRegHandlerSTQRGBAXYZF2(const GIFPackedReg* RESTRICT r, u3 GSVector4i xy = GSVector4i::loadl(&r[2].U64[0]); GSVector4i zf = GSVector4i::loadl(&r[2].U64[1]); xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::load((int)m_v.UV)); - zf = zf.srl32(4) & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff()); + zf = zf.srl32(4) & ffffff_mask; m_v.m[1] = xy.upl32(zf); // TODO: only store the last one @@ -706,12 +708,13 @@ void GSState::GIFPackedRegHandlerSTQRGBAXYZ2(const GIFPackedReg* RESTRICT r, u32 } const GIFPackedReg* RESTRICT r_end = r + size; + const GSVector4i ff_mask = GSVector4i::cxpr(0x000000ff); while (r < r_end) { const GSVector4i st = GSVector4i::loadl(&r[0].U64[0]); GSVector4i q = GSVector4i::loadl(&r[0].U64[1]); - const GSVector4i rgba = (GSVector4i::load(&r[1]) & GSVector4i::x000000ff()).ps32().pu16(); + const GSVector4i rgba = (GSVector4i::load(&r[1]) & ff_mask).ps32().pu16(); q = q.blend8(GSVector4i::cast(GSVector4::m_one), q == GSVector4i::zero()); // see GIFPackedRegHandlerSTQ @@ -753,10 +756,12 @@ void GSState::GIFPackedRegHandlerUVRGBAXYZ2(const GIFPackedReg* RESTRICT r, u32 const GSVector4i st = GSVector4i::loadl(&m_v.ST); const GSVector4i q = GSVector4i::loadl(&m_v.RGBAQ.U32[1]).blend8(GSVector4i::cast(GSVector4::m_one), q == GSVector4i::zero()); + const GSVector4i ff_mask = GSVector4i::cxpr(0x000000ff); + const GSVector4i mask_3fff = GSVector4i::cxpr(0x00003fff); while (r < r_end) { - const GSVector4i rgba = (GSVector4i::load(&r[1]) & GSVector4i::x000000ff()).ps32().pu16(); + const GSVector4i rgba = (GSVector4i::load(&r[1]) & ff_mask).ps32().pu16(); m_v.m[0] = st.upl64(rgba.upl32(q)); @@ -764,7 +769,7 @@ void GSState::GIFPackedRegHandlerUVRGBAXYZ2(const GIFPackedReg* RESTRICT r, u32 const GSVector4i z = GSVector4i::loadl(&r[2].U64[1]); const GSVector4i xyz = xy.upl16(xy.srl<4>()).upl32(z); - const GSVector4i uv = GSVector4i::loadl(&r[0]) & GSVector4i::x00003fff(); + const GSVector4i uv = GSVector4i::loadl(&r[0]) & mask_3fff; m_v.m[1] = xyz.upl64(uv.ps32(uv)); @@ -796,10 +801,11 @@ void GSState::GIFPackedRegHandlerRGBAXYZ2(const GIFPackedReg* RESTRICT r, u32 si const GSVector4i st = GSVector4i::loadl(&m_v.ST); const GSVector4i q = GSVector4i(m_v.RGBAQ.U32[1]).blend8(GSVector4i::cast(GSVector4::m_one), q == GSVector4i::zero()); // see GIFPackedRegHandlerSTQ + const GSVector4i ff_mask = GSVector4i::cxpr(0x000000ff); while (r < r_end) { - const GSVector4i rgba = (GSVector4i::load(&r[0]) & GSVector4i::x000000ff()).ps32().pu16(); + const GSVector4i rgba = (GSVector4i::load(&r[0]) & ff_mask).ps32().pu16(); m_v.m[0] = st.upl64(rgba.upl32(q)); @@ -837,17 +843,19 @@ void GSState::GIFPackedRegHandlerRGBAXYZF2(const GIFPackedReg* RESTRICT r, u32 s const GSVector4i st = GSVector4i::loadl(&m_v.ST); const GSVector4i q = GSVector4i(m_v.RGBAQ.U32[1]).blend8(GSVector4i::cast(GSVector4::m_one), q == GSVector4i::zero()); // see GIFPackedRegHandlerSTQ + const GSVector4i ff_mask = GSVector4i::cxpr(0x000000ff); + const GSVector4i ffffff_mask = GSVector4i::x00ffffff().upl32(ff_mask); while (r < r_end) { - const GSVector4i rgba = (GSVector4i::load(&r[0]) & GSVector4i::x000000ff()).ps32().pu16(); + const GSVector4i rgba = (GSVector4i::load(&r[0]) & ff_mask).ps32().pu16(); m_v.m[0] = st.upl64(rgba.upl32(q)); GSVector4i xy = GSVector4i::loadl(&r[1].U64[0]); GSVector4i zf = GSVector4i::loadl(&r[1].U64[1]); xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::load((int)m_v.UV)); - zf = zf.srl32(4) & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff()); + zf = zf.srl32(4) & ffffff_mask; m_v.m[1] = xy.upl32(zf); @@ -879,20 +887,23 @@ void GSState::GIFPackedRegHandlerUVRGBAXYZF2(const GIFPackedReg* RESTRICT r, u32 const GSVector4i st = GSVector4i::loadl(&m_v.ST); const GSVector4i q = GSVector4i(m_v.RGBAQ.U32[1]).blend8(GSVector4i::cast(GSVector4::m_one), q == GSVector4i::zero()); // see GIFPackedRegHandlerSTQ + const GSVector4i ff_mask = GSVector4i::cxpr(0x000000ff); + const GSVector4i mask_3fff = GSVector4i::cxpr(0x00003fff); + const GSVector4i ffffff_mask = GSVector4i::x00ffffff().upl32(ff_mask); while (r < r_end) { - const GSVector4i rgba = (GSVector4i::load(&r[1]) & GSVector4i::x000000ff()).ps32().pu16(); + const GSVector4i rgba = (GSVector4i::load(&r[1]) & ff_mask).ps32().pu16(); m_v.m[0] = st.upl64(rgba.upl32(q)); GSVector4i xy = GSVector4i::loadl(&r[2].U64[0]); GSVector4i zf = GSVector4i::loadl(&r[2].U64[1]); - const GSVector4i uv = GSVector4i::loadl(&r[0]) & GSVector4i::x00003fff(); + const GSVector4i uv = GSVector4i::loadl(&r[0]) & mask_3fff; xy = xy.upl16(xy.srl<4>()).upl32(uv.ps32(uv)); - zf = zf.srl32(4) & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff()); + zf = zf.srl32(4) & ffffff_mask; m_v.m[1] = xy.upl32(zf); @@ -924,20 +935,23 @@ void GSState::GIFPackedRegHandlerRGBAUVXYZF2(const GIFPackedReg* RESTRICT r, u32 const GSVector4i st = GSVector4i::loadl(&m_v.ST); const GSVector4i q = GSVector4i(m_v.RGBAQ.U32[1]).blend8(GSVector4i::cast(GSVector4::m_one), q == GSVector4i::zero()); // see GIFPackedRegHandlerSTQ + const GSVector4i ff_mask = GSVector4i::cxpr(0x000000ff); + const GSVector4i mask_3fff = GSVector4i::cxpr(0x00003fff); + const GSVector4i ffffff_mask = GSVector4i::x00ffffff().upl32(ff_mask); while (r < r_end) { - const GSVector4i rgba = (GSVector4i::load(&r[0]) & GSVector4i::x000000ff()).ps32().pu16(); + const GSVector4i rgba = (GSVector4i::load(&r[0]) & ff_mask).ps32().pu16(); m_v.m[0] = st.upl64(rgba.upl32(q)); // TODO: only store the last one GSVector4i xy = GSVector4i::loadl(&r[2].U64[0]); GSVector4i zf = GSVector4i::loadl(&r[2].U64[1]); - const GSVector4i uv = GSVector4i::loadl(&r[1]) & GSVector4i::x00003fff(); + const GSVector4i uv = GSVector4i::loadl(&r[1]) & mask_3fff; xy = xy.upl16(xy.srl<4>()).upl32(uv.ps32(uv)); - zf = zf.srl32(4) & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff()); + zf = zf.srl32(4) & ffffff_mask; m_v.m[1] = xy.upl32(zf); @@ -1006,6 +1020,8 @@ void GSState::GIFPackedRegHandlerSTQXYZF2(const GIFPackedReg* RESTRICT r, u32 si } const GIFPackedReg* RESTRICT r_end = r + size; + const GSVector4i ff_mask = GSVector4i::cxpr(0x000000ff); + const GSVector4i ffffff_mask = GSVector4i::x00ffffff().upl32(ff_mask); while (r < r_end) { @@ -1015,7 +1031,7 @@ void GSState::GIFPackedRegHandlerSTQXYZF2(const GIFPackedReg* RESTRICT r, u32 si GSVector4i xy = GSVector4i::loadl(&r[1].U64[0]); GSVector4i zf = GSVector4i::loadl(&r[1].U64[1]); xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::load((int)m_v.UV)); - zf = zf.srl32(4) & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff()); + zf = zf.srl32(4) & ffffff_mask; m_v.m[1] = xy.upl32(zf); // TODO: only store the last one @@ -1046,13 +1062,14 @@ void GSState::GIFPackedRegHandlerUVXYZ2(const GIFPackedReg* RESTRICT r, u32 size } const GIFPackedReg* RESTRICT r_end = r + size; + const GSVector4i mask_3fff = GSVector4i::cxpr(0x00003fff); while (r < r_end) { const GSVector4i xy = GSVector4i::loadl(&r[1].U64[0]); const GSVector4i z = GSVector4i::loadl(&r[1].U64[1]); const GSVector4i xyz = xy.upl16(xy.srl<4>()).upl32(z); - const GSVector4i uv = GSVector4i::loadl(&r[0]) & GSVector4i::x00003fff(); + const GSVector4i uv = GSVector4i::loadl(&r[0]) & mask_3fff; m_v.m[1] = xyz.upl64(uv.ps32(uv)); // TODO: only store the last one @@ -1084,6 +1101,8 @@ void GSState::GIFPackedRegHandlerUVXYZF2(const GIFPackedReg* RESTRICT r, u32 siz } const GIFPackedReg* RESTRICT r_end = r + size; + const GSVector4i mask_3fff = GSVector4i::cxpr(0x00003fff); + const GSVector4i ffffff_mask = GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff()); while (r < r_end) { @@ -1091,7 +1110,7 @@ void GSState::GIFPackedRegHandlerUVXYZF2(const GIFPackedReg* RESTRICT r, u32 siz GSVector4i zf = GSVector4i::loadl(&r[1].U64[1]); const GSVector4i uv = GSVector4i::loadl(&r[0]) & GSVector4i::x00003fff(); xy = xy.upl16(xy.srl<4>()).upl32(uv.ps32(uv)); - zf = zf.srl32(4) & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff()); + zf = zf.srl32(4) & ffffff_mask; m_v.m[1] = xy.upl32(zf); // TODO: only store the last one @@ -2760,77 +2779,12 @@ void GSState::Transfer(const u8* mem, u32 size) mem += sizeof(GIFPackedReg); } while (--total > 0); - - break; - case GIFPath::TYPE_STQRGBAXYZF2: // majority of the vertices are formatted like this - (this->*m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZF2])((GIFPackedReg*)mem, total); - - mem += total * sizeof(GIFPackedReg); - - break; - case GIFPath::TYPE_STQRGBAXYZ2: - (this->*m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZ2])((GIFPackedReg*)mem, total); - - mem += total * sizeof(GIFPackedReg); - - break; - case GIFPath::TYPE_UVRGBAXYZ2: - (this->*m_fpGIFPackedRegHandlersC[GIF_REG_UVRGBAXYZ2])((GIFPackedReg*)mem, total); - - mem += total * sizeof(GIFPackedReg); - - break; - case GIFPath::TYPE_RGBAXYZF2: - (this->*m_fpGIFPackedRegHandlersC[GIF_REG_RGBAXYZF2])((GIFPackedReg*)mem, total); - - mem += total * sizeof(GIFPackedReg); - - break; - case GIFPath::TYPE_STQXYZF2: - (this->*m_fpGIFPackedRegHandlersC[GIF_REG_STQXYZF2])((GIFPackedReg*)mem, total); - - mem += total * sizeof(GIFPackedReg); - - break; - case GIFPath::TYPE_RGBAXYZ2: - (this->*m_fpGIFPackedRegHandlersC[GIF_REG_RGBAXYZ2])((GIFPackedReg*)mem, total); - - mem += total * sizeof(GIFPackedReg); - - break; - case GIFPath::TYPE_STQXYZ2: - (this->*m_fpGIFPackedRegHandlersC[GIF_REG_STQXYZ2])((GIFPackedReg*)mem, total); - - mem += total * sizeof(GIFPackedReg); - - break; - case GIFPath::TYPE_UVRGBAXYZF2: - (this->*m_fpGIFPackedRegHandlersC[GIF_REG_UVRGBAXYZF2])((GIFPackedReg*)mem, total); - - mem += total * sizeof(GIFPackedReg); - - break; - case GIFPath::TYPE_RGBAUVXYZF2: - (this->*m_fpGIFPackedRegHandlersC[GIF_REG_RGBAUVXYZF2])((GIFPackedReg*)mem, total); - - mem += total * sizeof(GIFPackedReg); - - break; - case GIFPath::TYPE_UVXYZ2: - (this->*m_fpGIFPackedRegHandlersC[GIF_REG_UVXYZ2])((GIFPackedReg*)mem, total); - - mem += total * sizeof(GIFPackedReg); - - break; - case GIFPath::TYPE_UVXYZF2: - (this->*m_fpGIFPackedRegHandlersC[GIF_REG_UVXYZF2])((GIFPackedReg*)mem, total); + break; + default: + (this->*m_fpGIFPackedRegHandlersC[path.type])((GIFPackedReg*)mem, total); mem += total * sizeof(GIFPackedReg); - - break; - - default: - __assume(0); + break; } path.nloop = 0;