Skip to content

Commit

Permalink
GS: Use templates for shift immediates
Browse files Browse the repository at this point in the history
Also removes the __m128 overloads - it's too easy to mistake for a
variable shift (which doesn't exist in SSE4), instead it takes the shift
amount from the lowest 32-bits.
  • Loading branch information
stenzek committed Dec 29, 2023
1 parent 6d2f6b4 commit c9e61d2
Show file tree
Hide file tree
Showing 9 changed files with 523 additions and 608 deletions.
8 changes: 4 additions & 4 deletions pcsx2/GS/GSClut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -837,8 +837,8 @@ void GSClut::Expand16(const u16* RESTRICT src, u32* RESTRICT dst, int w, const G
c = s[i];
cl = c.upl16(c);
ch = c.uph16(c);
d[i * 2 + 0] = ((cl & rm) << 3) | ((cl & gm) << 6) | ((cl & bm) << 9) | TA0.blend8(TA1, cl.sra16(15));
d[i * 2 + 1] = ((ch & rm) << 3) | ((ch & gm) << 6) | ((ch & bm) << 9) | TA0.blend8(TA1, ch.sra16(15));
d[i * 2 + 0] = ((cl & rm) << 3) | ((cl & gm) << 6) | ((cl & bm) << 9) | TA0.blend8(TA1, cl.sra16<15>());
d[i * 2 + 1] = ((ch & rm) << 3) | ((ch & gm) << 6) | ((ch & bm) << 9) | TA0.blend8(TA1, ch.sra16<15>());
}
}
else
Expand All @@ -848,8 +848,8 @@ void GSClut::Expand16(const u16* RESTRICT src, u32* RESTRICT dst, int w, const G
c = s[i];
cl = c.upl16(c);
ch = c.uph16(c);
d[i * 2 + 0] = ((cl & rm) << 3) | ((cl & gm) << 6) | ((cl & bm) << 9) | TA0.blend8(TA1, cl.sra16(15)).andnot(cl == GSVector4i::zero());
d[i * 2 + 1] = ((ch & rm) << 3) | ((ch & gm) << 6) | ((ch & bm) << 9) | TA0.blend8(TA1, ch.sra16(15)).andnot(ch == GSVector4i::zero());
d[i * 2 + 0] = ((cl & rm) << 3) | ((cl & gm) << 6) | ((cl & bm) << 9) | TA0.blend8(TA1, cl.sra16<15>()).andnot(cl == GSVector4i::zero());
d[i * 2 + 1] = ((ch & rm) << 3) | ((ch & gm) << 6) | ((ch & bm) << 9) | TA0.blend8(TA1, ch.sra16<15>()).andnot(ch == GSVector4i::zero());
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion pcsx2/GS/GSDrawingContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ void GSDrawingContext::UpdateScissor()
scissor.in = rscissor + GSVector4i::cxpr(0, 0, 1, 1);

// Fixed-point scissor min/max, used for rejecting primitives which are entirely outside.
scissor.cull = rscissor.sll32(4);
scissor.cull = rscissor.sll32<4>();

// Offset applied to vertices for culling, zw is for native resolution culling
// We want to round subpixels down, because at least one pixel gets filled per scanline.
Expand Down
1 change: 0 additions & 1 deletion pcsx2/GS/GSRegs.h
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,6 @@ union name \
#define REG128_SET(name) \
union name \
{ \
__m128i m128; \
u64 U64[2]; \
u32 U32[4];

Expand Down
8 changes: 4 additions & 4 deletions pcsx2/GS/GSState.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -594,7 +594,7 @@ void GSState::GIFPackedRegHandlerXYZF2(const GIFPackedReg* RESTRICT r)
GSVector4i zf = xy.zwzw();

xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::load((int)m_v.UV));
zf = zf.srl32(4) & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff());
zf = zf.srl32<4>() & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff());

m_v.m[1] = xy.upl32(zf);

Expand Down Expand Up @@ -654,7 +654,7 @@ void GSState::GIFPackedRegHandlerSTQRGBAXYZF2(const GIFPackedReg* RESTRICT r, u3
GSVector4i xy = GSVector4i::loadl(&r[2].U64[0]);
GSVector4i zf = GSVector4i::loadl(&r[2].U64[1]);
xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::load((int)m_v.UV));
zf = zf.srl32(4) & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff());
zf = zf.srl32<4>() & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff());

m_v.m[1] = xy.upl32(zf); // TODO: only store the last one

Expand Down Expand Up @@ -784,7 +784,7 @@ void GSState::GIFRegHandlerXYZF2(const GIFReg* RESTRICT r)

const GSVector4i xyzf = GSVector4i::loadl(&r->XYZF);
const GSVector4i xyz = xyzf & (GSVector4i::xffffffff().upl32(GSVector4i::x00ffffff()));
const GSVector4i uvf = GSVector4i::load((int)m_v.UV).upl32(xyzf.srl32(24).srl<4>());
const GSVector4i uvf = GSVector4i::load((int)m_v.UV).upl32(xyzf.srl32<24>().srl<4>());

m_v.m[1] = xyz.upl64(uvf);

Expand Down Expand Up @@ -3363,7 +3363,7 @@ __forceinline void GSState::VertexKick(u32 skip)
// integer coordinates for culling at native resolution, and the fixed point for all others. The XY offset has to be
// applied, then we split it into the fixed/integer portions.
const GSVector4i xy_ofs = new_v1.xxxx().u16to32().sub32(m_xyof);
const GSVector4i xy = xy_ofs.blend32<12>(xy_ofs.sra32(4));
const GSVector4i xy = xy_ofs.blend32<12>(xy_ofs.sra32<4>());
m_vertex.xy[xy_tail & 3] = xy;

// Backup head for triangle fans so we can read it later, otherwise it'll get lost after the 4th vertex.
Expand Down
4 changes: 2 additions & 2 deletions pcsx2/GS/GSVector4.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ class alignas(16) GSVector4
{
GSVector4i v((int)u);

*this = GSVector4(v) + (m_x4f800000 & GSVector4::cast(v.sra32(31)));
*this = GSVector4(v) + (m_x4f800000 & GSVector4::cast(v.sra32<31>()));
}

__forceinline explicit GSVector4(const GSVector4i& v);
Expand Down Expand Up @@ -643,7 +643,7 @@ GSVector.h:2973:15: error: shadows template parm 'int i'
{
GSVector4i v = GSVector4i::load((int)u);

return GSVector4(v) + (m_x4f800000 & GSVector4::cast(v.sra32(31)));
return GSVector4(v) + (m_x4f800000 & GSVector4::cast(v.sra32<31>()));
}

template <bool aligned>
Expand Down
Loading

0 comments on commit c9e61d2

Please sign in to comment.