GS: Use templates for shift immediates

Also removes the __m128 overloads - it's too easy to mistake for a variable shift (which doesn't exist in SSE4), instead it takes the shift amount from the lowest 32-bits.
PCSX2 · Dec 29, 2023 · c9e61d2 · c9e61d2
1 parent 6d2f6b4
commit c9e61d2
Show file tree

Hide file tree

Showing 9 changed files with 523 additions and 608 deletions.
diff --git a/pcsx2/GS/GSClut.cpp b/pcsx2/GS/GSClut.cpp
@@ -837,8 +837,8 @@ void GSClut::Expand16(const u16* RESTRICT src, u32* RESTRICT dst, int w, const G
 			c = s[i];
 			cl = c.upl16(c);
 			ch = c.uph16(c);
-			d[i * 2 + 0] = ((cl & rm) << 3) | ((cl & gm) << 6) | ((cl & bm) << 9) | TA0.blend8(TA1, cl.sra16(15));
-			d[i * 2 + 1] = ((ch & rm) << 3) | ((ch & gm) << 6) | ((ch & bm) << 9) | TA0.blend8(TA1, ch.sra16(15));
+			d[i * 2 + 0] = ((cl & rm) << 3) | ((cl & gm) << 6) | ((cl & bm) << 9) | TA0.blend8(TA1, cl.sra16<15>());
+			d[i * 2 + 1] = ((ch & rm) << 3) | ((ch & gm) << 6) | ((ch & bm) << 9) | TA0.blend8(TA1, ch.sra16<15>());
 		}
 	}
 	else
@@ -848,8 +848,8 @@ void GSClut::Expand16(const u16* RESTRICT src, u32* RESTRICT dst, int w, const G
 			c = s[i];
 			cl = c.upl16(c);
 			ch = c.uph16(c);
-			d[i * 2 + 0] = ((cl & rm) << 3) | ((cl & gm) << 6) | ((cl & bm) << 9) | TA0.blend8(TA1, cl.sra16(15)).andnot(cl == GSVector4i::zero());
-			d[i * 2 + 1] = ((ch & rm) << 3) | ((ch & gm) << 6) | ((ch & bm) << 9) | TA0.blend8(TA1, ch.sra16(15)).andnot(ch == GSVector4i::zero());
+			d[i * 2 + 0] = ((cl & rm) << 3) | ((cl & gm) << 6) | ((cl & bm) << 9) | TA0.blend8(TA1, cl.sra16<15>()).andnot(cl == GSVector4i::zero());
+			d[i * 2 + 1] = ((ch & rm) << 3) | ((ch & gm) << 6) | ((ch & bm) << 9) | TA0.blend8(TA1, ch.sra16<15>()).andnot(ch == GSVector4i::zero());
 		}
 	}
 }

diff --git a/pcsx2/GS/GSDrawingContext.cpp b/pcsx2/GS/GSDrawingContext.cpp
@@ -97,7 +97,7 @@ void GSDrawingContext::UpdateScissor()
 	scissor.in = rscissor + GSVector4i::cxpr(0, 0, 1, 1);
 
 	// Fixed-point scissor min/max, used for rejecting primitives which are entirely outside.
-	scissor.cull = rscissor.sll32(4);
+	scissor.cull = rscissor.sll32<4>();
 
 	// Offset applied to vertices for culling, zw is for native resolution culling
 	// We want to round subpixels down, because at least one pixel gets filled per scanline.

diff --git a/pcsx2/GS/GSRegs.h b/pcsx2/GS/GSRegs.h
@@ -262,7 +262,6 @@ union name          \
 #define REG128_SET(name) \
 	union name           \
 	{                    \
-		__m128i m128;    \
 		u64 U64[2];   \
 		u32 U32[4];
 

diff --git a/pcsx2/GS/GSState.cpp b/pcsx2/GS/GSState.cpp
@@ -594,7 +594,7 @@ void GSState::GIFPackedRegHandlerXYZF2(const GIFPackedReg* RESTRICT r)
 	GSVector4i zf = xy.zwzw();
 
 	xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::load((int)m_v.UV));
-	zf = zf.srl32(4) & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff());
+	zf = zf.srl32<4>() & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff());
 
 	m_v.m[1] = xy.upl32(zf);
 
@@ -654,7 +654,7 @@ void GSState::GIFPackedRegHandlerSTQRGBAXYZF2(const GIFPackedReg* RESTRICT r, u3
 		GSVector4i xy = GSVector4i::loadl(&r[2].U64[0]);
 		GSVector4i zf = GSVector4i::loadl(&r[2].U64[1]);
 		xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::load((int)m_v.UV));
-		zf = zf.srl32(4) & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff());
+		zf = zf.srl32<4>() & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff());
 
 		m_v.m[1] = xy.upl32(zf); // TODO: only store the last one
 
@@ -784,7 +784,7 @@ void GSState::GIFRegHandlerXYZF2(const GIFReg* RESTRICT r)
 
 	const GSVector4i xyzf = GSVector4i::loadl(&r->XYZF);
 	const GSVector4i xyz = xyzf & (GSVector4i::xffffffff().upl32(GSVector4i::x00ffffff()));
-	const GSVector4i uvf = GSVector4i::load((int)m_v.UV).upl32(xyzf.srl32(24).srl<4>());
+	const GSVector4i uvf = GSVector4i::load((int)m_v.UV).upl32(xyzf.srl32<24>().srl<4>());
 
 	m_v.m[1] = xyz.upl64(uvf);
 
@@ -3363,7 +3363,7 @@ __forceinline void GSState::VertexKick(u32 skip)
 	// integer coordinates for culling at native resolution, and the fixed point for all others. The XY offset has to be
 	// applied, then we split it into the fixed/integer portions.
 	const GSVector4i xy_ofs = new_v1.xxxx().u16to32().sub32(m_xyof);
-	const GSVector4i xy = xy_ofs.blend32<12>(xy_ofs.sra32(4));
+	const GSVector4i xy = xy_ofs.blend32<12>(xy_ofs.sra32<4>());
 	m_vertex.xy[xy_tail & 3] = xy;
 
 	// Backup head for triangle fans so we can read it later, otherwise it'll get lost after the 4th vertex.

diff --git a/pcsx2/GS/GSVector4.h b/pcsx2/GS/GSVector4.h
@@ -153,7 +153,7 @@ class alignas(16) GSVector4
 	{
 		GSVector4i v((int)u);
 
-		*this = GSVector4(v) + (m_x4f800000 & GSVector4::cast(v.sra32(31)));
+		*this = GSVector4(v) + (m_x4f800000 & GSVector4::cast(v.sra32<31>()));
 	}
 
 	__forceinline explicit GSVector4(const GSVector4i& v);
@@ -643,7 +643,7 @@ GSVector.h:2973:15: error:  shadows template parm 'int i'
 	{
 		GSVector4i v = GSVector4i::load((int)u);
 
-		return GSVector4(v) + (m_x4f800000 & GSVector4::cast(v.sra32(31)));
+		return GSVector4(v) + (m_x4f800000 & GSVector4::cast(v.sra32<31>()));
 	}
 
 	template <bool aligned>