Skip to content

Commit

Permalink
GS/HW: Improve channel + texture shuffle detection and processing
Browse files Browse the repository at this point in the history
  • Loading branch information
refractionpcsx2 committed Mar 24, 2024
1 parent 9e42bf7 commit 4ba43b8
Show file tree
Hide file tree
Showing 14 changed files with 335 additions and 160 deletions.
67 changes: 49 additions & 18 deletions bin/resources/shaders/dx11/tfx.fx
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
#define FMT_24 1
#define FMT_16 2

#define SHUFFLE_READ 1
#define SHUFFLE_WRITE 2
#define SHUFFLE_READWRITE 3

#ifndef VS_TME
#define VS_IIP 0
#define VS_TME 1
Expand Down Expand Up @@ -41,7 +45,9 @@
#define PS_REGION_RECT 0
#define PS_SHUFFLE 0
#define PS_SHUFFLE_SAME 0
#define PS_READ_BA 0
#define PS_PROCESS_BA 0
#define PS_PROCESS_RG 0
#define PS_SHUFFLE_ACROSS 0
#define PS_READ16_SRC 0
#define PS_DST_FMT 0
#define PS_DEPTH_FMT 0
Expand Down Expand Up @@ -761,10 +767,10 @@ float4 ps_color(PS_INPUT input)
float4 T = sample_color(st, input.t.w);
#endif

if (PS_SHUFFLE && !PS_SHUFFLE_SAME && !PS_READ16_SRC)
if (SW_BLEND && PS_SHUFFLE && !PS_SHUFFLE_SAME && !PS_READ16_SRC && (PS_SHUFFLE_ACROSS || PS_PROCESS_BA == SHUFFLE_READWRITE || PS_PROCESS_RG == SHUFFLE_READWRITE))
{
uint4 denorm_c_before = uint4(T);
if (PS_READ_BA)
if (PS_PROCESS_BA & SHUFFLE_READ)
{
T.r = float((denorm_c_before.b << 3) & 0xF8);
T.g = float(((denorm_c_before.b >> 2) & 0x38) | ((denorm_c_before.a << 6) & 0xC0));
Expand Down Expand Up @@ -1028,10 +1034,10 @@ PS_OUTPUT ps_main(PS_INPUT input)

if (PS_SHUFFLE)
{
if (!PS_SHUFFLE_SAME && !PS_READ16_SRC)
if (SW_BLEND && PS_SHUFFLE && !PS_SHUFFLE_SAME && !PS_READ16_SRC && (PS_SHUFFLE_ACROSS || PS_PROCESS_BA == SHUFFLE_READWRITE || PS_PROCESS_RG == SHUFFLE_READWRITE))
{
uint4 denorm_c_after = uint4(C);
if (PS_READ_BA)
if (PS_PROCESS_BA & SHUFFLE_READ)
{
C.b = float(((denorm_c_after.r >> 3) & 0x1F) | ((denorm_c_after.g << 2) & 0xE0));
C.a = float(((denorm_c_after.g >> 6) & 0x3) | ((denorm_c_after.b >> 1) & 0x7C) | (denorm_c_after.a & 0x80));
Expand All @@ -1049,7 +1055,7 @@ PS_OUTPUT ps_main(PS_INPUT input)
// Special case for 32bit input and 16bit output, shuffle used by The Godfather
if (PS_SHUFFLE_SAME)
{
if (PS_READ_BA)
if (PS_PROCESS_BA & SHUFFLE_READ)
C = (float4)(float((denorm_c.b & 0x7Fu) | (denorm_c.a & 0x80u)));
else
C.ga = C.rg;
Expand All @@ -1063,23 +1069,48 @@ PS_OUTPUT ps_main(PS_INPUT input)
else
C.ga = (float2)float((denorm_c.g >> 6) | ((denorm_c.b >> 3) << 2) | (denorm_TA.x & 0x80u));
}
// Write RB part. Mask will take care of the correct destination
else if (PS_READ_BA)
else if (PS_SHUFFLE_ACROSS)
{
C.rb = C.bb;
if (denorm_c.a & 0x80u)
C.ga = (float2)(float((denorm_c.a & 0x7Fu) | (denorm_TA.y & 0x80u)));
if (PS_PROCESS_BA == SHUFFLE_READWRITE && PS_PROCESS_RG == SHUFFLE_READWRITE)
{
C.rb = C.br;
if ((denorm_c.a & 0x80u) != 0u)
C.g = float((denorm_c.a & 0x7Fu) | (denorm_TA.y & 0x80u));
else
C.g = float((denorm_c.a & 0x7Fu) | (denorm_TA.x & 0x80u));

if ((denorm_c.g & 0x80u) != 0u)
C.a = float((denorm_c.g & 0x7Fu) | (denorm_TA.y & 0x80u));
else
C.a = float((denorm_c.g & 0x7Fu) | (denorm_TA.x & 0x80u));
}
else if(PS_PROCESS_BA & SHUFFLE_READ)
{
C.rb = C.bb;
if ((denorm_c.a & 0x80u) != 0u)
C.ga = (float2)(float((denorm_c.a & 0x7Fu) | (denorm_TA.y & 0x80u)));
else
C.ga = (float2)(float((denorm_c.a & 0x7Fu) | (denorm_TA.x & 0x80u)));
}
else
C.ga = (float2)(float((denorm_c.a & 0x7Fu) | (denorm_TA.x & 0x80u)));
{
C.rb = C.rr;
if ((denorm_c.g & 0x80u) != 0u)
C.ga = (float2)(float((denorm_c.g & 0x7Fu) | (denorm_TA.y & 0x80u)));
else
C.ga = (float2)(float((denorm_c.g & 0x7Fu) | (denorm_TA.x & 0x80u)));
}
}
else
else // Basically a direct copy but a shuffle of both pairs of channels, so green and alpha get modified by TEXA
{
C.rb = C.rr;
if (denorm_c.g & 0x80u)
C.ga = (float2)(float((denorm_c.g & 0x7Fu) | (denorm_TA.y & 0x80u)));

if ((denorm_c.g & 0x80u) != 0u)
C.g = float((denorm_c.g & 0x7Fu) | (denorm_TA.y & 0x80u));
else
C.g = float((denorm_c.g & 0x7Fu) | (denorm_TA.x & 0x80u));
if ((denorm_c.a & 0x80u) != 0u)
C.a = float((denorm_c.a & 0x7Fu) | (denorm_TA.y & 0x80u));
else
C.ga = (float2)(float((denorm_c.g & 0x7Fu) | (denorm_TA.x & 0x80u)));
C.a = float((denorm_c.a & 0x7Fu) | (denorm_TA.x & 0x80u));
}
}

Expand Down
80 changes: 43 additions & 37 deletions bin/resources/shaders/opengl/tfx_fs.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
#define FMT_24 1
#define FMT_16 2

#define SHUFFLE_READ 1
#define SHUFFLE_WRITE 2
#define SHUFFLE_READWRITE 3

// TEX_COORD_DEBUG output the uv coordinate as color. It is useful
// to detect bad sampling due to upscaling
//#define TEX_COORD_DEBUG
Expand Down Expand Up @@ -695,9 +699,9 @@ vec4 ps_color()
vec4 T = sample_color(st);
#endif

#if PS_SHUFFLE && !PS_SHUFFLE_SAME && !PS_READ16_SRC
#if SW_BLEND && PS_SHUFFLE && !PS_SHUFFLE_SAME && !PS_READ16_SRC && (PS_SHUFFLE_ACROSS || PS_PROCESS_BA == SHUFFLE_READWRITE || PS_PROCESS_RG == SHUFFLE_READWRITE)
uvec4 denorm_c_before = uvec4(T);
#if PS_READ_BA
#if (PS_PROCESS_BA & SHUFFLE_READ)
T.r = float((denorm_c_before.b << 3) & 0xF8);
T.g = float(((denorm_c_before.b >> 2) & 0x38) | ((denorm_c_before.a << 6) & 0xC0));
T.b = float((denorm_c_before.a << 1) & 0xF8);
Expand Down Expand Up @@ -1027,9 +1031,9 @@ void ps_main()


#if PS_SHUFFLE
#if !PS_SHUFFLE_SAME && !PS_READ16_SRC
#if SW_BLEND && PS_SHUFFLE && !PS_SHUFFLE_SAME && !PS_READ16_SRC && (PS_SHUFFLE_ACROSS || PS_PROCESS_BA == SHUFFLE_READWRITE || PS_PROCESS_RG == SHUFFLE_READWRITE)
uvec4 denorm_c_after = uvec4(C);
#if PS_READ_BA
#if (PS_PROCESS_BA & SHUFFLE_READ)
C.b = float(((denorm_c_after.r >> 3) & 0x1F) | ((denorm_c_after.g << 2) & 0xE0));
C.a = float(((denorm_c_after.g >> 6) & 0x3) | ((denorm_c_after.b >> 1) & 0x7C) | (denorm_c_after.a & 0x80));
#else
Expand All @@ -1043,7 +1047,7 @@ void ps_main()

// Special case for 32bit input and 16bit output, shuffle used by The Godfather
#if PS_SHUFFLE_SAME
#if (PS_READ_BA)
#if (PS_PROCESS_BA & SHUFFLE_READ)
C = vec4(float((denorm_c.b & 0x7Fu) | (denorm_c.a & 0x80u)));
#else
C.ga = C.rg;
Expand All @@ -1055,40 +1059,42 @@ void ps_main()
C.ga = vec2(float((denorm_c.g >> 6) | ((denorm_c.b >> 3) << 2) | (denorm_TA.y & 0x80u)));
else
C.ga = vec2(float((denorm_c.g >> 6) | ((denorm_c.b >> 3) << 2) | (denorm_TA.x & 0x80u)));
// Write RB part. Mask will take care of the correct destination
#elif PS_READ_BA
C.rb = C.bb;
// FIXME precompute my_TA & 0x80

// Write GA part. Mask will take care of the correct destination
// Note: GLSL 4.50/GL_EXT_shader_integer_mix support a mix instruction to select a component\n"
// However Nvidia emulate it with an if (at least on kepler arch) ...\n"

// bit field operation requires GL4 HW. Could be nice to merge it with step/mix below
// uint my_ta = (bool(bitfieldExtract(denorm_c.a, 7, 1))) ? denorm_TA.y : denorm_TA.x;
// denorm_c.a = bitfieldInsert(denorm_c.a, bitfieldExtract(my_ta, 7, 1), 7, 1);
// c.ga = vec2(float(denorm_c.a));

if (bool(denorm_c.a & 0x80u))
C.ga = vec2(float((denorm_c.a & 0x7Fu) | (denorm_TA.y & 0x80u)));
#elif PS_SHUFFLE_ACROSS
#if(PS_PROCESS_BA == SHUFFLE_READWRITE && PS_PROCESS_RG == SHUFFLE_READWRITE)
C.rb = C.br;
if ((denorm_c.a & 0x80u) != 0u)
C.g = float((denorm_c.a & 0x7Fu) | (denorm_TA.y & 0x80u));
else
C.g = float((denorm_c.a & 0x7Fu) | (denorm_TA.x & 0x80u));

if ((denorm_c.g & 0x80u) != 0u)
C.a = float((denorm_c.g & 0x7Fu) | (denorm_TA.y & 0x80u));
else
C.a = float((denorm_c.g & 0x7Fu) | (denorm_TA.x & 0x80u));

#elif(PS_PROCESS_BA & SHUFFLE_READ)
C.rb = C.bb;
if ((denorm_c.a & 0x80u) != 0u)
C.ga = vec2(float((denorm_c.a & 0x7Fu) | (denorm_TA.y & 0x80u)));
else
C.ga = vec2(float((denorm_c.a & 0x7Fu) | (denorm_TA.x & 0x80u)));
#else
C.rb = C.rr;
if ((denorm_c.g & 0x80u) != 0u)
C.ga = vec2(float((denorm_c.g & 0x7Fu) | (denorm_TA.y & 0x80u)));
else
C.ga = vec2(float((denorm_c.g & 0x7Fu) | (denorm_TA.x & 0x80u)));
#endif // PS_PROCESS_BA
#else // PS_SHUFFLE_ACROSS
if ((denorm_c.g & 0x80u) != 0u)
C.g = float((denorm_c.g & 0x7Fu) | (denorm_TA.y & 0x80u));
else
C.ga = vec2(float((denorm_c.a & 0x7Fu) | (denorm_TA.x & 0x80u)));

#else
C.rb = C.rr;
if (bool(denorm_c.g & 0x80u))
C.ga = vec2(float((denorm_c.g & 0x7Fu) | (denorm_TA.y & 0x80u)));
C.g = float((denorm_c.g & 0x7Fu) | (denorm_TA.x & 0x80u));
if ((denorm_c.a & 0x80u) != 0u)
C.a = float((denorm_c.a & 0x7Fu) | (denorm_TA.y & 0x80u));
else
C.ga = vec2(float((denorm_c.g & 0x7Fu) | (denorm_TA.x & 0x80u)));

// Nice idea but step/mix requires 4 instructions
// set / trunc / I2F / Mad
//
// float sel = step(128.0f, c.g);
// vec2 c_shuffle = vec2((denorm_c.gg & 0x7Fu) | (denorm_TA & 0x80u));
// c.ga = mix(c_shuffle.xx, c_shuffle.yy, sel);

#endif // PS_SHUFFLE_SAME
C.a = float((denorm_c.a & 0x7Fu) | (denorm_TA.x & 0x80u));
#endif // PS_SHUFFLE_ACROSS
#endif // PS_SHUFFLE

ps_dither(C.rgb, alpha_blend.a);
Expand Down
63 changes: 46 additions & 17 deletions bin/resources/shaders/vulkan/tfx.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,10 @@ void main()
#define FMT_24 1
#define FMT_16 2

#define SHUFFLE_READ 1
#define SHUFFLE_WRITE 2
#define SHUFFLE_READWRITE 3

#ifndef VS_TME
#define VS_TME 1
#define VS_FST 1
Expand Down Expand Up @@ -266,7 +270,9 @@ void main()
#define PS_POINT_SAMPLER 0
#define PS_SHUFFLE 0
#define PS_SHUFFLE_SAME 0
#define PS_READ_BA 0
#define PS_PROCESS_BA 0
#define PS_PROCESS_RG 0
#define PS_SHUFFLE_ACROSS 0
#define PS_WRITE_RG 0
#define PS_READ16_SRC 0
#define PS_DST_FMT 0
Expand Down Expand Up @@ -945,9 +951,9 @@ vec4 ps_color()
vec4 T = sample_color(st);
#endif

#if PS_SHUFFLE && !PS_SHUFFLE_SAME && !PS_READ16_SRC
#if SW_BLEND && PS_SHUFFLE && !PS_SHUFFLE_SAME && !PS_READ16_SRC && (PS_SHUFFLE_ACROSS || PS_PROCESS_BA == SHUFFLE_READWRITE || PS_PROCESS_RG == SHUFFLE_READWRITE)
uvec4 denorm_c_before = uvec4(T);
#if PS_READ_BA
#if (PS_PROCESS_BA & SHUFFLE_READ)
T.r = float((denorm_c_before.b << 3) & 0xF8);
T.g = float(((denorm_c_before.b >> 2) & 0x38) | ((denorm_c_before.a << 6) & 0xC0));
T.b = float((denorm_c_before.a << 1) & 0xF8);
Expand Down Expand Up @@ -1277,9 +1283,9 @@ void main()
ps_blend(C, alpha_blend);

#if PS_SHUFFLE
#if !PS_SHUFFLE_SAME && !PS_READ16_SRC
#if SW_BLEND && !PS_SHUFFLE_SAME && !PS_READ16_SRC && (PS_SHUFFLE_ACROSS || PS_PROCESS_BA == SHUFFLE_READWRITE || PS_PROCESS_RG == SHUFFLE_READWRITE)
uvec4 denorm_c_after = uvec4(C);
#if PS_READ_BA
#if (PS_PROCESS_BA & SHUFFLE_READ)
C.b = float(((denorm_c_after.r >> 3) & 0x1F) | ((denorm_c_after.g << 2) & 0xE0));
C.a = float(((denorm_c_after.g >> 6) & 0x3) | ((denorm_c_after.b >> 1) & 0x7C) | (denorm_c_after.a & 0x80));
#else
Expand All @@ -1293,7 +1299,7 @@ void main()

// Special case for 32bit input and 16bit output, shuffle used by The Godfather
#if PS_SHUFFLE_SAME
#if (PS_READ_BA)
#if (PS_PROCESS_BA & SHUFFLE_READ)
C = vec4(float((denorm_c.b & 0x7Fu) | (denorm_c.a & 0x80u)));
#else
C.ga = C.rg;
Expand All @@ -1306,19 +1312,42 @@ void main()
else
C.ga = vec2(float((denorm_c.g >> 6) | ((denorm_c.b >> 3) << 2) | (denorm_TA.x & 0x80u)));
// Write RB part. Mask will take care of the correct destination
#elif PS_READ_BA
C.rb = C.bb;
if ((denorm_c.a & 0x80u) != 0u)
C.ga = vec2(float((denorm_c.a & 0x7Fu) | (denorm_TA.y & 0x80u)));
else
C.ga = vec2(float((denorm_c.a & 0x7Fu) | (denorm_TA.x & 0x80u)));
#else
C.rb = C.rr;
#elif PS_SHUFFLE_ACROSS
#if(PS_PROCESS_BA == SHUFFLE_READWRITE && PS_PROCESS_RG == SHUFFLE_READWRITE)
C.rb = C.br;
if ((denorm_c.a & 0x80u) != 0u)
C.g = float((denorm_c.a & 0x7Fu) | (denorm_TA.y & 0x80u));
else
C.g = float((denorm_c.a & 0x7Fu) | (denorm_TA.x & 0x80u));

if ((denorm_c.g & 0x80u) != 0u)
C.a = float((denorm_c.g & 0x7Fu) | (denorm_TA.y & 0x80u));
else
C.a = float((denorm_c.g & 0x7Fu) | (denorm_TA.x & 0x80u));

#elif(PS_PROCESS_BA & SHUFFLE_READ)
C.rb = C.bb;
if ((denorm_c.a & 0x80u) != 0u)
C.ga = vec2(float((denorm_c.a & 0x7Fu) | (denorm_TA.y & 0x80u)));
else
C.ga = vec2(float((denorm_c.a & 0x7Fu) | (denorm_TA.x & 0x80u)));
#else
C.rb = C.rr;
if ((denorm_c.g & 0x80u) != 0u)
C.ga = vec2(float((denorm_c.g & 0x7Fu) | (denorm_TA.y & 0x80u)));
else
C.ga = vec2(float((denorm_c.g & 0x7Fu) | (denorm_TA.x & 0x80u)));
#endif // PS_PROCESS_BA
#else // PS_SHUFFLE_ACROSS
if ((denorm_c.g & 0x80u) != 0u)
C.ga = vec2(float((denorm_c.g & 0x7Fu) | (denorm_TA.y & 0x80u)));
C.g = float((denorm_c.g & 0x7Fu) | (denorm_TA.y & 0x80u));
else
C.g = float((denorm_c.g & 0x7Fu) | (denorm_TA.x & 0x80u));
if ((denorm_c.a & 0x80u) != 0u)
C.a = float((denorm_c.a & 0x7Fu) | (denorm_TA.y & 0x80u));
else
C.ga = vec2(float((denorm_c.g & 0x7Fu) | (denorm_TA.x & 0x80u)));
#endif // PS_SHUFFLE_SAME
C.a = float((denorm_c.a & 0x7Fu) | (denorm_TA.x & 0x80u));
#endif // PS_SHUFFLE_ACROSS
#endif // PS_SHUFFLE

ps_dither(C.rgb, alpha_blend.a);
Expand Down
4 changes: 3 additions & 1 deletion pcsx2/GS/Renderers/Common/GSDevice.h
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,9 @@ struct alignas(16) GSHWDrawConfig
u32 shuffle : 1;
u32 shuffle_same : 1;
u32 real16src: 1;
u32 read_ba : 1;
u32 process_ba : 2;
u32 process_rg : 2;
u32 shuffle_across : 1;
u32 write_rg : 1;
u32 fbmask : 1;

Expand Down
4 changes: 3 additions & 1 deletion pcsx2/GS/Renderers/DX11/GSDevice11.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1679,7 +1679,9 @@ void GSDevice11::SetupPS(const PSSelector& sel, const GSHWDrawConfig::PSConstant
sm.AddMacro("PS_REGION_RECT", sel.region_rect);
sm.AddMacro("PS_SHUFFLE", sel.shuffle);
sm.AddMacro("PS_SHUFFLE_SAME", sel.shuffle_same);
sm.AddMacro("PS_READ_BA", sel.read_ba);
sm.AddMacro("PS_PROCESS_BA", sel.process_ba);
sm.AddMacro("PS_PROCESS_RG", sel.process_rg);
sm.AddMacro("PS_SHUFFLE_ACROSS", sel.shuffle_across);
sm.AddMacro("PS_READ16_SRC", sel.real16src);
sm.AddMacro("PS_CHANNEL_FETCH", sel.channel);
sm.AddMacro("PS_TALES_OF_ABYSS_HLE", sel.tales_of_abyss_hle);
Expand Down
4 changes: 3 additions & 1 deletion pcsx2/GS/Renderers/DX12/GSDevice12.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2833,7 +2833,9 @@ const ID3DBlob* GSDevice12::GetTFXPixelShader(const GSHWDrawConfig::PSSelector&
sm.AddMacro("PS_REGION_RECT", sel.region_rect);
sm.AddMacro("PS_SHUFFLE", sel.shuffle);
sm.AddMacro("PS_SHUFFLE_SAME", sel.shuffle_same);
sm.AddMacro("PS_READ_BA", sel.read_ba);
sm.AddMacro("PS_PROCESS_BA", sel.process_ba);
sm.AddMacro("PS_PROCESS_RG", sel.process_rg);
sm.AddMacro("PS_SHUFFLE_ACROSS", sel.shuffle_across);
sm.AddMacro("PS_READ16_SRC", sel.real16src);
sm.AddMacro("PS_CHANNEL_FETCH", sel.channel);
sm.AddMacro("PS_TALES_OF_ABYSS_HLE", sel.tales_of_abyss_hle);
Expand Down
Loading

0 comments on commit 4ba43b8

Please sign in to comment.