Skip to content

Commit

Permalink
Simd optimize quantization
Browse files Browse the repository at this point in the history
  • Loading branch information
SaiyansKing committed Sep 8, 2024
1 parent 29af87c commit 4046b4e
Show file tree
Hide file tree
Showing 4 changed files with 237 additions and 184 deletions.
193 changes: 193 additions & 0 deletions D3D11Engine/DLLMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ extern "C" {
_declspec(dllexport) DWORD AmdPowerXpressRequestHighPerformance = 0x00000001;
}

ZQuantizeHalfFloat QuantizeHalfFloat;
ZQuantizeHalfFloat_X4 QuantizeHalfFloat_X4;
ZUnquantizeHalfFloat UnquantizeHalfFloat;
ZUnquantizeHalfFloat_X4 UnquantizeHalfFloat_X4;
ZUnquantizeHalfFloat_X4 UnquantizeHalfFloat_X8;

static HINSTANCE hLThis = 0;

typedef void (WINAPI* DirectDrawSimple)();
Expand All @@ -38,6 +44,170 @@ WinMainFunc originalWinMain = reinterpret_cast<WinMainFunc>(GothicMemoryLocation
bool FeatureLevel10Compatibility = false;
bool GMPModeActive = false;

unsigned short QuantizeHalfFloat_Scalar( float input )
{
union { float f; unsigned int ui; } u = { input };
unsigned int ui = u.ui;

int s = ( ui >> 16 ) & 0x8000;
int em = ui & 0x7fffffff;

int h = ( em - ( 112 << 23 ) + ( 1 << 12 ) ) >> 13;
h = ( em < ( 113 << 23 ) ) ? 0 : h;
h = ( em >= ( 143 << 23 ) ) ? 0x7c00 : h;
h = ( em > ( 255 << 23 ) ) ? 0x7e00 : h;
return static_cast<unsigned short>(s | h);
}

void QuantizeHalfFloats_X4_SSE2( float* input, unsigned short* output )
{
__m128i v = _mm_castps_si128( _mm_load_ps( input ) );
__m128i s = _mm_and_si128( _mm_srli_epi32( v, 16 ), _mm_set1_epi32( 0x8000 ) );
__m128i em = _mm_and_si128( v, _mm_set1_epi32( 0x7FFFFFFF ) );
__m128i h = _mm_srli_epi32( _mm_sub_epi32( em, _mm_set1_epi32( 0x37FFF000 ) ), 13 );

__m128i mask = _mm_cmplt_epi32( em, _mm_set1_epi32( 0x38800000 ) );
h = _mm_or_si128( _mm_and_si128( mask, _mm_setzero_si128() ), _mm_andnot_si128( mask, h ) );

mask = _mm_cmpgt_epi32( em, _mm_set1_epi32( 0x47800000 - 1 ) );
h = _mm_or_si128( _mm_and_si128( mask, _mm_set1_epi32( 0x7C00 ) ), _mm_andnot_si128( mask, h ) );

mask = _mm_cmpgt_epi32( em, _mm_set1_epi32( 0x7F800000 ) );
h = _mm_or_si128( _mm_and_si128( mask, _mm_set1_epi32( 0x7E00 ) ), _mm_andnot_si128( mask, h ) );

// We need to stay in int16_t range due to signed saturation
__m128i halfs = _mm_sub_epi32( _mm_or_si128( s, h ), _mm_set1_epi32( 32768 ) );
_mm_store_sd( reinterpret_cast<double*>(output), _mm_castsi128_pd( _mm_add_epi16( _mm_packs_epi32( halfs, halfs ), _mm_set1_epi16( 32768 ) ) ) );
}

void QuantizeHalfFloats_X4_SSE41( float* input, unsigned short* output )
{
__m128i v = _mm_castps_si128( _mm_load_ps( input ) );
__m128i s = _mm_and_si128( _mm_srli_epi32( v, 16 ), _mm_set1_epi32( 0x8000 ) );
__m128i em = _mm_and_si128( v, _mm_set1_epi32( 0x7FFFFFFF ) );
__m128i h = _mm_srli_epi32( _mm_sub_epi32( em, _mm_set1_epi32( 0x37FFF000 ) ), 13 );

__m128i mask = _mm_cmplt_epi32( em, _mm_set1_epi32( 0x38800000 ) );
h = _mm_blendv_epi8( h, _mm_setzero_si128(), mask );

mask = _mm_cmpgt_epi32( em, _mm_set1_epi32( 0x47800000 - 1 ) );
h = _mm_blendv_epi8( h, _mm_set1_epi32( 0x7C00 ), mask );

mask = _mm_cmpgt_epi32( em, _mm_set1_epi32( 0x7F800000 ) );
h = _mm_blendv_epi8( h, _mm_set1_epi32( 0x7E00 ), mask );

__m128i halfs = _mm_or_si128( s, h );
_mm_store_sd( reinterpret_cast<double*>(output), _mm_castsi128_pd( _mm_packus_epi32( halfs, halfs ) ) );
}

#ifdef _XM_AVX_INTRINSICS_
unsigned short QuantizeHalfFloat_F16C( float input )
{
return static_cast<unsigned short>(_mm_cvtsi128_si32( _mm_cvtps_ph( _mm_set_ss( input ), _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC ) ));
}

void QuantizeHalfFloats_X4_F16C( float* input, unsigned short* output )
{
_mm_store_sd( reinterpret_cast<double*>(output), _mm_castsi128_pd( _mm_cvtps_ph( _mm_load_ps( input ), _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC ) ) );
}
#endif

float UnquantizeHalfFloat_Scalar( unsigned short input )
{
unsigned int s = input & 0x8000;
unsigned int m = input & 0x03FF;
unsigned int e = input & 0x7C00;
e += 0x0001C000;

float out;
unsigned int r = (s << 16) | (m << 13) | (e << 13);
memcpy( &out, &r, sizeof( float ) );
return out;
}

void UnquantizeHalfFloat_X4_SSE2( unsigned short* input, float* output )
{
const __m128i mask_zero = _mm_setzero_si128();
const __m128i mask_s = _mm_set1_epi16( 0x8000 );
const __m128i mask_m = _mm_set1_epi16( 0x03FF );
const __m128i mask_e = _mm_set1_epi16( 0x7C00 );
const __m128i bias_e = _mm_set1_epi32( 0x0001C000 );

__m128i halfs = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(input) );

__m128i s = _mm_and_si128( halfs, mask_s );
__m128i m = _mm_and_si128( halfs, mask_m );
__m128i e = _mm_and_si128( halfs, mask_e );

__m128i s4 = _mm_unpacklo_epi16( s, mask_zero );
s4 = _mm_slli_epi32( s4, 16 );

__m128i m4 = _mm_unpacklo_epi16( m, mask_zero );
m4 = _mm_slli_epi32( m4, 13 );

__m128i e4 = _mm_unpacklo_epi16( e, mask_zero );
e4 = _mm_add_epi32( e4, bias_e );
e4 = _mm_slli_epi32( e4, 13 );

_mm_store_si128( reinterpret_cast<__m128i*>(output), _mm_or_si128( s4, _mm_or_si128( e4, m4 ) ) );
}

void UnquantizeHalfFloat_X8_SSE2( unsigned short* input, float* output )
{
const __m128i mask_zero = _mm_setzero_si128();
const __m128i mask_s = _mm_set1_epi16( 0x8000 );
const __m128i mask_m = _mm_set1_epi16( 0x03FF );
const __m128i mask_e = _mm_set1_epi16( 0x7C00 );
const __m128i bias_e = _mm_set1_epi32( 0x0001C000 );

__m128i halfs = _mm_load_si128( reinterpret_cast<const __m128i*>(input) );

__m128i s = _mm_and_si128( halfs, mask_s );
__m128i m = _mm_and_si128( halfs, mask_m );
__m128i e = _mm_and_si128( halfs, mask_e );

__m128i s4 = _mm_unpacklo_epi16( s, mask_zero );
s4 = _mm_slli_epi32( s4, 16 );

__m128i m4 = _mm_unpacklo_epi16( m, mask_zero );
m4 = _mm_slli_epi32( m4, 13 );

__m128i e4 = _mm_unpacklo_epi16( e, mask_zero );
e4 = _mm_add_epi32( e4, bias_e );
e4 = _mm_slli_epi32( e4, 13 );

_mm_store_si128( reinterpret_cast<__m128i*>(output + 0), _mm_or_si128( s4, _mm_or_si128( e4, m4 ) ) );

s4 = _mm_unpackhi_epi16( s, mask_zero );
s4 = _mm_slli_epi32( s4, 16 );

m4 = _mm_unpackhi_epi16( m, mask_zero );
m4 = _mm_slli_epi32( m4, 13 );

e4 = _mm_unpackhi_epi16( e, mask_zero );
e4 = _mm_add_epi32( e4, bias_e );
e4 = _mm_slli_epi32( e4, 13 );

_mm_store_si128( reinterpret_cast<__m128i*>(output + 4), _mm_or_si128( s4, _mm_or_si128( e4, m4 ) ) );
}

#ifdef _XM_AVX_INTRINSICS_
float UnquantizeHalfFloat_F16C( unsigned short input )
{
return _mm_cvtss_f32( _mm_cvtph_ps( _mm_cvtsi32_si128( input ) ) );
}

void UnquantizeHalfFloat_X4_F16C( unsigned short* input, float* output )
{
_mm_store_ps( output, _mm_cvtph_ps( _mm_loadl_epi64( reinterpret_cast<const __m128i*>(input) ) ) );
}

void UnquantizeHalfFloat_X8_F16C( unsigned short* input, float* output )
{
_mm256_store_ps( output, _mm256_cvtph_ps( _mm_load_si128( reinterpret_cast<const __m128i*>(input) ) ) );
}
#endif

void SignalHandler( int signal ) {
LogInfo() << "Signal:" << signal;
throw "!Access Violation!";
Expand Down Expand Up @@ -206,6 +376,29 @@ void CheckPlatformSupport() {
#elif __SSE__
support_message( "SSE", InstructionSet::SSE() );
#endif

#ifdef _XM_AVX_INTRINSICS_
if ( InstructionSet::F16C() ) {
QuantizeHalfFloat = QuantizeHalfFloat_F16C;
QuantizeHalfFloat_X4 = QuantizeHalfFloats_X4_F16C;
UnquantizeHalfFloat = UnquantizeHalfFloat_F16C;
UnquantizeHalfFloat_X4 = UnquantizeHalfFloat_X4_F16C;
UnquantizeHalfFloat_X8 = UnquantizeHalfFloat_X8_F16C;
} else
#endif
if ( InstructionSet::SSE41() ) {
QuantizeHalfFloat = QuantizeHalfFloat_Scalar;
QuantizeHalfFloat_X4 = QuantizeHalfFloats_X4_SSE41;
UnquantizeHalfFloat = UnquantizeHalfFloat_Scalar;
UnquantizeHalfFloat_X4 = UnquantizeHalfFloat_X4_SSE2;
UnquantizeHalfFloat_X8 = UnquantizeHalfFloat_X8_SSE2;
} else {
QuantizeHalfFloat = QuantizeHalfFloat_Scalar;
QuantizeHalfFloat_X4 = QuantizeHalfFloats_X4_SSE2;
UnquantizeHalfFloat = UnquantizeHalfFloat_Scalar;
UnquantizeHalfFloat_X4 = UnquantizeHalfFloat_X4_SSE2;
UnquantizeHalfFloat_X8 = UnquantizeHalfFloat_X8_SSE2;
}
}

#if defined(BUILD_GOTHIC_2_6_fix)
Expand Down
45 changes: 25 additions & 20 deletions D3D11Engine/GothicAPI.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1929,27 +1929,32 @@ float3* GothicAPI::GetLowestLODPoly_SkeletalMesh( zCModel* model, const int poly
VERTEX_INDEX _polyId = mesh->Indices[polyIndex + i];
ExSkelVertexStruct& _polyVert = mesh->Vertices[_polyId];

alignas(32) float floats_0[8];
alignas(32) float floats_1[8];
alignas(16) unsigned short half2float_0[8] = { _polyVert.Position[0][0], _polyVert.Position[0][1], _polyVert.Position[0][2], _polyVert.weights[0],
_polyVert.Position[1][0], _polyVert.Position[1][1], _polyVert.Position[1][2], _polyVert.weights[1] };
alignas(16) unsigned short half2float_1[8] = { _polyVert.Position[2][0], _polyVert.Position[2][1], _polyVert.Position[2][2], _polyVert.weights[2],
_polyVert.Position[3][0], _polyVert.Position[3][1], _polyVert.Position[3][2], _polyVert.weights[3] };
UnquantizeHalfFloat_X8( half2float_0, floats_0 );
UnquantizeHalfFloat_X8( half2float_1, floats_1 );

XMVECTOR position = XMVectorZero();
position += XMVectorReplicate( unquantizeHalfFloat( _polyVert.weights[0] ) ) * XMVector3Transform(
XMVectorSet( unquantizeHalfFloat( _polyVert.Position[0][0] ),
unquantizeHalfFloat( _polyVert.Position[0][1] ),
unquantizeHalfFloat( _polyVert.Position[0][2] ), 1.f ), XMMatrixTranspose( XMLoadFloat4x4( &transforms[_polyVert.boneIndices[0]] ) ) );

position += XMVectorReplicate( unquantizeHalfFloat( _polyVert.weights[1] ) ) * XMVector3Transform(
XMVectorSet( unquantizeHalfFloat( _polyVert.Position[1][0] ),
unquantizeHalfFloat( _polyVert.Position[1][1] ),
unquantizeHalfFloat( _polyVert.Position[1][2] ), 1.f ), XMMatrixTranspose( XMLoadFloat4x4( &transforms[_polyVert.boneIndices[1]] ) ) );

position += XMVectorReplicate( unquantizeHalfFloat( _polyVert.weights[2] ) ) * XMVector3Transform(
XMVectorSet( unquantizeHalfFloat( _polyVert.Position[2][0] ),
unquantizeHalfFloat( _polyVert.Position[2][1] ),
unquantizeHalfFloat( _polyVert.Position[2][2] ), 1.f ), XMMatrixTranspose( XMLoadFloat4x4( &transforms[_polyVert.boneIndices[2]] ) ) );

position += XMVectorReplicate( unquantizeHalfFloat( _polyVert.weights[3] ) ) * XMVector3Transform(
XMVectorSet( unquantizeHalfFloat( _polyVert.Position[3][0] ),
unquantizeHalfFloat( _polyVert.Position[3][1] ),
unquantizeHalfFloat( _polyVert.Position[3][2] ), 1.f ), XMMatrixTranspose( XMLoadFloat4x4( &transforms[_polyVert.boneIndices[3]] ) ) );

position += XMVectorReplicate( floats_0[3] ) * XMVector3Transform(
XMVectorSet( floats_0[0], floats_0[1], floats_0[2], 1.f ),
XMMatrixTranspose( XMLoadFloat4x4( &transforms[_polyVert.boneIndices[0]] ) ) );

position += XMVectorReplicate( floats_0[7] ) * XMVector3Transform(
XMVectorSet( floats_0[4], floats_0[5], floats_0[6], 1.f ),
XMMatrixTranspose( XMLoadFloat4x4( &transforms[_polyVert.boneIndices[1]] ) ) );

position += XMVectorReplicate( floats_1[3] ) * XMVector3Transform(
XMVectorSet( floats_1[0], floats_1[1], floats_1[2], 1.f ),
XMMatrixTranspose( XMLoadFloat4x4( &transforms[_polyVert.boneIndices[2]] ) ) );

position += XMVectorReplicate( floats_1[7] ) * XMVector3Transform(
XMVectorSet( floats_1[4], floats_1[5], floats_1[6], 1.f ),
XMMatrixTranspose( XMLoadFloat4x4( &transforms[_polyVert.boneIndices[3]] ) ) );

position += XMVectorReplicate( fatness ) * XMLoadFloat3( reinterpret_cast<const XMFLOAT3*>(&_polyVert.BindPoseNormal) ) ;

// world matrix is applied later when particle calculate world position
Expand Down
13 changes: 9 additions & 4 deletions D3D11Engine/WorldConverter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -788,11 +788,16 @@ void WorldConverter::ExtractSkeletalMeshFromVob( zCModel* model, SkeletalMeshVis

// Get index and weight
if ( n < 4 ) {
vx.weights[n] = quantizeHalfFloat( weightEntry.Weight );
alignas(16) float floats[4] = { weightEntry.VertexPosition.x, weightEntry.VertexPosition.y,
weightEntry.VertexPosition.z, weightEntry.Weight };
alignas(16) unsigned short halfs[4];
QuantizeHalfFloat_X4( floats, halfs );

vx.weights[n] = halfs[3];
vx.boneIndices[n] = weightEntry.NodeIndex;
vx.Position[n][0] = quantizeHalfFloat( weightEntry.VertexPosition.x );
vx.Position[n][1] = quantizeHalfFloat( weightEntry.VertexPosition.y );
vx.Position[n][2] = quantizeHalfFloat( weightEntry.VertexPosition.z );
vx.Position[n][0] = halfs[0];
vx.Position[n][1] = halfs[1];
vx.Position[n][2] = halfs[2];
}
}

Expand Down
Loading

0 comments on commit 4046b4e

Please sign in to comment.