From ae9d70efaaeb7b2f8fe68093eeafeb5e1bcfca57 Mon Sep 17 00:00:00 2001 From: Adam Wieckowski Date: Mon, 24 Jun 2024 11:36:45 +0200 Subject: [PATCH] FGS improvements, clean SIMD separation --- CMakeLists.txt | 2 +- include/vvdec/vvdec.h | 25 +- source/App/vvdecapp/CmdLineParser.h | 392 +++++------ source/App/vvdecapp/vvdecapp.cpp | 14 +- source/Lib/CommonLib/LoopFilter.cpp | 143 ++-- source/Lib/CommonLib/LoopFilter.h | 2 +- source/Lib/CommonLib/UnitTools.cpp | 67 +- source/Lib/CommonLib/x86/CommonDefX86.h | 7 + source/Lib/DecoderLib/DecLib.h | 2 + source/Lib/FilmGrain/FilmGrain.cpp | 172 +++-- source/Lib/FilmGrain/FilmGrain.h | 32 +- source/Lib/FilmGrain/FilmGrainImpl.cpp | 206 +++--- source/Lib/FilmGrain/FilmGrainImpl.h | 57 +- source/Lib/FilmGrain/FilmGrainImplX86.h | 102 +++ source/Lib/FilmGrain/FilmGrainImpl_X86_SIMD.h | 609 ++++++++++++++++++ source/Lib/FilmGrain/FilmGrainImpl_avx2.cpp | 42 ++ source/Lib/FilmGrain/FilmGrainImpl_sse41.cpp | 42 ++ source/Lib/vvdec/CMakeLists.txt | 30 +- source/Lib/vvdec/vvdec.cpp | 37 +- source/Lib/vvdec/vvdecimpl.cpp | 149 +++-- source/Lib/vvdec/vvdecimpl.h | 9 +- 21 files changed, 1610 insertions(+), 531 deletions(-) create mode 100644 source/Lib/FilmGrain/FilmGrainImplX86.h create mode 100755 source/Lib/FilmGrain/FilmGrainImpl_X86_SIMD.h create mode 100644 source/Lib/FilmGrain/FilmGrainImpl_avx2.cpp create mode 100644 source/Lib/FilmGrain/FilmGrainImpl_sse41.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 869b3a6d..b391b73e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -77,7 +77,7 @@ set( VVDEC_ENABLE_X86_SIMD TRUE CACHE BOOL "enable x86 in set( VVDEC_ENABLE_ARM_SIMD ${VVDEC_ARM_SIMD_DEFAULT} CACHE BOOL "enable ARM intrinsics" ) set( VVDEC_ENABLE_TRACING FALSE CACHE BOOL "Compile in tracing functionality" ) -set( VVDEC_ENABLE_FILM_GRAIN FALSE CACHE BOOL "Build with film grain synthesis support" ) +set( VVDEC_ENABLE_FILM_GRAIN TRUE CACHE BOOL "Build with film grain synthesis support" ) include( vvdecCompilerSupport ) diff --git a/include/vvdec/vvdec.h b/include/vvdec/vvdec.h index f85b5e58..480cffdc 100644 --- a/include/vvdec/vvdec.h +++ b/include/vvdec/vvdec.h @@ -437,17 +437,20 @@ typedef struct vvdecFrame */ typedef struct vvdecParams { - int threads; // thread count ( default: -1 ) - int parseDelay; // number of frames to parse in parallel ( default: -1 ) - vvdecRPRUpscaling upscaleOutput; // do internal upscaling of rpl pictures to dest. resolution ( default: 0 ) - vvdecLogLevel logLevel; // verbosity level - bool verifyPictureHash; // verify picture, if digest is available, true: check hash in SEI messages if available, false: ignore SEI message - bool removePadding; // copy output pictures to new buffer to remove padding (stride==width) ( default: false ) - vvdecSIMD_Extension simd; // set specific simd optimization (default: max. availalbe) - void *opaque; // opaque pointer for private user data ( can be used to carry application specific data or contexts ) - vvdecErrHandlingFlags errHandlingFlags; // set of flags defining how to handle bitstream errors - int parseThreads; // DEPRECATED. Use `parseDelay` instead. This will be removed in the future. Until then, this value is copied to parseDelay if set. - int padding2; // reserved space for future parameters + int threads; // thread count ( default: -1 ) + int parseDelay; // number of frames to parse in parallel ( default: -1 ) + vvdecRPRUpscaling upscaleOutput; // do internal upscaling of rpl pictures to dest. resolution ( default: 0 ) + vvdecLogLevel logLevel; // verbosity level + bool verifyPictureHash; // verify picture, if digest is available, true: check hash in SEI messages if available, false: ignore SEI message + bool removePadding; // copy output pictures to new buffer to remove padding (stride==width) ( default: false ) + vvdecSIMD_Extension simd; // set specific simd optimization (default: max. availalbe) + void *opaque; // opaque pointer for private user data ( can be used to carry application specific data or contexts ) + vvdecErrHandlingFlags errHandlingFlags; // set of flags defining how to handle bitstream errors + int parseThreads; // DEPRECATED. Use `parseDelay` instead. This will be removed in the future. Until then, this value is copied to parseDelay if set. + bool filmGrainSynthesis; // set film grain synthesis using Film Grain Charactersitics SEI ( default: true ) + int8_t padding2_1; // reserved space for future parameters + int8_t padding2_2; + int8_t padding2_3; int padding3; int padding4; } vvdecParams; diff --git a/source/App/vvdecapp/CmdLineParser.h b/source/App/vvdecapp/CmdLineParser.h index 3f5616e8..fc4a1806 100644 --- a/source/App/vvdecapp/CmdLineParser.h +++ b/source/App/vvdecapp/CmdLineParser.h @@ -1,7 +1,7 @@ /* ----------------------------------------------------------------------------- The copyright in this software is being made available under the Clear BSD -License, included below. No patent rights, trademark rights and/or -other Intellectual Property Rights other than the copyrights concerning +License, included below. No patent rights, trademark rights and/or +other Intellectual Property Rights other than the copyrights concerning the Software are granted under this license. The Clear BSD License @@ -42,28 +42,118 @@ POSSIBILITY OF SUCH DAMAGE. #pragma once +#include +#include +#include +#include #include #include -#include -#include #include -#include #include "vvdec/vvdec.h" -namespace vvdecoderapp { +namespace vvdecoderapp +{ class CmdLineParser { + int32_t m_iArg = 0; + int m_argc = 0; + char** m_argv = nullptr; + + // parse a parameter with the corresponding argument + template + bool parse_param( std::initializer_list paramNames, TOut& outputVar, bool argOptional = false, const TOut optionalDefault = {} ) + { + if( m_iArg >= m_argc ) + { + return false; + } + + const std::string currArg( m_argv[m_iArg] ); + if( std::any_of( paramNames.begin(), paramNames.end(), + [&]( const char* name ) { return currArg == name; } ) ) + { + ++m_iArg; + + if( m_iArg < m_argc && parse_param_arg( outputVar ) ) + { + return true; + } + if( argOptional ) + { + outputVar = optionalDefault; + return true; + } + if( std::is_same::value ) + { + outputVar = true; // default value for bool always true, if present + return true; + } + + fprintf( stderr, " - missing argument for: %s \n", m_argv[m_iArg - 1] ); + throw MissingArgumentException(); + } + return false; + } + + // parse boolean arguments + bool parse_param_arg( bool& outputVar ) + { + outputVar = true; // boolean always defaults to true + if( strlen( m_argv[m_iArg] ) >= 1 && std::isdigit( m_argv[m_iArg][0] ) ) + { + outputVar = !!atoi( m_argv[m_iArg] ); + ++m_iArg; + return true; + } + return true; + } + + // parse string arguments + bool parse_param_arg( std::string& outputVar ) + { + outputVar = std::string( m_argv[m_iArg] ); + ++m_iArg; + return true; + } + + // parse signed int arguments + bool parse_param_arg( int& outputVar ) + { + const size_t argStrLen = strlen( m_argv[m_iArg] ); + if( ( argStrLen >= 1 && std::isdigit( m_argv[m_iArg][0] ) ) // positive number + || ( argStrLen >= 2 && m_argv[m_iArg][0] == '-' && std::isdigit( m_argv[m_iArg][1] ) ) ) // negative number + { + outputVar = atoi( m_argv[m_iArg] ); + ++m_iArg; + return true; + } + + return false; + } + + // parse unsigned int arguments + bool parse_param_arg( unsigned int& outputVar ) + { + if( strlen( m_argv[m_iArg] ) >= 1 && std::isdigit( m_argv[m_iArg][0] ) ) + { + outputVar = atoi( m_argv[m_iArg] ); + ++m_iArg; + return true; + } + return false; + } + public: /// Constructor - CmdLineParser(){} - + CmdLineParser() = default; /// Destructor - virtual ~CmdLineParser() {} + ~CmdLineParser() = default; static void print_usage( std::string cApp, vvdecParams& rcParams, bool fullHelp ) { + // clang-format off std::cout << std::endl; std::cout << " Usage: " << cApp << " [param1] [pararm2] [...]" << std::endl; std::cout << std::endl; @@ -78,6 +168,7 @@ class CmdLineParser if( fullHelp ) { std::cout << "\t\t [--upscale,-uo ] : set upscaling mode for RPR pictures(default: 0: off, 1: copy without rescaling, 2: rescale to target resolution)" << std::endl; + std::cout << "\t\t [--filmGrain,-fg ] : set film grain synthesis using Film Grain Charactersitics SEI (default: 1, off: 0, on: 1)" << std::endl; } std::cout << "\t\t [--y4m ] : force y4m output (for pipe output; auto enable for .y4m output file extension)" << std::endl; std::cout << std::endl; @@ -123,12 +214,22 @@ class CmdLineParser std::cout << "\t\t [--fullhelp ] : show full help including expert options" << std::endl; std::cout << std::endl; std::cout << std::endl; + // clang-format on } - - static int parse_command_line( int argc, char* argv[] , vvdecParams& rcParams, std::string& rcBitstreamFile, std::string& rcOutputFile, - int& riFrames, int& riLoops, std::string& rcExpectYuvMD5, bool& useY4mFormat, bool &useExternAllocator, - std::string& sTracingFile, std::string& sTracingRule, int& riPrintPicHash ) + int parse_command_line( int argc, + char* argv[], + vvdecParams& rcParams, + std::string& rcBitstreamFile, + std::string& rcOutputFile, + int& riFrames, + int& riLoops, + std::string& rcExpectYuvMD5, + bool& useY4mFormat, + bool& useExternAllocator, + std::string& sTracingFile, + std::string& sTracingRule, + int& riPrintPicHash ) { #ifndef ENABLE_TRACING // ignore unused variables @@ -136,221 +237,132 @@ class CmdLineParser (void) sTracingRule; #endif // !ENABLE_TRACING - int iRet = 0; /* Check command line parameters */ - int32_t i_arg = 1; + m_iArg = 1; + m_argc = argc; + m_argv = argv; /* Check general options first */ - while( i_arg < argc ) + while( m_iArg < argc ) { - if( (!strcmp( (const char*)argv[i_arg], "-v" )) || !strcmp( (const char*)argv[i_arg], "--verbosity" ) ) + bool _dummy = false; + unsigned logLevel = 0; + if( parse_param( { "-v", "--verbosity" }, logLevel ) ) { - if( i_arg == argc-1 ){ fprintf( stderr, " - missing argument for: %s \n", argv[i_arg] ); return -1; } - i_arg++; - int iLogLevel = atoi( argv[i_arg++] ); - if( iLogLevel < 0 ) iLogLevel = 0; - if( iLogLevel > (int)vvdecLogLevel::VVDEC_DETAILS ) iLogLevel = (int)vvdecLogLevel::VVDEC_DETAILS ; - rcParams.logLevel = (vvdecLogLevel)iLogLevel; + rcParams.logLevel = std::min( (vvdecLogLevel) logLevel, VVDEC_DETAILS ); if( rcParams.logLevel > VVDEC_VERBOSE ) { - std::string cll; - switch (rcParams.logLevel) + const char* cll; + switch( rcParams.logLevel ) { - case VVDEC_SILENT : cll = "SILENT"; break; - case VVDEC_ERROR : cll = "ERROR"; break; + // clang-format off + case VVDEC_SILENT : cll = "SILENT"; break; + case VVDEC_ERROR : cll = "ERROR"; break; case VVDEC_WARNING: cll = "WARNING"; break; - case VVDEC_INFO : cll = "INFO"; break; - case VVDEC_NOTICE : cll = "NOTICE"; break; + case VVDEC_INFO : cll = "INFO"; break; + case VVDEC_NOTICE : cll = "NOTICE"; break; case VVDEC_VERBOSE: cll = "VERBOSE"; break; case VVDEC_DETAILS: cll = "DETAILS"; break; - default: cll = "UNKNOWN"; break; + default: cll = "UNKNOWN"; break; + // clang-format on }; - fprintf( stdout, "[verbosity] : %d - %s\n", (int)rcParams.logLevel, cll.c_str() ); + fprintf( stdout, "[verbosity] : %d - %s\n", (int) rcParams.logLevel, cll ); } } - else if( (!strcmp( (const char*)argv[i_arg], "-h" )) || !strcmp( (const char*)argv[i_arg], "--help" ) ) + else if( parse_param( { "-h", "--help" }, _dummy ) ) { - i_arg++; - iRet = 2; - return iRet; + return 2; } - else if( !strcmp( ( const char* ) argv[i_arg], "--fullhelp" ) ) + else if( parse_param( { "--fullhelp", "--full-help" }, _dummy ) ) { - i_arg++; - iRet = 3; - return iRet; + return 3; } - else if( !strcmp( (const char*)argv[i_arg], "--version" ) ) + else if( parse_param( { "--version" }, _dummy ) ) { - i_arg++; - iRet = 4; - return iRet; + return 4; } else { - i_arg++; + m_iArg++; } } - - i_arg = 1; - while( i_arg < argc ) + // restart from the beginning to parse the remainig options + m_iArg = 1; + while( m_iArg < argc ) { - if( (!strcmp( (const char*)argv[i_arg], "-b" )) || !strcmp( (const char*)argv[i_arg], "--bitstream" ) ) /* In: input-file */ + int simd_arg = 0; + int err_handle_flags = 0; + int upscale_output = 0; + unsigned logLevel = 0; + if( parse_param( { "-b", "--bitstream" }, rcBitstreamFile ) ) /* In: input-file */ { - if( i_arg == argc-1 ){ fprintf( stderr, " - missing argument for: %s \n", argv[i_arg] ); return -1; } - i_arg++; if( rcParams.logLevel > VVDEC_VERBOSE ) - fprintf( stdout, "[bitstream] input-file: %s\n", argv[i_arg] ); - rcBitstreamFile = argv[i_arg++]; + fprintf( stdout, "[bitstream] input-file: %s\n", argv[m_iArg] ); } - else if( (!strcmp( (const char*)argv[i_arg], "-o" )) || !strcmp( (const char*)argv[i_arg], "--output" ) ) /* Out: bitstream-file */ + else if( parse_param( { "-o", "--output" }, rcOutputFile ) ) /* Out: bitstream-file */ { - if( i_arg == argc-1 ){ fprintf( stderr, " - missing argument for: %s \n", argv[i_arg] ); return -1; } - i_arg++; - if( i_arg < argc && strlen( argv[i_arg] ) > 0 ) - { - if( rcParams.logLevel > VVDEC_VERBOSE ) - fprintf( stdout, "[output] yuv-file: %s\n", argv[i_arg] ); - rcOutputFile = argv[i_arg++]; - } + if( rcParams.logLevel > VVDEC_VERBOSE ) + fprintf( stdout, "[output] yuv-file: %s\n", argv[m_iArg] ); } - else if( (!strcmp( (const char*)argv[i_arg], "-uo" )) || !strcmp( (const char*)argv[i_arg], "--upscale" ) ) /* In: upscale */ + else if( parse_param( { "-uo", "--upscale" }, upscale_output ) ) /* In: upscale */ { - i_arg++; - - rcParams.upscaleOutput = (vvdecRPRUpscaling) atoi( argv[i_arg++]); - + rcParams.upscaleOutput = vvdecRPRUpscaling( upscale_output ); if( rcParams.logLevel > VVDEC_VERBOSE ) { std::string scale; switch( rcParams.upscaleOutput ) { - case VVDEC_UPSCALING_OFF : scale = "OFF"; break; - case VVDEC_UPSCALING_COPY_ONLY: scale = "COPY_ONLY"; break; - case VVDEC_UPSCALING_RESCALE : scale = "RESCALE"; break; - default: scale = "UNKNOWN"; break; + // clang-format off + case VVDEC_UPSCALING_OFF : scale = "OFF"; break; + case VVDEC_UPSCALING_COPY_ONLY: scale = "COPY_ONLY"; break; + case VVDEC_UPSCALING_RESCALE : scale = "RESCALE"; break; + default : scale = "UNKNOWN"; break; + // clang-format on }; fprintf( stdout, "[upscale] : %s\n", scale.c_str() ); } } - else if( !strcmp( (const char*)argv[i_arg], "--y4m" ) ) - { - i_arg++; - useY4mFormat = true; - - if( i_arg < argc ) - { - if( std::isdigit(argv[i_arg][0])) - { - i_arg++; - } - } - } - else if( !strcmp( (const char*)argv[i_arg], "--extern" ) ) + else if( parse_param( { "-fg", "--filmGrain" }, rcParams.filmGrainSynthesis ) ) {} + else if( parse_param( { "--y4m" }, useY4mFormat ) ) {} + else if( parse_param( { "--extern" }, useExternAllocator ) ) {} + else if( parse_param( { "-f", "--frames" }, riFrames ) ) { - i_arg++; - useExternAllocator = true; - - if( i_arg < argc ) - { - if( std::isdigit(argv[i_arg][0])) - { - i_arg++; - } - } - } - else if( (!strcmp( (const char*)argv[i_arg], "-f" )) || !strcmp( (const char*)argv[i_arg], "--frames" ) ) - { - if( i_arg == argc-1 ){ fprintf( stderr, " - missing argument for: %s \n", argv[i_arg] ); return -1; } - i_arg++; - riFrames = atoi( argv[i_arg++] ); if( rcParams.logLevel > VVDEC_VERBOSE ) fprintf( stdout, "[frames] : %d\n", riFrames ); } - else if( (!strcmp( (const char*)argv[i_arg], "-t" )) || !strcmp( (const char*)argv[i_arg], "--threads" ) ) + else if( parse_param( { "-t", "--threads" }, rcParams.threads ) ) { - if( i_arg == argc-1 ){ fprintf( stderr, " - missing argument for: %s \n", argv[i_arg] ); return -1; } - i_arg++; - int iThreads = atoi( argv[i_arg++] ); if( rcParams.logLevel > VVDEC_VERBOSE ) - fprintf( stdout, "[threads] : %d\n", iThreads ); - rcParams.threads = iThreads; + fprintf( stdout, "[threads] : %d\n", rcParams.threads ); } - else if( (!strcmp( (const char*)argv[i_arg], "-p" )) || !strcmp( (const char*)argv[i_arg], "--parsedelay" ) ) + else if( parse_param( { "-p", "--parsedelay" }, rcParams.parseDelay ) ) { - if( i_arg == argc-1 ){ fprintf( stderr, " - missing argument for: %s \n", argv[i_arg] ); return -1; } - i_arg++; - int iDelay = atoi( argv[i_arg++] ); if( rcParams.logLevel > VVDEC_VERBOSE ) - fprintf( stdout, "[parsedelay] : %d\n", iDelay ); - rcParams.parseDelay = iDelay; + fprintf( stdout, "[parsedelay] : %d\n", rcParams.parseDelay ); } - else if( (!strcmp( (const char*)argv[i_arg], "-dph" )) || !strcmp( (const char*)argv[i_arg], "--SEIDecodedPictureHash" ) ) + else if( parse_param( { "-dph", "--SEIDecodedPictureHash" }, riPrintPicHash, true, 1 ) ) { - i_arg++; - if( i_arg < argc && std::isdigit( argv[i_arg][0] ) ) - { - riPrintPicHash = atoi( argv[i_arg] ); - i_arg++; - } - else - { - riPrintPicHash = 1; - } - - if( riPrintPicHash <= 1 ) + if( riPrintPicHash == 1 ) // dph levels > 11 print the DPH, but don't verify it (only 1 actually verifies) { + rcParams.verifyPictureHash = true; if( rcParams.logLevel > VVDEC_VERBOSE ) fprintf( stdout, "[SEIDecodedPictureHash] : true\n" ); - rcParams.verifyPictureHash = true; } } - else if( ( !strcmp( (const char*)argv[i_arg], "-md5" ) ) || !strcmp( (const char*)argv[i_arg], "--CheckYuvMD5" ) ) + else if( parse_param( { "-md5", "--CheckYuvMD5" }, rcExpectYuvMD5 ) ) { - if( i_arg >= argc - 1 ) { fprintf( stderr, " - missing argument for: %s \n", argv[i_arg] ); return -1; } - i_arg++; - if( strlen( argv[i_arg] ) != 32 ) - { - fprintf( stderr, " - the provided md5 hash to %s should be exactly 32 characters long\n", argv[i_arg - 1] ); - return -1; - } - - rcExpectYuvMD5 = std::string( argv[i_arg++] ); - if( rcParams.logLevel > VVDEC_VERBOSE ) fprintf( stdout, "[CheckYuvMD5] : %s\n", rcExpectYuvMD5.c_str() ); } - else if( (!strcmp( (const char*)argv[i_arg], "-L" )) || !strcmp( (const char*)argv[i_arg], "--loops" ) ) + else if( parse_param( { "-L", "--loops" }, riLoops ) ) { - if( i_arg == argc-1 ){ fprintf( stderr, " - missing argument for: %s \n", argv[i_arg] ); return -1; } - i_arg++; - riLoops = atoi( argv[i_arg++] ); if( rcParams.logLevel > VVDEC_VERBOSE ) fprintf( stdout, "[loops] : %d\n", riLoops ); } - else if( (!strcmp( (const char*)argv[i_arg], "-v" )) || !strcmp( (const char*)argv[i_arg], "--verbosity" ) ) - { - // already processed - i_arg++; - i_arg++; - } - else if( (!strcmp( (const char*)argv[i_arg], "-h" )) || !strcmp( (const char*)argv[i_arg], "--help" ) ) - { - // already processed - i_arg++; - } - else if( !strcmp( (const char*)argv[i_arg], "--version" ) ) + else if( parse_param( { "--simd" }, simd_arg ) ) { - // already processed - i_arg++; - } - else if( !strcmp( ( const char* ) argv[i_arg], "--simd" ) ) - { - if( i_arg == argc-1 ){ fprintf( stderr, " - missing argument for: %s \n", argv[i_arg] ); return -1; } - i_arg++; - const int simd_arg = atoi( argv[i_arg++] ); if( simd_arg < -1 || simd_arg > VVDEC_SIMD_MAX - 1 ) { fprintf( stderr, " - unsupported simd mode. Should be between -1 and %i inclusive.\n", VVDEC_SIMD_MAX - 1 ); @@ -363,31 +375,29 @@ class CmdLineParser const char* cll; switch( rcParams.simd ) { - case VVDEC_SIMD_DEFAULT: cll = "DEFAULT"; break; - case VVDEC_SIMD_SCALAR: cll = "SCALAR"; break; + // clang-format off + case VVDEC_SIMD_DEFAULT: cll = "DEFAULT"; break; + case VVDEC_SIMD_SCALAR: cll = "SCALAR"; break; #if VVDEC_ARCH_X86 - case VVDEC_SIMD_SSE41: cll = "SSE41"; break; - case VVDEC_SIMD_SSE42: cll = "SSE42"; break; - case VVDEC_SIMD_AVX: cll = "AVX"; break; - case VVDEC_SIMD_AVX2: cll = "AVX2"; break; + case VVDEC_SIMD_SSE41: cll = "SSE41"; break; + case VVDEC_SIMD_SSE42: cll = "SSE42"; break; + case VVDEC_SIMD_AVX: cll = "AVX"; break; + case VVDEC_SIMD_AVX2: cll = "AVX2"; break; #elif VVDEC_ARCH_ARM - case VVDEC_SIMD_NEON: cll = "NEON"; break; + case VVDEC_SIMD_NEON: cll = "NEON"; break; #elif VVDEC_ARCH_WASM - case VVDEC_SIMD_WASM: cll = "WASM-SIMD"; break; + case VVDEC_SIMD_WASM: cll = "WASM-SIMD"; break; #else - case VVDEC_SIMD_SIMDE_ANY:cll = "SIMDE-ANY"; break; + case VVDEC_SIMD_SIMDE_ANY: cll = "SIMDE-ANY"; break; #endif - default: return -1; + default: return -1; + // clang-format on }; fprintf( stdout, "[simd] : %s\n", cll ); } } - else if( (!strcmp( argv[i_arg], "-eh" )) || !strcmp( argv[i_arg], "--errHandling" ) ) + else if( parse_param( { "-eh", "--errHandling" }, err_handle_flags ) ) { - if( i_arg == argc-1 ){ fprintf( stderr, " - missing argument for: %s \n", argv[i_arg] ); return -1; } - i_arg++; - - const int err_handle_flags = atoi( argv[i_arg++] ); if( err_handle_flags < 0 || err_handle_flags > VVDEC_ERR_HANDLING_TRY_CONTINUE ) { fprintf( stderr, " - unsupported error handling flags. Should be between 0 and %i.\n", VVDEC_ERR_HANDLING_TRY_CONTINUE ); @@ -397,32 +407,26 @@ class CmdLineParser rcParams.errHandlingFlags = vvdecErrHandlingFlags( err_handle_flags ); } #ifdef ENABLE_TRACING - else if( !strcmp( (const char*)argv[i_arg], "--TraceFile" ) || !strcmp( (const char*)argv[i_arg], "-tf" ) ) - { - sTracingFile = argv[++i_arg]; - i_arg++; - } - else if( !strcmp( (const char*)argv[i_arg], "--TraceRule" ) || !strcmp( (const char*)argv[i_arg], "-tr" ) ) + else if( parse_param( { "-tf", "--TraceFile" }, sTracingFile ) ) {} + else if( parse_param( { "-tr", "--TraceRule" }, sTracingRule ) ) {} +#endif + else if( parse_param( { "-v", "--verbosity" }, logLevel ) ) // already processed. Parse again so we don't detect an unknown argument { - sTracingRule = argv[++i_arg]; - i_arg++; + assert( logLevel == rcParams.logLevel ); } -#endif // ENABLE_TRACING else { - fprintf( stderr, " - unknown argument: %s \n", argv[i_arg++] ); - iRet = -1; + fprintf( stderr, " - unknown argument: %s \n", argv[m_iArg++] ); + return -1; } } - return iRet; + return 0; } -private: - std::ofstream m_cOS; + struct MissingArgumentException : std::exception + { + }; }; - - -} // namespace - +} // namespace vvdecoderapp diff --git a/source/App/vvdecapp/vvdecapp.cpp b/source/App/vvdecapp/vvdecapp.cpp index 414772f8..16c9fce9 100644 --- a/source/App/vvdecapp/vvdecapp.cpp +++ b/source/App/vvdecapp/vvdecapp.cpp @@ -483,8 +483,16 @@ int main( int argc, char* argv[] ) return 0; } + int iRet = -1; + try { + vvdecoderapp::CmdLineParser cmdLineParser; + iRet = cmdLineParser.parse_command_line( argc, argv, params, cBitstreamFile, cOutputFile, iMaxFrames, iLoopCount, cExpectedYuvMD5, y4mOutput, externAllocator, sTracingFile, sTracingRule, iPrintPicHash ); + } + catch( std::exception& ) + { + iRet = -1; + } - int iRet = vvdecoderapp::CmdLineParser::parse_command_line( argc, argv, params, cBitstreamFile, cOutputFile, iMaxFrames, iLoopCount, cExpectedYuvMD5, y4mOutput, externAllocator, sTracingFile, sTracingRule, iPrintPicHash ); if( iRet != 0 ) { if( iRet == 2 ) @@ -1055,9 +1063,9 @@ static bool handle_frame( vvdecFrame* pcFrame, if( pcFrame->frameFormat == VVDEC_FF_PROGRESSIVE ) { - if( iPrintPicHash > 1 ) + if( iPrintPicHash >= 11 ) { - printPicHash( pcFrame, logStream, uiFrames-1, iPrintPicHash-11 ); + printPicHash( pcFrame, logStream, uiFrames - 1, iPrintPicHash - 11 ); } if( md5Stream ) diff --git a/source/Lib/CommonLib/LoopFilter.cpp b/source/Lib/CommonLib/LoopFilter.cpp index 011b2a05..2eb26aa7 100644 --- a/source/Lib/CommonLib/LoopFilter.cpp +++ b/source/Lib/CommonLib/LoopFilter.cpp @@ -610,6 +610,11 @@ void LoopFilter::calcFilterStrengths( const CodingUnit& cu ) const xSetMaxFilterLengthPQForCodingSubBlocks( cu, ctuData ); } +#if ENABLE_SIMD_DBLF && defined( TARGET_SIMD_X86 ) + const bool useSimd = read_x86_extension_flags() > x86_simd::SCALAR; +#else + const bool useSimd = false; +#endif const unsigned uiPelsInPartX = pcv.minCUWidth >> channelScaleX; const unsigned uiPelsInPartY = pcv.minCUHeight >> channelScaleY; const ptrdiff_t lfpPos = cu.cs->inCtuPos( area.pos(), cu.chType() ); @@ -630,7 +635,7 @@ void LoopFilter::calcFilterStrengths( const CodingUnit& cu ) const for( int x = 0; x < area.width; x += uiPelsInPartX ) { - if( lineLfpPtrV->filterEdge( cu.chType() ) ) xGetBoundaryStrengthSingle( *lineLfpPtrV, cu, Position{ area.x + x, area.y + y }, x ? cu : *cuP, ctuData, x ? true : pqCuSameCtuVer ); + if( lineLfpPtrV->filterEdge( cu.chType() ) ) xGetBoundaryStrengthSingle( *lineLfpPtrV, cu, Position{ area.x + x, area.y + y }, x ? cu : *cuP, ctuData, x ? true : pqCuSameCtuVer, useSimd ); lineLfpPtrV->bs &= ~BsSet( 3, MAX_NUM_COMPONENT ); @@ -656,7 +661,7 @@ void LoopFilter::calcFilterStrengths( const CodingUnit& cu ) const { cuP = ( y || ( cuP && cuP->blocks[chType].x + cuP->blocks[chType].width > area.x + x ) ) ? cuP : cu.cs->getCU( Position{ area.x + x, area.y - 1 }, chType ); - if( lineLfpPtrH->filterEdge( cu.chType() ) ) xGetBoundaryStrengthSingle( *lineLfpPtrH, cu, Position{ area.x + x, area.y + y }, y ? cu : *cuP, ctuData, y ? true : pqCuSameCtuHor ); + if( lineLfpPtrH->filterEdge( cu.chType() ) ) xGetBoundaryStrengthSingle( *lineLfpPtrH, cu, Position{ area.x + x, area.y + y }, y ? cu : *cuP, ctuData, y ? true : pqCuSameCtuHor, useSimd ); lineLfpPtrH->bs &= ~BsSet( 3, MAX_NUM_COMPONENT ); @@ -783,10 +788,15 @@ void LoopFilter::xSetMaxFilterLengthPQFromTransformSizes( const CodingUnit& cu, { const PreCalcValues &pcv = *cu.cs->pcv; - ChannelType start = CH_L; - ChannelType end = CH_C; + ChannelType start = CH_L; + ChannelType end = CH_C; - const bool dt = CU::isSepTree( cu ); + const bool dt = CU::isSepTree( cu ); +#if ENABLE_SIMD_DBLF && defined( TARGET_SIMD_X86 ) + const bool useSimd = read_x86_extension_flags() > x86_simd::SCALAR; +#else + const bool useSimd = false; +#endif if( dt ) { @@ -855,7 +865,7 @@ void LoopFilter::xSetMaxFilterLengthPQFromTransformSizes( const CodingUnit& cu, lfp.setFilterCMFL( ( sizeQSide >= 8 && sizePSide >= 8 ) ? 1 : 0 ); if( bValue ) - xGetBoundaryStrengthSingle( lfp, cu, Position( ( area.x + edgeDir * d ) << csx, ( area.y + ( 1 - edgeDir ) * d ) << csy ), *cuPfstCh, ctuData, pqSameCtu ); + xGetBoundaryStrengthSingle( lfp, cu, Position( ( area.x + edgeDir * d ) << csx, ( area.y + ( 1 - edgeDir ) * d ) << csy ), *cuPfstCh, ctuData, pqSameCtu, useSimd ); lfp.bs &= ~BsSet( 3, MAX_NUM_COMPONENT ); if( !CU::isIntra( cu ) && !CU::isIntra( *cuP ) && cuP == cuPfstCh && cu.geoFlag() == false && cuP->geoFlag() == false ) @@ -928,7 +938,7 @@ void LoopFilter::xSetMaxFilterLengthPQFromTransformSizes( const CodingUnit& cu, } if( bValue ) - xGetBoundaryStrengthSingle( lfp, cu, Position( ( area.x + edgeDir * d ) << csx, ( area.y + ( 1 - edgeDir ) * d ) << csy ), *cuPfstCh, ctuData, pqSameCtu ); + xGetBoundaryStrengthSingle( lfp, cu, Position( ( area.x + edgeDir * d ) << csx, ( area.y + ( 1 - edgeDir ) * d ) << csy ), *cuPfstCh, ctuData, pqSameCtu, useSimd ); lfp.bs &= ~BsSet( 3, MAX_NUM_COMPONENT ); OFFSET( lfpPtr, lfpStride, edgeDir, ( 1 - edgeDir ) ); } @@ -1079,7 +1089,7 @@ LFCUParam LoopFilter::xGetLoopfilterParam( const CodingUnit& cu ) const } template -void LoopFilter::xGetBoundaryStrengthSingle( LoopFilterParam& lfp, const CodingUnit& cuQ, const Position &localPos, const CodingUnit& cuP, CtuData& ctuData, bool pqSameCtu ) const +void LoopFilter::xGetBoundaryStrengthSingle( LoopFilterParam& lfp, const CodingUnit& cuQ, const Position &localPos, const CodingUnit& cuP, CtuData& ctuData, bool pqSameCtu, bool useSimd ) const { const Slice &sliceQ = *cuQ.slice; const ChannelType chType = cuQ.chType(); @@ -1238,74 +1248,83 @@ void LoopFilter::xGetBoundaryStrengthSingle( LoopFilterParam& lfp, const CodingU if( ( piRefP0 == piRefQ0 && piRefP1 == piRefQ1 ) || ( piRefP0 == piRefQ1 && piRefP1 == piRefQ0 ) ) { #if defined( TARGET_SIMD_X86 ) && ENABLE_SIMD_DBLF - const __m128i xmvP = _mm_unpacklo_epi64( refP0valid ? _mm_loadu_si64( ( const __m128i* ) &miP.mv[0] ) : _mm_setzero_si128(), refP1valid ? _mm_loadu_si64( ( const __m128i* ) &miP.mv[1] ) : _mm_setzero_si128() ); - const __m128i xmvQ = _mm_unpacklo_epi64( refQ0valid ? _mm_loadu_si64( ( const __m128i* ) &miQ.mv[0] ) : _mm_setzero_si128(), refQ1valid ? _mm_loadu_si64( ( const __m128i* ) &miQ.mv[1] ) : _mm_setzero_si128() ); - const __m128i xth = _mm_set1_epi32( nThreshold - 1 ); -#else - Mv mvP[2] = { { 0, 0 }, { 0, 0 } }, mvQ[2] = { { 0, 0 }, { 0, 0 } }; - - if( refP0valid ) { mvP[0] = miP.mv[0]; } - if( refP1valid ) { mvP[1] = miP.mv[1]; } - if( refQ0valid ) { mvQ[0] = miQ.mv[0]; } - if( refQ1valid ) { mvQ[1] = miQ.mv[1]; } -#endif - if( piRefP0 != piRefP1 ) // Different L0 & L1 + if( useSimd ) { - if( piRefP0 == piRefQ0 ) + const __m128i xmvP = _mm_unpacklo_epi64( refP0valid ? _mm_loadu_si64( ( const __m128i* ) &miP.mv[0] ) : _mm_setzero_si128(), refP1valid ? _mm_loadu_si64( ( const __m128i* ) &miP.mv[1] ) : _mm_setzero_si128() ); + const __m128i xmvQ = _mm_unpacklo_epi64( refQ0valid ? _mm_loadu_si64( ( const __m128i* ) &miQ.mv[0] ) : _mm_setzero_si128(), refQ1valid ? _mm_loadu_si64( ( const __m128i* ) &miQ.mv[1] ) : _mm_setzero_si128() ); + const __m128i xth = _mm_set1_epi32( nThreshold - 1 ); + + if( piRefP0 != piRefP1 ) // Different L0 & L1 { -#if defined( TARGET_SIMD_X86 ) && ENABLE_SIMD_DBLF - __m128i - xdiff = _mm_sub_epi32 ( xmvQ, xmvP ); - xdiff = _mm_abs_epi32 ( xdiff ); - xdiff = _mm_cmpgt_epi32( xdiff, xth ); - uiBs = _mm_testz_si128( xdiff, xdiff ) ? 0 : 1; -#else - uiBs = ( ( abs( mvQ[0].getHor() - mvP[0].getHor() ) >= nThreshold ) || ( abs( mvQ[0].getVer() - mvP[0].getVer() ) >= nThreshold ) || - ( abs( mvQ[1].getHor() - mvP[1].getHor() ) >= nThreshold ) || ( abs( mvQ[1].getVer() - mvP[1].getVer() ) >= nThreshold ) ) - ? 1 : 0; -#endif + if( piRefP0 == piRefQ0 ) + { + __m128i + xdiff = _mm_sub_epi32 ( xmvQ, xmvP ); + xdiff = _mm_abs_epi32 ( xdiff ); + xdiff = _mm_cmpgt_epi32( xdiff, xth ); + uiBs = _mm_testz_si128( xdiff, xdiff ) ? 0 : 1; + } + else + { + __m128i + xmvQ1 = _mm_shuffle_epi32( xmvQ, ( 2 << 0 ) + ( 3 << 2 ) + ( 0 << 4 ) + ( 1 << 6 ) ); + __m128i + xdiff = _mm_sub_epi32 ( xmvQ1, xmvP ); + xdiff = _mm_abs_epi32 ( xdiff ); + xdiff = _mm_cmpgt_epi32( xdiff, xth ); + uiBs = _mm_testz_si128( xdiff, xdiff ) ? 0 : 1; + } } else { -#if defined( TARGET_SIMD_X86 ) && ENABLE_SIMD_DBLF __m128i - xmvQ1 = _mm_shuffle_epi32( xmvQ, ( 2 << 0 ) + ( 3 << 2 ) + ( 0 << 4 ) + ( 1 << 6 ) ); + xmvQ1 = _mm_shuffle_epi32( xmvQ, ( 2 << 0 ) + ( 3 << 2 ) + ( 0 << 4 ) + ( 1 << 6 ) ); __m128i - xdiff = _mm_sub_epi32 ( xmvQ1, xmvP ); - xdiff = _mm_abs_epi32 ( xdiff ); + xdiff = _mm_sub_epi32( xmvQ1, xmvP ); + xdiff = _mm_abs_epi32( xdiff ); xdiff = _mm_cmpgt_epi32( xdiff, xth ); uiBs = _mm_testz_si128( xdiff, xdiff ) ? 0 : 1; -#else - uiBs = ( ( abs( mvQ[1].getHor() - mvP[0].getHor() ) >= nThreshold ) || ( abs( mvQ[1].getVer() - mvP[0].getVer() ) >= nThreshold ) || - ( abs( mvQ[0].getHor() - mvP[1].getHor() ) >= nThreshold ) || ( abs( mvQ[0].getVer() - mvP[1].getVer() ) >= nThreshold ) ) - ? 1 : 0; -#endif + + xdiff = _mm_sub_epi32( xmvQ, xmvP ); + xdiff = _mm_abs_epi32( xdiff ); + xdiff = _mm_cmpgt_epi32( xdiff, xth ); + uiBs &= _mm_testz_si128( xdiff, xdiff ) ? 0 : 1; } } - else // Same L0 & L1 + else +#endif { + Mv mvP[2] = { { 0, 0 }, { 0, 0 } }, mvQ[2] = { { 0, 0 }, { 0, 0 } }; -#if defined( TARGET_SIMD_X86 ) && ENABLE_SIMD_DBLF - __m128i - xmvQ1 = _mm_shuffle_epi32( xmvQ, ( 2 << 0 ) + ( 3 << 2 ) + ( 0 << 4 ) + ( 1 << 6 ) ); - __m128i - xdiff = _mm_sub_epi32( xmvQ1, xmvP ); - xdiff = _mm_abs_epi32( xdiff ); - xdiff = _mm_cmpgt_epi32( xdiff, xth ); - uiBs = _mm_testz_si128( xdiff, xdiff ) ? 0 : 1; - - xdiff = _mm_sub_epi32( xmvQ, xmvP ); - xdiff = _mm_abs_epi32( xdiff ); - xdiff = _mm_cmpgt_epi32( xdiff, xth ); - uiBs &= _mm_testz_si128( xdiff, xdiff ) ? 0 : 1; -#else - uiBs = ( ( abs( mvQ[0].getHor() - mvP[0].getHor() ) >= nThreshold ) || ( abs( mvQ[0].getVer() - mvP[0].getVer() ) >= nThreshold ) || - ( abs( mvQ[1].getHor() - mvP[1].getHor() ) >= nThreshold ) || ( abs( mvQ[1].getVer() - mvP[1].getVer() ) >= nThreshold ) ) - && - ( ( abs( mvQ[1].getHor() - mvP[0].getHor() ) >= nThreshold ) || ( abs( mvQ[1].getVer() - mvP[0].getVer() ) >= nThreshold ) || - ( abs( mvQ[0].getHor() - mvP[1].getHor() ) >= nThreshold ) || ( abs( mvQ[0].getVer() - mvP[1].getVer() ) >= nThreshold ) ) + if( refP0valid ) { mvP[0] = miP.mv[0]; } + if( refP1valid ) { mvP[1] = miP.mv[1]; } + if( refQ0valid ) { mvQ[0] = miQ.mv[0]; } + if( refQ1valid ) { mvQ[1] = miQ.mv[1]; } + + if( piRefP0 != piRefP1 ) // Different L0 & L1 + { + if( piRefP0 == piRefQ0 ) + { + uiBs = ( ( abs( mvQ[0].getHor() - mvP[0].getHor() ) >= nThreshold ) || ( abs( mvQ[0].getVer() - mvP[0].getVer() ) >= nThreshold ) || + ( abs( mvQ[1].getHor() - mvP[1].getHor() ) >= nThreshold ) || ( abs( mvQ[1].getVer() - mvP[1].getVer() ) >= nThreshold ) ) + ? 1 : 0; + } + else + { + uiBs = ( ( abs( mvQ[1].getHor() - mvP[0].getHor() ) >= nThreshold ) || ( abs( mvQ[1].getVer() - mvP[0].getVer() ) >= nThreshold ) || + ( abs( mvQ[0].getHor() - mvP[1].getHor() ) >= nThreshold ) || ( abs( mvQ[0].getVer() - mvP[1].getVer() ) >= nThreshold ) ) + ? 1 : 0; + } + } + else + { + uiBs = ( ( abs( mvQ[0].getHor() - mvP[0].getHor() ) >= nThreshold ) || ( abs( mvQ[0].getVer() - mvP[0].getVer() ) >= nThreshold ) || + ( abs( mvQ[1].getHor() - mvP[1].getHor() ) >= nThreshold ) || ( abs( mvQ[1].getVer() - mvP[1].getVer() ) >= nThreshold ) ) + && + ( ( abs( mvQ[1].getHor() - mvP[0].getHor() ) >= nThreshold ) || ( abs( mvQ[1].getVer() - mvP[0].getVer() ) >= nThreshold ) || + ( abs( mvQ[0].getHor() - mvP[1].getHor() ) >= nThreshold ) || ( abs( mvQ[0].getVer() - mvP[1].getVer() ) >= nThreshold ) ) ? 1 : 0; -#endif + } } } else // for all different Ref_Idx diff --git a/source/Lib/CommonLib/LoopFilter.h b/source/Lib/CommonLib/LoopFilter.h index 372d5464..aca6bb4c 100644 --- a/source/Lib/CommonLib/LoopFilter.h +++ b/source/Lib/CommonLib/LoopFilter.h @@ -76,7 +76,7 @@ class LoopFilter // filtering functions template - void xGetBoundaryStrengthSingle ( LoopFilterParam& lfp, const CodingUnit& cu, const Position &localPos, const CodingUnit &cuP, CtuData& ctuData, bool pqSameCtu ) const; + void xGetBoundaryStrengthSingle ( LoopFilterParam& lfp, const CodingUnit& cu, const Position &localPos, const CodingUnit &cuP, CtuData& ctuData, bool pqSameCtu, bool useSimd ) const; template void xSetEdgeFilterInsidePu ( const CodingUnit &cu, const Area &area, const bool bValue, CtuData& ctuData ) const; diff --git a/source/Lib/CommonLib/UnitTools.cpp b/source/Lib/CommonLib/UnitTools.cpp index b1b387fe..b40df742 100644 --- a/source/Lib/CommonLib/UnitTools.cpp +++ b/source/Lib/CommonLib/UnitTools.cpp @@ -2737,42 +2737,30 @@ void PU::setAllAffineMv( CodingUnit& cu, Mv affLT, Mv affRT, Mv affLB, RefPicLis height >>= MIN_CU_LOG2; #if ENABLE_SIMD_OPT && defined( TARGET_SIMD_X86 ) - __m128i xvbase = _mm_setr_epi32( mvScaleHor, mvScaleVer, mvScaleHor, mvScaleVer ); - __m128i xvdvxy = _mm_setr_epi32( deltaMvVerX, deltaMvVerY, deltaMvVerX, deltaMvVerY ); - __m128i xhdhxy = _mm_setr_epi32( deltaMvHorX, deltaMvHorY, deltaMvHorX, deltaMvHorY ); - -#endif - for( int h = 0; h < height; h++ ) + if( !subblkMVSpreadOverLimit && read_x86_extension_flags() > x86_simd::SCALAR ) { -#if ENABLE_SIMD_OPT && defined( TARGET_SIMD_X86 ) + __m128i xvbase = _mm_setr_epi32( mvScaleHor, mvScaleVer, mvScaleHor, mvScaleVer ); + __m128i xvdvxy = _mm_setr_epi32( deltaMvVerX, deltaMvVerY, deltaMvVerX, deltaMvVerY ); + __m128i xhdhxy = _mm_setr_epi32( deltaMvHorX, deltaMvHorY, deltaMvHorX, deltaMvHorY ); + + for( int h = 0; h < height; h++ ) + { __m128i xvoff = _mm_set1_epi32 ( halfBH + ( h << MIN_CU_LOG2 ) ); xvoff = _mm_mullo_epi32( xvoff, xvdvxy ); xvoff = _mm_add_epi32 ( xvoff, xvbase ); -#endif - if( subblkMVSpreadOverLimit ) - { - for( int w = 0; w < width; w++ ) - { - MotionInfo &mi = mb.at( w, h ); - mi.mv[eRefList] = flbMv; - } - } - else - { -#if ENABLE_SIMD_OPT && defined( TARGET_SIMD_X86 ) for( int w = 0; w < width; w += 2 ) { MotionInfo *mi = &mb.at( w, h ); __m128i - xhoff = _mm_set1_epi32 ( 2 + ( w << MIN_CU_LOG2 ) ); + xhoff = _mm_set1_epi32 ( 2 + ( w << MIN_CU_LOG2 ) ); xhoff = _mm_add_epi32 ( xhoff, _mm_setr_epi32( 0, 0, 1 << MIN_CU_LOG2, 1 << MIN_CU_LOG2 ) ); xhoff = _mm_mullo_epi32( xhoff, xhdhxy ); xhoff = _mm_add_epi32 ( xhoff, xvoff ); __m128i - xmv = _mm_add_epi32 ( xhoff, _mm_set1_epi32( 1 << ( shift - 1 ) ) ); + xmv = _mm_add_epi32 ( xhoff, _mm_set1_epi32( 1 << ( shift - 1 ) ) ); xmv = _mm_add_epi32 ( xmv, _mm_cmpgt_epi32( xhoff, _mm_set1_epi32( -1 ) ) ); xmv = _mm_srai_epi32 ( xmv, shift ); xmv = _mm_max_epi32 ( _mm_set1_epi32( -( 1 << 17 ) ), _mm_min_epi32( _mm_set1_epi32( ( 1 << 17 ) - 1 ), xmv ) ); @@ -2780,22 +2768,39 @@ void PU::setAllAffineMv( CodingUnit& cu, Mv affLT, Mv affRT, Mv affLB, RefPicLis _mm_storeu_si64( ( __m128i* ) &mi[0].mv[eRefList], xmv ); _mm_storeu_si64( ( __m128i* ) &mi[1].mv[eRefList], _mm_unpackhi_epi64( xmv, _mm_setzero_si128() ) ); } -#else - for( int w = 0; w < width; w++ ) + } + } + else +#endif + { + for( int h = 0; h < height; h++ ) + { + if( subblkMVSpreadOverLimit ) + { + for( int w = 0; w < width; w++ ) + { + MotionInfo &mi = mb.at( w, h ); + + mi.mv[eRefList] = flbMv; + } + } + else { - MotionInfo &mi = mb.at( w, h ); + for( int w = 0; w < width; w++ ) + { + MotionInfo &mi = mb.at( w, h ); - int mvHor = mvScaleHor + deltaMvHorX * ( 2 + ( w << MIN_CU_LOG2 ) ) + deltaMvVerX * ( halfBH + ( h << MIN_CU_LOG2 ) ); - int mvVer = mvScaleVer + deltaMvHorY * ( 2 + ( w << MIN_CU_LOG2 ) ) + deltaMvVerY * ( halfBH + ( h << MIN_CU_LOG2 ) ); + int mvHor = mvScaleHor + deltaMvHorX * ( 2 + ( w << MIN_CU_LOG2 ) ) + deltaMvVerX * ( halfBH + ( h << MIN_CU_LOG2 ) ); + int mvVer = mvScaleVer + deltaMvHorY * ( 2 + ( w << MIN_CU_LOG2 ) ) + deltaMvVerY * ( halfBH + ( h << MIN_CU_LOG2 ) ); - roundAffineMv( mvHor, mvVer, shift ); + roundAffineMv( mvHor, mvVer, shift ); - Mv rndMv( mvHor, mvVer ); - rndMv.clipToStorageBitDepth(); + Mv rndMv( mvHor, mvVer ); + rndMv.clipToStorageBitDepth(); - mi.mv[eRefList] = rndMv; + mi.mv[eRefList] = rndMv; + } } -#endif } } diff --git a/source/Lib/CommonLib/x86/CommonDefX86.h b/source/Lib/CommonLib/x86/CommonDefX86.h index f2386485..f9a6da37 100644 --- a/source/Lib/CommonLib/x86/CommonDefX86.h +++ b/source/Lib/CommonLib/x86/CommonDefX86.h @@ -85,6 +85,13 @@ POSSIBILITY OF SUCH DAMAGE. # include # endif +# if defined( REAL_TARGET_X86 ) \ + || ( defined( SIMD_EVERYWHERE_EXTENSION_LEVEL_ID ) && SIMD_EVERYWHERE_EXTENSION_LEVEL_ID >= X86_SIMD_AVX2 ) +# define ENABLE_AVX2_IMPLEMENTATIONS 1 +# else +# define ENABLE_AVX2_IMPLEMENTATIONS 0 +# endif + namespace vvdec { using namespace x86_simd; diff --git a/source/Lib/DecoderLib/DecLib.h b/source/Lib/DecoderLib/DecLib.h index c38e271a..707ecd42 100644 --- a/source/Lib/DecoderLib/DecLib.h +++ b/source/Lib/DecoderLib/DecLib.h @@ -120,6 +120,8 @@ class DecLib unsigned int getUpscaledOutput() { return m_upscaledOutput; } #endif + ThreadPool& getThreadPool() { return *m_decodeThreadPool; } + private: void reconPicture( Picture* pcPic ); #if JVET_R0270 diff --git a/source/Lib/FilmGrain/FilmGrain.cpp b/source/Lib/FilmGrain/FilmGrain.cpp index dacc3ff4..66dd63d0 100644 --- a/source/Lib/FilmGrain/FilmGrain.cpp +++ b/source/Lib/FilmGrain/FilmGrain.cpp @@ -62,6 +62,10 @@ POSSIBILITY OF SUCH DAMAGE. #include "CommonDef.h" +#if defined( TARGET_SIMD_X86 ) && defined( USE_SIMD ) +# include "FilmGrainImplX86.h" +#endif + namespace vvdec { @@ -547,19 +551,26 @@ static int same_pattern( fgs_sei* cfg, int32_t a, int32_t b ) return 1; } +void FilmGrain::set_seed( uint32_t seed ) +{ + m_line_rnd = m_line_rnd_up = seed; +} + /** Initialize "hardware" interface from FGS SEI parameters */ -void FilmGrain::init_sei( fgs_sei* cfg ) +void FilmGrain::init_sei() { - int8_t P[64 * 64]; - int8_t Lbuf[73 * 82]; - int8_t Cbuf[38 * 44]; - uint8_t slut[256]; - uint8_t plut[256]; - uint8_t intensities[VFGS_MAX_PATTERNS]; - uint32_t patterns[VFGS_MAX_PATTERNS]; - uint8_t np = 0; // number of patterns - uint8_t a, b, i; - int c, k; + int8_t P[64 * 64]; + int8_t Lbuf[73 * 82]; + int8_t Cbuf[38 * 44]; + uint8_t slut[256]; + uint8_t plut[256]; + uint8_t intensities[VFGS_MAX_PATTERNS]; + uint32_t patterns[VFGS_MAX_PATTERNS]; + uint8_t np = 0; // number of patterns + uint8_t a, b, i; + unsigned char all0 = 1; + + int c, k; for( c = 0; c < 3; c++ ) { @@ -571,16 +582,16 @@ void FilmGrain::init_sei( fgs_sei* cfg ) memset( patterns, ~0, sizeof( patterns ) ); } // 1. Look for different patterns, up to max supported number - if( cfg->comp_model_present_flag[c] ) + if( fgs.comp_model_present_flag[c] ) { - for( k = 0; k < cfg->num_intensity_intervals[c]; k++ ) + for( k = 0; k < fgs.num_intensity_intervals[c]; k++ ) { - a = cfg->intensity_interval_lower_bound[c][k]; + a = fgs.intensity_interval_lower_bound[c][k]; uint32_t id = SEI_MAX_MODEL_VALUES * ( k + 256 * c ); for( i = 0; i < VFGS_MAX_PATTERNS; i++ ) { - if( same_pattern( cfg, patterns[i], id ) ) + if( same_pattern( &fgs, patterns[i], id ) ) { break; } @@ -613,51 +624,50 @@ void FilmGrain::init_sei( fgs_sei* cfg ) // 2. Register the patterns (with correct order) for( i = 0; i < np; i++ ) { - int16_t* coef = &cfg->comp_model_value[0][0][0] + patterns[i]; + int16_t* coef = &fgs.comp_model_value[0][0][0] + patterns[i]; if( c == 0 ) { - if( cfg->model_id ) + if( fgs.model_id ) { - make_ar_pattern( Lbuf, P, 64, coef, 6, 1, cfg->log2_scale_factor, Seed_LUT[0] ); + make_ar_pattern( Lbuf, P, 64, coef, 6, 1, fgs.log2_scale_factor, Seed_LUT[0] ); } else { make_sei_ff_pattern64( (int8_t( * )[64]) P, coef[1], coef[2] ); } - set_luma_pattern( i, P ); + m_impl->set_luma_pattern( i, P ); } else if( c == 2 ) { - if( cfg->model_id ) + if( fgs.model_id ) { - make_ar_pattern( Cbuf, P, 32, coef, 6, 1, cfg->log2_scale_factor, Seed_LUT[1] ); + make_ar_pattern( Cbuf, P, 32, coef, 6, 1, fgs.log2_scale_factor, Seed_LUT[1] ); } else { make_sei_ff_pattern32( (int8_t( * )[32]) P, coef[1], coef[2] ); } - - set_chroma_pattern( i, P ); + m_impl->set_chroma_pattern( i, P ); } } // 3. Fill up LUTs for( int cc = std::min( c, 1 ); cc <= c; cc++ ) { - if( cfg->comp_model_present_flag[cc] ) + if( fgs.comp_model_present_flag[cc] ) { memset( plut, 255, sizeof( plut ) ); // 3a. Fill valid patterns - for( k = 0; k < cfg->num_intensity_intervals[cc]; k++ ) + for( k = 0; k < fgs.num_intensity_intervals[cc]; k++ ) { - a = cfg->intensity_interval_lower_bound[cc][k]; - b = cfg->intensity_interval_upper_bound[cc][k]; + a = fgs.intensity_interval_lower_bound[cc][k]; + b = fgs.intensity_interval_upper_bound[cc][k]; uint32_t id = SEI_MAX_MODEL_VALUES * ( k + 256 * cc ); for( i = 0; i < VFGS_MAX_PATTERNS; i++ ) { - if( same_pattern( cfg, patterns[i], id ) ) + if( same_pattern( &fgs, patterns[i], id ) ) { break; } @@ -666,7 +676,7 @@ void FilmGrain::init_sei( fgs_sei* cfg ) for( int l = a; l <= b; l++ ) { - slut[l] = (uint8_t) cfg->comp_model_value[cc][k][0]; + slut[l] = (uint8_t) fgs.comp_model_value[cc][k][0]; if( i < VFGS_MAX_PATTERNS ) { plut[l] = i << 4; @@ -674,7 +684,8 @@ void FilmGrain::init_sei( fgs_sei* cfg ) } } // 3b. Fill holes (no interp. yet, just repeat last) - i = 0; + i = 0; + int tmp = 0; for( k = 0; k < 256; k++ ) { if( plut[k] == 255 ) @@ -685,25 +696,39 @@ void FilmGrain::init_sei( fgs_sei* cfg ) { i = plut[k]; } + tmp += plut[k]; + } + if( tmp != 0 ) + { + all0 = 0; } } else { memset( plut, 0, sizeof( plut ) ); + all0 = 1; } // 3c. Register LUTs - set_scale_lut( cc, slut ); - set_pattern_lut( cc, plut ); + m_impl->set_scale_lut( cc, slut ); + m_impl->set_pattern_lut( cc, plut, all0 ); } } } - set_scale_shift( cfg->log2_scale_factor - ( cfg->model_id ? 1 : 0 ) ); // -1 for grain shift in pattern generation (see above) + m_impl->set_scale_shift( fgs.log2_scale_factor - ( fgs.model_id ? 1 : 0 ) ); // -1 for grain shift in pattern generation (see above) +} + +FilmGrain::FilmGrain() +{ +#if defined( TARGET_SIMD_X86 ) && defined( USE_SIMD ) + m_impl = FilmGrainImplX86::makeFilmGrainImpl(); +#else + m_impl = std::make_unique(); +#endif } void FilmGrain::updateFGC( vvdecSEIFilmGrainCharacteristics* fgc ) { - fgs_sei fgs; // TODO: maybe make it a member ? (idea would be to re-seed patterns for each picture) // Copy SEI message in vfgs structure format // TODO: check some values and warn about unsupported stuff ? fgs.model_id = fgc->filmGrainModelId; @@ -758,12 +783,87 @@ void FilmGrain::updateFGC( vvdecSEIFilmGrainCharacteristics* fgc ) } } - init_sei( &fgs ); + init_sei(); // if (!m_bFgs) // // TODO: get something random // // TODO: make seed also impact the pattern gen - // vfgs_set_seed(uint32_t seed); + // set_seed(uint32_t seed); +} + +void FilmGrain::prepareBlockSeeds( int width, int height ) +{ + m_line_seeds.resize( ( height + 15 ) / 16 ); + + m_prev_frame_line_rnd_up = m_line_rnd_up; + + uint32_t rnd = 0; + for( int y = 0; y < m_line_seeds.size(); ++y ) + { + // Generate / backup / restore per-line random seeds (needed to make multi-line blocks) + if( y != 0 ) + { + // new line of blocks + m_line_rnd_up = m_line_rnd; + m_line_rnd = rnd; + } + + m_line_seeds[y] = m_line_rnd; + + // Crank random generator + rnd = m_line_rnd; + for( int x = 0; x < ( width + 15 ) / 16; ++x ) + { + rnd = prng( rnd ); + } + } +} + +void FilmGrain::setColorFormat( vvdecColorFormat fmt ) +{ + switch( fmt ) + { + // clang-format off + case VVDEC_CF_YUV400_PLANAR: m_impl->set_chroma_subsampling( 0, 0 ); break; + case VVDEC_CF_YUV420_PLANAR: m_impl->set_chroma_subsampling( 2, 2 ); break; + case VVDEC_CF_YUV422_PLANAR: m_impl->set_chroma_subsampling( 2, 1 ); break; + case VVDEC_CF_YUV444_PLANAR: m_impl->set_chroma_subsampling( 1, 1 ); break; + default: THROW_FATAL( "invalid color format: " ); + // clang-format on + } +} + +void FilmGrain::add_grain_line( void* Y, void* U, void* V, int y, int width ) +{ + uint32_t rnd_up = y < 16 ? m_prev_frame_line_rnd_up : m_line_seeds[y / 16 - 1]; + uint32_t rnd = m_line_seeds[y / 16]; + + int16_t grain[3][32]; + uint8_t scale[3][32]; + + // Process line + for( int x = 0; x < width; x += 16 ) + { + // Process pixels for each color component + if( fgs.comp_model_present_flag[0] ) + { + m_impl->add_grain_block( Y, 0, x, y, width, rnd, rnd_up, grain, scale ); + } + if( U && V ) + { + if( fgs.comp_model_present_flag[1] ) + { + m_impl->add_grain_block( U, 1, x, y, width, rnd, rnd_up, grain, scale ); + } + if( fgs.comp_model_present_flag[2] ) + { + m_impl->add_grain_block( V, 2, x, y, width, rnd, rnd_up, grain, scale ); + } + } + // Crank random generator + rnd = prng( rnd ); + rnd_up = prng( rnd_up ); // upper block (overlapping) + } } } // namespace vvdec diff --git a/source/Lib/FilmGrain/FilmGrain.h b/source/Lib/FilmGrain/FilmGrain.h index 73303d92..d03a71bd 100644 --- a/source/Lib/FilmGrain/FilmGrain.h +++ b/source/Lib/FilmGrain/FilmGrain.h @@ -59,8 +59,13 @@ POSSIBILITY OF SUCH DAMAGE. #include "FilmGrainImpl.h" #include +#include +#include #include "vvdec/sei.h" +#include "vvdec/vvdec.h" + +#define USE_SIMD namespace vvdec { @@ -81,18 +86,31 @@ struct fgs_sei int16_t comp_model_value[3][256][SEI_MAX_MODEL_VALUES]; }; -class FilmGrain : public FilmGrainImpl +class FilmGrain { + std::unique_ptr m_impl; + + uint32_t m_line_rnd = 0xdeadbeef; + uint32_t m_line_rnd_up = 0xdeadbeef; + uint32_t m_prev_frame_line_rnd_up = 0xdeadbeef; + + std::vector m_line_seeds; + fgs_sei fgs; + public: - FilmGrain( int depth, int chromaSubsampling ) - { - set_depth( depth ); - set_chroma_subsampling( chromaSubsampling, chromaSubsampling ); - } + FilmGrain(); + ~FilmGrain() = default; + void updateFGC( vvdecSEIFilmGrainCharacteristics* fgc ); + void setDepth( int depth ) { m_impl->set_depth( depth ); } + void setColorFormat( vvdecColorFormat fmt ); + void prepareBlockSeeds( int width, int height ); + + void add_grain_line( void* Y, void* U, void* V, int y, int width ); private: - void init_sei( fgs_sei* cfg ); + void set_seed( uint32_t seed ); + void init_sei(); }; } // namespace vvdec diff --git a/source/Lib/FilmGrain/FilmGrainImpl.cpp b/source/Lib/FilmGrain/FilmGrainImpl.cpp index 85858487..8cba3a1a 100644 --- a/source/Lib/FilmGrain/FilmGrainImpl.cpp +++ b/source/Lib/FilmGrain/FilmGrainImpl.cpp @@ -61,8 +61,6 @@ POSSIBILITY OF SUCH DAMAGE. #include -#define PATTERN_INTERPOLATION 0 - namespace vvdec { @@ -84,7 +82,7 @@ namespace vvdec * Note: to fully support cross-component correlation within patterns, we would * need to align luma/chroma offsets. */ -static void get_offset_y( uint32_t val, int* s, uint8_t* x, uint8_t* y ) +void FilmGrainImpl::get_offset_y( uint32_t val, int* s, uint8_t* x, uint8_t* y ) { uint32_t bf; // bit field @@ -99,7 +97,7 @@ static void get_offset_y( uint32_t val, int* s, uint8_t* x, uint8_t* y ) // pattern samples (when using overlap). } -void FilmGrainImpl::get_offset_u( uint32_t val, int* s, uint8_t* x, uint8_t* y ) +void FilmGrainImpl::get_offset_u( uint32_t val, int* s, uint8_t* x, uint8_t* y ) const { uint32_t bf; // bit field @@ -112,7 +110,7 @@ void FilmGrainImpl::get_offset_u( uint32_t val, int* s, uint8_t* x, uint8_t* y ) *y = ( ( bf * 12 ) >> 10 ) * ( 4 / csuby ); } -void FilmGrainImpl::get_offset_v( uint32_t val, int* s, uint8_t* x, uint8_t* y ) +void FilmGrainImpl::get_offset_v( uint32_t val, int* s, uint8_t* x, uint8_t* y ) const { uint32_t bf; // bit field @@ -125,29 +123,10 @@ void FilmGrainImpl::get_offset_v( uint32_t val, int* s, uint8_t* x, uint8_t* y ) *y = ( ( bf * 12 ) >> 10 ) * ( 4 / csuby ); } -void FilmGrainImpl::add_grain_block( void* I, int c, int x, int y, int width ) +void FilmGrainImpl::add_grain_block( void* I, int c, int x, int y, int width, uint32_t rnd, uint32_t rnd_up, int16_t grain[3][32], uint8_t scale[3][32] ) const { - uint8_t* I8 = (uint8_t*) I; - uint16_t* I16 = (uint16_t*) I; - - int s, s_up; // random sign flip (current + upper row) - uint8_t ox, oy; // random offset (current) - uint8_t ox_up, oy_up; // random offset (upper row) - uint8_t oc1, oc2; // overlapping coefficients - uint8_t pi; // pattern index integer part - int i, j; - int P; // Pattern sample (from current pattern index) -#if PATTERN_INTERPOLATION - int Pn; // Next-pattern sample (from pattern index+1) - uint8_t pf; // pattern index fractional part -#endif - - uint8_t intensity; - int flush = 0; - int subx = c ? csubx : 1; - int suby = c ? csuby : 1; - uint8_t I_min = c ? C_min : Y_min; - uint8_t I_max = c ? C_max : Y_max; + const int subx = c ? csubx : 1; + const int suby = c ? csuby : 1; if( ( y & 1 ) && suby > 1 ) { @@ -161,8 +140,9 @@ void FilmGrainImpl::add_grain_block( void* I, int c, int x, int y, int width ) // TODO: assert subx, suby, Y/C min/max, max pLUT values, etc - j = y & 0xf; + const int j = y & 0xf; + uint8_t oc1, oc2; // overlapping coefficients if( y > 15 && j == 0 ) // first line of overlap { oc1 = ( suby > 1 ) ? 20 : 12; // current @@ -179,6 +159,8 @@ void FilmGrainImpl::add_grain_block( void* I, int c, int x, int y, int width ) } // Derive block offsets + sign + int s; // random sign flip (current) + uint8_t ox, oy; // random offset (current) if( c == 0 ) { get_offset_y( rnd, &s, &ox, &oy ); @@ -194,6 +176,8 @@ void FilmGrainImpl::add_grain_block( void* I, int c, int x, int y, int width ) oy += j / suby; // Same for upper block (overlap) + int s_up; // random sign flip (upper row) + uint8_t ox_up, oy_up; // random offset (upper row) if( c == 0 ) { get_offset_y( rnd_up, &s_up, &ox_up, &oy_up ); @@ -209,78 +193,124 @@ void FilmGrainImpl::add_grain_block( void* I, int c, int x, int y, int width ) oy_up += ( 16 + j ) / suby; // Make grain pattern - for( i = 0; i < 16 / subx; i++ ) + make_grain_pattern( I, c, x, subx, oc1, oc2, ox, ox_up, oy, oy_up, s, s_up, grain, scale ); + + // Scale & output + scale_and_output( I, c, x, subx, width, grain, scale ); +} + +void FilmGrainImpl::make_grain_pattern( const void* I, + int c, + int x, + int subx, + uint8_t oc1, + uint8_t oc2, + uint8_t ox, + uint8_t ox_up, + uint8_t oy, + uint8_t oy_up, + int s, + int s_up, + int16_t grain[3][32], + uint8_t scale[3][32] ) const +{ + const uint8_t* I8 = (const uint8_t*) I; + const uint16_t* I16 = (const uint16_t*) I; { - intensity = bs ? I16[x / subx + i] >> bs : I8[x / subx + i]; - pi = pLUT[c][intensity] >> 4; // pattern index (integer part) + for( int i = 0; i < 16 / subx; i++ ) + { + uint8_t intensity = bs ? I16[x / subx + i] >> bs : I8[x / subx + i]; + uint8_t pi = pLUT[c][intensity] >> 4; // pattern index (integer part) + int P = pattern[c ? 1 : 0][pi][oy][ox + i] * s; // Pattern sample (from current pattern index) + // We could consider just XORing the sign bit #if PATTERN_INTERPOLATION - pf = pLUT[c][intensity] & 15; // fractional part (interpolate with next) -- could restrict to less bits (e.g. 2) + uint8_t pf = pLUT[c][intensity] & 15; // pattern index fractional part (interpolate with next) -- could restrict to less bits (e.g. 2) + int Pn = + pattern[c ? 1 : 0][pi + 1][oy][ox + i] * s; // Next-pattern sample (from pattern index+1) + // But there are equivalent hw tricks, e.g. storing values as sign + amplitude instead of two's complement #endif - // Pattern - P = pattern[c ? 1 : 0][pi][oy][ox + i] * s; // We could consider just XORing the sign bit + if( oc1 ) // overlap + { + P = round( P * oc1 + pattern[c ? 1 : 0][pi][oy_up][ox_up + i] * oc2 * s_up, 5 ); #if PATTERN_INTERPOLATION - Pn = - pattern[c ? 1 : 0][pi + 1][oy][ox + i] * s; // But there are equivalent hw tricks, e.g. storing values as sign + amplitude instead of two's complement + Pn = round( Pn * oc1 + pattern[c ? 1 : 0][pi + 1][oy_up][ox_up + i] * oc2 * s_up, 5 ); #endif - - if( oc1 ) // overlap - { - P = round( P * oc1 + pattern[c ? 1 : 0][pi][oy_up][ox_up + i] * oc2 * s_up, 5 ); + } #if PATTERN_INTERPOLATION - Pn = round( Pn * oc1 + pattern[c ? 1 : 0][pi + 1][oy_up][ox_up + i] * oc2 * s_up, 5 ); + // Pattern interpolation: P is current, Pn is next, pf is interpolation coefficient + grain[c][16 / subx + i] = round( P * ( 16 - pf ) + Pn * pf, 4 ); +#else + grain[c][16 / subx + i] = P; #endif + // Scale sign already integrated above because of overlap + scale[c][16 / subx + i] = sLUT[c][intensity]; } + } +} -#if PATTERN_INTERPOLATION - // Pattern interpolation: P is current, Pn is next, pf is interpolation coefficient - grain[c][16 / subx + i] = round( P * ( 16 - pf ) + Pn * pf, 4 ); -#else - grain[c][16 / subx + i] = P; -#endif +void FilmGrainImpl::scale_and_output( void* I, int c, int x, int subx, int width, int16_t grain[3][32], uint8_t scale[3][32] ) const +{ + uint8_t* I8 = (uint8_t*) I; + uint16_t* I16 = (uint16_t*) I; - // Scale sign already integrated above because of overlap - scale[c][16 / subx + i] = sLUT[c][intensity]; - } + const uint8_t I_min = c ? C_min : Y_min; + const uint8_t I_max = c ? C_max : Y_max; - // Scale & output + int flush = 0; do { if( x > 0 ) { - int32_t g; - int16_t l1, l0, r0, r1; - if( !flush ) { // Horizontal deblock (across previous block) - l1 = grain[c][16 / subx - 2]; - l0 = grain[c][16 / subx - 1]; - r0 = grain[c][16 / subx + 0]; - r1 = grain[c][16 / subx + 1]; + int16_t l1, l0, r0, r1; + + l1 = grain[c][16 / subx - 2]; + l0 = grain[c][16 / subx - 1]; + r0 = grain[c][16 / subx + 0]; + r1 = grain[c][16 / subx + 1]; + grain[c][16 / subx - 1] = round( l1 + 3 * l0 + r0, 2 ); grain[c][16 / subx + 0] = round( l0 + 3 * r0 + r1, 2 ); } - for( i = 0; i < 16 / subx; i++ ) { - // Output previous block (or flush current) - g = round( scale[c][i] * (int16_t) grain[c][i], scale_shift ); - if( bs ) - { - I16[( x - 16 ) / subx + i] = std::max( I_min << bs, std::min( I_max << bs, I16[( x - 16 ) / subx + i] + g ) ); - } - else + for( int i = 0; i < 16 / subx; i++ ) { - I8[( x - 16 ) / subx + i] = std::max( I_min, std::min( I_max, I8[( x - 16 ) / subx + i] + g ) ); + // Output previous block (or flush current) + int32_t g = round( scale[c][i] * (int16_t) grain[c][i], scale_shift ); + if( bs ) + { + I16[( x - 16 ) / subx + i] = std::max( I_min << bs, std::min( I_max << bs, I16[( x - 16 ) / subx + i] + g ) ); + } + else + { + I8[( x - 16 ) / subx + i] = std::max( I_min, std::min( I_max, I8[( x - 16 ) / subx + i] + g ) ); + } } } } // Shift pipeline - for( i = 0; i < 16 / subx && !flush; i++ ) + if( !flush ) { - grain[c][i] = grain[c][i + 16 / subx]; - scale[c][i] = scale[c][i + 16 / subx]; + if( c == 0 ) + { + for( int i = 0; i < 16; i++ ) + { + grain[0][i] = grain[0][i + 16]; + scale[0][i] = scale[0][i + 16]; + } + } + else + { + for( int i = 0; i < 8; i++ ) + { + grain[c][i] = grain[c][i + 8]; + scale[c][i] = scale[c][i + 8]; + } + } } if( x + 16 >= width ) @@ -293,32 +323,6 @@ void FilmGrainImpl::add_grain_block( void* I, int c, int x, int y, int width ) /* Public interface ***********************************************************/ -void FilmGrainImpl::add_grain_line( void* Y, void* U, void* V, int y, int width ) -{ - // Generate / backup / restore per-line random seeds (needed to make multi-line blocks) - if( y && ( y & 0x0f ) == 0 ) - { - // new line of blocks --> backup + copy current to upper - line_rnd_up = line_rnd; - line_rnd = rnd; - } - rnd_up = line_rnd_up; - rnd = line_rnd; - - // Process line - for( int x = 0; x < width; x += 16 ) - { - // Process pixels for each color component - add_grain_block( Y, 0, x, y, width ); - add_grain_block( U, 1, x, y, width ); - add_grain_block( V, 2, x, y, width ); - - // Crank random generator - rnd = prng( rnd ); - rnd_up = prng( rnd_up ); // upper block (overlapping) - } -} - void FilmGrainImpl::set_luma_pattern( int index, int8_t* P ) { CHECK( index < 0 || index >= 8, "luma pattern index out of bounds" ); @@ -340,17 +344,13 @@ void FilmGrainImpl::set_scale_lut( int c, uint8_t lut[] ) memcpy( sLUT[c], lut, 256 ); } -void FilmGrainImpl::set_pattern_lut( int c, uint8_t lut[] ) +void FilmGrainImpl::set_pattern_lut( int c, uint8_t lut[], bool all0 ) { CHECK( c < 0 || c >= 3, "pattern lut idx out of bounds" ); + allZero[c] = all0; memcpy( pLUT[c], lut, 256 ); } -void FilmGrainImpl::set_seed( uint32_t seed ) -{ - rnd = rnd_up = line_rnd = line_rnd_up = seed; -} - void FilmGrainImpl::set_scale_shift( int shift ) { CHECK( shift < 2 || shift >= 8, "scale shift out of range" ); @@ -386,8 +386,6 @@ FilmGrainImpl::FilmGrainImpl() memset( pattern, 0, sizeof( pattern ) ); memset( sLUT, 0, sizeof( sLUT ) ); memset( pLUT, 0, sizeof( pLUT ) ); - memset( grain, 0, sizeof( grain ) ); - memset( scale, 0, sizeof( scale ) ); } } // namespace vvdec diff --git a/source/Lib/FilmGrain/FilmGrainImpl.h b/source/Lib/FilmGrain/FilmGrainImpl.h index 41150271..3a5727d6 100644 --- a/source/Lib/FilmGrain/FilmGrainImpl.h +++ b/source/Lib/FilmGrain/FilmGrainImpl.h @@ -59,6 +59,7 @@ POSSIBILITY OF SUCH DAMAGE. #include #define VFGS_MAX_PATTERNS 8 +#define PATTERN_INTERPOLATION 0 namespace vvdec { @@ -87,50 +88,64 @@ constexpr inline auto round( T a, uint8_t s ) class FilmGrainImpl { +protected: // Note: declarations optimized for code readability; e.g. pattern storage in // actual hardware implementation would differ significantly int8_t pattern[2][VFGS_MAX_PATTERNS + 1][64][64]; // +1 to simplify interpolation code uint8_t sLUT[3][256]; uint8_t pLUT[3][256]; - uint32_t rnd = 0xdeadbeef; - uint32_t rnd_up = 0xdeadbeef; - uint32_t line_rnd = 0xdeadbeef; - uint32_t line_rnd_up = 0xdeadbeef; - uint8_t scale_shift = 5 + 6; - uint8_t bs = 0; // bitshift = bitdepth - 8 - int csubx = 2; - int csuby = 2; + uint8_t scale_shift = 5 + 6; + uint8_t bs = 0; // bitshift = bitdepth - 8 + int csubx = 2; + int csuby = 2; + bool allZero[3] = { 0, 0, 0 }; constexpr static uint8_t Y_min = 0; constexpr static uint8_t Y_max = 255; constexpr static uint8_t C_min = 0; constexpr static uint8_t C_max = 255; - // Processing pipeline (needs only 2 registers for each color actually, for horizontal deblocking) - int16_t grain[3][32]; // 9 bit needed because of overlap (has norm > 1) - uint8_t scale[3][32]; - - void get_offset_u( uint32_t val, int* s, uint8_t* x, uint8_t* y ); - void get_offset_v( uint32_t val, int* s, uint8_t* x, uint8_t* y ); - void add_grain_block( void* I, int c, int x, int y, int width ); + static void get_offset_y( uint32_t val, int* s, uint8_t* x, uint8_t* y ); + void get_offset_u( uint32_t val, int* s, uint8_t* x, uint8_t* y ) const; + void get_offset_v( uint32_t val, int* s, uint8_t* x, uint8_t* y ) const; -protected: +public: FilmGrainImpl(); + virtual ~FilmGrainImpl() = default; + void add_grain_block( void* I, int c, int x, int y, int width, uint32_t rnd, uint32_t rnd_up, int16_t grain[3][32], uint8_t scale[3][32] ) const; void set_luma_pattern( int index, int8_t* P ); void set_chroma_pattern( int index, int8_t* P ); void set_scale_lut( int c, uint8_t lut[] ); - void set_pattern_lut( int c, uint8_t lut[] ); - - void set_seed( uint32_t seed ); + void set_pattern_lut( int c, uint8_t lut[], bool all0 ); void set_scale_shift( int shift ); -public: void set_depth( int depth ); void set_chroma_subsampling( int subx, int suby ); - void add_grain_line( void* Y, void* U, void* V, int y, int width ); +private: + virtual void make_grain_pattern( const void* I, + int c, + int x, + int subx, + uint8_t oc1, + uint8_t oc2, + uint8_t ox, + uint8_t ox_up, + uint8_t oy, + uint8_t oy_up, + int s, + int s_up, + int16_t grain[3][32], + uint8_t scale[3][32] ) const; + virtual void scale_and_output( void* I, // + int c, + int x, + int subx, + int width, + int16_t grain[3][32], + uint8_t scale[3][32] ) const; }; } // namespace vvdec diff --git a/source/Lib/FilmGrain/FilmGrainImplX86.h b/source/Lib/FilmGrain/FilmGrainImplX86.h new file mode 100644 index 00000000..a6643dc3 --- /dev/null +++ b/source/Lib/FilmGrain/FilmGrainImplX86.h @@ -0,0 +1,102 @@ +/* ----------------------------------------------------------------------------- +The copyright in this software is being made available under the Clear BSD +License, included below. No patent rights, trademark rights and/or +other Intellectual Property Rights other than the copyrights concerning +the Software are granted under this license. + +The Clear BSD License + +Copyright (c) 2018-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted (subject to the limitations in the disclaimer below) provided that +the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from this +software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY +THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +------------------------------------------------------------------------------------------- */ + +#pragma once + +#include "FilmGrainImpl.h" + +#include +#include + +#include + +namespace vvdec +{ + +template +class FilmGrainImplX86 : public FilmGrainImpl +{ +public: + static std::unique_ptr makeFilmGrainImpl(); + +protected: + void make_grain_pattern( const void* I, + int c, + int x, + int subx, + uint8_t oc1, + uint8_t oc2, + uint8_t ox, + uint8_t ox_up, + uint8_t oy, + uint8_t oy_up, + int s, + int s_up, + int16_t grain[3][32], + uint8_t scale[3][32] ) const override; + void scale_and_output( void* I, // + int c, + int x, + int subx, + int width, + int16_t grain[3][32], + uint8_t scale[3][32] ) const override; +}; + +template<> +inline std::unique_ptr FilmGrainImplX86::makeFilmGrainImpl() +{ + switch( read_x86_extension_flags() ) + { + case AVX512: + case AVX2: +#if ENABLE_AVX2_IMPLEMENTATIONS + return std::make_unique>(); +#endif + case AVX: + case SSE42: + case SSE41: + return std::make_unique>(); + default: + return std::make_unique(); + } +} + +} // namespace vvdec diff --git a/source/Lib/FilmGrain/FilmGrainImpl_X86_SIMD.h b/source/Lib/FilmGrain/FilmGrainImpl_X86_SIMD.h new file mode 100755 index 00000000..e8eb7249 --- /dev/null +++ b/source/Lib/FilmGrain/FilmGrainImpl_X86_SIMD.h @@ -0,0 +1,609 @@ +/* ----------------------------------------------------------------------------- +The copyright in this software is being made available under the Clear BSD +License, included below. No patent rights, trademark rights and/or +other Intellectual Property Rights other than the copyrights concerning +the Software are granted under this license. + +The Clear BSD License + +Copyright (c) 2018-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted (subject to the limitations in the disclaimer below) provided that +the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY +THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +------------------------------------------------------------------------------------------- */ + +#include "FilmGrainImplX86.h" + +#include + +#include + +#ifdef TARGET_SIMD_X86 +# include + +namespace vvdec +{ +using namespace x86_simd; + +template<> +void FilmGrainImplX86::make_grain_pattern( const void* I, + int c, + int x, + int subx, + uint8_t oc1, + uint8_t oc2, + uint8_t ox, + uint8_t ox_up, + uint8_t oy, + uint8_t oy_up, + int s, + int s_up, + int16_t grain[3][32], + uint8_t scale[3][32] ) const +{ + const uint8_t* I8 = (const uint8_t*) I; + const uint16_t* I16 = (const uint16_t*) I; + if( allZero[c] == 1 ) + { + if( c == 0 ) + { + __m128i vP = _mm_lddqu_si128( (__m128i*) &pattern[0][0][oy][ox] ); + if( s == -1 ) + { + vP = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP ); + } +#ifdef USE_AVX2 + __m256i vmask = _mm256_set1_epi32(0xff); + __m128i tmp0; + __m128i tmp1; + __m256i vintensity; + if (bs) + { + vintensity = _mm256_lddqu_si256((__m256i*)&I16[x]); //load 16 16 bit values + vintensity = _mm256_sra_epi16 (vintensity, _mm_set_epi32 (0,0,0,bs)); + tmp0=_mm256_extracti128_si256 (vintensity,0); + tmp1=_mm256_extracti128_si256 (vintensity,1); + } + else + { + __m128i vintensity128 = _mm_lddqu_si128((__m128i*)&I8[x]); //load 16 8 bit value + tmp0=_mm_cvtepi8_epi16 (vintensity128); + tmp1=_mm_cvtepi8_epi16 (_mm_bsrli_si128(vintensity128,8)); + tmp0 = _mm_and_si128 (tmp0,_mm_set1_epi16(0xff)); // only 8 bit + tmp1 = _mm_and_si128 (tmp1,_mm_set1_epi16(0xff)); // only 8 bit + vintensity = _mm256_castsi128_si256 (vintensity128); + } + __m256i vindex0=_mm256_cvtepi16_epi32 (tmp0); + __m256i vindex1=_mm256_cvtepi16_epi32 (tmp1); + + __m256i avP = _mm256_cvtepi8_epi16( vP ); + if( oc1 ) + { + __m256i avoc1 = _mm256_set1_epi16( oc1 ); + __m256i avoc2 = _mm256_set1_epi16( oc2 ); + // p*oc1 + avP = _mm256_mullo_epi16( avP, avoc1 ); // max 16 Bit + // pattern * s_up + __m128i vP2 = _mm_lddqu_si128( (__m128i*) &pattern[0][0][oy_up][ox_up] ); + if( s_up == -1 ) + { + vP2 = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP2 ); + } + __m256i avP2 = _mm256_cvtepi8_epi16( vP2 ); + // * oc2 + avP2 = _mm256_mullo_epi16( avP2, avoc2 ); + // add + avP = _mm256_add_epi16( avP, avP2 ); + // round to 16 bit + __m256i avadd = _mm256_set1_epi16( 1 << ( 5 - 1 ) ); + __m128i avshift = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 5 ); + avP = _mm256_add_epi16( avP, avadd ); + avP = _mm256_sra_epi16( avP, avshift ); + } + _mm256_storeu_si256( (__m256i*) &grain[c][16], avP ); + + __m256i vscale0 = _mm256_i32gather_epi32 ((int *)&sLUT[0][0], vindex0, 1); // load 8 32 bit values + __m256i vscale1 = _mm256_i32gather_epi32 ((int *)&sLUT[0][0], vindex1, 1); // load 8 32 bit values + + vscale0 = _mm256_and_si256 (vscale0,vmask); + vscale1 = _mm256_and_si256 (vscale1,vmask); + + vintensity = _mm256_packus_epi32 (vscale0, vscale1); + vscale0 = _mm256_permute4x64_epi64 (vintensity, 0x8); + vscale1 = _mm256_permute4x64_epi64 (vintensity, 0xd); + vscale0 = _mm256_packus_epi16 (vscale0, vscale1); + _mm_storeu_si128(( __m128i * )&scale[0][16],_mm256_castsi256_si128(vscale0)); +# else + __m128i vPlo = _mm_cvtepi8_epi16( vP ); + __m128i vPhi = _mm_cvtepi8_epi16( _mm_bsrli_si128( vP, 8 ) ); + if( oc1 ) + { + __m128i voc1 = _mm_set1_epi16( oc1 ); + __m128i voc2 = _mm_set1_epi16( oc2 ); + // p*oc1 + vPlo = _mm_mullo_epi16( vPlo, voc1 ); // max 16 Bit + vPhi = _mm_mullo_epi16( vPhi, voc1 ); + // pattern * s_up + __m128i vP2 = _mm_lddqu_si128( (__m128i*) &pattern[0][0][oy_up][ox_up] ); + if( s_up == -1 ) + { + vP2 = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP2 ); + } + __m128i vP2lo = _mm_cvtepi8_epi16( vP2 ); + __m128i vP2hi = _mm_cvtepi8_epi16( _mm_bsrli_si128( vP2, 8 ) ); + // * oc2 + vP2lo = _mm_mullo_epi16( vP2lo, voc2 ); + vP2hi = _mm_mullo_epi16( vP2hi, voc2 ); + // add + vPlo = _mm_add_epi16( vPlo, vP2lo ); + vPhi = _mm_add_epi16( vPhi, vP2hi ); + // round to 16 bit + __m128i vadd = _mm_set1_epi16( 1 << ( 5 - 1 ) ); + __m128i vshift = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 5 ); + vPlo = _mm_add_epi16( vPlo, vadd ); + vPhi = _mm_add_epi16( vPhi, vadd ); + vPlo = _mm_sra_epi16( vPlo, vshift ); + vPhi = _mm_sra_epi16( vPhi, vshift ); + } + _mm_storeu_si128( (__m128i*) &grain[c][16], vPlo ); + _mm_storeu_si128( (__m128i*) &grain[c][16 + 8], vPhi ); + // Scale sign already integrated above because of overlap + //scale[0][16+i] = sLUT[0][intensity]; + uint8_t intensity; + uint8_t *pscale=&scale[0][16]; + const uint8_t *pLUT=sLUT[0]; + if (bs) + { + const uint16_t *pI16 = I16+x; + for (int i=0; i<16; i++) + { + intensity = *pI16++ >> bs ; + *pscale++ = pLUT[intensity]; + } + } + else + { + const uint8_t *pI8 = I8+x; + for (int i=0; i<16; i++) + { + intensity = *pI8++ ; + *pscale++ = pLUT[intensity]; + } + } +#endif + } // Y + else + { // U/V + __m128i vP; +#ifdef USE_AVX2 + __m256i vindex; + __m128i vintensity; + if (bs) + { + vintensity = _mm_lddqu_si128((__m128i*)&I16[x>>1]); //load 8 16 bit values + vintensity = _mm_sra_epi16 (vintensity, _mm_set_epi32 (0,0,0,bs)); + } + else + { + vintensity = _mm_loadu_si64(&I8[x>>1]); //load 8 8 bit values + vintensity=_mm_cvtepi8_epi16 (vintensity); + vintensity = _mm_and_si128 (vintensity,_mm_set1_epi16(0xff)); // only 8 bit + } + vindex=_mm256_cvtepi16_epi32 (vintensity); +#endif + vP = _mm_loadl_epi64( (__m128i*) &pattern[1][0][oy][ox] ); + + if( s == -1 ) + { + vP = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP ); + } + __m128i vPlo = _mm_cvtepi8_epi16( vP ); + if( oc1 ) + { + __m128i voc1 = _mm_set1_epi16( oc1 ); + __m128i voc2 = _mm_set1_epi16( oc2 ); + // p*oc1 + vPlo = _mm_mullo_epi16( vPlo, voc1 ); // max 16 Bit + // pattern * s_up + __m128i vP2 = _mm_loadl_epi64( (__m128i*) &pattern[c ? 1 : 0][0][oy_up][ox_up] ); + if( s_up == -1 ) + { + vP2 = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP2 ); + } + __m128i vP2lo = _mm_cvtepi8_epi16( vP2 ); + // * oc2 + vP2lo = _mm_mullo_epi16( vP2lo, voc2 ); + // add + vPlo = _mm_add_epi16( vPlo, vP2lo ); + // round to 16 bit + __m128i vadd = _mm_set1_epi16( 1 << ( 5 - 1 ) ); + __m128i vshift = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 5 ); + vPlo = _mm_add_epi16( vPlo, vadd ); + vPlo = _mm_sra_epi16( vPlo, vshift ); + } + _mm_storeu_si128( (__m128i*) &grain[c][8], vPlo ); +#ifdef USE_AVX2 + __m256i vmask = _mm256_set1_epi32(0xff); + __m256i vscale = _mm256_i32gather_epi32 ((int *)&sLUT[c][0], vindex, 1); // load 8 32 bit values + vscale = _mm256_and_si256 (vscale,vmask); + + vmask = _mm256_packus_epi32 (vscale, vscale); + vscale = _mm256_permute4x64_epi64 (vmask, 0x8); + vscale = _mm256_packus_epi16 (vscale, vscale); + _mm_storeu_si64(( __m128i * )&scale[c][8],_mm256_castsi256_si128(vscale)); +#else + uint8_t* pscale = &scale[c][8]; + const uint8_t* pLUT = sLUT[c]; + if (bs) + { + const uint16_t* pI16 = &I16[x >> 1]; + for( int i = 0; i < 8; i++ ) + { + uint8_t intensity = *pI16++ >> bs; + *pscale++ = pLUT[intensity]; + } + } + else + { + const uint8_t* pI8 = &I8[x >> 1]; + for( int i = 0; i < 8; i++ ) + { + uint8_t intensity = *pI8++; + *pscale++ = pLUT[intensity]; + } + } +#endif + } + } +#ifdef USE_AVX2 + else if( c>0 && allZero[c] == 0 ) + { + __m128i vP; + __m128i vintensity; + __m256i vindex; + __m256i vmask = _mm256_set1_epi32(0xff); + if (bs) + { + vintensity = _mm_lddqu_si128((__m128i*)&I16[x>>1]); //load 8 16 bit values + vintensity = _mm_sra_epi16 (vintensity, _mm_set_epi32 (0,0,0,bs)); + } + else + { + vintensity = _mm_loadu_si64(&I8[x>>1]); //load 8 8 bit values + vintensity=_mm_cvtepi8_epi16 (vintensity); + } + vindex=_mm256_cvtepi16_epi32 (vintensity); + vindex = _mm256_and_si256 (vindex,vmask); // only 8 bit + + __m256i vadd = _mm256_set_epi32(7,6,5,4,3,2,1,0); + __m256i vpi = _mm256_i32gather_epi32 ((int *)&pLUT[c][0], vindex, 1); // load 8 32 bit values + vpi = _mm256_and_si256 (vpi,vmask); // only 8 bit + vpi = _mm256_slli_epi32 (vpi, 8); // 12-4 + vpi = _mm256_add_epi32 (vpi, vadd); + __m256i avP = _mm256_i32gather_epi32 ((int *)&pattern[1][0][oy][ox], vpi, 1); // load 8 32 bit values + avP = _mm256_and_si256 (avP,vmask); // only 8 bit + // convert to packed 8 bit + __m256i vtmp = _mm256_packus_epi32 (avP, avP); + avP = _mm256_permute4x64_epi64 (vtmp, 0x8); + avP = _mm256_packus_epi16 (avP, avP); + vP = _mm256_castsi256_si128(avP); + if( s == -1 ) + { + vP = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP ); + } + __m128i vPlo = _mm_cvtepi8_epi16( vP ); + if( oc1 ) + { + __m128i voc1 = _mm_set1_epi16( oc1 ); + __m128i voc2 = _mm_set1_epi16( oc2 ); + // p*oc1 + vPlo = _mm_mullo_epi16( vPlo, voc1 ); // max 16 Bit + // pattern * s_up + __m256i avP2 = _mm256_i32gather_epi32 ((int *)&pattern[1][0][oy_up][ox_up], vpi, 1); // load 8 32 bit values + avP2 = _mm256_and_si256 (avP2,vmask); // only 8 bit + // convert to packed 8 bit + vtmp = _mm256_packus_epi32 (avP2, avP2); + avP2 = _mm256_permute4x64_epi64 (vtmp, 0x8); + avP2 = _mm256_packus_epi16 (avP2, avP2); + __m128i vP2= _mm256_castsi256_si128(avP2); + if( s_up == -1 ) + { + vP2 = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP2 ); + } + __m128i vP2lo = _mm_cvtepi8_epi16( vP2 ); + vP2lo = _mm_mullo_epi16( vP2lo, voc2 ); + vPlo = _mm_add_epi16( vPlo, vP2lo ); + // round to 16 bit + __m128i vadd = _mm_set1_epi16( 1 << ( 5 - 1 ) ); + __m128i vshift = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 5 ); + vPlo = _mm_add_epi16( vPlo, vadd ); + vPlo = _mm_sra_epi16( vPlo, vshift ); + } + _mm_storeu_si128( (__m128i*) &grain[c][8], vPlo ); + __m256i vscale = _mm256_i32gather_epi32 ((int *)&sLUT[c][0], vindex, 1); // load 8 32 bit values + vscale = _mm256_and_si256 (vscale,vmask); + vmask = _mm256_packus_epi32 (vscale, vscale); + vscale = _mm256_permute4x64_epi64 (vmask, 0x8); + vscale = _mm256_packus_epi16 (vscale, vscale); + _mm_storeu_si64(( __m128i * )&scale[c][8],_mm256_castsi256_si128(vscale)); + } +#endif + else + { + for( int i = 0; i < 16 / subx; i++ ) + { + uint8_t intensity = bs ? I16[x / subx + i] >> bs : I8[x / subx + i]; + uint8_t pi = pLUT[c][intensity] >> 4; // pattern index (integer part) + int P = pattern[c ? 1 : 0][pi][oy][ox + i] * s; // Pattern sample (from current pattern index) + // We could consider just XORing the sign bit +#if PATTERN_INTERPOLATION + uint8_t pf = pLUT[c][intensity] & 15; // pattern index fractional part (interpolate with next) -- could restrict to less bits (e.g. 2) + int Pn = + pattern[c ? 1 : 0][pi + 1][oy][ox + i] * s; // Next-pattern sample (from pattern index+1) + // But there are equivalent hw tricks, e.g. storing values as sign + amplitude instead of two's complement +#endif + + if( oc1 ) // overlap + { + P = round( P * oc1 + pattern[c ? 1 : 0][pi][oy_up][ox_up + i] * oc2 * s_up, 5 ); +#if PATTERN_INTERPOLATION + Pn = round( Pn * oc1 + pattern[c ? 1 : 0][pi + 1][oy_up][ox_up + i] * oc2 * s_up, 5 ); +#endif + } +#if PATTERN_INTERPOLATION + // Pattern interpolation: P is current, Pn is next, pf is interpolation coefficient + grain[c][16 / subx + i] = round( P * ( 16 - pf ) + Pn * pf, 4 ); +#else + grain[c][16 / subx + i] = P; +#endif + // Scale sign already integrated above because of overlap + scale[c][16 / subx + i] = sLUT[c][intensity]; + } + } +} + +template<> +void FilmGrainImplX86::scale_and_output( void* I, int c, int x, int subx, int width, int16_t grain[3][32], uint8_t scale[3][32] ) const +{ + uint8_t* I8 = (uint8_t*) I; + uint16_t* I16 = (uint16_t*) I; + + const uint8_t I_min = c ? C_min : Y_min; + const uint8_t I_max = c ? C_max : Y_max; + + int flush = 0; + do + { + if( x > 0 ) + { + if( !flush ) + { + // Horizontal deblock (across previous block) + __m128i vgrain; + __m128i vfac = _mm_set_epi16( 0, 0, 0, 1, 1, 3, 1, 1 ); + if( c == 0 ) + { + vgrain = _mm_loadl_epi64( (__m128i*) &grain[0][16 - 2] ); // r1 r0 l0 l1 + } + else + { + vgrain = _mm_loadl_epi64( (__m128i*) &grain[c][8 - 2] ); // r1 r0 l0 l1 + } + __m128i vgrainh = _mm_mullo_epi16( vgrain, vfac ); // r1 3*r0 l0 l1 + vgrainh = _mm_srli_si128( vgrainh, 2 ); // r1 3+r0 l0 + vfac = _mm_srli_si128( vfac, 2 ); + __m128i vgrainl = _mm_mullo_epi16( vgrain, vfac ); // r1 r0 3*lo l1 + vgrainl = _mm_slli_si128( vgrainl, 10 ); + vgrainl = _mm_srli_si128( vgrainl, 10 ); // r0 3*lo l1 + vgrainl = _mm_hadd_epi16( vgrainl, vgrainl ); // r0 3*lo+l1 + vgrainl = _mm_hadd_epi16( vgrainl, vgrainl ); // r0+3*lo+l1 + vgrainh = _mm_hadd_epi16( vgrainh, vgrainh ); + vgrainh = _mm_hadd_epi16( vgrainh, vgrainh ); + vgrainh = _mm_srli_si128( vgrainh, 2 ); + vgrain = _mm_or_si128( vgrainl, vgrainh ); + vgrain = _mm_add_epi16( vgrain, _mm_set_epi16( 0, 0, 0, 0, 0, 0, 2, 2 ) ); + vgrain = _mm_srai_epi16( vgrain, 2 ); + if( c == 0 ) + { + _mm_storeu_si32( (__m128i*) &grain[0][16 - 1], vgrain ); + } + else + { + _mm_storeu_si32( (__m128i*) &grain[c][8 - 1], vgrain ); + } + } + if( bs ) + { +# ifdef USE_AVX2 + __m128i vshift = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, scale_shift ); + if( c == 0 ) + { + __m256i vadd = _mm256_set1_epi32( 1 << ( scale_shift - 1 ) ); + __m256i vgrain = _mm256_lddqu_si256( (__m256i*) &grain[0][0] ); // load 16 * 16 bit + __m256i vscale = _mm256_cvtepi8_epi16( _mm_lddqu_si128( (__m128i*) &scale[0][0] ) ); + __m256i tmplo = _mm256_mullo_epi16( vscale, vgrain ); + __m256i tmphi = _mm256_mulhi_epi16( vscale, vgrain ); + __m256i tmpgvlo = _mm256_unpacklo_epi16( tmplo, tmphi ); // 32 bit + __m256i tmpgvhi = _mm256_unpackhi_epi16( tmplo, tmphi ); + // deinterleave + __m256i gvlo = _mm256_permute2x128_si256( tmpgvlo, tmpgvhi, 0x20 ); + __m256i gvhi = _mm256_permute2x128_si256( tmpgvlo, tmpgvhi, 0x31 ); + // round + gvlo = _mm256_add_epi32( gvlo, vadd ); + gvhi = _mm256_add_epi32( gvhi, vadd ); + gvlo = _mm256_sra_epi32( gvlo, vshift ); + gvhi = _mm256_sra_epi32( gvhi, vshift ); + __m256i vI16lo = _mm256_cvtepi16_epi32( _mm_lddqu_si128( (__m128i*) &I16[( x - 16 )] ) ); + __m256i vI16hi = _mm256_cvtepi16_epi32( _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) + 8] ) ); + vI16lo = _mm256_add_epi32( gvlo, vI16lo ); + vI16hi = _mm256_add_epi32( gvhi, vI16hi ); + vI16lo = _mm256_max_epi32( _mm256_set1_epi32( I_min ), vI16lo ); + vI16hi = _mm256_max_epi32( _mm256_set1_epi32( I_min ), vI16hi ); + vI16lo = _mm256_min_epi32( _mm256_set1_epi32( I_max << bs ), vI16lo ); + vI16hi = _mm256_min_epi32( _mm256_set1_epi32( I_max << bs ), vI16hi ); + vI16lo = _mm256_packs_epi32( vI16lo, vI16hi ); + vI16lo = _mm256_permute4x64_epi64( vI16lo, 0xd8 ); + _mm256_storeu_si256( (__m256i*) &I16[( x - 16 )], vI16lo ); + } + else + { + __m128i vadd = _mm_set1_epi32( 1 << ( scale_shift - 1 ) ); + __m128i vscale = _mm_lddqu_si128( (__m128i*) &scale[c] ); + __m128i vgrain = _mm_lddqu_si128( (__m128i*) &grain[c] ); + vscale = _mm_cvtepi8_epi16( vscale ); // 16 bit + __m128i tmplo = _mm_mullo_epi16( vscale, vgrain ); + __m128i tmphi = _mm_mulhi_epi16( vscale, vgrain ); + __m128i gvlo = _mm_unpacklo_epi16( tmplo, tmphi ); // 32 bit + __m128i gvhi = _mm_unpackhi_epi16( tmplo, tmphi ); + gvlo = _mm_add_epi32( gvlo, vadd ); + gvhi = _mm_add_epi32( gvhi, vadd ); + gvlo = _mm_sra_epi32( gvlo, vshift ); + gvhi = _mm_sra_epi32( gvhi, vshift ); + __m128i vI16lo = _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) / subx] ); + __m128i vI16hi = _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) / subx + 4] ); + vI16lo = _mm_cvtepi16_epi32( vI16lo ); // 32 bit + vI16hi = _mm_cvtepi16_epi32( vI16hi ); + vI16lo = _mm_add_epi32( gvlo, vI16lo ); + vI16hi = _mm_add_epi32( gvhi, vI16hi ); + vI16lo = _mm_max_epi32( _mm_set1_epi32( I_min ), vI16lo ); + vI16hi = _mm_max_epi32( _mm_set1_epi32( I_min ), vI16hi ); + vI16lo = _mm_min_epi32( _mm_set1_epi32( I_max << bs ), vI16lo ); + vI16hi = _mm_min_epi32( _mm_set1_epi32( I_max << bs ), vI16hi ); + vI16lo = _mm_packs_epi32( vI16lo, vI16hi ); + _mm_storeu_si128( (__m128i*) &I16[( x - 16 ) / subx], vI16lo ); + } +# else // !USE_AVX2 + __m128i vadd = _mm_set1_epi32( 1 << ( scale_shift - 1 ) ); + __m128i vshift = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, scale_shift ); + __m128i vscale = _mm_lddqu_si128( (__m128i*) &scale[c] ); + __m128i vgrain = _mm_lddqu_si128( (__m128i*) &grain[c] ); + vscale = _mm_cvtepi8_epi16( vscale ); // 16 bit + __m128i tmplo = _mm_mullo_epi16( vscale, vgrain ); + __m128i tmphi = _mm_mulhi_epi16( vscale, vgrain ); + __m128i gvlo = _mm_unpacklo_epi16( tmplo, tmphi ); // 32 bit + __m128i gvhi = _mm_unpackhi_epi16( tmplo, tmphi ); + gvlo = _mm_add_epi32( gvlo, vadd ); + gvhi = _mm_add_epi32( gvhi, vadd ); + gvlo = _mm_sra_epi32( gvlo, vshift ); + gvhi = _mm_sra_epi32( gvhi, vshift ); + __m128i vI16lo = _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) / subx] ); + __m128i vI16hi = _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) / subx + 4] ); + + vI16lo = _mm_cvtepi16_epi32( vI16lo ); // 32 bit + vI16hi = _mm_cvtepi16_epi32( vI16hi ); + vI16lo = _mm_add_epi32( gvlo, vI16lo ); + vI16hi = _mm_add_epi32( gvhi, vI16hi ); + vI16lo = _mm_max_epi32( _mm_set1_epi32( I_min ), vI16lo ); + vI16hi = _mm_max_epi32( _mm_set1_epi32( I_min ), vI16hi ); + vI16lo = _mm_min_epi32( _mm_set1_epi32( I_max << bs ), vI16lo ); + vI16hi = _mm_min_epi32( _mm_set1_epi32( I_max << bs ), vI16hi ); + vI16lo = _mm_packs_epi32( vI16lo, vI16hi ); + _mm_storeu_si128( (__m128i*) &I16[( x - 16 ) / subx], vI16lo ); + if( c == 0 ) + { + __m128i vscale = _mm_lddqu_si128( (__m128i*) &scale[c][8] ); + __m128i vgrain = _mm_lddqu_si128( (__m128i*) &grain[c][8] ); + vscale = _mm_cvtepi8_epi16( vscale ); // 16 bit + __m128i tmplo = _mm_mullo_epi16( vscale, vgrain ); + __m128i tmphi = _mm_mulhi_epi16( vscale, vgrain ); + __m128i gvlo = _mm_unpacklo_epi16( tmplo, tmphi ); // 32 bit + __m128i gvhi = _mm_unpackhi_epi16( tmplo, tmphi ); + // round + gvlo = _mm_add_epi32( gvlo, vadd ); + gvhi = _mm_add_epi32( gvhi, vadd ); + gvlo = _mm_sra_epi32( gvlo, vshift ); + gvhi = _mm_sra_epi32( gvhi, vshift ); + __m128i vI16lo = _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) / subx + 8] ); + __m128i vI16hi = _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) / subx + 12] ); + vI16lo = _mm_cvtepi16_epi32( vI16lo ); // 32 bit + vI16hi = _mm_cvtepi16_epi32( vI16hi ); + vI16lo = _mm_add_epi32( gvlo, vI16lo ); + vI16hi = _mm_add_epi32( gvhi, vI16hi ); + vI16lo = _mm_max_epi32( _mm_set1_epi32( I_min ), vI16lo ); + vI16hi = _mm_max_epi32( _mm_set1_epi32( I_min ), vI16hi ); + vI16lo = _mm_min_epi32( _mm_set1_epi32( I_max << bs ), vI16lo ); + vI16hi = _mm_min_epi32( _mm_set1_epi32( I_max << bs ), vI16hi ); + vI16lo = _mm_packs_epi32( vI16lo, vI16hi ); + _mm_storeu_si128( (__m128i*) &I16[( x - 16 ) / subx + 8], vI16lo ); + } +#endif // !USE_AVX2 + } // bs + else + { + for( int i = 0; i < 16 / subx; i++ ) + { + // Output previous block (or flush current) + int32_t g = round( scale[c][i] * (int16_t) grain[c][i], scale_shift ); + if( bs ) + { + I16[( x - 16 ) / subx + i] = std::max( I_min << bs, std::min( I_max << bs, I16[( x - 16 ) / subx + i] + g ) ); + } + else + { + I8[( x - 16 ) / subx + i] = std::max( I_min, std::min( I_max, I8[( x - 16 ) / subx + i] + g ) ); + } + } + } + } + // Shift pipeline + if( !flush ) + { + if( c == 0 ) + { +#ifdef USE_AVX2 + __m256i vgrain = _mm256_lddqu_si256( (__m256i*) &grain[0][16] ); + _mm256_storeu_si256( (__m256i*) &grain[0][0], vgrain ); +#else + __m128i vgrain0 = _mm_lddqu_si128( (__m128i*) &grain[0][16] ); + __m128i vgrain1 = _mm_lddqu_si128( (__m128i*) &grain[0][24] ); + _mm_storeu_si128( (__m128i*) &grain[0][0], vgrain0 ); + _mm_storeu_si128( (__m128i*) &grain[0][8], vgrain1 ); +#endif + __m128i vscale = _mm_lddqu_si128( (__m128i*) &scale[0][16] ); + _mm_storeu_si128( (__m128i*) &scale[0][0], vscale ); + } + else + { + __m128i vgrain = _mm_lddqu_si128( (__m128i*) &grain[c][8] ); + __m128i vscale = _mm_loadl_epi64( (__m128i*) &scale[c][8] ); + _mm_storeu_si128( (__m128i*) &grain[c][0], vgrain ); + _mm_storel_epi64( (__m128i*) &scale[c][0], vscale ); + } + } + if( x + 16 >= width ) + { + flush++; + x += 16; + } + } while( flush == 1 ); +} + +} // namespace vvdec + +#endif // TARGET_SIMD_X86 diff --git a/source/Lib/FilmGrain/FilmGrainImpl_avx2.cpp b/source/Lib/FilmGrain/FilmGrainImpl_avx2.cpp new file mode 100644 index 00000000..38442b7f --- /dev/null +++ b/source/Lib/FilmGrain/FilmGrainImpl_avx2.cpp @@ -0,0 +1,42 @@ +/* ----------------------------------------------------------------------------- +The copyright in this software is being made available under the Clear BSD +License, included below. No patent rights, trademark rights and/or +other Intellectual Property Rights other than the copyrights concerning +the Software are granted under this license. + +The Clear BSD License + +Copyright (c) 2018-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted (subject to the limitations in the disclaimer below) provided that +the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from this +software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY +THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +------------------------------------------------------------------------------------------- */ + +#define CURR_X86_VEXT AVX2 +#include "FilmGrainImpl_X86_SIMD.h" diff --git a/source/Lib/FilmGrain/FilmGrainImpl_sse41.cpp b/source/Lib/FilmGrain/FilmGrainImpl_sse41.cpp new file mode 100644 index 00000000..855227d7 --- /dev/null +++ b/source/Lib/FilmGrain/FilmGrainImpl_sse41.cpp @@ -0,0 +1,42 @@ +/* ----------------------------------------------------------------------------- +The copyright in this software is being made available under the Clear BSD +License, included below. No patent rights, trademark rights and/or +other Intellectual Property Rights other than the copyrights concerning +the Software are granted under this license. + +The Clear BSD License + +Copyright (c) 2018-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted (subject to the limitations in the disclaimer below) provided that +the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from this +software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY +THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +------------------------------------------------------------------------------------------- */ + +#define CURR_X86_VEXT SSE41 +#include "FilmGrainImpl_X86_SIMD.h" diff --git a/source/Lib/vvdec/CMakeLists.txt b/source/Lib/vvdec/CMakeLists.txt index d33af357..aa3e4c82 100644 --- a/source/Lib/vvdec/CMakeLists.txt +++ b/source/Lib/vvdec/CMakeLists.txt @@ -29,9 +29,7 @@ if( VVDEC_ENABLE_X86_SIMD ) file( GLOB X86_SSE41_SRC_FILES "../CommonLib/x86/sse41/*.cpp" ) #file( GLOB X86_SSE42_SRC_FILES "../CommonLib/x86/sse42/*.cpp" ) #file( GLOB X86_AVX_SRC_FILES "../CommonLib/x86/avx/*.cpp" ) - if( VVDEC_TARGET_ARCH STREQUAL "X86" ) - file( GLOB X86_AVX2_SRC_FILES "../CommonLib/x86/avx2/*.cpp" ) - endif() + file( GLOB X86_AVX2_SRC_FILES "../CommonLib/x86/avx2/*.cpp" ) endif() if( VVDEC_ENABLE_ARM_SIMD ) @@ -46,10 +44,21 @@ file( GLOB MD5_SRC_FILES "../libmd5/*.cpp" ) file( GLOB MD5_INC_FILES "../libmd5/*.h" ) if( VVDEC_ENABLE_FILM_GRAIN ) - file( GLOB FGS_SRC_FILES "../FilmGrain/*.cpp" ) - file( GLOB FGS_INC_FILES "../FilmGrain/*.h" ) + file( GLOB FGS_SRC_FILES "../FilmGrain/FilmGrain.cpp" "../FilmGrain/FilmGrainImpl.cpp" ) + file( GLOB FGS_INC_FILES "../FilmGrain/FilmGrain.h" "../FilmGrain/FilmGrainImpl.h" ) + + file( GLOB FGS_X86_SSE41_SRC_FILES "../FilmGrain/*_sse41.cpp" ) + file( GLOB FGS_X86_AVX2_SRC_FILES "../FilmGrain/*_avx2.cpp" ) + + list( APPEND X86_SSE41_SRC_FILES ${FGS_X86_SSE41_SRC_FILES} ) + list( APPEND X86_AVX2_SRC_FILES ${FGS_X86_AVX2_SRC_FILES} ) - set_property( SOURCE vvdec.cpp vvdecimpl.cpp APPEND PROPERTY COMPILE_DEFINITIONS ENABLE_FILM_GRAIN ) + set_property( SOURCE + vvdec.cpp vvdecimpl.cpp + ${FGS_SRC_FILES} + ${FGS_X86_SSE41_SRC_FILES} + ${FGS_X86_AVX2_SRC_FILES} + APPEND PROPERTY COMPILE_DEFINITIONS ENABLE_FILM_GRAIN ) endif() # get public/extern include files @@ -94,12 +103,10 @@ if( VVDEC_ENABLE_X86_SIMD ) #set_property( SOURCE ${X86_SSE42_SRC_FILES} APPEND PROPERTY COMPILE_DEFINITIONS USE_SSE42 ) #set_property( SOURCE ${X86_AVX_SRC_FILES} APPEND PROPERTY COMPILE_DEFINITIONS USE_AVX ) set_property( SOURCE ${X86_AVX2_SRC_FILES} APPEND PROPERTY COMPILE_DEFINITIONS USE_AVX2 ) - set_property( SOURCE ${FGS_SRC_FILES} APPEND PROPERTY COMPILE_DEFINITIONS USE_AVX2 ) # set needed compile flags if( MSVC ) #set_property( SOURCE ${X86_AVX_SRC_FILES} APPEND PROPERTY COMPILE_FLAGS "/arch:AVX" ) set_property( SOURCE ${X86_AVX2_SRC_FILES} APPEND PROPERTY COMPILE_FLAGS "/arch:AVX2" ) - set_property( SOURCE ${FGS_SRC_FILES} APPEND PROPERTY COMPILE_FLAGS "/arch:AVX2" ) elseif( UNIX OR MINGW ) include( vvdecCompilerSupport ) @@ -116,11 +123,14 @@ if( VVDEC_ENABLE_X86_SIMD ) #set_property( SOURCE ${X86_SSE42_SRC_FILES} APPEND PROPERTY COMPILE_FLAGS "${FLAG_msse42}" ) #set_property( SOURCE ${X86_AVX_SRC_FILES} APPEND PROPERTY COMPILE_FLAGS "${FLAG_mavx}" ) set_property( SOURCE ${X86_AVX2_SRC_FILES} APPEND PROPERTY COMPILE_FLAGS "${FLAG_mavx2}" ) - set_property( SOURCE ${FGS_SRC_FILES} APPEND PROPERTY COMPILE_FLAGS "${FLAG_mavx2}" ) + endif() + + if( NOT VVDEC_TARGET_ARCH STREQUAL "X86" ) # only build AVX2 files for X86 + set( X86_AVX2_SRC_FILES "" ) endif() #add_library( ${LIB_NAME}_x86_simd OBJECT ${X86_SSE41_SRC_FILES} ${X86_SSE42_SRC_FILES} ${X86_AVX_SRC_FILES} ${X86_AVX2_SRC_FILES} ) - add_library( ${LIB_NAME}_x86_simd OBJECT ${X86_SSE41_SRC_FILES} ${X86_AVX2_SRC_FILES} ${X86_AVX2_C_FILES} ) + add_library( ${LIB_NAME}_x86_simd OBJECT ${X86_SSE41_SRC_FILES} ${X86_AVX2_SRC_FILES} ) target_link_libraries( ${LIB_NAME}_x86_simd ${INTEL_ITT_LINK_TARGET} ) # disble LTO for the files compiled with special architecture flags diff --git a/source/Lib/vvdec/vvdec.cpp b/source/Lib/vvdec/vvdec.cpp index 288c0b8f..7665cf7f 100644 --- a/source/Lib/vvdec/vvdec.cpp +++ b/source/Lib/vvdec/vvdec.cpp @@ -58,16 +58,25 @@ VVDEC_DECL void vvdec_params_default(vvdecParams *params) return; } - params->threads = -1; // thread count ( default: -1 ) - params->parseDelay = -1; // number of frames to parse in parallel ( default: -1 ) - params->upscaleOutput = VVDEC_UPSCALING_OFF; // do internal upscaling of rpr pictures to dest. resolution ( default: off ) - params->logLevel = VVDEC_WARNING; // verbosity level - params->verifyPictureHash = false; // verify picture, if digest is available, true: check hash in SEI messages if available, false: ignore SEI message - params->removePadding = false; // copy output pictures to new buffer to remove padding (stride==width) - params->opaque = nullptr; // opaque pointer for private user data ( can be used to carry caller specific data or contexts ) - params->simd = VVDEC_SIMD_DEFAULT; // set specific simd optimization (default: max. availalbe) - params->errHandlingFlags = VVDEC_ERR_HANDLING_OFF; // no special error handling - params->parseThreads = -1; // DEPRECATED. Use `parseDelay` instead. Will be removed in the future. Until then, this value is copied to parseDelay if set. + // ensure the padding parameters are cleared also, so we don't read undefined values, + // when new parameters are introduced and the library is used with an old executable + memset( params, 0, sizeof( vvdecParams ) ); + + params->threads = -1; // thread count ( default: -1 ) + params->parseDelay = -1; // number of frames to parse in parallel ( default: -1 ) + params->upscaleOutput = VVDEC_UPSCALING_OFF; // do internal upscaling of rpr pictures to dest. resolution ( default: off ) + params->logLevel = VVDEC_WARNING; // verbosity level + params->verifyPictureHash = false; // verify picture, if digest is available, true: check hash in SEI messages if available, false: ignore SEI message + params->removePadding = false; // copy output pictures to new buffer to remove padding (stride==width) + params->opaque = nullptr; // opaque pointer for private user data ( can be used to carry caller specific data or contexts ) + params->simd = VVDEC_SIMD_DEFAULT; // set specific simd optimization (default: max. availalbe) + params->errHandlingFlags = VVDEC_ERR_HANDLING_OFF; // no special error handling +#if ENABLE_FILM_GRAIN + params->filmGrainSynthesis = true; // enable film grain synthesis using Film Grain Charactersitics SEI ( default: true ) +#else + params->filmGrainSynthesis = false; // built without film grain support +#endif + params->parseThreads = -1; // DEPRECATED. Use `parseDelay` instead. Will be removed in the future. Until then, this value is copied to parseDelay if set. } VVDEC_DECL vvdecParams* vvdec_params_alloc() @@ -191,6 +200,14 @@ static int paramCheck( vvdecParams *params ) } } +#if !ENABLE_FILM_GRAIN + if( params->filmGrainSynthesis ) + { + vvdec::msg( vvdec::ERROR, "VVdeC was built without ENABLE_FILM_GRAIN. filmGrainSynthesis parameter must be 0.\n" ); + ret = -1; + } +#endif // !ENABLE_FILM_GRAIN + return ret; } diff --git a/source/Lib/vvdec/vvdecimpl.cpp b/source/Lib/vvdec/vvdecimpl.cpp index 89423a5a..9a29fb1d 100644 --- a/source/Lib/vvdec/vvdecimpl.cpp +++ b/source/Lib/vvdec/vvdecimpl.cpp @@ -131,6 +131,9 @@ int VVDecImpl::init( const vvdecParams& params, vvdecCreateBufferCallback create m_sDecoderCapabilities = m_cDecLib->getDecoderCapabilities(); +#if ENABLE_FILM_GRAIN + m_enableFilmGrain = params.filmGrainSynthesis; +#endif // ENABLE_FILM_GRAIN m_bRemovePadding = params.removePadding; m_eErrHandlingFlags = static_cast(params.errHandlingFlags); m_uiSeqNumber = 0; @@ -774,7 +777,13 @@ bool VVDecImpl::isNalUnitSlice( vvdecNalType t ) || t == VVC_NAL_UNIT_CODED_SLICE_GDR; } -int VVDecImpl::copyComp( const unsigned char* pucSrc, unsigned char* pucDest, unsigned int uiWidth, unsigned int uiHeight, ptrdiff_t iStrideSrc, ptrdiff_t iStrideDest, int iBytesPerSample ) +int VVDecImpl::copyComp( const unsigned char* pucSrc, + unsigned char* pucDest, + unsigned int uiWidth, + unsigned int uiHeight, + ptrdiff_t iStrideSrc, + ptrdiff_t iStrideDest, + int iBytesPerSample ) { if( NULL != pucSrc && NULL != pucDest ) { @@ -842,7 +851,7 @@ void VVDecImpl::xUpdateFGC( vvdecSEI* s ) if( !m_filmGrainSynth ) { - m_filmGrainSynth = std::make_unique( 10, 2 ); // TODO: (GH) set correct bit depth and color format, and apply changes + m_filmGrainSynth = std::make_unique(); } m_filmGrainSynth->updateFGC( sei ); @@ -856,22 +865,50 @@ void VVDecImpl::xAddGrain( vvdecFrame* frame ) return; } - uint8_t* Y = (uint8_t*) frame->planes[0].ptr; - uint8_t* U = (uint8_t*) frame->planes[1].ptr; - uint8_t* V = (uint8_t*) frame->planes[2].ptr; + m_filmGrainSynth->setDepth( frame->bitDepth ); + m_filmGrainSynth->setColorFormat( frame->colorFormat ); + m_filmGrainSynth->prepareBlockSeeds( frame->planes[0].width, frame->planes[0].height ); - CHECK( frame->bitDepth != 10, "Bitdepth is not 10" ); + struct GrainTaskData + { + vvdecFrame* frame; + uint32_t startLine; + FilmGrain* filmGrainSynth; + }; + constexpr static int LINES_PER_TASK = 16; + const int numTasks = ( frame->planes[0].height + ( LINES_PER_TASK - 1 ) ) / LINES_PER_TASK; + std::vector grainTaskData( numTasks ); - for( int y = 0; y < frame->planes[0].height; y++ ) + WaitCounter grainTaskCounter; + for( int i = 0; i < numTasks; ++i ) { - m_filmGrainSynth->add_grain_line( Y, U, V, y, frame->planes[0].width ); - Y += frame->planes[0].stride; - if( ( y & 1 ) || ( frame->planes[0].height == frame->planes[1].height ) ) + grainTaskData[i].frame = frame; + grainTaskData[i].startLine = i * LINES_PER_TASK; + grainTaskData[i].filmGrainSynth = m_filmGrainSynth.get(); + + static auto grainTask = []( int, GrainTaskData* data ) { - U += frame->planes[1].stride; - V += frame->planes[2].stride; - } + auto* frame = data->frame; + for( unsigned y = data->startLine; y < std::min( data->startLine + LINES_PER_TASK, frame->planes[0].height ); ++y ) + { + uint8_t* Y = (uint8_t*) frame->planes[0].ptr + frame->planes[0].stride * y; + uint8_t* U = nullptr; + uint8_t* V = nullptr; + if( frame->colorFormat != VVDEC_CF_YUV400_PLANAR ) + { + const int chromaSub = frame->colorFormat == VVDEC_CF_YUV420_PLANAR ? 2 : 1; + + U = (uint8_t*) frame->planes[1].ptr + frame->planes[1].stride * y / chromaSub; + V = (uint8_t*) frame->planes[2].ptr + frame->planes[2].stride * y / chromaSub; + } + + data->filmGrainSynth->add_grain_line( Y, U, V, y, frame->planes[0].width ); + } + return true; + }; + m_cDecLib->getThreadPool().addBarrierTask( grainTask, &( grainTaskData[i] ), &grainTaskCounter ); } + grainTaskCounter.wait(); if( m_filmGrainCharacteristicsState != FgcPersist ) // Not persistent { @@ -918,16 +955,23 @@ int VVDecImpl::xAddPicture( Picture* pcPic ) bCreateStorage = bCreateStorage || m_bRemovePadding; #if ENABLE_FILM_GRAIN - // find FGC SEI - for( auto& sei: pcPic->seiMessageList ) + if( m_enableFilmGrain ) { - if( sei->payloadType == VVDEC_FILM_GRAIN_CHARACTERISTICS ) + // find FGC SEI + for( auto& sei: pcPic->seiMessageList ) { - xUpdateFGC( sei ); - msg( DETAILS, "vvdecimpl [detail]: SEI FILM_GRAIN_CHARACTERISTICS\n"); + if( sei->payloadType == VVDEC_FILM_GRAIN_CHARACTERISTICS ) + { + xUpdateFGC( sei ); + msg( DETAILS, "vvdecimpl [detail]: SEI FILM_GRAIN_CHARACTERISTICS\n"); + } + } + const bool fgsReuseBuffer = bitDepths.recon == 10 && !m_bRemovePadding && !pcPic->stillReferenced; + if( !fgsReuseBuffer ) + { + bCreateStorage |= m_filmGrainCharacteristicsState != FgcNone; } } - bCreateStorage = bCreateStorage || m_filmGrainCharacteristicsState; #endif // ENABLE_FILM_GRAIN // create a brand new picture object @@ -935,22 +979,22 @@ int VVDecImpl::xAddPicture( Picture* pcPic ) vvdec_frame_default( &cFrame ); cFrame.sequenceNumber = m_uiSeqNumber; - cFrame.cts = pcPic->getCts(); - cFrame.ctsValid = true; + cFrame.cts = pcPic->getCts(); + cFrame.ctsValid = true; int ret; #if RPR_YUV_OUTPUT if( m_cDecLib->getUpscaledOutput() && ( uiWidth != orgWidth || uiHeight != orgHeight ) ) { bCreateStorage = true; - ret = xCreateFrame ( cFrame, cPicBuf, orgWidth, orgHeight, bitDepths, bCreateStorage ); + ret = xCreateFrame( cFrame, cPicBuf, orgWidth, orgHeight, bitDepths, bCreateStorage ); } else { - ret = xCreateFrame ( cFrame, cPicBuf, uiWidth, uiHeight, bitDepths, bCreateStorage ); + ret = xCreateFrame( cFrame, cPicBuf, uiWidth, uiHeight, bitDepths, bCreateStorage, m_filmGrainCharacteristicsState != FgcNone ); } #else - ret = xCreateFrame ( cFrame, cPicBuf, uiWidth, uiHeight, bitDepths, bCreateStorage ); + ret = xCreateFrame( cFrame, cPicBuf, uiWidth, uiHeight, bitDepths, bCreateStorage, m_filmGrainCharacteristicsState != FgcNone ); #endif if( ret != VVDEC_OK ) { @@ -994,9 +1038,12 @@ int VVDecImpl::xAddPicture( Picture* pcPic ) copyComp( (const unsigned char*) ( planeOrigin + planeOffset ), cFrame.planes[comp].ptr, - cFrame.planes[comp].width, cFrame.planes[comp].height, - area.stride<<1, cFrame.planes[comp].stride, uiBytesPerSample ); - cFrame.planes[comp].allocator = upscaledPic.getBufAllocator( (ComponentID)comp ); + area.width, // need to use source width & height here, for VVDEC_UPSCALING_COPY_ONLY to work + area.height, + area.stride * sizeof( *area.buf ), + cFrame.planes[comp].stride, + uiBytesPerSample ); + cFrame.planes[comp].allocator = upscaledPic.getBufAllocator( (ComponentID) comp ); } upscaledPic.destroy(); } @@ -1017,8 +1064,25 @@ int VVDecImpl::xAddPicture( Picture* pcPic ) copyComp( (const unsigned char*) ( planeOrigin + planeOffset ), cFrame.planes[comp].ptr, - cFrame.planes[comp].width, cFrame.planes[comp].height, - area.stride<<1, cFrame.planes[comp].stride, uiBytesPerSample ); + area.width, // need to use source width & height here, for VVDEC_UPSCALING_COPY_ONLY to work + area.height, + area.stride * sizeof( *area.buf ), + cFrame.planes[comp].stride, + uiBytesPerSample ); + + // zero the surrounding area for VVDEC_UPSCALING_COPY_ONLY + if( m_cDecLib->getUpscaledOutput() == (int) VVDEC_UPSCALING_COPY_ONLY + && ( area.width < cFrame.planes[comp].width || area.height < cFrame.planes[comp].height ) ) + { + unsigned char* linePtr = cFrame.planes[comp].ptr; + const auto bytesPerSample = cFrame.planes[comp].bytesPerSample; + for( unsigned y = 0; y < area.height; ++y ) + { + ::memset( linePtr + area.width * bytesPerSample, 0, ( cFrame.planes[comp].width - area.width ) * bytesPerSample ); + linePtr += cFrame.planes[comp].stride; + } + ::memset( linePtr, 0, ( cFrame.planes[comp].height - area.height ) * cFrame.planes[comp].stride ); + } } } } @@ -1143,7 +1207,10 @@ int VVDecImpl::xAddPicture( Picture* pcPic ) #if ENABLE_FILM_GRAIN // Grain synthesis - xAddGrain( &cFrame ); + if( m_enableFilmGrain && m_filmGrainCharacteristicsState != FgcNone ) + { + xAddGrain( &cFrame ); + } #endif // ENABLE_FILM_GRAIN m_rcFrameList.emplace_back( cFrame, bCreateStorage ? nullptr : pcPic ); @@ -1178,9 +1245,13 @@ int VVDecImpl::xAddPicture( Picture* pcPic ) return 0; } - - -int VVDecImpl::xCreateFrame( vvdecFrame& rcFrame, const CPelUnitBuf& rcPicBuf, uint32_t uiWidth, uint32_t uiHeight, const BitDepths& rcBitDepths, bool bCreateStorage ) +int VVDecImpl::xCreateFrame( vvdecFrame& rcFrame, + const CPelUnitBuf& rcPicBuf, + uint32_t uiWidth, + uint32_t uiHeight, + const BitDepths& rcBitDepths, + bool bCreateStorage, + bool origStride ) { rcFrame.width = uiWidth; rcFrame.height = uiHeight; @@ -1191,11 +1262,11 @@ int VVDecImpl::xCreateFrame( vvdecFrame& rcFrame, const CPelUnitBuf& rcPicBuf, u rcFrame.planes[VVDEC_CT_Y].width = uiWidth; rcFrame.planes[VVDEC_CT_Y].height = uiHeight; rcFrame.planes[VVDEC_CT_Y].bytesPerSample = rcBitDepths.recon > 8 ? 2 : 1; - rcFrame.planes[VVDEC_CT_Y].stride = bCreateStorage ? uiWidth * rcFrame.planes[VVDEC_CT_Y].bytesPerSample - : (uint32_t)rcPicBuf.get(COMPONENT_Y).stride * rcFrame.planes[VVDEC_CT_Y].bytesPerSample; + rcFrame.planes[VVDEC_CT_Y].stride = bCreateStorage && !origStride ? uiWidth * rcFrame.planes[VVDEC_CT_Y].bytesPerSample + : (uint32_t) rcPicBuf.get( COMPONENT_Y ).stride * rcFrame.planes[VVDEC_CT_Y].bytesPerSample; size_t nBufSize = 0; - size_t nLSize = rcFrame.planes[VVDEC_CT_Y].stride * uiHeight; + size_t nLSize = rcFrame.planes[VVDEC_CT_Y].stride * rcFrame.planes[VVDEC_CT_Y].height; size_t nCSize = 0; unsigned int uiCWidth = 0; @@ -1265,7 +1336,7 @@ int VVDecImpl::xCreateFrame( vvdecFrame& rcFrame, const CPelUnitBuf& rcPicBuf, u rcFrame.planes[VVDEC_CT_V].height = uiCHeight; rcFrame.planes[VVDEC_CT_V].bytesPerSample = rcBitDepths.recon > 8 ? 2 : 1; - if( bCreateStorage ) + if( bCreateStorage && !origStride ) { rcFrame.planes[VVDEC_CT_U].stride = uiCWidth * rcFrame.planes[CHANNEL_TYPE_CHROMA].bytesPerSample; rcFrame.planes[VVDEC_CT_V].stride = uiCWidth * rcFrame.planes[CHANNEL_TYPE_CHROMA].bytesPerSample; @@ -1276,8 +1347,8 @@ int VVDecImpl::xCreateFrame( vvdecFrame& rcFrame, const CPelUnitBuf& rcPicBuf, u rcFrame.planes[VVDEC_CT_V].stride = (uint32_t)rcPicBuf.get(COMPONENT_Cr).stride * rcFrame.planes[CHANNEL_TYPE_CHROMA].bytesPerSample; } - nCSize = rcFrame.planes[VVDEC_CT_U].stride * uiCHeight; - nBufSize = nLSize + ( nCSize << 1 ); + nCSize = rcFrame.planes[VVDEC_CT_U].stride * rcFrame.planes[VVDEC_CT_U].height; + nBufSize = nLSize + nCSize * 2; } diff --git a/source/Lib/vvdec/vvdecimpl.h b/source/Lib/vvdec/vvdecimpl.h index 5399ef89..9a6355f0 100644 --- a/source/Lib/vvdec/vvdecimpl.h +++ b/source/Lib/vvdec/vvdecimpl.h @@ -174,7 +174,13 @@ class VVDecImpl private: int xAddPicture ( Picture* pcPic ); - int xCreateFrame ( vvdecFrame& frame, const CPelUnitBuf& rcPicBuf, uint32_t uiWidth, uint32_t uiHeight, const BitDepths& rcBitDepths, bool bCreateStorage ); + int xCreateFrame ( vvdecFrame& frame, + const CPelUnitBuf& rcPicBuf, + uint32_t uiWidth, + uint32_t uiHeight, + const BitDepths& rcBitDepths, + bool bCreateStorage, + bool origStride = false ); void xUpdateFGC ( vvdecSEI *sei ); void xAddGrain ( vvdecFrame *frame ); @@ -223,6 +229,7 @@ class VVDecImpl FgcDontPersist = 1, FgcPersist = 2 } m_filmGrainCharacteristicsState = FgcNone; + bool m_enableFilmGrain = false; std::unique_ptr m_filmGrainSynth; #endif // ENABLE_FILM_GRAIN };