From ae9d70efaaeb7b2f8fe68093eeafeb5e1bcfca57 Mon Sep 17 00:00:00 2001
From: Adam Wieckowski <adam.wieckowski@hhi.fraunhofer.de>
Date: Mon, 24 Jun 2024 11:36:45 +0200
Subject: [PATCH] FGS improvements, clean SIMD separation

---
 CMakeLists.txt                                |   2 +-
 include/vvdec/vvdec.h                         |  25 +-
 source/App/vvdecapp/CmdLineParser.h           | 392 +++++------
 source/App/vvdecapp/vvdecapp.cpp              |  14 +-
 source/Lib/CommonLib/LoopFilter.cpp           | 143 ++--
 source/Lib/CommonLib/LoopFilter.h             |   2 +-
 source/Lib/CommonLib/UnitTools.cpp            |  67 +-
 source/Lib/CommonLib/x86/CommonDefX86.h       |   7 +
 source/Lib/DecoderLib/DecLib.h                |   2 +
 source/Lib/FilmGrain/FilmGrain.cpp            | 172 +++--
 source/Lib/FilmGrain/FilmGrain.h              |  32 +-
 source/Lib/FilmGrain/FilmGrainImpl.cpp        | 206 +++---
 source/Lib/FilmGrain/FilmGrainImpl.h          |  57 +-
 source/Lib/FilmGrain/FilmGrainImplX86.h       | 102 +++
 source/Lib/FilmGrain/FilmGrainImpl_X86_SIMD.h | 609 ++++++++++++++++++
 source/Lib/FilmGrain/FilmGrainImpl_avx2.cpp   |  42 ++
 source/Lib/FilmGrain/FilmGrainImpl_sse41.cpp  |  42 ++
 source/Lib/vvdec/CMakeLists.txt               |  30 +-
 source/Lib/vvdec/vvdec.cpp                    |  37 +-
 source/Lib/vvdec/vvdecimpl.cpp                | 149 +++--
 source/Lib/vvdec/vvdecimpl.h                  |   9 +-
 21 files changed, 1610 insertions(+), 531 deletions(-)
 create mode 100644 source/Lib/FilmGrain/FilmGrainImplX86.h
 create mode 100755 source/Lib/FilmGrain/FilmGrainImpl_X86_SIMD.h
 create mode 100644 source/Lib/FilmGrain/FilmGrainImpl_avx2.cpp
 create mode 100644 source/Lib/FilmGrain/FilmGrainImpl_sse41.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 869b3a6d..b391b73e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -77,7 +77,7 @@ set( VVDEC_ENABLE_X86_SIMD   TRUE                      CACHE BOOL "enable x86 in
 set( VVDEC_ENABLE_ARM_SIMD   ${VVDEC_ARM_SIMD_DEFAULT} CACHE BOOL "enable ARM intrinsics" )
 
 set( VVDEC_ENABLE_TRACING    FALSE                     CACHE BOOL "Compile in tracing functionality" )
-set( VVDEC_ENABLE_FILM_GRAIN FALSE                     CACHE BOOL "Build with film grain synthesis support" )
+set( VVDEC_ENABLE_FILM_GRAIN TRUE                      CACHE BOOL "Build with film grain synthesis support" )
 
 include( vvdecCompilerSupport )
 
diff --git a/include/vvdec/vvdec.h b/include/vvdec/vvdec.h
index f85b5e58..480cffdc 100644
--- a/include/vvdec/vvdec.h
+++ b/include/vvdec/vvdec.h
@@ -437,17 +437,20 @@ typedef struct vvdecFrame
 */
 typedef struct vvdecParams
 {
-  int                   threads;           // thread count                          ( default: -1 )
-  int                   parseDelay;        // number of frames to parse in parallel ( default: -1 )
-  vvdecRPRUpscaling     upscaleOutput;     // do internal upscaling of rpl pictures to dest. resolution ( default: 0 )
-  vvdecLogLevel         logLevel;          // verbosity level
-  bool                  verifyPictureHash; // verify picture, if digest is available, true: check hash in SEI messages if available, false: ignore SEI message
-  bool                  removePadding;     // copy output pictures to new buffer to remove padding (stride==width) ( default: false )
-  vvdecSIMD_Extension   simd;              // set specific simd optimization (default: max. availalbe)
-  void                 *opaque;            // opaque pointer for private user data ( can be used to carry application specific data or contexts )
-  vvdecErrHandlingFlags errHandlingFlags;  // set of flags defining how to handle bitstream errors
-  int                   parseThreads;      // DEPRECATED. Use `parseDelay` instead. This will be removed in the future. Until then, this value is copied to parseDelay if set.
-  int                   padding2;          // reserved space for future parameters
+  int                   threads;            // thread count                          ( default: -1 )
+  int                   parseDelay;         // number of frames to parse in parallel ( default: -1 )
+  vvdecRPRUpscaling     upscaleOutput;      // do internal upscaling of rpl pictures to dest. resolution ( default: 0 )
+  vvdecLogLevel         logLevel;           // verbosity level
+  bool                  verifyPictureHash;  // verify picture, if digest is available, true: check hash in SEI messages if available, false: ignore SEI message
+  bool                  removePadding;      // copy output pictures to new buffer to remove padding (stride==width) ( default: false )
+  vvdecSIMD_Extension   simd;               // set specific simd optimization (default: max. availalbe)
+  void                 *opaque;             // opaque pointer for private user data ( can be used to carry application specific data or contexts )
+  vvdecErrHandlingFlags errHandlingFlags;   // set of flags defining how to handle bitstream errors
+  int                   parseThreads;       // DEPRECATED. Use `parseDelay` instead. This will be removed in the future. Until then, this value is copied to parseDelay if set.
+  bool                  filmGrainSynthesis; // set film grain synthesis using Film Grain Charactersitics SEI ( default: true )
+  int8_t                padding2_1;         // reserved space for future parameters
+  int8_t                padding2_2;
+  int8_t                padding2_3;
   int                   padding3;
   int                   padding4;
 } vvdecParams;
diff --git a/source/App/vvdecapp/CmdLineParser.h b/source/App/vvdecapp/CmdLineParser.h
index 3f5616e8..fc4a1806 100644
--- a/source/App/vvdecapp/CmdLineParser.h
+++ b/source/App/vvdecapp/CmdLineParser.h
@@ -1,7 +1,7 @@
 /* -----------------------------------------------------------------------------
 The copyright in this software is being made available under the Clear BSD
-License, included below. No patent rights, trademark rights and/or 
-other Intellectual Property Rights other than the copyrights concerning 
+License, included below. No patent rights, trademark rights and/or
+other Intellectual Property Rights other than the copyrights concerning
 the Software are granted under this license.
 
 The Clear BSD License
@@ -42,28 +42,118 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #pragma once
 
+#include <cctype>
+#include <cstdio>
+#include <cstring>
+#include <cassert>
 #include <string>
 #include <iostream>
-#include <stdio.h>
-#include <string.h>
 #include <algorithm>
-#include <cctype>
 
 #include "vvdec/vvdec.h"
 
-namespace vvdecoderapp {
+namespace vvdecoderapp
+{
 
 class CmdLineParser
 {
+  int32_t m_iArg = 0;
+  int     m_argc = 0;
+  char**  m_argv = nullptr;
+
+  // parse a parameter with the corresponding argument
+  template<class TOut>
+  bool parse_param( std::initializer_list<const char*> paramNames, TOut& outputVar, bool argOptional = false, const TOut optionalDefault = {} )
+  {
+    if( m_iArg >= m_argc )
+    {
+      return false;
+    }
+
+    const std::string currArg( m_argv[m_iArg] );
+    if( std::any_of( paramNames.begin(), paramNames.end(),
+                     [&]( const char* name ) { return currArg == name; } ) )
+    {
+      ++m_iArg;
+
+      if( m_iArg < m_argc && parse_param_arg( outputVar ) )
+      {
+        return true;
+      }
+      if( argOptional )
+      {
+        outputVar = optionalDefault;
+        return true;
+      }
+      if( std::is_same<bool, TOut>::value )
+      {
+        outputVar = true;   // default value for bool always true, if present
+        return true;
+      }
+
+      fprintf( stderr, " - missing argument for: %s \n", m_argv[m_iArg - 1] );
+      throw MissingArgumentException();
+    }
+    return false;
+  }
+
+  // parse boolean arguments
+  bool parse_param_arg( bool& outputVar )
+  {
+    outputVar = true;   // boolean always defaults to true
+    if( strlen( m_argv[m_iArg] ) >= 1 && std::isdigit( m_argv[m_iArg][0] ) )
+    {
+      outputVar = !!atoi( m_argv[m_iArg] );
+      ++m_iArg;
+      return true;
+    }
+    return true;
+  }
+
+  // parse string arguments
+  bool parse_param_arg( std::string& outputVar )
+  {
+    outputVar = std::string( m_argv[m_iArg] );
+    ++m_iArg;
+    return true;
+  }
+
+  // parse signed int arguments
+  bool parse_param_arg( int& outputVar )
+  {
+    const size_t argStrLen = strlen( m_argv[m_iArg] );
+    if( ( argStrLen >= 1 && std::isdigit( m_argv[m_iArg][0] ) )                                    // positive number
+        || ( argStrLen >= 2 && m_argv[m_iArg][0] == '-' && std::isdigit( m_argv[m_iArg][1] ) ) )   // negative number
+    {
+      outputVar = atoi( m_argv[m_iArg] );
+      ++m_iArg;
+      return true;
+    }
+
+    return false;
+  }
+
+  // parse unsigned int arguments
+  bool parse_param_arg( unsigned int& outputVar )
+  {
+    if( strlen( m_argv[m_iArg] ) >= 1 && std::isdigit( m_argv[m_iArg][0] ) )
+    {
+      outputVar = atoi( m_argv[m_iArg] );
+      ++m_iArg;
+      return true;
+    }
+    return false;
+  }
+
 public:
   /// Constructor
-  CmdLineParser(){}
-
+  CmdLineParser() = default;
   /// Destructor
-  virtual ~CmdLineParser() {}
+  ~CmdLineParser() = default;
 
   static void print_usage( std::string cApp, vvdecParams& rcParams, bool fullHelp )
   {
+    // clang-format off
     std::cout <<   std::endl;
     std::cout <<   " Usage:  " << cApp << "  [param1] [pararm2] [...]" << std::endl;
     std::cout <<   std::endl;
@@ -78,6 +168,7 @@ class CmdLineParser
     if( fullHelp )
     {
       std::cout << "\t\t [--upscale,-uo             ] : set upscaling mode for RPR pictures(default: 0: off, 1: copy without rescaling, 2: rescale to target resolution)" << std::endl;
+      std::cout << "\t\t [--filmGrain,-fg <int>     ] : set film grain synthesis using Film Grain Charactersitics SEI (default: 1, off: 0, on: 1)" << std::endl;
     }
     std::cout <<   "\t\t [--y4m                     ] : force y4m output (for pipe output; auto enable for .y4m output file extension)" << std::endl;
     std::cout <<   std::endl;
@@ -123,12 +214,22 @@ class CmdLineParser
     std::cout <<   "\t\t [--fullhelp                ] : show full help including expert options" << std::endl;
     std::cout <<   std::endl;
     std::cout <<   std::endl;
+    // clang-format on
   }
 
-
-  static int parse_command_line( int argc, char* argv[] , vvdecParams& rcParams, std::string& rcBitstreamFile, std::string& rcOutputFile,
-                                 int& riFrames, int& riLoops, std::string& rcExpectYuvMD5, bool& useY4mFormat, bool &useExternAllocator,
-                                 std::string& sTracingFile, std::string& sTracingRule, int& riPrintPicHash )
+  int parse_command_line( int          argc,
+                          char*        argv[],
+                          vvdecParams& rcParams,
+                          std::string& rcBitstreamFile,
+                          std::string& rcOutputFile,
+                          int&         riFrames,
+                          int&         riLoops,
+                          std::string& rcExpectYuvMD5,
+                          bool&        useY4mFormat,
+                          bool&        useExternAllocator,
+                          std::string& sTracingFile,
+                          std::string& sTracingRule,
+                          int&         riPrintPicHash )
   {
 #ifndef ENABLE_TRACING
     // ignore unused variables
@@ -136,221 +237,132 @@ class CmdLineParser
     (void) sTracingRule;
 #endif   // !ENABLE_TRACING
 
-    int iRet = 0;
     /* Check command line parameters */
-    int32_t  i_arg = 1;
+    m_iArg = 1;
+    m_argc = argc;
+    m_argv = argv;
 
     /* Check general options first */
-    while( i_arg < argc )
+    while( m_iArg < argc )
     {
-      if( (!strcmp( (const char*)argv[i_arg], "-v" )) || !strcmp( (const char*)argv[i_arg], "--verbosity" ) )
+      bool     _dummy   = false;
+      unsigned logLevel = 0;
+      if( parse_param( { "-v", "--verbosity" }, logLevel ) )
       {
-        if( i_arg == argc-1 ){ fprintf( stderr, " - missing argument for: %s \n", argv[i_arg] ); return -1; }
-        i_arg++;
-        int iLogLevel = atoi( argv[i_arg++] );
-        if( iLogLevel < 0 ) iLogLevel = 0;
-        if( iLogLevel > (int)vvdecLogLevel::VVDEC_DETAILS ) iLogLevel = (int)vvdecLogLevel::VVDEC_DETAILS ;
-        rcParams.logLevel = (vvdecLogLevel)iLogLevel;
+        rcParams.logLevel = std::min( (vvdecLogLevel) logLevel, VVDEC_DETAILS );
 
         if( rcParams.logLevel > VVDEC_VERBOSE )
         {
-          std::string cll;
-          switch (rcParams.logLevel)
+          const char* cll;
+          switch( rcParams.logLevel )
           {
-            case VVDEC_SILENT : cll = "SILENT"; break;
-            case VVDEC_ERROR  : cll = "ERROR"; break;
+            // clang-format off
+            case VVDEC_SILENT : cll = "SILENT";  break;
+            case VVDEC_ERROR  : cll = "ERROR";   break;
             case VVDEC_WARNING: cll = "WARNING"; break;
-            case VVDEC_INFO   : cll = "INFO"; break;
-            case VVDEC_NOTICE : cll = "NOTICE"; break;
+            case VVDEC_INFO   : cll = "INFO";    break;
+            case VVDEC_NOTICE : cll = "NOTICE";  break;
             case VVDEC_VERBOSE: cll = "VERBOSE"; break;
             case VVDEC_DETAILS: cll = "DETAILS"; break;
-            default: cll = "UNKNOWN"; break;
+            default:            cll = "UNKNOWN"; break;
+            // clang-format on
           };
-          fprintf( stdout, "[verbosity] : %d - %s\n", (int)rcParams.logLevel, cll.c_str() );
+          fprintf( stdout, "[verbosity] : %d - %s\n", (int) rcParams.logLevel, cll );
         }
       }
-      else if( (!strcmp( (const char*)argv[i_arg], "-h" )) || !strcmp( (const char*)argv[i_arg], "--help" ) )
+      else if( parse_param( { "-h", "--help" }, _dummy ) )
       {
-        i_arg++;
-        iRet = 2;
-        return iRet;
+        return 2;
       }
-      else if( !strcmp( ( const char* ) argv[i_arg], "--fullhelp" ) )
+      else if( parse_param( { "--fullhelp", "--full-help" }, _dummy ) )
       {
-        i_arg++;
-        iRet = 3;
-        return iRet;
+        return 3;
       }
-      else if( !strcmp( (const char*)argv[i_arg], "--version" ) )
+      else if( parse_param( { "--version" }, _dummy ) )
       {
-        i_arg++;
-        iRet = 4;
-        return iRet;
+        return 4;
       }
       else
       {
-        i_arg++;
+        m_iArg++;
       }
     }
 
-
-    i_arg = 1;
-    while( i_arg < argc )
+    // restart from the beginning to parse the remainig options
+    m_iArg = 1;
+    while( m_iArg < argc )
     {
-      if( (!strcmp( (const char*)argv[i_arg], "-b" )) || !strcmp( (const char*)argv[i_arg], "--bitstream" ) ) /* In: input-file */
+      int      simd_arg         = 0;
+      int      err_handle_flags = 0;
+      int      upscale_output   = 0;
+      unsigned logLevel         = 0;
+      if( parse_param( { "-b", "--bitstream" }, rcBitstreamFile ) ) /* In: input-file */
       {
-        if( i_arg == argc-1 ){ fprintf( stderr, " - missing argument for: %s \n", argv[i_arg] ); return -1; }
-        i_arg++;
         if( rcParams.logLevel > VVDEC_VERBOSE )
-          fprintf( stdout, "[bitstream] input-file:    %s\n", argv[i_arg] );
-        rcBitstreamFile = argv[i_arg++];
+          fprintf( stdout, "[bitstream] input-file:    %s\n", argv[m_iArg] );
       }
-      else if( (!strcmp( (const char*)argv[i_arg], "-o" )) || !strcmp( (const char*)argv[i_arg], "--output" ) ) /* Out: bitstream-file */
+      else if( parse_param( { "-o", "--output" }, rcOutputFile ) ) /* Out: bitstream-file */
       {
-        if( i_arg == argc-1 ){ fprintf( stderr, " - missing argument for: %s \n", argv[i_arg] ); return -1; }
-        i_arg++;
-        if( i_arg < argc && strlen( argv[i_arg] ) > 0 )
-        {
-          if( rcParams.logLevel > VVDEC_VERBOSE )
-            fprintf( stdout, "[output] yuv-file:    %s\n", argv[i_arg] );
-          rcOutputFile = argv[i_arg++];
-        }
+        if( rcParams.logLevel > VVDEC_VERBOSE )
+          fprintf( stdout, "[output] yuv-file:    %s\n", argv[m_iArg] );
       }
-      else if( (!strcmp( (const char*)argv[i_arg], "-uo" )) || !strcmp( (const char*)argv[i_arg], "--upscale" ) ) /* In: upscale */
+      else if( parse_param( { "-uo", "--upscale" }, upscale_output ) ) /* In: upscale */
       {
-        i_arg++;
-
-        rcParams.upscaleOutput = (vvdecRPRUpscaling) atoi( argv[i_arg++]);
-
+        rcParams.upscaleOutput = vvdecRPRUpscaling( upscale_output );
         if( rcParams.logLevel > VVDEC_VERBOSE )
         {
           std::string scale;
           switch( rcParams.upscaleOutput )
           {
-          case VVDEC_UPSCALING_OFF      : scale = "OFF"; break;
-          case VVDEC_UPSCALING_COPY_ONLY: scale = "COPY_ONLY"; break;
-          case VVDEC_UPSCALING_RESCALE  : scale = "RESCALE"; break;
-          default: scale = "UNKNOWN"; break;
+            // clang-format off
+            case VVDEC_UPSCALING_OFF      : scale = "OFF";       break;
+            case VVDEC_UPSCALING_COPY_ONLY: scale = "COPY_ONLY"; break;
+            case VVDEC_UPSCALING_RESCALE  : scale = "RESCALE";   break;
+            default                       : scale = "UNKNOWN";   break;
+            // clang-format on
           };
           fprintf( stdout, "[upscale] : %s\n", scale.c_str() );
         }
       }
-      else if( !strcmp( (const char*)argv[i_arg], "--y4m" ) )
-      {
-        i_arg++;
-        useY4mFormat = true;
-
-        if( i_arg < argc )
-        {
-          if( std::isdigit(argv[i_arg][0]))
-          {
-            i_arg++;
-          }
-        }
-      }
-      else if( !strcmp( (const char*)argv[i_arg], "--extern" ) )
+      else if( parse_param( { "-fg", "--filmGrain" }, rcParams.filmGrainSynthesis ) ) {}
+      else if( parse_param( { "--y4m" }, useY4mFormat ) ) {}
+      else if( parse_param( { "--extern" }, useExternAllocator ) ) {}
+      else if( parse_param( { "-f", "--frames" }, riFrames ) )
       {
-        i_arg++;
-        useExternAllocator = true;
-
-        if( i_arg < argc )
-        {
-          if( std::isdigit(argv[i_arg][0]))
-          {
-            i_arg++;
-          }
-        }
-      }
-      else if( (!strcmp( (const char*)argv[i_arg], "-f" )) || !strcmp( (const char*)argv[i_arg], "--frames" ) )
-      {
-        if( i_arg == argc-1 ){ fprintf( stderr, " - missing argument for: %s \n", argv[i_arg] ); return -1; }
-        i_arg++;
-        riFrames = atoi( argv[i_arg++] );
         if( rcParams.logLevel > VVDEC_VERBOSE )
           fprintf( stdout, "[frames] : %d\n", riFrames );
       }
-      else if( (!strcmp( (const char*)argv[i_arg], "-t" )) || !strcmp( (const char*)argv[i_arg], "--threads" ) )
+      else if( parse_param( { "-t", "--threads" }, rcParams.threads ) )
       {
-        if( i_arg == argc-1 ){ fprintf( stderr, " - missing argument for: %s \n", argv[i_arg] ); return -1; }
-        i_arg++;
-        int iThreads = atoi( argv[i_arg++] );
         if( rcParams.logLevel > VVDEC_VERBOSE )
-          fprintf( stdout, "[threads] : %d\n", iThreads );
-        rcParams.threads = iThreads;
+          fprintf( stdout, "[threads] : %d\n", rcParams.threads );
       }
-      else if( (!strcmp( (const char*)argv[i_arg], "-p" )) || !strcmp( (const char*)argv[i_arg], "--parsedelay" ) )
+      else if( parse_param( { "-p", "--parsedelay" }, rcParams.parseDelay ) )
       {
-        if( i_arg == argc-1 ){ fprintf( stderr, " - missing argument for: %s \n", argv[i_arg] ); return -1; }
-        i_arg++;
-        int iDelay = atoi( argv[i_arg++] );
         if( rcParams.logLevel > VVDEC_VERBOSE )
-          fprintf( stdout, "[parsedelay] : %d\n", iDelay );
-        rcParams.parseDelay = iDelay;
+          fprintf( stdout, "[parsedelay] : %d\n", rcParams.parseDelay );
       }
-      else if( (!strcmp( (const char*)argv[i_arg], "-dph" )) || !strcmp( (const char*)argv[i_arg], "--SEIDecodedPictureHash" ) )
+      else if( parse_param( { "-dph", "--SEIDecodedPictureHash" }, riPrintPicHash, true, 1 ) )
       {
-        i_arg++;
-        if( i_arg < argc && std::isdigit( argv[i_arg][0] ) )
-        {
-          riPrintPicHash = atoi( argv[i_arg] );
-          i_arg++;
-        }
-        else
-        {
-          riPrintPicHash = 1;
-        }
-
-        if( riPrintPicHash <= 1 )
+        if( riPrintPicHash == 1 )   // dph levels > 11 print the DPH, but don't verify it (only 1 actually verifies)
         {
+          rcParams.verifyPictureHash = true;
           if( rcParams.logLevel > VVDEC_VERBOSE )
             fprintf( stdout, "[SEIDecodedPictureHash] : true\n" );
-          rcParams.verifyPictureHash = true;
         }
       }
-      else if( ( !strcmp( (const char*)argv[i_arg], "-md5" ) ) || !strcmp( (const char*)argv[i_arg], "--CheckYuvMD5" ) )
+      else if( parse_param( { "-md5", "--CheckYuvMD5" }, rcExpectYuvMD5 ) )
       {
-        if( i_arg >= argc - 1 ) { fprintf( stderr, " - missing argument for: %s \n", argv[i_arg] ); return -1; }
-        i_arg++;
-        if( strlen( argv[i_arg] ) != 32 )
-        {
-          fprintf( stderr, " - the provided md5 hash to %s should be exactly 32 characters long\n", argv[i_arg - 1] );
-          return -1;
-        }
-
-        rcExpectYuvMD5 = std::string( argv[i_arg++] );
-
         if( rcParams.logLevel > VVDEC_VERBOSE )
           fprintf( stdout, "[CheckYuvMD5] : %s\n", rcExpectYuvMD5.c_str() );
       }
-      else if( (!strcmp( (const char*)argv[i_arg], "-L" )) || !strcmp( (const char*)argv[i_arg], "--loops" ) )
+      else if( parse_param( { "-L", "--loops" }, riLoops ) )
       {
-        if( i_arg == argc-1 ){ fprintf( stderr, " - missing argument for: %s \n", argv[i_arg] ); return -1; }
-        i_arg++;
-        riLoops = atoi( argv[i_arg++] );
         if( rcParams.logLevel > VVDEC_VERBOSE )
           fprintf( stdout, "[loops] : %d\n", riLoops );
       }
-      else if( (!strcmp( (const char*)argv[i_arg], "-v" )) || !strcmp( (const char*)argv[i_arg], "--verbosity" ) )
-      {
-        // already processed
-        i_arg++;
-        i_arg++;
-      }
-      else if( (!strcmp( (const char*)argv[i_arg], "-h" )) || !strcmp( (const char*)argv[i_arg], "--help" ) )
-      {
-        // already processed
-        i_arg++;
-      }
-      else if( !strcmp( (const char*)argv[i_arg], "--version" ) )
+      else if( parse_param( { "--simd" }, simd_arg ) )
       {
-        // already processed
-        i_arg++;
-      }
-      else if( !strcmp( ( const char* ) argv[i_arg], "--simd" ) )
-      {
-        if( i_arg == argc-1 ){ fprintf( stderr, " - missing argument for: %s \n", argv[i_arg] ); return -1; }
-        i_arg++;
-        const int simd_arg = atoi( argv[i_arg++] );
         if( simd_arg < -1 || simd_arg > VVDEC_SIMD_MAX - 1 )
         {
           fprintf( stderr, " - unsupported simd mode. Should be between -1 and %i inclusive.\n", VVDEC_SIMD_MAX - 1 );
@@ -363,31 +375,29 @@ class CmdLineParser
           const char* cll;
           switch( rcParams.simd )
           {
-          case VVDEC_SIMD_DEFAULT: cll = "DEFAULT";   break;
-          case VVDEC_SIMD_SCALAR:  cll = "SCALAR";    break;
+            // clang-format off
+            case VVDEC_SIMD_DEFAULT:   cll = "DEFAULT";   break;
+            case VVDEC_SIMD_SCALAR:    cll = "SCALAR";    break;
 #if VVDEC_ARCH_X86
-          case VVDEC_SIMD_SSE41:   cll = "SSE41";     break;
-          case VVDEC_SIMD_SSE42:   cll = "SSE42";     break;
-          case VVDEC_SIMD_AVX:     cll = "AVX";       break;
-          case VVDEC_SIMD_AVX2:    cll = "AVX2";      break;
+            case VVDEC_SIMD_SSE41:     cll = "SSE41";     break;
+            case VVDEC_SIMD_SSE42:     cll = "SSE42";     break;
+            case VVDEC_SIMD_AVX:       cll = "AVX";       break;
+            case VVDEC_SIMD_AVX2:      cll = "AVX2";      break;
 #elif VVDEC_ARCH_ARM
-          case VVDEC_SIMD_NEON:    cll = "NEON";      break;
+            case VVDEC_SIMD_NEON:      cll = "NEON";      break;
 #elif VVDEC_ARCH_WASM
-          case VVDEC_SIMD_WASM:    cll = "WASM-SIMD"; break;
+            case VVDEC_SIMD_WASM:      cll = "WASM-SIMD"; break;
 #else
-          case VVDEC_SIMD_SIMDE_ANY:cll = "SIMDE-ANY"; break;
+            case VVDEC_SIMD_SIMDE_ANY: cll = "SIMDE-ANY"; break;
 #endif
-          default:                 return -1;
+            default:                   return -1;
+            // clang-format on
           };
           fprintf( stdout, "[simd] : %s\n", cll );
         }
       }
-      else if( (!strcmp( argv[i_arg], "-eh" )) || !strcmp( argv[i_arg], "--errHandling" ) )
+      else if( parse_param( { "-eh", "--errHandling" }, err_handle_flags ) )
       {
-        if( i_arg == argc-1 ){ fprintf( stderr, " - missing argument for: %s \n", argv[i_arg] ); return -1; }
-        i_arg++;
-
-        const int err_handle_flags = atoi( argv[i_arg++] );
         if( err_handle_flags < 0 || err_handle_flags > VVDEC_ERR_HANDLING_TRY_CONTINUE )
         {
           fprintf( stderr, " - unsupported error handling flags. Should be between 0 and %i.\n", VVDEC_ERR_HANDLING_TRY_CONTINUE );
@@ -397,32 +407,26 @@ class CmdLineParser
         rcParams.errHandlingFlags = vvdecErrHandlingFlags( err_handle_flags );
       }
 #ifdef ENABLE_TRACING
-      else if( !strcmp( (const char*)argv[i_arg], "--TraceFile" ) || !strcmp( (const char*)argv[i_arg], "-tf" ) )
-      {
-        sTracingFile = argv[++i_arg];
-        i_arg++;
-      }
-      else if( !strcmp( (const char*)argv[i_arg], "--TraceRule" ) || !strcmp( (const char*)argv[i_arg], "-tr" ) )
+      else if( parse_param( { "-tf", "--TraceFile" }, sTracingFile ) ) {}
+      else if( parse_param( { "-tr", "--TraceRule" }, sTracingRule ) ) {}
+#endif
+      else if( parse_param( { "-v", "--verbosity" }, logLevel ) )   // already processed. Parse again so we don't detect an unknown argument
       {
-        sTracingRule = argv[++i_arg];
-        i_arg++;
+        assert( logLevel == rcParams.logLevel );
       }
-#endif   // ENABLE_TRACING
       else
       {
-        fprintf( stderr, " - unknown argument: %s \n", argv[i_arg++] );
-        iRet = -1;
+        fprintf( stderr, " - unknown argument: %s \n", argv[m_iArg++] );
+        return -1;
       }
     }
 
-    return iRet;
+    return 0;
   }
 
-private:
-  std::ofstream m_cOS;
+  struct MissingArgumentException : std::exception
+  {
+  };
 };
 
-
-
-} // namespace
-
+}   // namespace vvdecoderapp
diff --git a/source/App/vvdecapp/vvdecapp.cpp b/source/App/vvdecapp/vvdecapp.cpp
index 414772f8..16c9fce9 100644
--- a/source/App/vvdecapp/vvdecapp.cpp
+++ b/source/App/vvdecapp/vvdecapp.cpp
@@ -483,8 +483,16 @@ int main( int argc, char* argv[] )
     return 0;
   }
 
+  int iRet = -1;
+  try {
+    vvdecoderapp::CmdLineParser cmdLineParser;
+    iRet = cmdLineParser.parse_command_line( argc, argv, params, cBitstreamFile, cOutputFile, iMaxFrames, iLoopCount, cExpectedYuvMD5, y4mOutput, externAllocator, sTracingFile, sTracingRule, iPrintPicHash );
+  }
+  catch( std::exception& )
+  {
+    iRet = -1;
+  }
 
-  int iRet = vvdecoderapp::CmdLineParser::parse_command_line( argc, argv, params, cBitstreamFile, cOutputFile, iMaxFrames, iLoopCount, cExpectedYuvMD5, y4mOutput, externAllocator, sTracingFile, sTracingRule, iPrintPicHash );
   if( iRet != 0 )
   {
     if( iRet == 2 )
@@ -1055,9 +1063,9 @@ static bool handle_frame( vvdecFrame*   pcFrame,
 
     if( pcFrame->frameFormat == VVDEC_FF_PROGRESSIVE )
     {
-      if( iPrintPicHash > 1 )
+      if( iPrintPicHash >= 11 )
       {
-        printPicHash( pcFrame, logStream, uiFrames-1, iPrintPicHash-11 );
+        printPicHash( pcFrame, logStream, uiFrames - 1, iPrintPicHash - 11 );
       }
 
       if( md5Stream )
diff --git a/source/Lib/CommonLib/LoopFilter.cpp b/source/Lib/CommonLib/LoopFilter.cpp
index 011b2a05..2eb26aa7 100644
--- a/source/Lib/CommonLib/LoopFilter.cpp
+++ b/source/Lib/CommonLib/LoopFilter.cpp
@@ -610,6 +610,11 @@ void LoopFilter::calcFilterStrengths( const CodingUnit& cu ) const
     xSetMaxFilterLengthPQForCodingSubBlocks<EDGE_HOR>( cu, ctuData );
   }
 
+#if ENABLE_SIMD_DBLF && defined( TARGET_SIMD_X86 )
+  const bool           useSimd = read_x86_extension_flags() > x86_simd::SCALAR;
+#else
+  const bool           useSimd = false;
+#endif
   const unsigned uiPelsInPartX = pcv.minCUWidth >> channelScaleX;
   const unsigned uiPelsInPartY = pcv.minCUHeight >> channelScaleY;
   const ptrdiff_t       lfpPos = cu.cs->inCtuPos( area.pos(), cu.chType() );
@@ -630,7 +635,7 @@ void LoopFilter::calcFilterStrengths( const CodingUnit& cu ) const
 
       for( int x = 0; x < area.width; x += uiPelsInPartX )
       {
-        if( lineLfpPtrV->filterEdge( cu.chType() ) ) xGetBoundaryStrengthSingle<EDGE_VER>( *lineLfpPtrV, cu, Position{ area.x + x, area.y + y }, x ? cu : *cuP, ctuData, x ? true : pqCuSameCtuVer );
+        if( lineLfpPtrV->filterEdge( cu.chType() ) ) xGetBoundaryStrengthSingle<EDGE_VER>( *lineLfpPtrV, cu, Position{ area.x + x, area.y + y }, x ? cu : *cuP, ctuData, x ? true : pqCuSameCtuVer, useSimd );
 
         lineLfpPtrV->bs &= ~BsSet( 3, MAX_NUM_COMPONENT );
 
@@ -656,7 +661,7 @@ void LoopFilter::calcFilterStrengths( const CodingUnit& cu ) const
       {
         cuP = ( y || ( cuP && cuP->blocks[chType].x + cuP->blocks[chType].width > area.x + x ) ) ? cuP : cu.cs->getCU( Position{ area.x + x, area.y - 1 }, chType );
 
-        if( lineLfpPtrH->filterEdge( cu.chType() ) ) xGetBoundaryStrengthSingle<EDGE_HOR>( *lineLfpPtrH, cu, Position{ area.x + x, area.y + y }, y ? cu : *cuP, ctuData, y ? true : pqCuSameCtuHor );
+        if( lineLfpPtrH->filterEdge( cu.chType() ) ) xGetBoundaryStrengthSingle<EDGE_HOR>( *lineLfpPtrH, cu, Position{ area.x + x, area.y + y }, y ? cu : *cuP, ctuData, y ? true : pqCuSameCtuHor, useSimd );
 
         lineLfpPtrH->bs &= ~BsSet( 3, MAX_NUM_COMPONENT );
 
@@ -783,10 +788,15 @@ void LoopFilter::xSetMaxFilterLengthPQFromTransformSizes( const CodingUnit& cu,
 {
   const PreCalcValues &pcv = *cu.cs->pcv;
 
-  ChannelType start = CH_L;
-  ChannelType end   = CH_C;
+  ChannelType start  = CH_L;
+  ChannelType end    = CH_C;
 
-  const bool dt = CU::isSepTree( cu );
+  const bool      dt = CU::isSepTree( cu );
+#if ENABLE_SIMD_DBLF && defined( TARGET_SIMD_X86 )
+  const bool useSimd = read_x86_extension_flags() > x86_simd::SCALAR;
+#else
+  const bool useSimd = false;
+#endif
 
   if( dt )
   {
@@ -855,7 +865,7 @@ void LoopFilter::xSetMaxFilterLengthPQFromTransformSizes( const CodingUnit& cu,
 
             lfp.setFilterCMFL( ( sizeQSide >= 8 && sizePSide >= 8 ) ? 1 : 0 );
             if( bValue )
-              xGetBoundaryStrengthSingle<edgeDir>( lfp, cu, Position( ( area.x + edgeDir * d ) << csx, ( area.y + ( 1 - edgeDir ) * d ) << csy ), *cuPfstCh, ctuData, pqSameCtu );
+              xGetBoundaryStrengthSingle<edgeDir>( lfp, cu, Position( ( area.x + edgeDir * d ) << csx, ( area.y + ( 1 - edgeDir ) * d ) << csy ), *cuPfstCh, ctuData, pqSameCtu, useSimd );
             lfp.bs &= ~BsSet( 3, MAX_NUM_COMPONENT );
 
             if( !CU::isIntra( cu ) && !CU::isIntra( *cuP ) && cuP == cuPfstCh && cu.geoFlag() == false && cuP->geoFlag() == false )
@@ -928,7 +938,7 @@ void LoopFilter::xSetMaxFilterLengthPQFromTransformSizes( const CodingUnit& cu,
             }
             
             if( bValue )
-              xGetBoundaryStrengthSingle<edgeDir>( lfp, cu, Position( ( area.x + edgeDir * d ) << csx, ( area.y + ( 1 - edgeDir ) * d ) << csy ), *cuPfstCh, ctuData, pqSameCtu );
+              xGetBoundaryStrengthSingle<edgeDir>( lfp, cu, Position( ( area.x + edgeDir * d ) << csx, ( area.y + ( 1 - edgeDir ) * d ) << csy ), *cuPfstCh, ctuData, pqSameCtu, useSimd );
             lfp.bs &= ~BsSet( 3, MAX_NUM_COMPONENT );
             OFFSET( lfpPtr, lfpStride, edgeDir, ( 1 - edgeDir ) );
           }
@@ -1079,7 +1089,7 @@ LFCUParam LoopFilter::xGetLoopfilterParam( const CodingUnit& cu ) const
 }
 
 template<DeblockEdgeDir edgeDir>
-void LoopFilter::xGetBoundaryStrengthSingle( LoopFilterParam& lfp, const CodingUnit& cuQ, const Position &localPos, const CodingUnit& cuP, CtuData& ctuData, bool pqSameCtu ) const
+void LoopFilter::xGetBoundaryStrengthSingle( LoopFilterParam& lfp, const CodingUnit& cuQ, const Position &localPos, const CodingUnit& cuP, CtuData& ctuData, bool pqSameCtu, bool useSimd ) const
 {
   const Slice      &sliceQ = *cuQ.slice;
   const ChannelType chType = cuQ.chType();
@@ -1238,74 +1248,83 @@ void LoopFilter::xGetBoundaryStrengthSingle( LoopFilterParam& lfp, const CodingU
     if( ( piRefP0 == piRefQ0 && piRefP1 == piRefQ1 ) || ( piRefP0 == piRefQ1 && piRefP1 == piRefQ0 ) )
     {
 #if defined( TARGET_SIMD_X86 ) && ENABLE_SIMD_DBLF
-      const __m128i xmvP = _mm_unpacklo_epi64( refP0valid ? _mm_loadu_si64( ( const __m128i* ) &miP.mv[0] ) : _mm_setzero_si128(), refP1valid ? _mm_loadu_si64( ( const __m128i* ) &miP.mv[1] ) : _mm_setzero_si128() );
-      const __m128i xmvQ = _mm_unpacklo_epi64( refQ0valid ? _mm_loadu_si64( ( const __m128i* ) &miQ.mv[0] ) : _mm_setzero_si128(), refQ1valid ? _mm_loadu_si64( ( const __m128i* ) &miQ.mv[1] ) : _mm_setzero_si128() );
-      const __m128i xth  = _mm_set1_epi32( nThreshold - 1 );
-#else
-      Mv mvP[2] = { { 0, 0 }, { 0, 0 } }, mvQ[2] = { { 0, 0 }, { 0, 0 } };
-
-      if( refP0valid ) { mvP[0] = miP.mv[0]; }
-      if( refP1valid ) { mvP[1] = miP.mv[1]; }
-      if( refQ0valid ) { mvQ[0] = miQ.mv[0]; }
-      if( refQ1valid ) { mvQ[1] = miQ.mv[1]; }
-#endif
-      if( piRefP0 != piRefP1 )   // Different L0 & L1
+      if( useSimd )
       {
-        if( piRefP0 == piRefQ0 )
+        const __m128i xmvP = _mm_unpacklo_epi64( refP0valid ? _mm_loadu_si64( ( const __m128i* ) &miP.mv[0] ) : _mm_setzero_si128(), refP1valid ? _mm_loadu_si64( ( const __m128i* ) &miP.mv[1] ) : _mm_setzero_si128() );
+        const __m128i xmvQ = _mm_unpacklo_epi64( refQ0valid ? _mm_loadu_si64( ( const __m128i* ) &miQ.mv[0] ) : _mm_setzero_si128(), refQ1valid ? _mm_loadu_si64( ( const __m128i* ) &miQ.mv[1] ) : _mm_setzero_si128() );
+        const __m128i xth  = _mm_set1_epi32( nThreshold - 1 );
+
+        if( piRefP0 != piRefP1 )   // Different L0 & L1
         {
-#if defined( TARGET_SIMD_X86 ) && ENABLE_SIMD_DBLF
-          __m128i
-          xdiff = _mm_sub_epi32  ( xmvQ, xmvP );
-          xdiff = _mm_abs_epi32  ( xdiff );
-          xdiff = _mm_cmpgt_epi32( xdiff, xth );
-          uiBs  = _mm_testz_si128( xdiff, xdiff ) ? 0 : 1;
-#else
-          uiBs = ( ( abs( mvQ[0].getHor() - mvP[0].getHor() ) >= nThreshold ) || ( abs( mvQ[0].getVer() - mvP[0].getVer() ) >= nThreshold ) ||
-                   ( abs( mvQ[1].getHor() - mvP[1].getHor() ) >= nThreshold ) || ( abs( mvQ[1].getVer() - mvP[1].getVer() ) >= nThreshold ) )
-                 ? 1 : 0;
-#endif
+          if( piRefP0 == piRefQ0 )
+          {
+            __m128i
+            xdiff = _mm_sub_epi32  ( xmvQ, xmvP );
+            xdiff = _mm_abs_epi32  ( xdiff );
+            xdiff = _mm_cmpgt_epi32( xdiff, xth );
+            uiBs  = _mm_testz_si128( xdiff, xdiff ) ? 0 : 1;
+          }
+          else
+          {
+            __m128i
+            xmvQ1 = _mm_shuffle_epi32( xmvQ, ( 2 << 0 ) + ( 3 <<  2 ) + ( 0 << 4 ) + ( 1 << 6 ) );
+            __m128i
+            xdiff = _mm_sub_epi32  ( xmvQ1, xmvP );
+            xdiff = _mm_abs_epi32  ( xdiff );
+            xdiff = _mm_cmpgt_epi32( xdiff, xth );
+            uiBs  = _mm_testz_si128( xdiff, xdiff ) ? 0 : 1;
+          }
         }
         else
         {
-#if defined( TARGET_SIMD_X86 ) && ENABLE_SIMD_DBLF
           __m128i
-          xmvQ1 = _mm_shuffle_epi32( xmvQ, ( 2 << 0 ) + ( 3 <<  2 ) + ( 0 << 4 ) + ( 1 << 6 ) );
+          xmvQ1 = _mm_shuffle_epi32( xmvQ, ( 2 << 0 ) + ( 3 << 2 ) + ( 0 << 4 ) + ( 1 << 6 ) );
           __m128i
-          xdiff = _mm_sub_epi32  ( xmvQ1, xmvP );
-          xdiff = _mm_abs_epi32  ( xdiff );
+          xdiff = _mm_sub_epi32( xmvQ1, xmvP );
+          xdiff = _mm_abs_epi32( xdiff );
           xdiff = _mm_cmpgt_epi32( xdiff, xth );
           uiBs  = _mm_testz_si128( xdiff, xdiff ) ? 0 : 1;
-#else
-          uiBs = ( ( abs( mvQ[1].getHor() - mvP[0].getHor() ) >= nThreshold ) || ( abs( mvQ[1].getVer() - mvP[0].getVer() ) >= nThreshold ) ||
-                   ( abs( mvQ[0].getHor() - mvP[1].getHor() ) >= nThreshold ) || ( abs( mvQ[0].getVer() - mvP[1].getVer() ) >= nThreshold ) )
-                 ? 1 : 0;
-#endif
+
+          xdiff = _mm_sub_epi32( xmvQ, xmvP );
+          xdiff = _mm_abs_epi32( xdiff );
+          xdiff = _mm_cmpgt_epi32( xdiff, xth );
+          uiBs &= _mm_testz_si128( xdiff, xdiff ) ? 0 : 1;
         }
       }
-      else    // Same L0 & L1
+      else
+#endif
       {
+        Mv mvP[2] = { { 0, 0 }, { 0, 0 } }, mvQ[2] = { { 0, 0 }, { 0, 0 } };
 
-#if defined( TARGET_SIMD_X86 ) && ENABLE_SIMD_DBLF
-        __m128i
-        xmvQ1 = _mm_shuffle_epi32( xmvQ, ( 2 << 0 ) + ( 3 << 2 ) + ( 0 << 4 ) + ( 1 << 6 ) );
-        __m128i
-        xdiff = _mm_sub_epi32( xmvQ1, xmvP );
-        xdiff = _mm_abs_epi32( xdiff );
-        xdiff = _mm_cmpgt_epi32( xdiff, xth );
-        uiBs  = _mm_testz_si128( xdiff, xdiff ) ? 0 : 1;
-
-        xdiff = _mm_sub_epi32( xmvQ, xmvP );
-        xdiff = _mm_abs_epi32( xdiff );
-        xdiff = _mm_cmpgt_epi32( xdiff, xth );
-        uiBs &= _mm_testz_si128( xdiff, xdiff ) ? 0 : 1;
-#else
-        uiBs = ( ( abs( mvQ[0].getHor() - mvP[0].getHor() ) >= nThreshold ) || ( abs( mvQ[0].getVer() - mvP[0].getVer() ) >= nThreshold ) ||
-                 ( abs( mvQ[1].getHor() - mvP[1].getHor() ) >= nThreshold ) || ( abs( mvQ[1].getVer() - mvP[1].getVer() ) >= nThreshold ) )
-               &&
-               ( ( abs( mvQ[1].getHor() - mvP[0].getHor() ) >= nThreshold ) || ( abs( mvQ[1].getVer() - mvP[0].getVer() ) >= nThreshold ) ||
-                 ( abs( mvQ[0].getHor() - mvP[1].getHor() ) >= nThreshold ) || ( abs( mvQ[0].getVer() - mvP[1].getVer() ) >= nThreshold ) )
+        if( refP0valid ) { mvP[0] = miP.mv[0]; }
+        if( refP1valid ) { mvP[1] = miP.mv[1]; }
+        if( refQ0valid ) { mvQ[0] = miQ.mv[0]; }
+        if( refQ1valid ) { mvQ[1] = miQ.mv[1]; }
+
+        if( piRefP0 != piRefP1 )   // Different L0 & L1
+        {
+          if( piRefP0 == piRefQ0 )
+          {
+            uiBs = ( ( abs( mvQ[0].getHor() - mvP[0].getHor() ) >= nThreshold ) || ( abs( mvQ[0].getVer() - mvP[0].getVer() ) >= nThreshold ) ||
+                     ( abs( mvQ[1].getHor() - mvP[1].getHor() ) >= nThreshold ) || ( abs( mvQ[1].getVer() - mvP[1].getVer() ) >= nThreshold ) )
+                 ? 1 : 0;
+          }
+          else
+          {
+            uiBs = ( ( abs( mvQ[1].getHor() - mvP[0].getHor() ) >= nThreshold ) || ( abs( mvQ[1].getVer() - mvP[0].getVer() ) >= nThreshold ) ||
+                     ( abs( mvQ[0].getHor() - mvP[1].getHor() ) >= nThreshold ) || ( abs( mvQ[0].getVer() - mvP[1].getVer() ) >= nThreshold ) )
+                 ? 1 : 0;
+          }
+        }
+        else
+        {
+          uiBs = ( ( abs( mvQ[0].getHor() - mvP[0].getHor() ) >= nThreshold ) || ( abs( mvQ[0].getVer() - mvP[0].getVer() ) >= nThreshold ) ||
+                   ( abs( mvQ[1].getHor() - mvP[1].getHor() ) >= nThreshold ) || ( abs( mvQ[1].getVer() - mvP[1].getVer() ) >= nThreshold ) )
+                  &&
+                 ( ( abs( mvQ[1].getHor() - mvP[0].getHor() ) >= nThreshold ) || ( abs( mvQ[1].getVer() - mvP[0].getVer() ) >= nThreshold ) ||
+                   ( abs( mvQ[0].getHor() - mvP[1].getHor() ) >= nThreshold ) || ( abs( mvQ[0].getVer() - mvP[1].getVer() ) >= nThreshold ) )
                ? 1 : 0;
-#endif
+        }
       }
     }
     else // for all different Ref_Idx
diff --git a/source/Lib/CommonLib/LoopFilter.h b/source/Lib/CommonLib/LoopFilter.h
index 372d5464..aca6bb4c 100644
--- a/source/Lib/CommonLib/LoopFilter.h
+++ b/source/Lib/CommonLib/LoopFilter.h
@@ -76,7 +76,7 @@ class LoopFilter
 
   // filtering functions
   template<DeblockEdgeDir edgeDir>
-  void xGetBoundaryStrengthSingle ( LoopFilterParam& lfp, const CodingUnit& cu, const Position &localPos, const CodingUnit &cuP, CtuData& ctuData, bool pqSameCtu ) const;
+  void xGetBoundaryStrengthSingle ( LoopFilterParam& lfp, const CodingUnit& cu, const Position &localPos, const CodingUnit &cuP, CtuData& ctuData, bool pqSameCtu, bool useSimd ) const;
   template<DeblockEdgeDir edgeDir>
   void xSetEdgeFilterInsidePu     ( const CodingUnit &cu, const Area &area, const bool bValue, CtuData& ctuData ) const;
 
diff --git a/source/Lib/CommonLib/UnitTools.cpp b/source/Lib/CommonLib/UnitTools.cpp
index b1b387fe..b40df742 100644
--- a/source/Lib/CommonLib/UnitTools.cpp
+++ b/source/Lib/CommonLib/UnitTools.cpp
@@ -2737,42 +2737,30 @@ void PU::setAllAffineMv( CodingUnit& cu, Mv affLT, Mv affRT, Mv affLB, RefPicLis
   height >>= MIN_CU_LOG2;
 
 #if ENABLE_SIMD_OPT && defined( TARGET_SIMD_X86 )
-  __m128i xvbase = _mm_setr_epi32( mvScaleHor, mvScaleVer, mvScaleHor, mvScaleVer );
-  __m128i xvdvxy = _mm_setr_epi32( deltaMvVerX, deltaMvVerY, deltaMvVerX, deltaMvVerY );
-  __m128i xhdhxy = _mm_setr_epi32( deltaMvHorX, deltaMvHorY, deltaMvHorX, deltaMvHorY );
-
-#endif
-  for( int h = 0; h < height; h++ )
+  if( !subblkMVSpreadOverLimit && read_x86_extension_flags() > x86_simd::SCALAR )
   {
-#if ENABLE_SIMD_OPT && defined( TARGET_SIMD_X86 )
+    __m128i xvbase = _mm_setr_epi32( mvScaleHor, mvScaleVer, mvScaleHor, mvScaleVer );
+    __m128i xvdvxy = _mm_setr_epi32( deltaMvVerX, deltaMvVerY, deltaMvVerX, deltaMvVerY );
+    __m128i xhdhxy = _mm_setr_epi32( deltaMvHorX, deltaMvHorY, deltaMvHorX, deltaMvHorY );
+
+    for( int h = 0; h < height; h++ )
+    {
     __m128i
     xvoff = _mm_set1_epi32 ( halfBH + ( h << MIN_CU_LOG2 ) );
     xvoff = _mm_mullo_epi32( xvoff, xvdvxy );
     xvoff = _mm_add_epi32  ( xvoff, xvbase );
-#endif
-    if( subblkMVSpreadOverLimit )
-    {
-      for( int w = 0; w < width; w++ )
-      {
-        MotionInfo &mi = mb.at( w, h );
 
-        mi.mv[eRefList] = flbMv;
-      }
-    }
-    else
-    {
-#if ENABLE_SIMD_OPT && defined( TARGET_SIMD_X86 )
       for( int w = 0; w < width; w += 2 )
       {
         MotionInfo *mi = &mb.at( w, h );
 
         __m128i
-        xhoff = _mm_set1_epi32 ( 2 + ( w << MIN_CU_LOG2 ) );
+          xhoff = _mm_set1_epi32 ( 2 + ( w << MIN_CU_LOG2 ) );
         xhoff = _mm_add_epi32  ( xhoff, _mm_setr_epi32( 0, 0, 1 << MIN_CU_LOG2, 1 << MIN_CU_LOG2 ) );
         xhoff = _mm_mullo_epi32( xhoff, xhdhxy );
         xhoff = _mm_add_epi32  ( xhoff, xvoff );
         __m128i
-        xmv   = _mm_add_epi32  ( xhoff, _mm_set1_epi32( 1 << ( shift - 1 ) ) );
+          xmv   = _mm_add_epi32  ( xhoff, _mm_set1_epi32( 1 << ( shift - 1 ) ) );
         xmv   = _mm_add_epi32  ( xmv, _mm_cmpgt_epi32( xhoff, _mm_set1_epi32( -1 ) ) );
         xmv   = _mm_srai_epi32 ( xmv, shift );
         xmv   = _mm_max_epi32  ( _mm_set1_epi32( -( 1 << 17 ) ), _mm_min_epi32( _mm_set1_epi32( ( 1 << 17 ) - 1 ), xmv ) );
@@ -2780,22 +2768,39 @@ void PU::setAllAffineMv( CodingUnit& cu, Mv affLT, Mv affRT, Mv affLB, RefPicLis
         _mm_storeu_si64( ( __m128i* ) &mi[0].mv[eRefList], xmv );
         _mm_storeu_si64( ( __m128i* ) &mi[1].mv[eRefList], _mm_unpackhi_epi64( xmv, _mm_setzero_si128() ) );
       }
-#else
-      for( int w = 0; w < width; w++ )
+    }
+  }
+  else
+#endif
+  {
+    for( int h = 0; h < height; h++ )
+    {
+      if( subblkMVSpreadOverLimit )
+      {
+        for( int w = 0; w < width; w++ )
+        {
+          MotionInfo &mi = mb.at( w, h );
+
+          mi.mv[eRefList] = flbMv;
+        }
+      }
+      else
       {
-        MotionInfo &mi = mb.at( w, h );
+        for( int w = 0; w < width; w++ )
+        {
+          MotionInfo &mi = mb.at( w, h );
 
-        int mvHor = mvScaleHor + deltaMvHorX * ( 2 + ( w << MIN_CU_LOG2 ) ) + deltaMvVerX * ( halfBH + ( h << MIN_CU_LOG2 ) );
-        int mvVer = mvScaleVer + deltaMvHorY * ( 2 + ( w << MIN_CU_LOG2 ) ) + deltaMvVerY * ( halfBH + ( h << MIN_CU_LOG2 ) );
+          int mvHor = mvScaleHor + deltaMvHorX * ( 2 + ( w << MIN_CU_LOG2 ) ) + deltaMvVerX * ( halfBH + ( h << MIN_CU_LOG2 ) );
+          int mvVer = mvScaleVer + deltaMvHorY * ( 2 + ( w << MIN_CU_LOG2 ) ) + deltaMvVerY * ( halfBH + ( h << MIN_CU_LOG2 ) );
 
-        roundAffineMv( mvHor, mvVer, shift );
+          roundAffineMv( mvHor, mvVer, shift );
 
-        Mv rndMv( mvHor, mvVer );
-        rndMv.clipToStorageBitDepth();
+          Mv rndMv( mvHor, mvVer );
+          rndMv.clipToStorageBitDepth();
 
-        mi.mv[eRefList] = rndMv;
+          mi.mv[eRefList] = rndMv;
+        }
       }
-#endif
     }
   }
 
diff --git a/source/Lib/CommonLib/x86/CommonDefX86.h b/source/Lib/CommonLib/x86/CommonDefX86.h
index f2386485..f9a6da37 100644
--- a/source/Lib/CommonLib/x86/CommonDefX86.h
+++ b/source/Lib/CommonLib/x86/CommonDefX86.h
@@ -85,6 +85,13 @@ POSSIBILITY OF SUCH DAMAGE.
 #    include <simde/x86/sse4.1.h>
 #  endif
 
+#  if defined( REAL_TARGET_X86 ) \
+    || ( defined( SIMD_EVERYWHERE_EXTENSION_LEVEL_ID ) && SIMD_EVERYWHERE_EXTENSION_LEVEL_ID >= X86_SIMD_AVX2 )
+#    define ENABLE_AVX2_IMPLEMENTATIONS 1
+#  else
+#    define ENABLE_AVX2_IMPLEMENTATIONS 0
+#  endif
+
 namespace vvdec
 {
 using namespace x86_simd;
diff --git a/source/Lib/DecoderLib/DecLib.h b/source/Lib/DecoderLib/DecLib.h
index c38e271a..707ecd42 100644
--- a/source/Lib/DecoderLib/DecLib.h
+++ b/source/Lib/DecoderLib/DecLib.h
@@ -120,6 +120,8 @@ class DecLib
   unsigned int getUpscaledOutput() { return m_upscaledOutput; }
 #endif
 
+  ThreadPool& getThreadPool() { return *m_decodeThreadPool; }
+
 private:
   void     reconPicture( Picture* pcPic );
 #if JVET_R0270
diff --git a/source/Lib/FilmGrain/FilmGrain.cpp b/source/Lib/FilmGrain/FilmGrain.cpp
index dacc3ff4..66dd63d0 100644
--- a/source/Lib/FilmGrain/FilmGrain.cpp
+++ b/source/Lib/FilmGrain/FilmGrain.cpp
@@ -62,6 +62,10 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #include "CommonDef.h"
 
+#if defined( TARGET_SIMD_X86 ) && defined( USE_SIMD )
+#  include "FilmGrainImplX86.h"
+#endif
+
 namespace vvdec
 {
 
@@ -547,19 +551,26 @@ static int same_pattern( fgs_sei* cfg, int32_t a, int32_t b )
   return 1;
 }
 
+void FilmGrain::set_seed( uint32_t seed )
+{
+  m_line_rnd = m_line_rnd_up = seed;
+}
+
 /** Initialize "hardware" interface from FGS SEI parameters */
-void FilmGrain::init_sei( fgs_sei* cfg )
+void FilmGrain::init_sei()
 {
-  int8_t   P[64 * 64];
-  int8_t   Lbuf[73 * 82];
-  int8_t   Cbuf[38 * 44];
-  uint8_t  slut[256];
-  uint8_t  plut[256];
-  uint8_t  intensities[VFGS_MAX_PATTERNS];
-  uint32_t patterns[VFGS_MAX_PATTERNS];
-  uint8_t  np = 0;   // number of patterns
-  uint8_t  a, b, i;
-  int      c, k;
+  int8_t        P[64 * 64];
+  int8_t        Lbuf[73 * 82];
+  int8_t        Cbuf[38 * 44];
+  uint8_t       slut[256];
+  uint8_t       plut[256];
+  uint8_t       intensities[VFGS_MAX_PATTERNS];
+  uint32_t      patterns[VFGS_MAX_PATTERNS];
+  uint8_t       np = 0;   // number of patterns
+  uint8_t       a, b, i;
+  unsigned char all0 = 1;
+
+  int c, k;
 
   for( c = 0; c < 3; c++ )
   {
@@ -571,16 +582,16 @@ void FilmGrain::init_sei( fgs_sei* cfg )
       memset( patterns, ~0, sizeof( patterns ) );
     }
     // 1. Look for different patterns, up to max supported number
-    if( cfg->comp_model_present_flag[c] )
+    if( fgs.comp_model_present_flag[c] )
     {
-      for( k = 0; k < cfg->num_intensity_intervals[c]; k++ )
+      for( k = 0; k < fgs.num_intensity_intervals[c]; k++ )
       {
-        a           = cfg->intensity_interval_lower_bound[c][k];
+        a           = fgs.intensity_interval_lower_bound[c][k];
         uint32_t id = SEI_MAX_MODEL_VALUES * ( k + 256 * c );
 
         for( i = 0; i < VFGS_MAX_PATTERNS; i++ )
         {
-          if( same_pattern( cfg, patterns[i], id ) )
+          if( same_pattern( &fgs, patterns[i], id ) )
           {
             break;
           }
@@ -613,51 +624,50 @@ void FilmGrain::init_sei( fgs_sei* cfg )
       // 2. Register the patterns (with correct order)
       for( i = 0; i < np; i++ )
       {
-        int16_t* coef = &cfg->comp_model_value[0][0][0] + patterns[i];
+        int16_t* coef = &fgs.comp_model_value[0][0][0] + patterns[i];
 
         if( c == 0 )
         {
-          if( cfg->model_id )
+          if( fgs.model_id )
           {
-            make_ar_pattern( Lbuf, P, 64, coef, 6, 1, cfg->log2_scale_factor, Seed_LUT[0] );
+            make_ar_pattern( Lbuf, P, 64, coef, 6, 1, fgs.log2_scale_factor, Seed_LUT[0] );
           }
           else
           {
             make_sei_ff_pattern64( (int8_t( * )[64]) P, coef[1], coef[2] );
           }
 
-          set_luma_pattern( i, P );
+          m_impl->set_luma_pattern( i, P );
         }
         else if( c == 2 )
         {
-          if( cfg->model_id )
+          if( fgs.model_id )
           {
-            make_ar_pattern( Cbuf, P, 32, coef, 6, 1, cfg->log2_scale_factor, Seed_LUT[1] );
+            make_ar_pattern( Cbuf, P, 32, coef, 6, 1, fgs.log2_scale_factor, Seed_LUT[1] );
           }
           else
           {
             make_sei_ff_pattern32( (int8_t( * )[32]) P, coef[1], coef[2] );
           }
-
-          set_chroma_pattern( i, P );
+          m_impl->set_chroma_pattern( i, P );
         }
       }
       // 3. Fill up LUTs
       for( int cc = std::min( c, 1 ); cc <= c; cc++ )
       {
-        if( cfg->comp_model_present_flag[cc] )
+        if( fgs.comp_model_present_flag[cc] )
         {
           memset( plut, 255, sizeof( plut ) );
           // 3a. Fill valid patterns
-          for( k = 0; k < cfg->num_intensity_intervals[cc]; k++ )
+          for( k = 0; k < fgs.num_intensity_intervals[cc]; k++ )
           {
-            a           = cfg->intensity_interval_lower_bound[cc][k];
-            b           = cfg->intensity_interval_upper_bound[cc][k];
+            a           = fgs.intensity_interval_lower_bound[cc][k];
+            b           = fgs.intensity_interval_upper_bound[cc][k];
             uint32_t id = SEI_MAX_MODEL_VALUES * ( k + 256 * cc );
 
             for( i = 0; i < VFGS_MAX_PATTERNS; i++ )
             {
-              if( same_pattern( cfg, patterns[i], id ) )
+              if( same_pattern( &fgs, patterns[i], id ) )
               {
                 break;
               }
@@ -666,7 +676,7 @@ void FilmGrain::init_sei( fgs_sei* cfg )
 
             for( int l = a; l <= b; l++ )
             {
-              slut[l] = (uint8_t) cfg->comp_model_value[cc][k][0];
+              slut[l] = (uint8_t) fgs.comp_model_value[cc][k][0];
               if( i < VFGS_MAX_PATTERNS )
               {
                 plut[l] = i << 4;
@@ -674,7 +684,8 @@ void FilmGrain::init_sei( fgs_sei* cfg )
             }
           }
           // 3b. Fill holes (no interp. yet, just repeat last)
-          i = 0;
+          i       = 0;
+          int tmp = 0;
           for( k = 0; k < 256; k++ )
           {
             if( plut[k] == 255 )
@@ -685,25 +696,39 @@ void FilmGrain::init_sei( fgs_sei* cfg )
             {
               i = plut[k];
             }
+            tmp += plut[k];
+          }
+          if( tmp != 0 )
+          {
+            all0 = 0;
           }
         }
         else
         {
           memset( plut, 0, sizeof( plut ) );
+          all0 = 1;
         }
         // 3c. Register LUTs
-        set_scale_lut( cc, slut );
-        set_pattern_lut( cc, plut );
+        m_impl->set_scale_lut( cc, slut );
+        m_impl->set_pattern_lut( cc, plut, all0 );
       }
     }
   }
 
-  set_scale_shift( cfg->log2_scale_factor - ( cfg->model_id ? 1 : 0 ) );   // -1 for grain shift in pattern generation (see above)
+  m_impl->set_scale_shift( fgs.log2_scale_factor - ( fgs.model_id ? 1 : 0 ) );   // -1 for grain shift in pattern generation (see above)
+}
+
+FilmGrain::FilmGrain()
+{
+#if defined( TARGET_SIMD_X86 ) && defined( USE_SIMD )
+  m_impl = FilmGrainImplX86<UNDEFINED>::makeFilmGrainImpl();
+#else
+  m_impl = std::make_unique<FilmGrainImpl>();
+#endif
 }
 
 void FilmGrain::updateFGC( vvdecSEIFilmGrainCharacteristics* fgc )
 {
-  fgs_sei fgs;   // TODO: maybe make it a member ? (idea would be to re-seed patterns for each picture)
   // Copy SEI message in vfgs structure format
   // TODO: check some values and warn about unsupported stuff ?
   fgs.model_id          = fgc->filmGrainModelId;
@@ -758,12 +783,87 @@ void FilmGrain::updateFGC( vvdecSEIFilmGrainCharacteristics* fgc )
     }
   }
 
-  init_sei( &fgs );
+  init_sei();
 
   //  if (!m_bFgs)
   //    // TODO: get something random
   //      // TODO: make seed also impact the pattern gen
-  //    vfgs_set_seed(uint32_t seed);
+  //    set_seed(uint32_t seed);
+}
+
+void FilmGrain::prepareBlockSeeds( int width, int height )
+{
+  m_line_seeds.resize( ( height + 15 ) / 16 );
+
+  m_prev_frame_line_rnd_up = m_line_rnd_up;
+
+  uint32_t rnd = 0;
+  for( int y = 0; y < m_line_seeds.size(); ++y )
+  {
+    // Generate / backup / restore per-line random seeds (needed to make multi-line blocks)
+    if( y != 0 )
+    {
+      // new line of blocks
+      m_line_rnd_up = m_line_rnd;
+      m_line_rnd    = rnd;
+    }
+
+    m_line_seeds[y] = m_line_rnd;
+
+    // Crank random generator
+    rnd = m_line_rnd;
+    for( int x = 0; x < ( width + 15 ) / 16; ++x )
+    {
+      rnd = prng( rnd );
+    }
+  }
+}
+
+void FilmGrain::setColorFormat( vvdecColorFormat fmt )
+{
+  switch( fmt )
+  {
+    // clang-format off
+    case VVDEC_CF_YUV400_PLANAR: m_impl->set_chroma_subsampling( 0, 0 ); break;
+    case VVDEC_CF_YUV420_PLANAR: m_impl->set_chroma_subsampling( 2, 2 ); break;
+    case VVDEC_CF_YUV422_PLANAR: m_impl->set_chroma_subsampling( 2, 1 ); break;
+    case VVDEC_CF_YUV444_PLANAR: m_impl->set_chroma_subsampling( 1, 1 ); break;
+    default: THROW_FATAL( "invalid color format: " );
+    // clang-format on
+  }
+}
+
+void FilmGrain::add_grain_line( void* Y, void* U, void* V, int y, int width )
+{
+  uint32_t rnd_up = y < 16 ? m_prev_frame_line_rnd_up : m_line_seeds[y / 16 - 1];
+  uint32_t rnd    = m_line_seeds[y / 16];
+
+  int16_t grain[3][32];
+  uint8_t scale[3][32];
+
+  // Process line
+  for( int x = 0; x < width; x += 16 )
+  {
+    // Process pixels for each color component
+    if( fgs.comp_model_present_flag[0] )
+    {
+      m_impl->add_grain_block( Y, 0, x, y, width, rnd, rnd_up, grain, scale );
+    }
+    if( U && V )
+    {
+      if( fgs.comp_model_present_flag[1] )
+      {
+        m_impl->add_grain_block( U, 1, x, y, width, rnd, rnd_up, grain, scale );
+      }
+      if( fgs.comp_model_present_flag[2] )
+      {
+        m_impl->add_grain_block( V, 2, x, y, width, rnd, rnd_up, grain, scale );
+      }
+    }
+    // Crank random generator
+    rnd    = prng( rnd );
+    rnd_up = prng( rnd_up );   // upper block (overlapping)
+  }
 }
 
 }   // namespace vvdec
diff --git a/source/Lib/FilmGrain/FilmGrain.h b/source/Lib/FilmGrain/FilmGrain.h
index 73303d92..d03a71bd 100644
--- a/source/Lib/FilmGrain/FilmGrain.h
+++ b/source/Lib/FilmGrain/FilmGrain.h
@@ -59,8 +59,13 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "FilmGrainImpl.h"
 
 #include <cstring>
+#include <vector>
+#include <memory>
 
 #include "vvdec/sei.h"
+#include "vvdec/vvdec.h"
+
+#define USE_SIMD
 
 namespace vvdec
 {
@@ -81,18 +86,31 @@ struct fgs_sei
   int16_t  comp_model_value[3][256][SEI_MAX_MODEL_VALUES];
 };
 
-class FilmGrain : public FilmGrainImpl
+class FilmGrain
 {
+  std::unique_ptr<FilmGrainImpl> m_impl;
+
+  uint32_t m_line_rnd               = 0xdeadbeef;
+  uint32_t m_line_rnd_up            = 0xdeadbeef;
+  uint32_t m_prev_frame_line_rnd_up = 0xdeadbeef;
+
+  std::vector<uint32_t> m_line_seeds;
+  fgs_sei               fgs;
+
 public:
-  FilmGrain( int depth, int chromaSubsampling )
-  {
-    set_depth( depth );
-    set_chroma_subsampling( chromaSubsampling, chromaSubsampling );
-  }
+  FilmGrain();
+  ~FilmGrain() = default;
+
   void updateFGC( vvdecSEIFilmGrainCharacteristics* fgc );
+  void setDepth( int depth ) { m_impl->set_depth( depth ); }
+  void setColorFormat( vvdecColorFormat fmt );
+  void prepareBlockSeeds( int width, int height );
+
+  void add_grain_line( void* Y, void* U, void* V, int y, int width );
 
 private:
-  void init_sei( fgs_sei* cfg );
+  void set_seed( uint32_t seed );
+  void init_sei();
 };
 
 }   // namespace vvdec
diff --git a/source/Lib/FilmGrain/FilmGrainImpl.cpp b/source/Lib/FilmGrain/FilmGrainImpl.cpp
index 85858487..8cba3a1a 100644
--- a/source/Lib/FilmGrain/FilmGrainImpl.cpp
+++ b/source/Lib/FilmGrain/FilmGrainImpl.cpp
@@ -61,8 +61,6 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #include <CommonDef.h>
 
-#define PATTERN_INTERPOLATION 0
-
 namespace vvdec
 {
 
@@ -84,7 +82,7 @@ namespace vvdec
  * Note: to fully support cross-component correlation within patterns, we would
  * need to align luma/chroma offsets.
  */
-static void get_offset_y( uint32_t val, int* s, uint8_t* x, uint8_t* y )
+void FilmGrainImpl::get_offset_y( uint32_t val, int* s, uint8_t* x, uint8_t* y )
 {
   uint32_t bf;   // bit field
 
@@ -99,7 +97,7 @@ static void get_offset_y( uint32_t val, int* s, uint8_t* x, uint8_t* y )
                                     // pattern samples (when using overlap).
 }
 
-void FilmGrainImpl::get_offset_u( uint32_t val, int* s, uint8_t* x, uint8_t* y )
+void FilmGrainImpl::get_offset_u( uint32_t val, int* s, uint8_t* x, uint8_t* y ) const
 {
   uint32_t bf;   // bit field
 
@@ -112,7 +110,7 @@ void FilmGrainImpl::get_offset_u( uint32_t val, int* s, uint8_t* x, uint8_t* y )
   *y = ( ( bf * 12 ) >> 10 ) * ( 4 / csuby );
 }
 
-void FilmGrainImpl::get_offset_v( uint32_t val, int* s, uint8_t* x, uint8_t* y )
+void FilmGrainImpl::get_offset_v( uint32_t val, int* s, uint8_t* x, uint8_t* y ) const
 {
   uint32_t bf;   // bit field
 
@@ -125,29 +123,10 @@ void FilmGrainImpl::get_offset_v( uint32_t val, int* s, uint8_t* x, uint8_t* y )
   *y = ( ( bf * 12 ) >> 10 ) * ( 4 / csuby );
 }
 
-void FilmGrainImpl::add_grain_block( void* I, int c, int x, int y, int width )
+void FilmGrainImpl::add_grain_block( void* I, int c, int x, int y, int width, uint32_t rnd, uint32_t rnd_up, int16_t grain[3][32], uint8_t scale[3][32] ) const
 {
-  uint8_t*  I8  = (uint8_t*) I;
-  uint16_t* I16 = (uint16_t*) I;
-
-  int     s, s_up;        // random sign flip (current + upper row)
-  uint8_t ox, oy;         // random offset (current)
-  uint8_t ox_up, oy_up;   // random offset (upper row)
-  uint8_t oc1, oc2;       // overlapping coefficients
-  uint8_t pi;             // pattern index integer part
-  int     i, j;
-  int     P;              // Pattern sample (from current pattern index)
-#if PATTERN_INTERPOLATION
-  int     Pn;             // Next-pattern sample (from pattern index+1)
-  uint8_t pf;             // pattern index fractional part
-#endif
-
-  uint8_t intensity;
-  int     flush = 0;
-  int     subx  = c ? csubx : 1;
-  int     suby  = c ? csuby : 1;
-  uint8_t I_min = c ? C_min : Y_min;
-  uint8_t I_max = c ? C_max : Y_max;
+  const int subx = c ? csubx : 1;
+  const int suby = c ? csuby : 1;
 
   if( ( y & 1 ) && suby > 1 )
   {
@@ -161,8 +140,9 @@ void FilmGrainImpl::add_grain_block( void* I, int c, int x, int y, int width )
 
   // TODO: assert subx, suby, Y/C min/max, max pLUT values, etc
 
-  j = y & 0xf;
+  const int j = y & 0xf;
 
+  uint8_t oc1, oc2;                 // overlapping coefficients
   if( y > 15 && j == 0 )            // first line of overlap
   {
     oc1 = ( suby > 1 ) ? 20 : 12;   // current
@@ -179,6 +159,8 @@ void FilmGrainImpl::add_grain_block( void* I, int c, int x, int y, int width )
   }
 
   // Derive block offsets + sign
+  int     s;        // random sign flip (current)
+  uint8_t ox, oy;   // random offset (current)
   if( c == 0 )
   {
     get_offset_y( rnd, &s, &ox, &oy );
@@ -194,6 +176,8 @@ void FilmGrainImpl::add_grain_block( void* I, int c, int x, int y, int width )
   oy += j / suby;
 
   // Same for upper block (overlap)
+  int     s_up;           // random sign flip (upper row)
+  uint8_t ox_up, oy_up;   // random offset (upper row)
   if( c == 0 )
   {
     get_offset_y( rnd_up, &s_up, &ox_up, &oy_up );
@@ -209,78 +193,124 @@ void FilmGrainImpl::add_grain_block( void* I, int c, int x, int y, int width )
   oy_up += ( 16 + j ) / suby;
 
   // Make grain pattern
-  for( i = 0; i < 16 / subx; i++ )
+  make_grain_pattern( I, c, x, subx, oc1, oc2, ox, ox_up, oy, oy_up, s, s_up, grain, scale );
+
+  // Scale & output
+  scale_and_output( I, c, x, subx, width, grain, scale );
+}
+
+void FilmGrainImpl::make_grain_pattern( const void* I,
+                                        int         c,
+                                        int         x,
+                                        int         subx,
+                                        uint8_t     oc1,
+                                        uint8_t     oc2,
+                                        uint8_t     ox,
+                                        uint8_t     ox_up,
+                                        uint8_t     oy,
+                                        uint8_t     oy_up,
+                                        int         s,
+                                        int         s_up,
+                                        int16_t     grain[3][32],
+                                        uint8_t     scale[3][32] ) const
+{
+  const uint8_t*  I8  = (const uint8_t*) I;
+  const uint16_t* I16 = (const uint16_t*) I;
   {
-    intensity = bs ? I16[x / subx + i] >> bs : I8[x / subx + i];
-    pi        = pLUT[c][intensity] >> 4;   // pattern index (integer part)
+    for( int i = 0; i < 16 / subx; i++ )
+    {
+      uint8_t intensity = bs ? I16[x / subx + i] >> bs : I8[x / subx + i];
+      uint8_t pi        = pLUT[c][intensity] >> 4;                  // pattern index (integer part)
+      int     P         = pattern[c ? 1 : 0][pi][oy][ox + i] * s;   // Pattern sample (from current pattern index)
+                                                                    // We could consider just XORing the sign bit
 #if PATTERN_INTERPOLATION
-    pf = pLUT[c][intensity] & 15;          // fractional part (interpolate with next) -- could restrict to less bits (e.g. 2)
+      uint8_t pf = pLUT[c][intensity] & 15;           // pattern index fractional part (interpolate with next) -- could restrict to less bits (e.g. 2)
+      int     Pn =
+        pattern[c ? 1 : 0][pi + 1][oy][ox + i] * s;   // Next-pattern sample (from pattern index+1)
+                                                      // But there are equivalent hw tricks, e.g. storing values as sign + amplitude instead of two's complement
 #endif
 
-    // Pattern
-    P = pattern[c ? 1 : 0][pi][oy][ox + i] * s;   // We could consider just XORing the sign bit
+      if( oc1 )   // overlap
+      {
+        P = round( P * oc1 + pattern[c ? 1 : 0][pi][oy_up][ox_up + i] * oc2 * s_up, 5 );
 #if PATTERN_INTERPOLATION
-    Pn =
-      pattern[c ? 1 : 0][pi + 1][oy][ox + i] * s;   // But there are equivalent hw tricks, e.g. storing values as sign + amplitude instead of two's complement
+        Pn = round( Pn * oc1 + pattern[c ? 1 : 0][pi + 1][oy_up][ox_up + i] * oc2 * s_up, 5 );
 #endif
-
-    if( oc1 )   // overlap
-    {
-      P = round( P * oc1 + pattern[c ? 1 : 0][pi][oy_up][ox_up + i] * oc2 * s_up, 5 );
+      }
 #if PATTERN_INTERPOLATION
-      Pn = round( Pn * oc1 + pattern[c ? 1 : 0][pi + 1][oy_up][ox_up + i] * oc2 * s_up, 5 );
+      // Pattern interpolation: P is current, Pn is next, pf is interpolation coefficient
+      grain[c][16 / subx + i] = round( P * ( 16 - pf ) + Pn * pf, 4 );
+#else
+      grain[c][16 / subx + i] = P;
 #endif
+      // Scale sign already integrated above because of overlap
+      scale[c][16 / subx + i] = sLUT[c][intensity];
     }
+  }
+}
 
-#if PATTERN_INTERPOLATION
-    // Pattern interpolation: P is current, Pn is next, pf is interpolation coefficient
-    grain[c][16 / subx + i] = round( P * ( 16 - pf ) + Pn * pf, 4 );
-#else
-    grain[c][16 / subx + i] = P;
-#endif
+void FilmGrainImpl::scale_and_output( void* I, int c, int x, int subx, int width, int16_t grain[3][32], uint8_t scale[3][32] ) const
+{
+  uint8_t*  I8  = (uint8_t*) I;
+  uint16_t* I16 = (uint16_t*) I;
 
-    // Scale sign already integrated above because of overlap
-    scale[c][16 / subx + i] = sLUT[c][intensity];
-  }
+  const uint8_t I_min = c ? C_min : Y_min;
+  const uint8_t I_max = c ? C_max : Y_max;
 
-  // Scale & output
+  int flush = 0;
   do
   {
     if( x > 0 )
     {
-      int32_t g;
-      int16_t l1, l0, r0, r1;
-
       if( !flush )
       {
         // Horizontal deblock (across previous block)
-        l1                      = grain[c][16 / subx - 2];
-        l0                      = grain[c][16 / subx - 1];
-        r0                      = grain[c][16 / subx + 0];
-        r1                      = grain[c][16 / subx + 1];
+        int16_t l1, l0, r0, r1;
+
+        l1 = grain[c][16 / subx - 2];
+        l0 = grain[c][16 / subx - 1];
+        r0 = grain[c][16 / subx + 0];
+        r1 = grain[c][16 / subx + 1];
+
         grain[c][16 / subx - 1] = round( l1 + 3 * l0 + r0, 2 );
         grain[c][16 / subx + 0] = round( l0 + 3 * r0 + r1, 2 );
       }
-      for( i = 0; i < 16 / subx; i++ )
       {
-        // Output previous block (or flush current)
-        g = round( scale[c][i] * (int16_t) grain[c][i], scale_shift );
-        if( bs )
-        {
-          I16[( x - 16 ) / subx + i] = std::max( I_min << bs, std::min( I_max << bs, I16[( x - 16 ) / subx + i] + g ) );
-        }
-        else
+        for( int i = 0; i < 16 / subx; i++ )
         {
-          I8[( x - 16 ) / subx + i] = std::max<int32_t>( I_min, std::min<int32_t>( I_max, I8[( x - 16 ) / subx + i] + g ) );
+          // Output previous block (or flush current)
+          int32_t g = round( scale[c][i] * (int16_t) grain[c][i], scale_shift );
+          if( bs )
+          {
+            I16[( x - 16 ) / subx + i] = std::max<int32_t>( I_min << bs, std::min<int32_t>( I_max << bs, I16[( x - 16 ) / subx + i] + g ) );
+          }
+          else
+          {
+            I8[( x - 16 ) / subx + i] = std::max<int32_t>( I_min, std::min<int32_t>( I_max, I8[( x - 16 ) / subx + i] + g ) );
+          }
         }
       }
     }
 
     // Shift pipeline
-    for( i = 0; i < 16 / subx && !flush; i++ )
+    if( !flush )
     {
-      grain[c][i] = grain[c][i + 16 / subx];
-      scale[c][i] = scale[c][i + 16 / subx];
+      if( c == 0 )
+      {
+        for( int i = 0; i < 16; i++ )
+        {
+          grain[0][i] = grain[0][i + 16];
+          scale[0][i] = scale[0][i + 16];
+        }
+      }
+      else
+      {
+        for( int i = 0; i < 8; i++ )
+        {
+          grain[c][i] = grain[c][i + 8];
+          scale[c][i] = scale[c][i + 8];
+        }
+      }
     }
 
     if( x + 16 >= width )
@@ -293,32 +323,6 @@ void FilmGrainImpl::add_grain_block( void* I, int c, int x, int y, int width )
 
 /* Public interface ***********************************************************/
 
-void FilmGrainImpl::add_grain_line( void* Y, void* U, void* V, int y, int width )
-{
-  // Generate / backup / restore per-line random seeds (needed to make multi-line blocks)
-  if( y && ( y & 0x0f ) == 0 )
-  {
-    // new line of blocks --> backup + copy current to upper
-    line_rnd_up = line_rnd;
-    line_rnd    = rnd;
-  }
-  rnd_up = line_rnd_up;
-  rnd    = line_rnd;
-
-  // Process line
-  for( int x = 0; x < width; x += 16 )
-  {
-    // Process pixels for each color component
-    add_grain_block( Y, 0, x, y, width );
-    add_grain_block( U, 1, x, y, width );
-    add_grain_block( V, 2, x, y, width );
-
-    // Crank random generator
-    rnd    = prng( rnd );
-    rnd_up = prng( rnd_up );   // upper block (overlapping)
-  }
-}
-
 void FilmGrainImpl::set_luma_pattern( int index, int8_t* P )
 {
   CHECK( index < 0 || index >= 8, "luma pattern index out of bounds" );
@@ -340,17 +344,13 @@ void FilmGrainImpl::set_scale_lut( int c, uint8_t lut[] )
   memcpy( sLUT[c], lut, 256 );
 }
 
-void FilmGrainImpl::set_pattern_lut( int c, uint8_t lut[] )
+void FilmGrainImpl::set_pattern_lut( int c, uint8_t lut[], bool all0 )
 {
   CHECK( c < 0 || c >= 3, "pattern lut idx out of bounds" );
+  allZero[c] = all0;
   memcpy( pLUT[c], lut, 256 );
 }
 
-void FilmGrainImpl::set_seed( uint32_t seed )
-{
-  rnd = rnd_up = line_rnd = line_rnd_up = seed;
-}
-
 void FilmGrainImpl::set_scale_shift( int shift )
 {
   CHECK( shift < 2 || shift >= 8, "scale shift out of range" );
@@ -386,8 +386,6 @@ FilmGrainImpl::FilmGrainImpl()
   memset( pattern, 0, sizeof( pattern ) );
   memset( sLUT,    0, sizeof( sLUT ) );
   memset( pLUT,    0, sizeof( pLUT ) );
-  memset( grain,   0, sizeof( grain ) );
-  memset( scale,   0, sizeof( scale ) );
 }
 
 }   // namespace vvdec
diff --git a/source/Lib/FilmGrain/FilmGrainImpl.h b/source/Lib/FilmGrain/FilmGrainImpl.h
index 41150271..3a5727d6 100644
--- a/source/Lib/FilmGrain/FilmGrainImpl.h
+++ b/source/Lib/FilmGrain/FilmGrainImpl.h
@@ -59,6 +59,7 @@ POSSIBILITY OF SUCH DAMAGE.
 #include <cstdint>
 
 #define VFGS_MAX_PATTERNS 8
+#define PATTERN_INTERPOLATION 0
 
 namespace vvdec
 {
@@ -87,50 +88,64 @@ constexpr inline auto round( T a, uint8_t s )
 
 class FilmGrainImpl
 {
+protected:
   // Note: declarations optimized for code readability; e.g. pattern storage in
   //       actual hardware implementation would differ significantly
   int8_t  pattern[2][VFGS_MAX_PATTERNS + 1][64][64];   // +1 to simplify interpolation code
   uint8_t sLUT[3][256];
   uint8_t pLUT[3][256];
 
-  uint32_t rnd         = 0xdeadbeef;
-  uint32_t rnd_up      = 0xdeadbeef;
-  uint32_t line_rnd    = 0xdeadbeef;
-  uint32_t line_rnd_up = 0xdeadbeef;
-  uint8_t  scale_shift = 5 + 6;
-  uint8_t  bs          = 0;   // bitshift = bitdepth - 8
-  int      csubx       = 2;
-  int      csuby       = 2;
+  uint8_t scale_shift = 5 + 6;
+  uint8_t bs          = 0;   // bitshift = bitdepth - 8
+  int     csubx       = 2;
+  int     csuby       = 2;
+  bool    allZero[3]  = { 0, 0, 0 };
 
   constexpr static uint8_t Y_min = 0;
   constexpr static uint8_t Y_max = 255;
   constexpr static uint8_t C_min = 0;
   constexpr static uint8_t C_max = 255;
 
-  // Processing pipeline (needs only 2 registers for each color actually, for horizontal deblocking)
-  int16_t grain[3][32];   // 9 bit needed because of overlap (has norm > 1)
-  uint8_t scale[3][32];
-
-  void get_offset_u( uint32_t val, int* s, uint8_t* x, uint8_t* y );
-  void get_offset_v( uint32_t val, int* s, uint8_t* x, uint8_t* y );
-  void add_grain_block( void* I, int c, int x, int y, int width );
+  static void get_offset_y( uint32_t val, int* s, uint8_t* x, uint8_t* y );
+         void get_offset_u( uint32_t val, int* s, uint8_t* x, uint8_t* y ) const;
+         void get_offset_v( uint32_t val, int* s, uint8_t* x, uint8_t* y ) const;
 
-protected:
+public:
   FilmGrainImpl();
+  virtual ~FilmGrainImpl() = default;
 
+  void add_grain_block( void* I, int c, int x, int y, int width, uint32_t rnd, uint32_t rnd_up, int16_t grain[3][32], uint8_t scale[3][32] ) const;
   void set_luma_pattern( int index, int8_t* P );
   void set_chroma_pattern( int index, int8_t* P );
   void set_scale_lut( int c, uint8_t lut[] );
-  void set_pattern_lut( int c, uint8_t lut[] );
-
-  void set_seed( uint32_t seed );
+  void set_pattern_lut( int c, uint8_t lut[], bool all0 );
   void set_scale_shift( int shift );
 
-public:
   void set_depth( int depth );
   void set_chroma_subsampling( int subx, int suby );
 
-  void add_grain_line( void* Y, void* U, void* V, int y, int width );
+private:
+  virtual void make_grain_pattern( const void* I,
+                                   int         c,
+                                   int         x,
+                                   int         subx,
+                                   uint8_t     oc1,
+                                   uint8_t     oc2,
+                                   uint8_t     ox,
+                                   uint8_t     ox_up,
+                                   uint8_t     oy,
+                                   uint8_t     oy_up,
+                                   int         s,
+                                   int         s_up,
+                                   int16_t     grain[3][32],
+                                   uint8_t     scale[3][32] ) const;
+  virtual void scale_and_output( void*   I,   //
+                                 int     c,
+                                 int     x,
+                                 int     subx,
+                                 int     width,
+                                 int16_t grain[3][32],
+                                 uint8_t scale[3][32] ) const;
 };
 
 }   // namespace vvdec
diff --git a/source/Lib/FilmGrain/FilmGrainImplX86.h b/source/Lib/FilmGrain/FilmGrainImplX86.h
new file mode 100644
index 00000000..a6643dc3
--- /dev/null
+++ b/source/Lib/FilmGrain/FilmGrainImplX86.h
@@ -0,0 +1,102 @@
+/* -----------------------------------------------------------------------------
+The copyright in this software is being made available under the Clear BSD
+License, included below. No patent rights, trademark rights and/or
+other Intellectual Property Rights other than the copyrights concerning
+the Software are granted under this license.
+
+The Clear BSD License
+
+Copyright (c) 2018-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted (subject to the limitations in the disclaimer below) provided that
+the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from this
+software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+------------------------------------------------------------------------------------------- */
+
+#pragma once
+
+#include "FilmGrainImpl.h"
+
+#include <CommonDef.h>
+#include <CommonDefX86.h>
+
+#include <memory>
+
+namespace vvdec
+{
+
+template<X86_VEXT VEXT>
+class FilmGrainImplX86 : public FilmGrainImpl
+{
+public:
+  static std::unique_ptr<FilmGrainImpl> makeFilmGrainImpl();
+
+protected:
+  void make_grain_pattern( const void* I,
+                           int         c,
+                           int         x,
+                           int         subx,
+                           uint8_t     oc1,
+                           uint8_t     oc2,
+                           uint8_t     ox,
+                           uint8_t     ox_up,
+                           uint8_t     oy,
+                           uint8_t     oy_up,
+                           int         s,
+                           int         s_up,
+                           int16_t     grain[3][32],
+                           uint8_t     scale[3][32] ) const override;
+  void scale_and_output( void*   I,   //
+                         int     c,
+                         int     x,
+                         int     subx,
+                         int     width,
+                         int16_t grain[3][32],
+                         uint8_t scale[3][32] ) const override;
+};
+
+template<>
+inline std::unique_ptr<FilmGrainImpl> FilmGrainImplX86<UNDEFINED>::makeFilmGrainImpl()
+{
+  switch( read_x86_extension_flags() )
+  {
+  case AVX512:
+  case AVX2:
+#if ENABLE_AVX2_IMPLEMENTATIONS
+    return std::make_unique<FilmGrainImplX86<AVX2>>();
+#endif
+  case AVX:
+  case SSE42:
+  case SSE41:
+    return std::make_unique<FilmGrainImplX86<SSE41>>();
+  default:
+    return std::make_unique<FilmGrainImpl>();
+  }
+}
+
+}   // namespace vvdec
diff --git a/source/Lib/FilmGrain/FilmGrainImpl_X86_SIMD.h b/source/Lib/FilmGrain/FilmGrainImpl_X86_SIMD.h
new file mode 100755
index 00000000..e8eb7249
--- /dev/null
+++ b/source/Lib/FilmGrain/FilmGrainImpl_X86_SIMD.h
@@ -0,0 +1,609 @@
+/* -----------------------------------------------------------------------------
+The copyright in this software is being made available under the Clear BSD
+License, included below. No patent rights, trademark rights and/or
+other Intellectual Property Rights other than the copyrights concerning
+the Software are granted under this license.
+
+The Clear BSD License
+
+Copyright (c) 2018-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted (subject to the limitations in the disclaimer below) provided that
+the following conditions are met:
+
+  * Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+  * Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from this
+  software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+------------------------------------------------------------------------------------------- */
+
+#include "FilmGrainImplX86.h"
+
+#include <algorithm>
+
+#include <CommonDef.h>
+
+#ifdef TARGET_SIMD_X86
+#  include <CommonDefX86.h>
+
+namespace vvdec
+{
+using namespace x86_simd;
+
+template<>
+void FilmGrainImplX86<CURR_X86_VEXT>::make_grain_pattern( const void* I,
+                                                          int         c,
+                                                          int         x,
+                                                          int         subx,
+                                                          uint8_t     oc1,
+                                                          uint8_t     oc2,
+                                                          uint8_t     ox,
+                                                          uint8_t     ox_up,
+                                                          uint8_t     oy,
+                                                          uint8_t     oy_up,
+                                                          int         s,
+                                                          int         s_up,
+                                                          int16_t     grain[3][32],
+                                                          uint8_t     scale[3][32] ) const
+{
+  const uint8_t*  I8  = (const uint8_t*) I;
+  const uint16_t* I16 = (const uint16_t*) I;
+  if( allZero[c] == 1 )
+  {
+    if( c == 0 )
+    {
+      __m128i vP = _mm_lddqu_si128( (__m128i*) &pattern[0][0][oy][ox] );
+      if( s == -1 )
+      {
+        vP = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP );
+      }
+#ifdef USE_AVX2
+      __m256i vmask = _mm256_set1_epi32(0xff);
+      __m128i tmp0;
+      __m128i tmp1;
+      __m256i vintensity;
+      if (bs)
+      {
+        vintensity = _mm256_lddqu_si256((__m256i*)&I16[x]);  //load 16 16 bit values
+        vintensity = _mm256_sra_epi16 (vintensity, _mm_set_epi32 (0,0,0,bs));
+        tmp0=_mm256_extracti128_si256 (vintensity,0);
+        tmp1=_mm256_extracti128_si256 (vintensity,1);
+      }
+      else
+      {
+        __m128i vintensity128 = _mm_lddqu_si128((__m128i*)&I8[x]);  //load 16 8 bit value
+        tmp0=_mm_cvtepi8_epi16 (vintensity128);
+        tmp1=_mm_cvtepi8_epi16 (_mm_bsrli_si128(vintensity128,8));
+        tmp0 = _mm_and_si128 (tmp0,_mm_set1_epi16(0xff));  // only 8 bit
+        tmp1 = _mm_and_si128 (tmp1,_mm_set1_epi16(0xff));  // only 8 bit
+        vintensity = _mm256_castsi128_si256 (vintensity128);
+      }
+      __m256i vindex0=_mm256_cvtepi16_epi32 (tmp0);
+      __m256i vindex1=_mm256_cvtepi16_epi32 (tmp1);
+
+      __m256i avP = _mm256_cvtepi8_epi16( vP );
+      if( oc1 )
+      {
+        __m256i avoc1 = _mm256_set1_epi16( oc1 );
+        __m256i avoc2 = _mm256_set1_epi16( oc2 );
+        // p*oc1
+        avP = _mm256_mullo_epi16( avP, avoc1 );   // max 16 Bit
+        // pattern * s_up
+        __m128i vP2 = _mm_lddqu_si128( (__m128i*) &pattern[0][0][oy_up][ox_up] );
+        if( s_up == -1 )
+        {
+          vP2 = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP2 );
+        }
+        __m256i avP2 = _mm256_cvtepi8_epi16( vP2 );
+        // * oc2
+        avP2 = _mm256_mullo_epi16( avP2, avoc2 );
+        // add
+        avP = _mm256_add_epi16( avP, avP2 );
+        // round to 16 bit
+        __m256i avadd   = _mm256_set1_epi16( 1 << ( 5 - 1 ) );
+        __m128i avshift = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 5 );
+        avP             = _mm256_add_epi16( avP, avadd );
+        avP             = _mm256_sra_epi16( avP, avshift );
+      }
+      _mm256_storeu_si256( (__m256i*) &grain[c][16], avP );
+
+      __m256i vscale0 = _mm256_i32gather_epi32 ((int *)&sLUT[0][0], vindex0, 1);  // load 8 32 bit values
+      __m256i vscale1 = _mm256_i32gather_epi32 ((int *)&sLUT[0][0], vindex1, 1);  // load 8 32 bit values
+
+      vscale0 = _mm256_and_si256 (vscale0,vmask);
+      vscale1 = _mm256_and_si256 (vscale1,vmask);
+
+      vintensity = _mm256_packus_epi32 (vscale0, vscale1);
+      vscale0 = _mm256_permute4x64_epi64 (vintensity, 0x8);
+      vscale1 = _mm256_permute4x64_epi64 (vintensity, 0xd);
+      vscale0 = _mm256_packus_epi16 (vscale0, vscale1);
+      _mm_storeu_si128(( __m128i * )&scale[0][16],_mm256_castsi256_si128(vscale0));
+# else
+      __m128i vPlo = _mm_cvtepi8_epi16( vP );
+      __m128i vPhi = _mm_cvtepi8_epi16( _mm_bsrli_si128( vP, 8 ) );
+      if( oc1 )
+      {
+        __m128i voc1 = _mm_set1_epi16( oc1 );
+        __m128i voc2 = _mm_set1_epi16( oc2 );
+        // p*oc1
+        vPlo = _mm_mullo_epi16( vPlo, voc1 );   // max 16 Bit
+        vPhi = _mm_mullo_epi16( vPhi, voc1 );
+        // pattern * s_up
+        __m128i vP2 = _mm_lddqu_si128( (__m128i*) &pattern[0][0][oy_up][ox_up] );
+        if( s_up == -1 )
+        {
+          vP2 = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP2 );
+        }
+        __m128i vP2lo = _mm_cvtepi8_epi16( vP2 );
+        __m128i vP2hi = _mm_cvtepi8_epi16( _mm_bsrli_si128( vP2, 8 ) );
+        // * oc2
+        vP2lo = _mm_mullo_epi16( vP2lo, voc2 );
+        vP2hi = _mm_mullo_epi16( vP2hi, voc2 );
+        // add
+        vPlo = _mm_add_epi16( vPlo, vP2lo );
+        vPhi = _mm_add_epi16( vPhi, vP2hi );
+        // round to 16 bit
+        __m128i vadd   = _mm_set1_epi16( 1 << ( 5 - 1 ) );
+        __m128i vshift = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 5 );
+        vPlo           = _mm_add_epi16( vPlo, vadd );
+        vPhi           = _mm_add_epi16( vPhi, vadd );
+        vPlo           = _mm_sra_epi16( vPlo, vshift );
+        vPhi           = _mm_sra_epi16( vPhi, vshift );
+      }
+      _mm_storeu_si128( (__m128i*) &grain[c][16], vPlo );
+      _mm_storeu_si128( (__m128i*) &grain[c][16 + 8], vPhi );
+      // Scale sign already integrated above because of overlap
+      //scale[0][16+i] = sLUT[0][intensity];
+      uint8_t intensity;
+      uint8_t *pscale=&scale[0][16];
+      const uint8_t *pLUT=sLUT[0];
+      if (bs)
+      {
+        const uint16_t *pI16 = I16+x;
+        for (int i=0; i<16; i++)
+        {
+          intensity =  *pI16++ >> bs ;
+          *pscale++ = pLUT[intensity];
+        }
+      }
+      else
+      {
+        const uint8_t *pI8 = I8+x;
+        for (int i=0; i<16; i++)
+        {
+          intensity =  *pI8++ ;
+          *pscale++ = pLUT[intensity];
+        }
+      }
+#endif
+    }   // Y
+    else
+    {   // U/V
+      __m128i vP;
+#ifdef USE_AVX2
+      __m256i vindex;
+      __m128i vintensity;
+      if (bs)
+      {
+        vintensity = _mm_lddqu_si128((__m128i*)&I16[x>>1]);  //load 8 16 bit values
+        vintensity = _mm_sra_epi16 (vintensity, _mm_set_epi32 (0,0,0,bs));
+      }
+      else
+      {
+        vintensity = _mm_loadu_si64(&I8[x>>1]);  //load 8 8 bit values
+        vintensity=_mm_cvtepi8_epi16 (vintensity);
+        vintensity = _mm_and_si128 (vintensity,_mm_set1_epi16(0xff));  // only 8 bit
+      }
+      vindex=_mm256_cvtepi16_epi32 (vintensity);
+#endif
+      vP = _mm_loadl_epi64( (__m128i*) &pattern[1][0][oy][ox] );
+
+      if( s == -1 )
+      {
+        vP = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP );
+      }
+      __m128i vPlo = _mm_cvtepi8_epi16( vP );
+      if( oc1 )
+      {
+        __m128i voc1 = _mm_set1_epi16( oc1 );
+        __m128i voc2 = _mm_set1_epi16( oc2 );
+        // p*oc1
+        vPlo = _mm_mullo_epi16( vPlo, voc1 );   // max 16 Bit
+        // pattern * s_up
+        __m128i vP2 = _mm_loadl_epi64( (__m128i*) &pattern[c ? 1 : 0][0][oy_up][ox_up] );
+        if( s_up == -1 )
+        {
+          vP2 = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP2 );
+        }
+        __m128i vP2lo = _mm_cvtepi8_epi16( vP2 );
+        // * oc2
+        vP2lo = _mm_mullo_epi16( vP2lo, voc2 );
+        // add
+        vPlo = _mm_add_epi16( vPlo, vP2lo );
+        // round to 16 bit
+        __m128i vadd   = _mm_set1_epi16( 1 << ( 5 - 1 ) );
+        __m128i vshift = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 5 );
+        vPlo           = _mm_add_epi16( vPlo, vadd );
+        vPlo           = _mm_sra_epi16( vPlo, vshift );
+      }
+      _mm_storeu_si128( (__m128i*) &grain[c][8], vPlo );
+#ifdef USE_AVX2
+      __m256i vmask = _mm256_set1_epi32(0xff);
+      __m256i vscale = _mm256_i32gather_epi32 ((int *)&sLUT[c][0], vindex, 1);  // load 8 32 bit values
+      vscale = _mm256_and_si256 (vscale,vmask);
+
+      vmask = _mm256_packus_epi32 (vscale, vscale);
+      vscale = _mm256_permute4x64_epi64 (vmask, 0x8);
+      vscale = _mm256_packus_epi16 (vscale, vscale);
+      _mm_storeu_si64(( __m128i * )&scale[c][8],_mm256_castsi256_si128(vscale));
+#else
+      uint8_t*       pscale = &scale[c][8];
+      const uint8_t* pLUT   = sLUT[c];
+      if (bs)
+      {
+        const uint16_t* pI16 = &I16[x >> 1];
+        for( int i = 0; i < 8; i++ )
+        {
+          uint8_t intensity = *pI16++ >> bs;
+          *pscale++         = pLUT[intensity];
+        }
+      }
+      else
+      {
+        const uint8_t* pI8 = &I8[x >> 1];
+        for( int i = 0; i < 8; i++ )
+        {
+          uint8_t intensity = *pI8++;
+          *pscale++         = pLUT[intensity];
+        }
+      }
+#endif
+    }
+  }
+#ifdef USE_AVX2
+  else if( c>0 && allZero[c] == 0 )
+  {
+    __m128i vP;
+    __m128i vintensity;
+    __m256i vindex;
+    __m256i vmask = _mm256_set1_epi32(0xff);
+    if (bs)
+    {
+      vintensity = _mm_lddqu_si128((__m128i*)&I16[x>>1]);  //load 8 16 bit values
+      vintensity = _mm_sra_epi16 (vintensity, _mm_set_epi32 (0,0,0,bs));
+    }
+    else
+    {
+      vintensity = _mm_loadu_si64(&I8[x>>1]);  //load 8 8 bit values
+      vintensity=_mm_cvtepi8_epi16 (vintensity);
+    }
+    vindex=_mm256_cvtepi16_epi32 (vintensity);
+    vindex = _mm256_and_si256 (vindex,vmask);  // only 8 bit
+
+    __m256i vadd = _mm256_set_epi32(7,6,5,4,3,2,1,0);
+    __m256i vpi = _mm256_i32gather_epi32 ((int *)&pLUT[c][0], vindex, 1);  // load 8 32 bit values
+    vpi = _mm256_and_si256 (vpi,vmask);  // only 8 bit
+    vpi = _mm256_slli_epi32 (vpi, 8);  // 12-4
+    vpi = _mm256_add_epi32 (vpi, vadd);
+    __m256i avP = _mm256_i32gather_epi32 ((int *)&pattern[1][0][oy][ox], vpi, 1);  // load 8 32 bit values
+    avP = _mm256_and_si256 (avP,vmask);  // only 8 bit
+    // convert to packed 8 bit
+    __m256i vtmp = _mm256_packus_epi32 (avP, avP);
+    avP = _mm256_permute4x64_epi64 (vtmp, 0x8);
+    avP = _mm256_packus_epi16 (avP, avP);
+    vP = _mm256_castsi256_si128(avP);
+    if( s == -1 )
+    {
+      vP = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP );
+    }
+    __m128i vPlo = _mm_cvtepi8_epi16( vP );
+    if( oc1 )
+    {
+      __m128i voc1 = _mm_set1_epi16( oc1 );
+      __m128i voc2 = _mm_set1_epi16( oc2 );
+      // p*oc1
+      vPlo = _mm_mullo_epi16( vPlo, voc1 );   // max 16 Bit
+      // pattern * s_up
+      __m256i avP2 = _mm256_i32gather_epi32 ((int *)&pattern[1][0][oy_up][ox_up], vpi, 1);  // load 8 32 bit values
+      avP2 = _mm256_and_si256 (avP2,vmask);  // only 8 bit
+      // convert to packed 8 bit
+      vtmp = _mm256_packus_epi32 (avP2, avP2);
+      avP2 = _mm256_permute4x64_epi64 (vtmp, 0x8);
+      avP2 = _mm256_packus_epi16 (avP2, avP2);
+      __m128i vP2= _mm256_castsi256_si128(avP2);
+      if( s_up == -1 )
+      {
+        vP2 = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP2 );
+      }
+      __m128i vP2lo = _mm_cvtepi8_epi16( vP2 );
+      vP2lo = _mm_mullo_epi16( vP2lo, voc2 );
+      vPlo = _mm_add_epi16( vPlo, vP2lo );
+      // round to 16 bit
+      __m128i vadd   = _mm_set1_epi16( 1 << ( 5 - 1 ) );
+      __m128i vshift = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 5 );
+      vPlo           = _mm_add_epi16( vPlo, vadd );
+      vPlo           = _mm_sra_epi16( vPlo, vshift );
+    }
+    _mm_storeu_si128( (__m128i*) &grain[c][8], vPlo );
+    __m256i vscale = _mm256_i32gather_epi32 ((int *)&sLUT[c][0], vindex, 1);  // load 8 32 bit values
+    vscale = _mm256_and_si256 (vscale,vmask);
+    vmask = _mm256_packus_epi32 (vscale, vscale);
+    vscale = _mm256_permute4x64_epi64 (vmask, 0x8);
+    vscale = _mm256_packus_epi16 (vscale, vscale);
+    _mm_storeu_si64(( __m128i * )&scale[c][8],_mm256_castsi256_si128(vscale));
+  }
+#endif
+  else
+  {
+    for( int i = 0; i < 16 / subx; i++ )
+    {
+      uint8_t intensity = bs ? I16[x / subx + i] >> bs : I8[x / subx + i];
+      uint8_t pi        = pLUT[c][intensity] >> 4;                  // pattern index (integer part)
+      int     P         = pattern[c ? 1 : 0][pi][oy][ox + i] * s;   // Pattern sample (from current pattern index)
+                                                                    // We could consider just XORing the sign bit
+#if PATTERN_INTERPOLATION
+      uint8_t pf = pLUT[c][intensity] & 15;           // pattern index fractional part (interpolate with next) -- could restrict to less bits (e.g. 2)
+      int     Pn =
+        pattern[c ? 1 : 0][pi + 1][oy][ox + i] * s;   // Next-pattern sample (from pattern index+1)
+                                                      // But there are equivalent hw tricks, e.g. storing values as sign + amplitude instead of two's complement
+#endif
+
+      if( oc1 )   // overlap
+      {
+        P = round( P * oc1 + pattern[c ? 1 : 0][pi][oy_up][ox_up + i] * oc2 * s_up, 5 );
+#if PATTERN_INTERPOLATION
+        Pn = round( Pn * oc1 + pattern[c ? 1 : 0][pi + 1][oy_up][ox_up + i] * oc2 * s_up, 5 );
+#endif
+      }
+#if PATTERN_INTERPOLATION
+      // Pattern interpolation: P is current, Pn is next, pf is interpolation coefficient
+      grain[c][16 / subx + i] = round( P * ( 16 - pf ) + Pn * pf, 4 );
+#else
+      grain[c][16 / subx + i] = P;
+#endif
+      // Scale sign already integrated above because of overlap
+      scale[c][16 / subx + i] = sLUT[c][intensity];
+    }
+  }
+}
+
+template<>
+void FilmGrainImplX86<CURR_X86_VEXT>::scale_and_output( void* I, int c, int x, int subx, int width, int16_t grain[3][32], uint8_t scale[3][32] ) const
+{
+  uint8_t*  I8  = (uint8_t*) I;
+  uint16_t* I16 = (uint16_t*) I;
+
+  const uint8_t I_min = c ? C_min : Y_min;
+  const uint8_t I_max = c ? C_max : Y_max;
+
+  int flush = 0;
+  do
+  {
+    if( x > 0 )
+    {
+      if( !flush )
+      {
+        // Horizontal deblock (across previous block)
+        __m128i vgrain;
+        __m128i vfac = _mm_set_epi16( 0, 0, 0, 1, 1, 3, 1, 1 );
+        if( c == 0 )
+        {
+          vgrain = _mm_loadl_epi64( (__m128i*) &grain[0][16 - 2] );   // r1 r0 l0 l1
+        }
+        else
+        {
+          vgrain = _mm_loadl_epi64( (__m128i*) &grain[c][8 - 2] );   // r1 r0 l0 l1
+        }
+        __m128i vgrainh = _mm_mullo_epi16( vgrain, vfac );           // r1 3*r0  l0 l1
+        vgrainh         = _mm_srli_si128( vgrainh, 2 );              //     r1 3+r0 l0
+        vfac            = _mm_srli_si128( vfac, 2 );
+        __m128i vgrainl = _mm_mullo_epi16( vgrain, vfac );           // r1 r0 3*lo l1
+        vgrainl         = _mm_slli_si128( vgrainl, 10 );
+        vgrainl         = _mm_srli_si128( vgrainl, 10 );             //    r0 3*lo l1
+        vgrainl         = _mm_hadd_epi16( vgrainl, vgrainl );        // r0 3*lo+l1
+        vgrainl         = _mm_hadd_epi16( vgrainl, vgrainl );        // r0+3*lo+l1
+        vgrainh         = _mm_hadd_epi16( vgrainh, vgrainh );
+        vgrainh         = _mm_hadd_epi16( vgrainh, vgrainh );
+        vgrainh         = _mm_srli_si128( vgrainh, 2 );
+        vgrain          = _mm_or_si128( vgrainl, vgrainh );
+        vgrain          = _mm_add_epi16( vgrain, _mm_set_epi16( 0, 0, 0, 0, 0, 0, 2, 2 ) );
+        vgrain          = _mm_srai_epi16( vgrain, 2 );
+        if( c == 0 )
+        {
+          _mm_storeu_si32( (__m128i*) &grain[0][16 - 1], vgrain );
+        }
+        else
+        {
+          _mm_storeu_si32( (__m128i*) &grain[c][8 - 1], vgrain );
+        }
+      }
+      if( bs )
+      {
+#  ifdef USE_AVX2
+        __m128i vshift = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, scale_shift );
+        if( c == 0 )
+        {
+          __m256i vadd    = _mm256_set1_epi32( 1 << ( scale_shift - 1 ) );
+          __m256i vgrain  = _mm256_lddqu_si256( (__m256i*) &grain[0][0] );   // load 16 * 16 bit
+          __m256i vscale  = _mm256_cvtepi8_epi16( _mm_lddqu_si128( (__m128i*) &scale[0][0] ) );
+          __m256i tmplo   = _mm256_mullo_epi16( vscale, vgrain );
+          __m256i tmphi   = _mm256_mulhi_epi16( vscale, vgrain );
+          __m256i tmpgvlo = _mm256_unpacklo_epi16( tmplo, tmphi );   // 32 bit
+          __m256i tmpgvhi = _mm256_unpackhi_epi16( tmplo, tmphi );
+          // deinterleave
+          __m256i gvlo = _mm256_permute2x128_si256( tmpgvlo, tmpgvhi, 0x20 );
+          __m256i gvhi = _mm256_permute2x128_si256( tmpgvlo, tmpgvhi, 0x31 );
+          // round
+          gvlo           = _mm256_add_epi32( gvlo, vadd );
+          gvhi           = _mm256_add_epi32( gvhi, vadd );
+          gvlo           = _mm256_sra_epi32( gvlo, vshift );
+          gvhi           = _mm256_sra_epi32( gvhi, vshift );
+          __m256i vI16lo = _mm256_cvtepi16_epi32( _mm_lddqu_si128( (__m128i*) &I16[( x - 16 )] ) );
+          __m256i vI16hi = _mm256_cvtepi16_epi32( _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) + 8] ) );
+          vI16lo         = _mm256_add_epi32( gvlo, vI16lo );
+          vI16hi         = _mm256_add_epi32( gvhi, vI16hi );
+          vI16lo         = _mm256_max_epi32( _mm256_set1_epi32( I_min ), vI16lo );
+          vI16hi         = _mm256_max_epi32( _mm256_set1_epi32( I_min ), vI16hi );
+          vI16lo         = _mm256_min_epi32( _mm256_set1_epi32( I_max << bs ), vI16lo );
+          vI16hi         = _mm256_min_epi32( _mm256_set1_epi32( I_max << bs ), vI16hi );
+          vI16lo         = _mm256_packs_epi32( vI16lo, vI16hi );
+          vI16lo         = _mm256_permute4x64_epi64( vI16lo, 0xd8 );
+          _mm256_storeu_si256( (__m256i*) &I16[( x - 16 )], vI16lo );
+        }
+        else
+        {
+          __m128i vadd   = _mm_set1_epi32( 1 << ( scale_shift - 1 ) );
+          __m128i vscale = _mm_lddqu_si128( (__m128i*) &scale[c] );
+          __m128i vgrain = _mm_lddqu_si128( (__m128i*) &grain[c] );
+          vscale         = _mm_cvtepi8_epi16( vscale );   // 16 bit
+          __m128i tmplo  = _mm_mullo_epi16( vscale, vgrain );
+          __m128i tmphi  = _mm_mulhi_epi16( vscale, vgrain );
+          __m128i gvlo   = _mm_unpacklo_epi16( tmplo, tmphi );   // 32 bit
+          __m128i gvhi   = _mm_unpackhi_epi16( tmplo, tmphi );
+          gvlo           = _mm_add_epi32( gvlo, vadd );
+          gvhi           = _mm_add_epi32( gvhi, vadd );
+          gvlo           = _mm_sra_epi32( gvlo, vshift );
+          gvhi           = _mm_sra_epi32( gvhi, vshift );
+          __m128i vI16lo = _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) / subx] );
+          __m128i vI16hi = _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) / subx + 4] );
+          vI16lo         = _mm_cvtepi16_epi32( vI16lo );   // 32 bit
+          vI16hi         = _mm_cvtepi16_epi32( vI16hi );
+          vI16lo         = _mm_add_epi32( gvlo, vI16lo );
+          vI16hi         = _mm_add_epi32( gvhi, vI16hi );
+          vI16lo         = _mm_max_epi32( _mm_set1_epi32( I_min ), vI16lo );
+          vI16hi         = _mm_max_epi32( _mm_set1_epi32( I_min ), vI16hi );
+          vI16lo         = _mm_min_epi32( _mm_set1_epi32( I_max << bs ), vI16lo );
+          vI16hi         = _mm_min_epi32( _mm_set1_epi32( I_max << bs ), vI16hi );
+          vI16lo         = _mm_packs_epi32( vI16lo, vI16hi );
+          _mm_storeu_si128( (__m128i*) &I16[( x - 16 ) / subx], vI16lo );
+        }
+#  else    // !USE_AVX2
+        __m128i vadd   = _mm_set1_epi32( 1 << ( scale_shift - 1 ) );
+        __m128i vshift = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, scale_shift );
+        __m128i vscale = _mm_lddqu_si128( (__m128i*) &scale[c] );
+        __m128i vgrain = _mm_lddqu_si128( (__m128i*) &grain[c] );
+        vscale         = _mm_cvtepi8_epi16( vscale );   // 16 bit
+        __m128i tmplo  = _mm_mullo_epi16( vscale, vgrain );
+        __m128i tmphi  = _mm_mulhi_epi16( vscale, vgrain );
+        __m128i gvlo   = _mm_unpacklo_epi16( tmplo, tmphi );   // 32 bit
+        __m128i gvhi   = _mm_unpackhi_epi16( tmplo, tmphi );
+        gvlo           = _mm_add_epi32( gvlo, vadd );
+        gvhi           = _mm_add_epi32( gvhi, vadd );
+        gvlo           = _mm_sra_epi32( gvlo, vshift );
+        gvhi           = _mm_sra_epi32( gvhi, vshift );
+        __m128i vI16lo = _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) / subx] );
+        __m128i vI16hi = _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) / subx + 4] );
+
+        vI16lo = _mm_cvtepi16_epi32( vI16lo );   // 32 bit
+        vI16hi = _mm_cvtepi16_epi32( vI16hi );
+        vI16lo = _mm_add_epi32( gvlo, vI16lo );
+        vI16hi = _mm_add_epi32( gvhi, vI16hi );
+        vI16lo = _mm_max_epi32( _mm_set1_epi32( I_min ), vI16lo );
+        vI16hi = _mm_max_epi32( _mm_set1_epi32( I_min ), vI16hi );
+        vI16lo = _mm_min_epi32( _mm_set1_epi32( I_max << bs ), vI16lo );
+        vI16hi = _mm_min_epi32( _mm_set1_epi32( I_max << bs ), vI16hi );
+        vI16lo = _mm_packs_epi32( vI16lo, vI16hi );
+        _mm_storeu_si128( (__m128i*) &I16[( x - 16 ) / subx], vI16lo );
+        if( c == 0 )
+        {
+          __m128i vscale = _mm_lddqu_si128( (__m128i*) &scale[c][8] );
+          __m128i vgrain = _mm_lddqu_si128( (__m128i*) &grain[c][8] );
+          vscale         = _mm_cvtepi8_epi16( vscale );   // 16 bit
+          __m128i tmplo  = _mm_mullo_epi16( vscale, vgrain );
+          __m128i tmphi  = _mm_mulhi_epi16( vscale, vgrain );
+          __m128i gvlo   = _mm_unpacklo_epi16( tmplo, tmphi );   // 32 bit
+          __m128i gvhi   = _mm_unpackhi_epi16( tmplo, tmphi );
+          // round
+          gvlo           = _mm_add_epi32( gvlo, vadd );
+          gvhi           = _mm_add_epi32( gvhi, vadd );
+          gvlo           = _mm_sra_epi32( gvlo, vshift );
+          gvhi           = _mm_sra_epi32( gvhi, vshift );
+          __m128i vI16lo = _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) / subx + 8] );
+          __m128i vI16hi = _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) / subx + 12] );
+          vI16lo         = _mm_cvtepi16_epi32( vI16lo );   // 32 bit
+          vI16hi         = _mm_cvtepi16_epi32( vI16hi );
+          vI16lo         = _mm_add_epi32( gvlo, vI16lo );
+          vI16hi         = _mm_add_epi32( gvhi, vI16hi );
+          vI16lo         = _mm_max_epi32( _mm_set1_epi32( I_min ), vI16lo );
+          vI16hi         = _mm_max_epi32( _mm_set1_epi32( I_min ), vI16hi );
+          vI16lo         = _mm_min_epi32( _mm_set1_epi32( I_max << bs ), vI16lo );
+          vI16hi         = _mm_min_epi32( _mm_set1_epi32( I_max << bs ), vI16hi );
+          vI16lo         = _mm_packs_epi32( vI16lo, vI16hi );
+          _mm_storeu_si128( (__m128i*) &I16[( x - 16 ) / subx + 8], vI16lo );
+        }
+#endif   // !USE_AVX2
+      }    // bs
+      else
+      {
+        for( int i = 0; i < 16 / subx; i++ )
+        {
+          // Output previous block (or flush current)
+          int32_t g = round( scale[c][i] * (int16_t) grain[c][i], scale_shift );
+          if( bs )
+          {
+            I16[( x - 16 ) / subx + i] = std::max<int32_t>( I_min << bs, std::min<int32_t>( I_max << bs, I16[( x - 16 ) / subx + i] + g ) );
+          }
+          else
+          {
+            I8[( x - 16 ) / subx + i] = std::max<int32_t>( I_min, std::min<int32_t>( I_max, I8[( x - 16 ) / subx + i] + g ) );
+          }
+        }
+      }
+    }
+    // Shift pipeline
+    if( !flush )
+    {
+      if( c == 0 )
+      {
+#ifdef USE_AVX2
+        __m256i vgrain = _mm256_lddqu_si256( (__m256i*) &grain[0][16] );
+        _mm256_storeu_si256( (__m256i*) &grain[0][0], vgrain );
+#else
+        __m128i vgrain0 = _mm_lddqu_si128( (__m128i*) &grain[0][16] );
+        __m128i vgrain1 = _mm_lddqu_si128( (__m128i*) &grain[0][24] );
+        _mm_storeu_si128( (__m128i*) &grain[0][0], vgrain0 );
+        _mm_storeu_si128( (__m128i*) &grain[0][8], vgrain1 );
+#endif
+        __m128i vscale = _mm_lddqu_si128( (__m128i*) &scale[0][16] );
+        _mm_storeu_si128( (__m128i*) &scale[0][0], vscale );
+      }
+      else
+      {
+        __m128i vgrain = _mm_lddqu_si128( (__m128i*) &grain[c][8] );
+        __m128i vscale = _mm_loadl_epi64( (__m128i*) &scale[c][8] );
+        _mm_storeu_si128( (__m128i*) &grain[c][0], vgrain );
+        _mm_storel_epi64( (__m128i*) &scale[c][0], vscale );
+      }
+    }
+    if( x + 16 >= width )
+    {
+      flush++;
+      x += 16;
+    }
+  } while( flush == 1 );
+}
+
+}   // namespace vvdec
+
+#endif   // TARGET_SIMD_X86
diff --git a/source/Lib/FilmGrain/FilmGrainImpl_avx2.cpp b/source/Lib/FilmGrain/FilmGrainImpl_avx2.cpp
new file mode 100644
index 00000000..38442b7f
--- /dev/null
+++ b/source/Lib/FilmGrain/FilmGrainImpl_avx2.cpp
@@ -0,0 +1,42 @@
+/* -----------------------------------------------------------------------------
+The copyright in this software is being made available under the Clear BSD
+License, included below. No patent rights, trademark rights and/or
+other Intellectual Property Rights other than the copyrights concerning
+the Software are granted under this license.
+
+The Clear BSD License
+
+Copyright (c) 2018-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted (subject to the limitations in the disclaimer below) provided that
+the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from this
+software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+------------------------------------------------------------------------------------------- */
+
+#define CURR_X86_VEXT AVX2
+#include "FilmGrainImpl_X86_SIMD.h"
diff --git a/source/Lib/FilmGrain/FilmGrainImpl_sse41.cpp b/source/Lib/FilmGrain/FilmGrainImpl_sse41.cpp
new file mode 100644
index 00000000..855227d7
--- /dev/null
+++ b/source/Lib/FilmGrain/FilmGrainImpl_sse41.cpp
@@ -0,0 +1,42 @@
+/* -----------------------------------------------------------------------------
+The copyright in this software is being made available under the Clear BSD
+License, included below. No patent rights, trademark rights and/or
+other Intellectual Property Rights other than the copyrights concerning
+the Software are granted under this license.
+
+The Clear BSD License
+
+Copyright (c) 2018-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted (subject to the limitations in the disclaimer below) provided that
+the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from this
+software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+------------------------------------------------------------------------------------------- */
+
+#define CURR_X86_VEXT SSE41
+#include "FilmGrainImpl_X86_SIMD.h"
diff --git a/source/Lib/vvdec/CMakeLists.txt b/source/Lib/vvdec/CMakeLists.txt
index d33af357..aa3e4c82 100644
--- a/source/Lib/vvdec/CMakeLists.txt
+++ b/source/Lib/vvdec/CMakeLists.txt
@@ -29,9 +29,7 @@ if( VVDEC_ENABLE_X86_SIMD )
   file( GLOB X86_SSE41_SRC_FILES "../CommonLib/x86/sse41/*.cpp" )
   #file( GLOB X86_SSE42_SRC_FILES "../CommonLib/x86/sse42/*.cpp" )
   #file( GLOB X86_AVX_SRC_FILES   "../CommonLib/x86/avx/*.cpp"   )
-  if( VVDEC_TARGET_ARCH STREQUAL "X86" )
-    file( GLOB X86_AVX2_SRC_FILES "../CommonLib/x86/avx2/*.cpp" )
-  endif()
+  file( GLOB X86_AVX2_SRC_FILES "../CommonLib/x86/avx2/*.cpp" )
 endif()
 
 if( VVDEC_ENABLE_ARM_SIMD )
@@ -46,10 +44,21 @@ file( GLOB MD5_SRC_FILES "../libmd5/*.cpp" )
 file( GLOB MD5_INC_FILES "../libmd5/*.h" )
 
 if( VVDEC_ENABLE_FILM_GRAIN )
-  file( GLOB FGS_SRC_FILES "../FilmGrain/*.cpp" )
-  file( GLOB FGS_INC_FILES "../FilmGrain/*.h"   )
+  file( GLOB FGS_SRC_FILES "../FilmGrain/FilmGrain.cpp" "../FilmGrain/FilmGrainImpl.cpp" )
+  file( GLOB FGS_INC_FILES "../FilmGrain/FilmGrain.h"   "../FilmGrain/FilmGrainImpl.h"   )
+
+  file( GLOB FGS_X86_SSE41_SRC_FILES "../FilmGrain/*_sse41.cpp" )
+  file( GLOB FGS_X86_AVX2_SRC_FILES  "../FilmGrain/*_avx2.cpp" )
+
+  list( APPEND X86_SSE41_SRC_FILES ${FGS_X86_SSE41_SRC_FILES} )
+  list( APPEND X86_AVX2_SRC_FILES  ${FGS_X86_AVX2_SRC_FILES}  )
 
-  set_property( SOURCE vvdec.cpp vvdecimpl.cpp APPEND PROPERTY COMPILE_DEFINITIONS ENABLE_FILM_GRAIN )
+  set_property( SOURCE
+                vvdec.cpp vvdecimpl.cpp
+                ${FGS_SRC_FILES}
+                ${FGS_X86_SSE41_SRC_FILES}
+                ${FGS_X86_AVX2_SRC_FILES}
+                APPEND PROPERTY COMPILE_DEFINITIONS ENABLE_FILM_GRAIN )
 endif()
 
 # get public/extern include files
@@ -94,12 +103,10 @@ if( VVDEC_ENABLE_X86_SIMD )
   #set_property( SOURCE ${X86_SSE42_SRC_FILES} APPEND PROPERTY COMPILE_DEFINITIONS USE_SSE42 )
   #set_property( SOURCE ${X86_AVX_SRC_FILES}   APPEND PROPERTY COMPILE_DEFINITIONS USE_AVX )
   set_property( SOURCE ${X86_AVX2_SRC_FILES}  APPEND PROPERTY COMPILE_DEFINITIONS USE_AVX2 )
-  set_property( SOURCE ${FGS_SRC_FILES}       APPEND PROPERTY COMPILE_DEFINITIONS USE_AVX2 )
   # set needed compile flags
   if( MSVC )
     #set_property( SOURCE ${X86_AVX_SRC_FILES}   APPEND PROPERTY COMPILE_FLAGS "/arch:AVX" )
     set_property( SOURCE ${X86_AVX2_SRC_FILES}  APPEND PROPERTY COMPILE_FLAGS "/arch:AVX2" )
-    set_property( SOURCE ${FGS_SRC_FILES}       APPEND PROPERTY COMPILE_FLAGS "/arch:AVX2" )
   elseif( UNIX OR MINGW )
     include( vvdecCompilerSupport )
 
@@ -116,11 +123,14 @@ if( VVDEC_ENABLE_X86_SIMD )
     #set_property( SOURCE ${X86_SSE42_SRC_FILES} APPEND PROPERTY COMPILE_FLAGS "${FLAG_msse42}" )
     #set_property( SOURCE ${X86_AVX_SRC_FILES}   APPEND PROPERTY COMPILE_FLAGS "${FLAG_mavx}"   )
     set_property( SOURCE ${X86_AVX2_SRC_FILES}  APPEND PROPERTY COMPILE_FLAGS "${FLAG_mavx2}"  )
-    set_property( SOURCE ${FGS_SRC_FILES}       APPEND PROPERTY COMPILE_FLAGS "${FLAG_mavx2}"  )
+  endif()
+
+  if( NOT VVDEC_TARGET_ARCH STREQUAL "X86" )  # only build AVX2 files for X86
+    set( X86_AVX2_SRC_FILES "" )
   endif()
 
   #add_library( ${LIB_NAME}_x86_simd OBJECT ${X86_SSE41_SRC_FILES} ${X86_SSE42_SRC_FILES} ${X86_AVX_SRC_FILES} ${X86_AVX2_SRC_FILES} )
-  add_library( ${LIB_NAME}_x86_simd OBJECT ${X86_SSE41_SRC_FILES} ${X86_AVX2_SRC_FILES} ${X86_AVX2_C_FILES} )
+  add_library( ${LIB_NAME}_x86_simd OBJECT ${X86_SSE41_SRC_FILES} ${X86_AVX2_SRC_FILES} )
   target_link_libraries( ${LIB_NAME}_x86_simd ${INTEL_ITT_LINK_TARGET} )
 
   # disble LTO for the files compiled with special architecture flags
diff --git a/source/Lib/vvdec/vvdec.cpp b/source/Lib/vvdec/vvdec.cpp
index 288c0b8f..7665cf7f 100644
--- a/source/Lib/vvdec/vvdec.cpp
+++ b/source/Lib/vvdec/vvdec.cpp
@@ -58,16 +58,25 @@ VVDEC_DECL void vvdec_params_default(vvdecParams *params)
     return;
   }
 
-  params->threads           = -1;                       // thread count                          ( default: -1 )
-  params->parseDelay        = -1;                       // number of frames to parse in parallel ( default: -1 )
-  params->upscaleOutput     = VVDEC_UPSCALING_OFF;      // do internal upscaling of rpr pictures to dest. resolution ( default: off )
-  params->logLevel          = VVDEC_WARNING;            // verbosity level
-  params->verifyPictureHash = false;                    // verify picture, if digest is available, true: check hash in SEI messages if available, false: ignore SEI message
-  params->removePadding     = false;                    // copy output pictures to new buffer to remove padding (stride==width)
-  params->opaque            = nullptr;                  // opaque pointer for private user data ( can be used to carry caller specific data or contexts )
-  params->simd              = VVDEC_SIMD_DEFAULT;       // set specific simd optimization (default: max. availalbe)
-  params->errHandlingFlags  = VVDEC_ERR_HANDLING_OFF;   // no special error handling
-  params->parseThreads      = -1;                       // DEPRECATED. Use `parseDelay` instead. Will be removed in the future. Until then, this value is copied to parseDelay if set.
+  // ensure the padding parameters are cleared also, so we don't read undefined values,
+  // when new parameters are introduced and the library is used with an old executable
+  memset( params, 0, sizeof( vvdecParams ) );
+
+  params->threads            = -1;                      // thread count                          ( default: -1 )
+  params->parseDelay         = -1;                      // number of frames to parse in parallel ( default: -1 )
+  params->upscaleOutput      = VVDEC_UPSCALING_OFF;     // do internal upscaling of rpr pictures to dest. resolution ( default: off )
+  params->logLevel           = VVDEC_WARNING;           // verbosity level
+  params->verifyPictureHash  = false;                   // verify picture, if digest is available, true: check hash in SEI messages if available, false: ignore SEI message
+  params->removePadding      = false;                   // copy output pictures to new buffer to remove padding (stride==width)
+  params->opaque             = nullptr;                 // opaque pointer for private user data ( can be used to carry caller specific data or contexts )
+  params->simd               = VVDEC_SIMD_DEFAULT;      // set specific simd optimization (default: max. availalbe)
+  params->errHandlingFlags   = VVDEC_ERR_HANDLING_OFF;  // no special error handling
+#if ENABLE_FILM_GRAIN
+  params->filmGrainSynthesis = true;                    // enable film grain synthesis using Film Grain Charactersitics SEI ( default: true )
+#else
+  params->filmGrainSynthesis = false;                   // built without film grain support
+#endif
+  params->parseThreads       = -1;                      // DEPRECATED. Use `parseDelay` instead. Will be removed in the future. Until then, this value is copied to parseDelay if set.
 }
 
 VVDEC_DECL vvdecParams* vvdec_params_alloc()
@@ -191,6 +200,14 @@ static int paramCheck( vvdecParams *params )
     }
   }
 
+#if !ENABLE_FILM_GRAIN
+  if( params->filmGrainSynthesis )
+  {
+    vvdec::msg( vvdec::ERROR, "VVdeC was built without ENABLE_FILM_GRAIN. filmGrainSynthesis parameter must be 0.\n" );
+    ret = -1;
+  }
+#endif   // !ENABLE_FILM_GRAIN
+
   return ret;
 }
 
diff --git a/source/Lib/vvdec/vvdecimpl.cpp b/source/Lib/vvdec/vvdecimpl.cpp
index 89423a5a..9a29fb1d 100644
--- a/source/Lib/vvdec/vvdecimpl.cpp
+++ b/source/Lib/vvdec/vvdecimpl.cpp
@@ -131,6 +131,9 @@ int VVDecImpl::init( const vvdecParams& params, vvdecCreateBufferCallback create
 
     m_sDecoderCapabilities = m_cDecLib->getDecoderCapabilities();
 
+#if ENABLE_FILM_GRAIN
+    m_enableFilmGrain   = params.filmGrainSynthesis;
+#endif   // ENABLE_FILM_GRAIN
     m_bRemovePadding    = params.removePadding;
     m_eErrHandlingFlags = static_cast<ErrHandlingFlags>(params.errHandlingFlags);
     m_uiSeqNumber       = 0;
@@ -774,7 +777,13 @@ bool VVDecImpl::isNalUnitSlice( vvdecNalType t )
       || t == VVC_NAL_UNIT_CODED_SLICE_GDR;
 }
 
-int VVDecImpl::copyComp( const unsigned char* pucSrc, unsigned char* pucDest, unsigned int uiWidth, unsigned int uiHeight, ptrdiff_t iStrideSrc, ptrdiff_t iStrideDest, int iBytesPerSample  )
+int VVDecImpl::copyComp( const unsigned char* pucSrc,
+                         unsigned char*       pucDest,
+                         unsigned int         uiWidth,
+                         unsigned int         uiHeight,
+                         ptrdiff_t            iStrideSrc,
+                         ptrdiff_t            iStrideDest,
+                         int                  iBytesPerSample )
 {
   if( NULL != pucSrc && NULL != pucDest )
   {
@@ -842,7 +851,7 @@ void VVDecImpl::xUpdateFGC( vvdecSEI* s )
 
   if( !m_filmGrainSynth )
   {
-    m_filmGrainSynth = std::make_unique<FilmGrain>( 10, 2 );   // TODO: (GH) set correct bit depth and color format, and apply changes
+    m_filmGrainSynth = std::make_unique<FilmGrain>();
   }
 
   m_filmGrainSynth->updateFGC( sei );
@@ -856,22 +865,50 @@ void VVDecImpl::xAddGrain( vvdecFrame* frame )
     return;
   }
 
-  uint8_t* Y = (uint8_t*) frame->planes[0].ptr;
-  uint8_t* U = (uint8_t*) frame->planes[1].ptr;
-  uint8_t* V = (uint8_t*) frame->planes[2].ptr;
+  m_filmGrainSynth->setDepth( frame->bitDepth );
+  m_filmGrainSynth->setColorFormat( frame->colorFormat );
+  m_filmGrainSynth->prepareBlockSeeds( frame->planes[0].width, frame->planes[0].height );
 
-  CHECK( frame->bitDepth != 10, "Bitdepth is not 10" );
+  struct GrainTaskData
+  {
+    vvdecFrame* frame;
+    uint32_t    startLine;
+    FilmGrain*  filmGrainSynth;
+  };
+  constexpr static int       LINES_PER_TASK = 16;
+  const int                  numTasks       = ( frame->planes[0].height + ( LINES_PER_TASK - 1 ) ) / LINES_PER_TASK;
+  std::vector<GrainTaskData> grainTaskData( numTasks );
 
-  for( int y = 0; y < frame->planes[0].height; y++ )
+  WaitCounter grainTaskCounter;
+  for( int i = 0; i < numTasks; ++i )
   {
-    m_filmGrainSynth->add_grain_line( Y, U, V, y, frame->planes[0].width );
-    Y += frame->planes[0].stride;
-    if( ( y & 1 ) || ( frame->planes[0].height == frame->planes[1].height ) )
+    grainTaskData[i].frame          = frame;
+    grainTaskData[i].startLine      = i * LINES_PER_TASK;
+    grainTaskData[i].filmGrainSynth = m_filmGrainSynth.get();
+
+    static auto grainTask = []( int, GrainTaskData* data )
     {
-      U += frame->planes[1].stride;
-      V += frame->planes[2].stride;
-    }
+      auto* frame = data->frame;
+      for( unsigned y = data->startLine; y < std::min( data->startLine + LINES_PER_TASK, frame->planes[0].height ); ++y )
+      {
+        uint8_t* Y = (uint8_t*) frame->planes[0].ptr + frame->planes[0].stride * y;
+        uint8_t* U = nullptr;
+        uint8_t* V = nullptr;
+        if( frame->colorFormat != VVDEC_CF_YUV400_PLANAR )
+        {
+          const int chromaSub = frame->colorFormat == VVDEC_CF_YUV420_PLANAR ? 2 : 1;
+
+          U = (uint8_t*) frame->planes[1].ptr + frame->planes[1].stride * y / chromaSub;
+          V = (uint8_t*) frame->planes[2].ptr + frame->planes[2].stride * y / chromaSub;
+        }
+
+        data->filmGrainSynth->add_grain_line( Y, U, V, y, frame->planes[0].width );
+      }
+      return true;
+    };
+    m_cDecLib->getThreadPool().addBarrierTask<GrainTaskData>( grainTask, &( grainTaskData[i] ), &grainTaskCounter );
   }
+  grainTaskCounter.wait();
 
   if( m_filmGrainCharacteristicsState != FgcPersist )   // Not persistent
   {
@@ -918,16 +955,23 @@ int VVDecImpl::xAddPicture( Picture* pcPic )
   bCreateStorage = bCreateStorage || m_bRemovePadding;
 
 #if ENABLE_FILM_GRAIN
-  // find FGC SEI
-  for( auto& sei: pcPic->seiMessageList )
+  if( m_enableFilmGrain )
   {
-    if( sei->payloadType == VVDEC_FILM_GRAIN_CHARACTERISTICS )
+    // find FGC SEI
+    for( auto& sei: pcPic->seiMessageList )
     {
-      xUpdateFGC( sei );
-      msg( DETAILS, "vvdecimpl [detail]: SEI FILM_GRAIN_CHARACTERISTICS\n");
+      if( sei->payloadType == VVDEC_FILM_GRAIN_CHARACTERISTICS )
+      {
+        xUpdateFGC( sei );
+        msg( DETAILS, "vvdecimpl [detail]: SEI FILM_GRAIN_CHARACTERISTICS\n");
+      }
+    }
+    const bool fgsReuseBuffer = bitDepths.recon == 10 && !m_bRemovePadding && !pcPic->stillReferenced;
+    if( !fgsReuseBuffer )
+    {
+      bCreateStorage |= m_filmGrainCharacteristicsState != FgcNone;
     }
   }
-  bCreateStorage = bCreateStorage || m_filmGrainCharacteristicsState;
 #endif   // ENABLE_FILM_GRAIN
 
   // create a brand new picture object
@@ -935,22 +979,22 @@ int VVDecImpl::xAddPicture( Picture* pcPic )
   vvdec_frame_default( &cFrame );
 
   cFrame.sequenceNumber = m_uiSeqNumber;
-  cFrame.cts      = pcPic->getCts();
-  cFrame.ctsValid = true;
+  cFrame.cts            = pcPic->getCts();
+  cFrame.ctsValid       = true;
 
   int ret;
 #if RPR_YUV_OUTPUT
   if( m_cDecLib->getUpscaledOutput() && ( uiWidth != orgWidth || uiHeight != orgHeight ) )
   {
     bCreateStorage = true;
-    ret = xCreateFrame ( cFrame, cPicBuf, orgWidth, orgHeight, bitDepths, bCreateStorage );
+    ret = xCreateFrame( cFrame, cPicBuf, orgWidth, orgHeight, bitDepths, bCreateStorage );
   }
   else
   {
-    ret = xCreateFrame ( cFrame, cPicBuf, uiWidth, uiHeight, bitDepths, bCreateStorage );
+    ret = xCreateFrame( cFrame, cPicBuf, uiWidth, uiHeight, bitDepths, bCreateStorage, m_filmGrainCharacteristicsState != FgcNone );
   }
 #else
-  ret = xCreateFrame ( cFrame, cPicBuf, uiWidth, uiHeight, bitDepths, bCreateStorage );
+  ret = xCreateFrame( cFrame, cPicBuf, uiWidth, uiHeight, bitDepths, bCreateStorage, m_filmGrainCharacteristicsState != FgcNone );
 #endif
   if( ret != VVDEC_OK )
   {
@@ -994,9 +1038,12 @@ int VVDecImpl::xAddPicture( Picture* pcPic )
 
         copyComp( (const unsigned char*) ( planeOrigin + planeOffset ),
                   cFrame.planes[comp].ptr,
-                  cFrame.planes[comp].width, cFrame.planes[comp].height,
-                  area.stride<<1, cFrame.planes[comp].stride, uiBytesPerSample );
-        cFrame.planes[comp].allocator = upscaledPic.getBufAllocator( (ComponentID)comp );
+                  area.width,   // need to use source width & height here, for VVDEC_UPSCALING_COPY_ONLY to work
+                  area.height,
+                  area.stride * sizeof( *area.buf ),
+                  cFrame.planes[comp].stride,
+                  uiBytesPerSample );
+        cFrame.planes[comp].allocator = upscaledPic.getBufAllocator( (ComponentID) comp );
       }
       upscaledPic.destroy();
     }
@@ -1017,8 +1064,25 @@ int VVDecImpl::xAddPicture( Picture* pcPic )
 
         copyComp( (const unsigned char*) ( planeOrigin + planeOffset ),
                   cFrame.planes[comp].ptr,
-                  cFrame.planes[comp].width, cFrame.planes[comp].height,
-                  area.stride<<1, cFrame.planes[comp].stride, uiBytesPerSample );
+                  area.width,   // need to use source width & height here, for VVDEC_UPSCALING_COPY_ONLY to work
+                  area.height,
+                  area.stride * sizeof( *area.buf ),
+                  cFrame.planes[comp].stride,
+                  uiBytesPerSample );
+
+        // zero the surrounding area for VVDEC_UPSCALING_COPY_ONLY
+        if( m_cDecLib->getUpscaledOutput() == (int) VVDEC_UPSCALING_COPY_ONLY
+            && ( area.width < cFrame.planes[comp].width || area.height < cFrame.planes[comp].height ) )
+        {
+          unsigned char* linePtr        = cFrame.planes[comp].ptr;
+          const auto     bytesPerSample = cFrame.planes[comp].bytesPerSample;
+          for( unsigned y = 0; y < area.height; ++y )
+          {
+            ::memset( linePtr + area.width * bytesPerSample, 0, ( cFrame.planes[comp].width - area.width ) * bytesPerSample );
+            linePtr += cFrame.planes[comp].stride;
+          }
+          ::memset( linePtr, 0, ( cFrame.planes[comp].height - area.height ) * cFrame.planes[comp].stride );
+        }
       }
     }
   }
@@ -1143,7 +1207,10 @@ int VVDecImpl::xAddPicture( Picture* pcPic )
 
 #if ENABLE_FILM_GRAIN
   // Grain synthesis
-  xAddGrain( &cFrame );
+  if( m_enableFilmGrain && m_filmGrainCharacteristicsState != FgcNone )
+  {
+    xAddGrain( &cFrame );
+  }
 #endif   // ENABLE_FILM_GRAIN
 
   m_rcFrameList.emplace_back( cFrame, bCreateStorage ? nullptr : pcPic );
@@ -1178,9 +1245,13 @@ int VVDecImpl::xAddPicture( Picture* pcPic )
   return 0;
 }
 
-
-
-int VVDecImpl::xCreateFrame( vvdecFrame& rcFrame, const CPelUnitBuf& rcPicBuf, uint32_t uiWidth, uint32_t uiHeight, const BitDepths& rcBitDepths, bool bCreateStorage )
+int VVDecImpl::xCreateFrame( vvdecFrame&        rcFrame,
+                             const CPelUnitBuf& rcPicBuf,
+                             uint32_t           uiWidth,
+                             uint32_t           uiHeight,
+                             const BitDepths&   rcBitDepths,
+                             bool               bCreateStorage,
+                             bool               origStride )
 {
   rcFrame.width       = uiWidth;
   rcFrame.height      = uiHeight;
@@ -1191,11 +1262,11 @@ int VVDecImpl::xCreateFrame( vvdecFrame& rcFrame, const CPelUnitBuf& rcPicBuf, u
   rcFrame.planes[VVDEC_CT_Y].width          = uiWidth;
   rcFrame.planes[VVDEC_CT_Y].height         = uiHeight;
   rcFrame.planes[VVDEC_CT_Y].bytesPerSample = rcBitDepths.recon > 8 ? 2 : 1;
-  rcFrame.planes[VVDEC_CT_Y].stride         = bCreateStorage  ? uiWidth                                    * rcFrame.planes[VVDEC_CT_Y].bytesPerSample
-                                                              : (uint32_t)rcPicBuf.get(COMPONENT_Y).stride * rcFrame.planes[VVDEC_CT_Y].bytesPerSample;
+  rcFrame.planes[VVDEC_CT_Y].stride         = bCreateStorage && !origStride ? uiWidth                                       * rcFrame.planes[VVDEC_CT_Y].bytesPerSample
+                                                                            : (uint32_t) rcPicBuf.get( COMPONENT_Y ).stride * rcFrame.planes[VVDEC_CT_Y].bytesPerSample;
 
   size_t nBufSize = 0;
-  size_t nLSize   = rcFrame.planes[VVDEC_CT_Y].stride * uiHeight;
+  size_t nLSize   = rcFrame.planes[VVDEC_CT_Y].stride * rcFrame.planes[VVDEC_CT_Y].height;
   size_t nCSize   = 0;
 
   unsigned int uiCWidth  = 0;
@@ -1265,7 +1336,7 @@ int VVDecImpl::xCreateFrame( vvdecFrame& rcFrame, const CPelUnitBuf& rcPicBuf, u
     rcFrame.planes[VVDEC_CT_V].height         = uiCHeight;
     rcFrame.planes[VVDEC_CT_V].bytesPerSample = rcBitDepths.recon > 8 ? 2 : 1;
 
-    if( bCreateStorage )
+    if( bCreateStorage && !origStride )
     {
       rcFrame.planes[VVDEC_CT_U].stride       = uiCWidth * rcFrame.planes[CHANNEL_TYPE_CHROMA].bytesPerSample;
       rcFrame.planes[VVDEC_CT_V].stride       = uiCWidth * rcFrame.planes[CHANNEL_TYPE_CHROMA].bytesPerSample;
@@ -1276,8 +1347,8 @@ int VVDecImpl::xCreateFrame( vvdecFrame& rcFrame, const CPelUnitBuf& rcPicBuf, u
       rcFrame.planes[VVDEC_CT_V].stride       = (uint32_t)rcPicBuf.get(COMPONENT_Cr).stride * rcFrame.planes[CHANNEL_TYPE_CHROMA].bytesPerSample;
     }
 
-    nCSize   = rcFrame.planes[VVDEC_CT_U].stride * uiCHeight;
-    nBufSize = nLSize + ( nCSize << 1 );
+    nCSize   = rcFrame.planes[VVDEC_CT_U].stride * rcFrame.planes[VVDEC_CT_U].height;
+    nBufSize = nLSize + nCSize * 2;
   }
 
 
diff --git a/source/Lib/vvdec/vvdecimpl.h b/source/Lib/vvdec/vvdecimpl.h
index 5399ef89..9a6355f0 100644
--- a/source/Lib/vvdec/vvdecimpl.h
+++ b/source/Lib/vvdec/vvdecimpl.h
@@ -174,7 +174,13 @@ class VVDecImpl
 
 private:
   int xAddPicture                  ( Picture* pcPic );
-  int xCreateFrame                 ( vvdecFrame& frame, const CPelUnitBuf& rcPicBuf, uint32_t uiWidth, uint32_t uiHeight, const BitDepths& rcBitDepths, bool bCreateStorage );
+  int xCreateFrame                 ( vvdecFrame&        frame,
+                                     const CPelUnitBuf& rcPicBuf,
+                                     uint32_t           uiWidth,
+                                     uint32_t           uiHeight,
+                                     const BitDepths&   rcBitDepths,
+                                     bool               bCreateStorage,
+                                     bool               origStride = false );
 
   void xUpdateFGC                  ( vvdecSEI *sei );
   void xAddGrain                   ( vvdecFrame *frame );
@@ -223,6 +229,7 @@ class VVDecImpl
     FgcDontPersist = 1,
     FgcPersist     = 2
   }                                        m_filmGrainCharacteristicsState = FgcNone;
+  bool                                     m_enableFilmGrain               = false;
   std::unique_ptr<FilmGrain>               m_filmGrainSynth;
 #endif   // ENABLE_FILM_GRAIN
 };