diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/.clang-format b/epochX/cudacpp/smeft_gg_tttt.sa/.clang-format
new file mode 100644
index 0000000000..12afd69b12
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/.clang-format
@@ -0,0 +1,226 @@
+# AV's draft .clang-format
+# --- 
+# February 2022: latest draft for clang 13.0.0 (BasedOnStyle: Google)
+# See https://releases.llvm.org/13.0.0/tools/clang/docs/ClangFormatStyleOptions.html
+---
+Language: Cpp
+BasedOnStyle: Google
+
+AccessModifierOffset: -2 # AV was -1
+AlignAfterOpenBracket: Align # AV ok
+AlignArrayOfStructures: None # AV ok (alternative: Right, but code-generating it would be too complex)
+AlignConsecutiveAssignments: None # AV ok
+AlignConsecutiveBitFields: None # AV ok
+AlignConsecutiveDeclarations: None # AV ok
+AlignConsecutiveMacros: None # AV ok
+AlignEscapedNewlines: DontAlign # AV was Left
+AlignOperands: DontAlign # AV was Align
+AlignTrailingComments: true # AV ok
+AllowAllArgumentsOnNextLine: true # AV ok(?)
+AllowAllConstructorInitializersOnNextLine: true # AV ok (NB: relevant only if ConstructorInitializerAllOnOneLineOrOnePerLine=true)
+AllowAllParametersOfDeclarationOnNextLine: true # AV ok(?)
+AllowShortBlocksOnASingleLine: Always # AV was Never
+AllowShortEnumsOnASingleLine: true # AV ok
+AllowShortCaseLabelsOnASingleLine: true # AV was false
+AllowShortFunctionsOnASingleLine: All # AV ok
+AllowShortLambdasOnASingleLine: All # AV ok
+AllowShortIfStatementsOnASingleLine: WithoutElse # AV ok
+AllowShortLoopsOnASingleLine: true # AV ok
+###AlwaysBreakAfterDefinitionReturnType: None # AV keep defaults (deprecated)
+#AlwaysBreakAfterReturnType: All # AV use this initially, then switch to TopLevelDefinitions!
+AlwaysBreakAfterReturnType: TopLevelDefinitions # AV was None (altearnative: All?)
+AlwaysBreakBeforeMultilineStrings: false # AV was true
+AlwaysBreakTemplateDeclarations: Yes # AV ok
+###AttributeMacros: # AV keep defaults (NB this is not about '__host__' attributes, see llvm/llvm-project/issues/45968)
+###  - __capability
+BinPackArguments: false # AV was true
+BinPackParameters: false # AV was true
+BitFieldColonSpacing: Both # AV ok
+BraceWrapping: # (NB: this is only relevant for "BreakBeforeBraces: Custom")
+  AfterCaseLabel: true # AV was false
+  AfterClass: true # AV was false
+  AfterControlStatement: Always # AV was Never
+  AfterEnum: true # AV was false
+  AfterFunction: true # AV was false
+  AfterNamespace: true # AV was false
+  AfterObjCDeclaration: true # AV was false
+  AfterStruct: true # AV was false
+  AfterUnion: true # AV was false
+  AfterExternBlock: true # AV was false (NB: does not work unless IndentExternBlock is AfterExternBlock?!)
+  BeforeCatch: true # AV was false
+  BeforeElse: true # AV was false
+  BeforeLambdaBody: true # AV was false
+  BeforeWhile: true # AV was false
+  IndentBraces: false # AV ok
+  SplitEmptyFunction: true # AV ok
+  SplitEmptyRecord: true # AV ok
+  SplitEmptyNamespace: true # AV ok
+BreakAfterJavaFieldAnnotations: false
+BreakBeforeBinaryOperators: None # AV ok
+BreakBeforeBraces: Custom # AV was Attach (alternative: Allman)
+BreakBeforeConceptDeclarations: true # AV ok
+###BreakBeforeInheritanceComma: false # (obsolete???)
+BreakBeforeTernaryOperators: true # AV ok
+###BreakConstructorInitializersBeforeComma: true # AV was false (obsolete???)
+BreakConstructorInitializers: BeforeComma # AV was BeforeColon
+BreakInheritanceList: BeforeColon # AV ok (alternative: BeforeComma?)
+BreakStringLiterals: false # AV was true
+ColumnLimit: 0 # AV was 80
+###CommentPragmas: '^[^ ]*' # AV use SpacesInLineCommentPrefix Min=0 Max=1 to allow both "//comment" and "// comment"
+CompactNamespaces: false # AV ok
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 2 # AV was 4
+ContinuationIndentWidth: 2 # AV was 4
+Cpp11BracedListStyle: true # AV ok
+DeriveLineEnding: false # AV was true
+DerivePointerAlignment: false # AV was true
+DisableFormat: false # AV ok
+EmptyLineAfterAccessModifier: Leave # AV was Never
+EmptyLineBeforeAccessModifier: Leave # AV was LogicalBlock
+ExperimentalAutoDetectBinPacking: false # AV ok ("use at your own risk")
+FixNamespaceComments: false # AV was true
+###ForEachMacros: # AV keep defaults
+###  - foreach
+###  - Q_FOREACH
+###  - BOOST_FOREACH
+###IfMacros: # AV keep defaults
+###  - KJ_IF_MAYBE
+IncludeBlocks: Regroup # AV ok
+IncludeCategories:
+  - Regex:           '^<ext/.*\.h>'
+    Priority:        4 # AV was 2
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '^<.*\.h>'
+    Priority:        5 # AV was 1
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '^<.*'
+    Priority:        6 # AV was 2
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           'mgOnGpuConfig.h'
+    Priority:        1 # AV new
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           'mgOnGpu*.*'
+    Priority:        2 # AV new
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '.*'
+    Priority:        3 # AV was 3
+    SortPriority:    0
+    CaseSensitive:   false
+###IncludeIsMainRegex: '([-_](test|unittest))?$' # AV keep defaults
+###IncludeIsMainSourceRegex: '' # AV keep defaults
+IndentAccessModifiers: false # AV ok
+IndentCaseLabels: true # AV ok
+IndentCaseBlocks: false # AV ok
+IndentGotoLabels: false # AV was true
+IndentPPDirectives: None # AV ok (NB: AfterHash and BeforeHash do not seem to work as intended)
+###IndentExternBlock: Indent # AV was AfterExternBlock
+IndentExternBlock: AfterExternBlock # AV ok (only with Custom BraceWrapping.AfterExternBlock = true)
+IndentRequires: false # AV ok(?)
+IndentWidth: 2 # AV ok
+IndentWrappedFunctionNames: false # AV ok
+###InsertTrailingCommas: None # AV keep defaults (Java only?)
+###JavaScriptQuotes: Leave # AV irrelevant
+###JavaScriptWrapImports: true # AV irrelevant
+KeepEmptyLinesAtTheStartOfBlocks: false # AV ok
+LambdaBodyIndentation: Signature # AV ok
+###MacroBlockBegin: '' # AV keep defaults
+###MacroBlockEnd: '' # AV keep defaults
+MaxEmptyLinesToKeep: 1 # AV ok
+NamespaceIndentation: All # AV was None
+###ObjCBinPackProtocolList: Never # AV irrelevant
+###ObjCBlockIndentWidth: 2 # AV irrelevant
+###ObjCBreakBeforeNestedBlockParam: true # AV irrelevant
+###ObjCSpaceAfterProperty: false # AV irrelevant
+###ObjCSpaceBeforeProtocolList: true # AV irrelevant
+###PenaltyBreakAssignment: 2 # AV keep defaults
+###PenaltyBreakBeforeFirstCallParameter: 1 # AV keep defaults
+###PenaltyBreakComment: 300 # AV keep defaults
+###PenaltyBreakFirstLessLess: 120 # AV keep defaults
+###PenaltyBreakString: 1000 # AV keep defaults
+###PenaltyBreakTemplateDeclaration: 10 # AV keep defaults
+###PenaltyExcessCharacter: 1000000 # AV keep defaults
+###PenaltyReturnTypeOnItsOwnLine: 200 # AV keep defaults
+###PenaltyIndentedWhitespace: 0 # AV keep defaults
+PointerAlignment: Left # AV ok
+PPIndentWidth: 0 # AV was -1
+###RawStringFormats: # AV keep defaults
+###  - Language: Cpp
+###    Delimiters:
+###      - cc
+###      - CC
+###      - cpp
+###      - Cpp
+###      - CPP
+###      - 'c++'
+###      - 'C++'
+###    CanonicalDelimiter: ''
+###    BasedOnStyle: google
+###  - Language: TextProto
+###    Delimiters:
+###      - pb
+###      - PB
+###      - proto
+###      - PROTO
+###    EnclosingFunctions:
+###      - EqualsProto
+###      - EquivToProto
+###      - PARSE_PARTIAL_TEXT_PROTO
+###      - PARSE_TEST_PROTO
+###      - PARSE_TEXT_PROTO
+###      - ParseTextOrDie
+###      - ParseTextProtoOrDie
+###      - ParseTestProto
+###      - ParsePartialTestProto
+###    CanonicalDelimiter: pb
+###    BasedOnStyle: google
+ReferenceAlignment: Pointer # AV ok
+ReflowComments: false # AV was true
+ShortNamespaceLines: 1 # AV ok
+SortIncludes: CaseSensitive # AV ok
+###SortJavaStaticImport: Before # irrelevant
+SortUsingDeclarations: false # AV was true
+SpaceAfterCStyleCast: false # AV ok
+SpaceAfterLogicalNot: false # AV ok
+SpaceAfterTemplateKeyword: false # AV was true
+SpaceAroundPointerQualifiers: Default # AV ok (alternative: Before?)
+SpaceBeforeAssignmentOperators: true # AV ok
+SpaceBeforeCaseColon: false # AV ok
+SpaceBeforeCpp11BracedList: false # AV ok
+SpaceBeforeCtorInitializerColon: true # AV ok
+SpaceBeforeInheritanceColon: true # AV ok
+SpaceBeforeParens: Never # AV was ControlStatements
+SpaceBeforeRangeBasedForLoopColon: false # AV was true
+SpaceBeforeSquareBrackets: false # AV ok
+SpaceInEmptyBlock: false # AV ok
+SpaceInEmptyParentheses: false # AV ok
+SpacesBeforeTrailingComments: 1 # AV was 2
+SpacesInAngles: Never # AV ok
+SpacesInConditionalStatement: false # AV ok (does this work?)
+SpacesInContainerLiterals: false # AV was true
+SpacesInCStyleCastParentheses: false # AV ok
+SpacesInLineCommentPrefix:
+  Minimum: 0 # AV was 1
+  Maximum: 1 # AV was -1
+SpacesInParentheses: true # AV was false
+SpacesInSquareBrackets: false # AV ok
+Standard: c++17 # AV was Auto
+###StatementAttributeLikeMacros: # AV keep defaults
+###  - Q_EMIT
+###StatementMacros: # AV keep defaults
+###  - Q_UNUSED
+###  - QT_REQUIRE_VERSION
+###TabWidth: 8 # AV irrelevant if UseTab=Never?
+UseCRLF: false # AV ok (but set DeriveLineEnding=false)
+UseTab: Never # AV ok
+###WhitespaceSensitiveMacros: # AV keep defaults
+###  - STRINGIZE
+###  - PP_STRINGIZE
+###  - BOOST_PP_STRINGIZE
+###  - NS_SWIFT_NAME
+###  - CF_SWIFT_NAME
+...
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/CMake/Compilers.txt b/epochX/cudacpp/smeft_gg_tttt.sa/CMake/Compilers.txt
new file mode 100644
index 0000000000..eec4baed28
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/CMake/Compilers.txt
@@ -0,0 +1,2 @@
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/CMake/Macros.txt b/epochX/cudacpp/smeft_gg_tttt.sa/CMake/Macros.txt
new file mode 100644
index 0000000000..9a0e141b81
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/CMake/Macros.txt
@@ -0,0 +1,10 @@
+MACRO(SUBDIRLIST result)
+  FILE(GLOB children RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/*)
+  SET(dirlist "")
+  FOREACH(child ${children})
+    IF(IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/${child})
+      LIST(APPEND dirlist ${child})
+    ENDIF()
+  ENDFOREACH()
+  SET(${result} ${dirlist})
+ENDMACRO()
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/CMake/Platforms.txt b/epochX/cudacpp/smeft_gg_tttt.sa/CMake/Platforms.txt
new file mode 100644
index 0000000000..ab73e53db8
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/CMake/Platforms.txt
@@ -0,0 +1,3 @@
+if (CMAKE_HOST_APPLE)
+  add_definitions(-DMGONGPU_HAS_NO_CURAND)
+endif(CMAKE_HOST_APPLE)
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/CMakeLists.txt b/epochX/cudacpp/smeft_gg_tttt.sa/CMakeLists.txt
new file mode 100644
index 0000000000..d3010411fc
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/CMakeLists.txt
@@ -0,0 +1,14 @@
+# Minimal CMake configuration to build a functional CPU version
+
+cmake_minimum_required(VERSION 3.22)
+
+project(Madgraph4GPU)
+
+include(${PROJECT_SOURCE_DIR}/CMake/Platforms.txt)
+include(${PROJECT_SOURCE_DIR}/CMake/Compilers.txt)
+include(${PROJECT_SOURCE_DIR}/CMake/Macros.txt)
+
+set(PROJECT_GITROOT_DIR ${PROJECT_SOURCE_DIR}/../../..)
+
+add_subdirectory(src)
+add_subdirectory(SubProcesses)
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt
new file mode 100644
index 0000000000..1ef8242c54
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt
@@ -0,0 +1,186 @@
+[1;31mNote that this is a development version.
+This version is intended for development/beta testing and NOT for production.
+This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
+Running MG5 in debug mode
+************************************************************
+*                                                          *
+*                     W E L C O M E to                     *
+*              M A D G R A P H 5 _ a M C @ N L O           *
+*                                                          *
+*                                                          *
+*                 *                       *                *
+*                   *        * *        *                  *
+*                     * * * * 5 * * * *                    *
+*                   *        * *        *                  *
+*                 *                       *                *
+*                                                          *
+*         VERSION 3.5.0_lo_vect         2023-01-26         *
+[1;31m*                                                          *[1;0m
+[1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
+[1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
+[1;31m*                                                          *[1;0m
+*                                                          *
+*    The MadGraph5_aMC@NLO Development Team - Find us at   *
+*    https://server06.fynu.ucl.ac.be/projects/madgraph     *
+*                            and                           *
+*            http://amcatnlo.web.cern.ch/amcatnlo/         *
+*                                                          *
+*               Type 'help' for in-line help.              *
+*           Type 'tutorial' to learn how MG5 works         *
+*    Type 'tutorial aMCatNLO' to learn how aMC@NLO works   *
+*    Type 'tutorial MadLoop' to learn how MadLoop works    *
+*                                                          *
+************************************************************
+load MG5 configuration from input/mg5_configuration.txt 
+fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead.
+ Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config).
+ MG5_aMC> set fastjet /PATH/TO/fastjet-config
+
+eMELA-config does not seem to correspond to a valid eMELA-config executable.
+ Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config).
+ MG5_aMC> set eMELA /PATH/TO/eMELA-config
+
+lhapdf-config does not seem to correspond to a valid lhapdf-config executable. 
+Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config).
+Note that you can still compile and run aMC@NLO with the built-in PDFs
+ MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
+
+None does not seem to correspond to a valid lhapdf-config executable. 
+Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config).
+Note that you can still compile and run aMC@NLO with the built-in PDFs
+ MG5_aMC> set lhapdf /PATH/TO/lhapdf-config
+
+Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
+No valid eps viewer found. Please set in ./input/mg5_configuration.txt
+No valid web browser found. Please set in ./input/mg5_configuration.txt
+import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_smeft_gg_tttt.mg
+The import format was not given, so we guess it as command
+set stdout_level DEBUG
+set output information to level: 10
+set zerowidth_tchannel F
+set auto_convert_model T; import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t; generate g g > t t~ t t~
+save options auto_convert_model
+save configuration file to /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/input/mg5_configuration.txt
+INFO: load particles 
+INFO: load vertices 
+[1;31mCRITICAL: Model with non QCD emission of gluon (found 14 of those).
+  This type of model is not fully supported within MG5aMC.
+  Restriction on LO dynamical scale and MLM matching/merging can occur for some processes.
+  Use such features with care. [0m
+[1;32mDEBUG: MG5 converter defines FFFF110 to Gamma(-1,2,-2)*Gamma(-1,4,-3)*ProjM(-2,3)*ProjP(-3,1) + Gamma(-1,2,-3)*Gamma(-1,4,-2)*ProjM(-2,1)*ProjP(-3,3) [0m
+[1;32mDEBUG: MG5 converter defines FFFF111 to Gamma(-1,2,-3)*Gamma(-1,4,-2)*ProjM(-2,3)*ProjP(-3,1) + Gamma(-1,2,-2)*Gamma(-1,4,-3)*ProjM(-2,1)*ProjP(-3,3) [0m
+[1;32mDEBUG: MG5 converter defines FFFF22 to ProjM(2,1)*ProjP(4,3) + ProjM(4,3)*ProjP(2,1) [0m
+[1;32mDEBUG: MG5 converter defines FFFF23 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1) [0m
+[1;32mDEBUG: MG5 converter defines FFFF24 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) [0m
+[1;32mDEBUG: MG5 converter defines FFFF25 to Gamma(-2,-6,-5)*Gamma(-2,-4,-3)*Gamma(-1,2,-4)*Gamma(-1,4,-6)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-6,-5)*Gamma(-2,-4,-3)*Gamma(-1,2,-4)*Gamma(-1,4,-6)*ProjM(-5,3)*ProjM(-3,1) [0m
+[1;32mDEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1) [0m
+[1;32mDEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3) [0m
+[1;32mDEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1) [0m
+[1;32mDEBUG: model prefixing  takes 0.15017271041870117 [0m
+INFO: Change particles name to pass to MG5 convention 
+Defined multiparticle p = g u c d s u~ c~ d~ s~
+Defined multiparticle j = g u c d s u~ c~ d~ s~
+Defined multiparticle l+ = e+ mu+
+Defined multiparticle l- = e- mu-
+Defined multiparticle vl = ve vm vt
+Defined multiparticle vl~ = ve~ vm~ vt~
+Defined multiparticle all = g a ve vm vt ve~ vm~ vt~ u c t d s b t1 u~ c~ t~ d~ s~ b~ t1~ z w+ z1 w1+ h h1 w- w1- e- mu- ta- e+ mu+ ta+
+INFO: Checking for minimal orders which gives processes. 
+INFO: Please specify coupling orders to bypass this step. 
+INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 
+INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1  
+INFO: Process has 72 diagrams 
+1 processes with 72 diagrams generated in 4.222 s
+Total: 1 processes with 72 diagrams
+output standalone_cudacpp CODEGEN_cudacpp_smeft_gg_tttt
+Load PLUGIN.CUDACPP_SA_OUTPUT
+[1mOutput will be done with PLUGIN: CUDACPP_SA_OUTPUT[0m
+[1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 143][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 148][0m [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_smeft_gg_tttt 
+INFO: Organizing processes into subprocess groups 
+INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 
+INFO: Processing color information for process: g g > t t~ t t~ @1 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 173][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 174][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 176][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1027][0m [0m
+[1;32mDEBUG:  proc_id = [0m 0 [1;30m[model_handling.py at line 1033][0m [0m
+INFO: Creating files in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx 
+[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1268][0m [0m
+[1;32mDEBUG:  self.include_multi_channel is not yet defined: this is standalone_cudacpp mode [1;30m[model_handling.py at line 1272][0m [0m
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h
+[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_h_file [1;30m[model_handling.py at line 1411][0m [0m
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc
+[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.write_process_cc_file [1;30m[model_handling.py at line 1433][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.get_sigmaKin_lines [1;30m[model_handling.py at line 1122][0m [0m
+[1;32mDEBUG:  self.include_multi_channel = [0m False [1;30m[model_handling.py at line 1123][0m [0m
+[1;32mDEBUG:  self.support_multichannel = [0m True [1;30m[model_handling.py at line 1124][0m [0m
+[1;32mDEBUG:  type(self.helas_call_writer) = [0m <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[model_handling.py at line 1225][0m [0m
+[1;32mDEBUG:  self.support_multichannel, self.include_multi_channel = [0m True False [1;30m[model_handling.py at line 1226][0m [0m
+[1;32mDEBUG:  multi_channel_map = [0m None [1;30m[model_handling.py at line 1612][0m [0m
+[1;32mDEBUG:  diag_to_config = [0m {} [1;30m[model_handling.py at line 1667][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1779][0m [0m
+[1;32mDEBUG:  ('ZERO', 0, -1, 0, 0) [1;30m[model_handling.py at line 1780][0m [0m
+[1;32mDEBUG:  call = [0m vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [1;30m[model_handling.py at line 1779][0m [0m
+[1;32mDEBUG:  ('ZERO', 1, -1, 1, 1) [1;30m[model_handling.py at line 1780][0m [0m
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. 
+[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_CMakeLists [1;30m[model_handling.py at line 1301][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_check_sa [1;30m[model_handling.py at line 1310][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_mgonGPU [1;30m[model_handling.py at line 1327][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_processidfile [1;30m[model_handling.py at line 1347][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_testxxx [1;30m[model_handling.py at line 1377][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memorybuffers [1;30m[model_handling.py at line 1388][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [1;30m[model_handling.py at line 1399][0m [0m
+Generated helas calls for 1 subprocesses (72 diagrams) in 0.212 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 181][0m [0m
+ALOHA: aloha starts to compute helicity amplitudes
+ALOHA: aloha creates VVV5 routines[0m
+ALOHA: aloha creates FFV1 routines[0m
+ALOHA: aloha creates VVVV1 routines[0m
+ALOHA: aloha creates VVVV9 routines[0m
+ALOHA: aloha creates VVVV10 routines[0m
+ALOHA: aloha creates 5 routines in  0.340 s
+<class 'aloha.create_aloha.AbstractRoutine'> VVV5
+<class 'aloha.create_aloha.AbstractRoutine'> VVV5
+<class 'aloha.create_aloha.AbstractRoutine'> FFV1
+<class 'aloha.create_aloha.AbstractRoutine'> FFV1
+<class 'aloha.create_aloha.AbstractRoutine'> FFV1
+<class 'aloha.create_aloha.AbstractRoutine'> FFV1
+<class 'aloha.create_aloha.AbstractRoutine'> VVVV1
+<class 'aloha.create_aloha.AbstractRoutine'> VVVV9
+<class 'aloha.create_aloha.AbstractRoutine'> VVVV10
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
+INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_smeft_gg_tttt/src/. 
+super_write_set_parameters_onlyfixMajorana (hardcoded=False)
+[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 344 , keys size = 344 [1;30m[model_handling.py at line 722][0m [0m
+[1;32mDEBUG:  'parset_pars size =', len(parset_pars)  = [0m parset_pars size = 344 [1;30m[model_handling.py at line 738][0m [0m
+[1;32mDEBUG:  'parset_lines size =', len(parset_lines), ', keys size =', len(parset_lines.keys())  = [0m parset_lines size = 344 , keys size = 344 [1;30m[model_handling.py at line 739][0m [0m
+super_write_set_parameters_onlyfixMajorana (hardcoded=True)
+[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 10 , keys size = 10 [1;30m[model_handling.py at line 722][0m [0m
+[1;32mDEBUG:  'parset_pars size =', len(parset_pars)  = [0m parset_pars size = 10 [1;30m[model_handling.py at line 738][0m [0m
+[1;32mDEBUG:  'parset_lines size =', len(parset_lines), ', keys size =', len(parset_lines.keys())  = [0m parset_lines size = 10 , keys size = 10 [1;30m[model_handling.py at line 739][0m [0m
+[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 722][0m [0m
+[1;32mDEBUG:  'parset_pars size =', len(parset_pars)  = [0m parset_pars size = 3 [1;30m[model_handling.py at line 738][0m [0m
+[1;32mDEBUG:  'parset_lines size =', len(parset_lines), ', keys size =', len(parset_lines.keys())  = [0m parset_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 739][0m [0m
+[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 10 , keys size = 10 [1;30m[model_handling.py at line 722][0m [0m
+[1;32mDEBUG:  'parset_pars size =', len(parset_pars)  = [0m parset_pars size = 10 [1;30m[model_handling.py at line 738][0m [0m
+[1;32mDEBUG:  'parset_lines size =', len(parset_lines), ', keys size =', len(parset_lines.keys())  = [0m parset_lines size = 10 , keys size = 10 [1;30m[model_handling.py at line 739][0m [0m
+[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 722][0m [0m
+[1;32mDEBUG:  'parset_pars size =', len(parset_pars)  = [0m parset_pars size = 3 [1;30m[model_handling.py at line 738][0m [0m
+[1;32mDEBUG:  'parset_lines size =', len(parset_lines), ', keys size =', len(parset_lines.keys())  = [0m parset_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 739][0m [0m
+[1;32mDEBUG:  'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys())  = [0m pardef_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 722][0m [0m
+[1;32mDEBUG:  'parset_pars size =', len(parset_pars)  = [0m parset_pars size = 3 [1;30m[model_handling.py at line 738][0m [0m
+[1;32mDEBUG:  'parset_lines size =', len(parset_lines), ', keys size =', len(parset_lines.keys())  = [0m parset_lines size = 3 , keys size = 3 [1;30m[model_handling.py at line 739][0m [0m
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
+FileWriter <class 'PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
+INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc in directory 
+INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_smeft_gg_tttt/src/. 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 190][0m [0m
+quit
+
+real	0m5.957s
+user	0m5.786s
+sys	0m0.085s
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/Cards/param_card.dat b/epochX/cudacpp/smeft_gg_tttt.sa/Cards/param_card.dat
new file mode 100644
index 0000000000..4a29fbe719
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/Cards/param_card.dat
@@ -0,0 +1,320 @@
+######################################################################
+## PARAM_CARD AUTOMATICALY GENERATED BY MG5 FOLLOWING UFO MODEL   ####
+######################################################################
+##                                                                  ##
+##  Width set on Auto will be computed following the information    ##
+##        present in the decay.py files of the model.               ##
+##        See  arXiv:1402.1178 for more details.                    ##
+##                                                                  ##
+######################################################################
+
+###################################
+## INFORMATION FOR MASS
+###################################
+Block mass 
+    1 4.670000e-03 # MD 
+    2 2.160000e-03 # MU 
+    3 9.300000e-02 # MS 
+    4 1.270000e+00 # MC 
+    5 4.180000e+00 # MB 
+    6 1.727600e+02 # MT 
+   11 5.110000e-04 # Me 
+   13 1.056600e-01 # MMU 
+   15 1.777000e+00 # MTA 
+   23 9.118760e+01 # MZ 
+   25 1.250900e+02 # MH 
+## Dependent parameters, given by model restrictions.
+## Those values should be edited following the 
+## analytical expression. MG5 ignores those values 
+## but they are important for interfacing the output of MG5
+## to external program such as Pythia.
+  12 0.000000e+00 # ve : 0.0 
+  14 0.000000e+00 # vm : 0.0 
+  16 0.000000e+00 # vt : 0.0 
+  21 0.000000e+00 # g : 0.0 
+  22 0.000000e+00 # a : 0.0 
+  9000005 9.118760e+01 # z1 : MZ 
+  9000006 8.038700e+01 # w1+ : MWsm 
+  9000007 1.727600e+02 # t1 : MT 
+  9000008 1.250900e+02 # h1 : MH 
+  24 8.038700e+01 # w+ : MW 
+
+###################################
+## INFORMATION FOR SMEFT
+###################################
+Block smeft 
+    1 0.000000e+00 # cG 
+    2 0.000000e+00 # cW 
+    3 0.000000e+00 # cH 
+    4 0.000000e+00 # cHbox 
+    5 0.000000e+00 # cHDD 
+    6 0.000000e+00 # cHG 
+    7 0.000000e+00 # cHW 
+    8 0.000000e+00 # cHB 
+    9 0.000000e+00 # cHWB 
+   10 0.000000e+00 # cuHRe 
+   11 0.000000e+00 # ctHRe 
+   12 0.000000e+00 # cdHRe 
+   13 0.000000e+00 # cbHRe 
+   14 0.000000e+00 # cuGRe 
+   15 0.000000e+00 # ctGRe 
+   16 0.000000e+00 # cuWRe 
+   17 0.000000e+00 # ctWRe 
+   18 0.000000e+00 # cuBRe 
+   19 0.000000e+00 # ctBRe 
+   20 0.000000e+00 # cdGRe 
+   21 0.000000e+00 # cbGRe 
+   22 0.000000e+00 # cdWRe 
+   23 0.000000e+00 # cbWRe 
+   24 0.000000e+00 # cdBRe 
+   25 0.000000e+00 # cbBRe 
+   26 0.000000e+00 # cHj1 
+   27 0.000000e+00 # cHQ1 
+   28 0.000000e+00 # cHj3 
+   29 0.000000e+00 # cHQ3 
+   30 0.000000e+00 # cHu 
+   31 0.000000e+00 # cHt 
+   32 0.000000e+00 # cHd 
+   33 0.000000e+00 # cHbq 
+   34 0.000000e+00 # cHudRe 
+   35 0.000000e+00 # cHtbRe 
+   36 0.000000e+00 # cjj11 
+   37 0.000000e+00 # cjj18 
+   38 0.000000e+00 # cjj31 
+   39 0.000000e+00 # cjj38 
+   40 0.000000e+00 # cQj11 
+   41 0.000000e+00 # cQj18 
+   42 0.000000e+00 # cQj31 
+   43 0.000000e+00 # cQj38 
+   44 0.000000e+00 # cQQ1 
+   45 0.000000e+00 # cQQ8 
+   46 0.000000e+00 # cuu1 
+   47 0.000000e+00 # cuu8 
+   48 0.000000e+00 # ctt 
+   49 0.000000e+00 # ctu1 
+   50 0.000000e+00 # ctu8 
+   51 0.000000e+00 # cdd1 
+   52 0.000000e+00 # cdd8 
+   53 0.000000e+00 # cbb 
+   54 0.000000e+00 # cbd1 
+   55 0.000000e+00 # cbd8 
+   56 0.000000e+00 # cud1 
+   57 0.000000e+00 # ctb1 
+   58 0.000000e+00 # ctd1 
+   59 0.000000e+00 # cbu1 
+   60 0.000000e+00 # cud8 
+   61 0.000000e+00 # ctb8 
+   62 0.000000e+00 # ctd8 
+   63 0.000000e+00 # cbu8 
+   64 0.000000e+00 # cutbd1Re 
+   65 0.000000e+00 # cutbd8Re 
+   66 0.000000e+00 # cju1 
+   67 0.000000e+00 # cQu1 
+   68 0.000000e+00 # cju8 
+   69 0.000000e+00 # cQu8 
+   70 0.000000e+00 # ctj1 
+   71 0.000000e+00 # ctj8 
+   72 0.000000e+00 # cQt1 
+   73 0.000000e+00 # cQt8 
+   74 0.000000e+00 # cjd1 
+   75 0.000000e+00 # cjd8 
+   76 0.000000e+00 # cQd1 
+   77 0.000000e+00 # cQd8 
+   78 0.000000e+00 # cbj1 
+   79 0.000000e+00 # cbj8 
+   80 0.000000e+00 # cQb1 
+   81 0.000000e+00 # cQb8 
+   82 0.000000e+00 # cjQtu1Re 
+   83 0.000000e+00 # cjQtu8Re 
+   84 0.000000e+00 # cjQbd1Re 
+   85 0.000000e+00 # cjQbd8Re 
+   86 0.000000e+00 # cjujd1Re 
+   87 0.000000e+00 # cjujd8Re 
+   88 0.000000e+00 # cjujd11Re 
+   89 0.000000e+00 # cjujd81Re 
+   90 0.000000e+00 # cQtjd1Re 
+   91 0.000000e+00 # cQtjd8Re 
+   92 0.000000e+00 # cjuQb1Re 
+   93 0.000000e+00 # cjuQb8Re 
+   94 0.000000e+00 # cQujb1Re 
+   95 0.000000e+00 # cQujb8Re 
+   96 0.000000e+00 # cjtQd1Re 
+   97 0.000000e+00 # cjtQd8Re 
+   98 0.000000e+00 # cQtQb1Re 
+   99 0.000000e+00 # cQtQb8Re 
+  100 0.000000e+00 # ceHRe 
+  101 0.000000e+00 # ceWRe 
+  102 0.000000e+00 # ceBRe 
+  103 0.000000e+00 # cHl1 
+  104 0.000000e+00 # cHl3 
+  105 0.000000e+00 # cHe 
+  106 0.000000e+00 # cll 
+  107 0.000000e+00 # cll1 
+  108 0.000000e+00 # clj1 
+  109 0.000000e+00 # clj3 
+  110 0.000000e+00 # cQl1 
+  111 0.000000e+00 # cQl3 
+  112 0.000000e+00 # cee 
+  113 0.000000e+00 # ceu 
+  114 0.000000e+00 # cte 
+  115 0.000000e+00 # ced 
+  116 0.000000e+00 # cbe 
+  117 0.000000e+00 # cje 
+  118 0.000000e+00 # cQe 
+  119 0.000000e+00 # clu 
+  120 0.000000e+00 # ctl 
+  121 0.000000e+00 # cld 
+  122 0.000000e+00 # cbl 
+  123 0.000000e+00 # cle 
+  124 0.000000e+00 # cledjRe 
+  125 0.000000e+00 # clebQRe 
+  126 0.000000e+00 # cleju1Re 
+  127 0.000000e+00 # cleQt1Re 
+  128 0.000000e+00 # cleju3Re 
+  129 0.000000e+00 # cleQt3Re 
+
+###################################
+## INFORMATION FOR SMEFTCPV
+###################################
+Block smeftcpv 
+    1 0.000000e+00 # cGtil 
+    2 0.000000e+00 # cWtil 
+    3 0.000000e+00 # cHGtil 
+    4 0.000000e+00 # cHWtil 
+    5 0.000000e+00 # cHBtil 
+    6 0.000000e+00 # cHWBtil 
+    7 0.000000e+00 # cuGIm 
+    8 0.000000e+00 # ctGIm 
+    9 0.000000e+00 # cuWIm 
+   10 0.000000e+00 # ctWIm 
+   11 0.000000e+00 # cuBIm 
+   12 0.000000e+00 # ctBIm 
+   13 0.000000e+00 # cdGIm 
+   14 0.000000e+00 # cbGIm 
+   15 0.000000e+00 # cdWIm 
+   16 0.000000e+00 # cbWIm 
+   17 0.000000e+00 # cdBIm 
+   18 0.000000e+00 # cbBIm 
+   19 0.000000e+00 # cuHIm 
+   20 0.000000e+00 # ctHIm 
+   21 0.000000e+00 # cdHIm 
+   22 0.000000e+00 # cbHIm 
+   23 0.000000e+00 # cHudIm 
+   24 0.000000e+00 # cHtbIm 
+   25 0.000000e+00 # cutbd1Im 
+   26 0.000000e+00 # cutbd8Im 
+   27 0.000000e+00 # cjQtu1Im 
+   28 0.000000e+00 # cjQtu8Im 
+   29 0.000000e+00 # cjQbd1Im 
+   30 0.000000e+00 # cjQbd8Im 
+   31 0.000000e+00 # cjujd1Im 
+   32 0.000000e+00 # cjujd8Im 
+   33 0.000000e+00 # cjujd11Im 
+   34 0.000000e+00 # cjujd81Im 
+   35 0.000000e+00 # cQtjd1Im 
+   36 0.000000e+00 # cQtjd8Im 
+   37 0.000000e+00 # cjuQb1Im 
+   38 0.000000e+00 # cjuQb8Im 
+   39 0.000000e+00 # cQujb1Im 
+   40 0.000000e+00 # cQujb8Im 
+   41 0.000000e+00 # cjtQd1Im 
+   42 0.000000e+00 # cjtQd8Im 
+   43 0.000000e+00 # cQtQb1Im 
+   44 0.000000e+00 # cQtQb8Im 
+   45 0.000000e+00 # ceHIm 
+   46 0.000000e+00 # ceWIm 
+   47 0.000000e+00 # ceBIm 
+   48 0.000000e+00 # cledjIm 
+   49 0.000000e+00 # clebQIm 
+   50 0.000000e+00 # cleju1Im 
+   51 0.000000e+00 # cleju3Im 
+   52 0.000000e+00 # cleQt1Im 
+   53 0.000000e+00 # cleQt3Im 
+
+###################################
+## INFORMATION FOR SMEFTCUTOFF
+###################################
+Block smeftcutoff 
+    1 1.000000e+03 # LambdaSMEFT 
+
+###################################
+## INFORMATION FOR SMINPUTS
+###################################
+Block sminputs 
+    1 8.038700e+01 # MW 
+    2 1.166379e-05 # Gf 
+    3 1.179000e-01 # aS (Note that Parameter not used if you use a PDF set) 
+
+###################################
+## INFORMATION FOR SWITCHES
+###################################
+Block switches 
+    1 0.000000e+00 # linearPropCorrections 
+
+###################################
+## INFORMATION FOR YUKAWA
+###################################
+Block yukawa 
+    1 4.670000e-03 # ymdo 
+    2 2.160000e-03 # ymup 
+    3 9.300000e-02 # yms 
+    4 1.270000e+00 # ymc 
+    5 4.180000e+00 # ymb 
+    6 1.727600e+02 # ymt 
+   11 5.110000e-04 # yme 
+   13 1.056600e-01 # ymm 
+   15 1.777000e+00 # ymtau 
+
+###################################
+## INFORMATION FOR DECAY
+###################################
+DECAY   6 1.330000e+00 # WT 
+DECAY  23 2.495200e+00 # WZ 
+DECAY  24 2.085000e+00 # WW 
+DECAY  25 4.070000e-03 # WH 
+## Dependent parameters, given by model restrictions.
+## Those values should be edited following the 
+## analytical expression. MG5 ignores those values 
+## but they are important for interfacing the output of MG5
+## to external program such as Pythia.
+DECAY  1 0.000000e+00 # d : 0.0 
+DECAY  2 0.000000e+00 # u : 0.0 
+DECAY  3 0.000000e+00 # s : 0.0 
+DECAY  4 0.000000e+00 # c : 0.0 
+DECAY  5 0.000000e+00 # b : 0.0 
+DECAY  11 0.000000e+00 # e- : 0.0 
+DECAY  12 0.000000e+00 # ve : 0.0 
+DECAY  13 0.000000e+00 # mu- : 0.0 
+DECAY  14 0.000000e+00 # vm : 0.0 
+DECAY  15 0.000000e+00 # ta- : 0.0 
+DECAY  16 0.000000e+00 # vt : 0.0 
+DECAY  21 0.000000e+00 # g : 0.0 
+DECAY  22 0.000000e+00 # a : 0.0 
+DECAY  9000005 2.495200e+00 # z1 : WZ 
+DECAY  9000006 2.085000e+00 # w1+ : WW 
+DECAY  9000007 1.330000e+00 # t1 : WT 
+DECAY  9000008 4.070000e-03 # h1 : WH 
+#===========================================================
+# QUANTUM NUMBERS OF NEW STATE(S) (NON SM PDG CODE)
+#===========================================================
+
+Block QNUMBERS 9000005  # z1 
+        1 0  # 3 times electric charge
+        2 3  # number of spin states (2S+1)
+        3 1  # colour rep (1: singlet, 3: triplet, 8: octet)
+        4 0  # Particle/Antiparticle distinction (0=own anti)
+Block QNUMBERS 9000006  # w1+ 
+        1 3  # 3 times electric charge
+        2 3  # number of spin states (2S+1)
+        3 1  # colour rep (1: singlet, 3: triplet, 8: octet)
+        4 1  # Particle/Antiparticle distinction (0=own anti)
+Block QNUMBERS 9000007  # t1 
+        1 2  # 3 times electric charge
+        2 2  # number of spin states (2S+1)
+        3 3  # colour rep (1: singlet, 3: triplet, 8: octet)
+        4 1  # Particle/Antiparticle distinction (0=own anti)
+Block QNUMBERS 9000008  # h1 
+        1 0  # 3 times electric charge
+        2 1  # number of spin states (2S+1)
+        3 1  # colour rep (1: singlet, 3: triplet, 8: octet)
+        4 0  # Particle/Antiparticle distinction (0=own anti)
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h
new file mode 100644
index 0000000000..faa8f95d1d
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h
@@ -0,0 +1,519 @@
+#ifndef BRIDGE_H
+#define BRIDGE_H 1
+
+// Includes from Cuda/C++ matrix element calculations
+#include "mgOnGpuConfig.h" // for mgOnGpu::npar, mgOnGpu::np4
+
+#include "CPPProcess.h"           // for CPPProcess
+#include "CrossSectionKernels.h"  // for flagAbnormalMEs
+#include "MatrixElementKernels.h" // for MatrixElementKernelHost, MatrixElementKernelDevice
+#include "MemoryAccessMomenta.h"  // for MemoryAccessMomenta::neppM
+#include "MemoryBuffers.h"        // for HostBufferMomenta, DeviceBufferMomenta etc
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstring>
+#include <iostream>
+#include <memory>
+#include <type_traits>
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+  /**
+   * A base class for a class whose pointer is passed between Fortran and C++.
+   * This is not really necessary, but it allows minimal type checks on all such pointers.
+   */
+  struct CppObjectInFortran
+  {
+    CppObjectInFortran() {}
+    virtual ~CppObjectInFortran() {}
+  };
+
+  //--------------------------------------------------------------------------
+  /**
+   * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow.
+   * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double).
+   * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++.
+   *
+   * The Fortran momenta passed in are in the form of
+   *   DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED)
+   * where the dimensions are <np4F(#momenta)>, <nparF(#particles)>, <nevtF(#events)>.
+   * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F].
+   * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM.
+   * The Bridge is configured to store nevt==nevtF events in CUDA/C++.
+   * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4.
+   *
+   * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs.
+   * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin.
+   * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float.
+   * In the check_sa "--bridge" test, everything is implemented in fptype (double or float).
+   */
+  template<typename FORTRANFPTYPE>
+  class Bridge final : public CppObjectInFortran
+  {
+  public:
+    /**
+     * Constructor
+     *
+     * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX)
+     * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+     * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+     */
+    Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F );
+
+    /**
+     * Destructor
+     */
+    virtual ~Bridge() {}
+
+    // Delete copy/move constructors and assignment operators
+    Bridge( const Bridge& ) = delete;
+    Bridge( Bridge&& ) = delete;
+    Bridge& operator=( const Bridge& ) = delete;
+    Bridge& operator=( Bridge&& ) = delete;
+
+#ifdef __CUDACC__
+    /**
+     * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads
+     * (this is needed for BridgeKernel tests rather than for actual production use in Fortran)
+     *
+     * @param gpublocks number of gpublocks
+     * @param gputhreads number of gputhreads
+     */
+    void set_gpugrid( const int gpublocks, const int gputhreads );
+
+    /**
+     * Sequence to be executed for the Cuda matrix element calculation
+     *
+     * @param momenta the pointer to the input 4-momenta
+     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
+     * @param rndhel the pointer to the input random numbers for helicity selection
+     * @param rndcol the pointer to the input random numbers for color selection
+     * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
+     * @param mes the pointer to the output matrix elements
+     * @param goodHelOnly quit after computing good helicities?
+     * @param selhel the pointer to the output selected helicities
+     * @param selcol the pointer to the output selected colors
+     */
+    void gpu_sequence( const FORTRANFPTYPE* momenta,
+                       const FORTRANFPTYPE* gs,
+                       const FORTRANFPTYPE* rndhel,
+                       const FORTRANFPTYPE* rndcol,
+                       const unsigned int channelId,
+                       FORTRANFPTYPE* mes,
+                       int* selhel,
+                       int* selcol,
+                       const bool goodHelOnly = false );
+#else
+    /**
+     * Sequence to be executed for the vectorized CPU matrix element calculation
+     *
+     * @param momenta the pointer to the input 4-momenta
+     * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
+     * @param rndhel the pointer to the input random numbers for helicity selection
+     * @param rndcol the pointer to the input random numbers for color selection
+     * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
+     * @param mes the pointer to the output matrix elements
+     * @param selhel the pointer to the output selected helicities
+     * @param selcol the pointer to the output selected colors
+     * @param goodHelOnly quit after computing good helicities?
+     */
+    void cpu_sequence( const FORTRANFPTYPE* momenta,
+                       const FORTRANFPTYPE* gs,
+                       const FORTRANFPTYPE* rndhel,
+                       const FORTRANFPTYPE* rndcol,
+                       const unsigned int channelId,
+                       FORTRANFPTYPE* mes,
+                       int* selhel,
+                       int* selcol,
+                       const bool goodHelOnly = false );
+#endif
+
+    // Return the number of good helicities (-1 initially when they have not yet been calculated)
+    int nGoodHel() const { return m_nGoodHel; }
+
+    // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran)
+    constexpr int nTotHel() const { return mgOnGpu::ncomb; }
+
+  private:
+    unsigned int m_nevt; // number of events
+    int m_nGoodHel;      // the number of good helicities (-1 initially when they have not yet been calculated)
+
+#ifdef __CUDACC__
+    int m_gputhreads; // number of gpu threads (default set from number of events, can be modified)
+    int m_gpublocks;  // number of gpu blocks (default set from number of events, can be modified)
+    mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, sizePerEventMomenta> m_devMomentaF;
+    mg5amcGpu::DeviceBufferMomenta m_devMomentaC;
+    mg5amcGpu::DeviceBufferGs m_devGs;
+    mg5amcGpu::DeviceBufferRndNumHelicity m_devRndHel;
+    mg5amcGpu::DeviceBufferRndNumColor m_devRndCol;
+    mg5amcGpu::DeviceBufferMatrixElements m_devMEs;
+    mg5amcGpu::DeviceBufferSelectedHelicity m_devSelHel;
+    mg5amcGpu::DeviceBufferSelectedColor m_devSelCol;
+    mg5amcGpu::PinnedHostBufferGs m_hstGs;
+    mg5amcGpu::PinnedHostBufferRndNumHelicity m_hstRndHel;
+    mg5amcGpu::PinnedHostBufferRndNumColor m_hstRndCol;
+    mg5amcGpu::PinnedHostBufferMatrixElements m_hstMEs;
+    mg5amcGpu::PinnedHostBufferSelectedHelicity m_hstSelHel;
+    mg5amcGpu::PinnedHostBufferSelectedColor m_hstSelCol;
+    std::unique_ptr<mg5amcGpu::MatrixElementKernelDevice> m_pmek;
+    //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT)
+    static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT)
+#else
+    mg5amcCpu::HostBufferMomenta m_hstMomentaC;
+    mg5amcCpu::HostBufferGs m_hstGs;
+    mg5amcCpu::HostBufferRndNumHelicity m_hstRndHel;
+    mg5amcCpu::HostBufferRndNumColor m_hstRndCol;
+    mg5amcCpu::HostBufferMatrixElements m_hstMEs;
+    mg5amcCpu::HostBufferSelectedHelicity m_hstSelHel;
+    mg5amcCpu::HostBufferSelectedColor m_hstSelCol;
+    std::unique_ptr<mg5amcCpu::MatrixElementKernelHost> m_pmek;
+#endif
+  };
+
+  //--------------------------------------------------------------------------
+  //
+  // Forward declare transposition methods
+  //
+
+#ifdef __CUDACC__
+
+  template<typename Tin, typename Tout>
+  __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
+
+#endif // __CUDACC__
+
+  template<typename Tin, typename Tout>
+  void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt );
+
+  template<typename Tin, typename Tout>
+  void hst_transposeMomentaC2F( const Tin* in, Tout* out, const unsigned int nevt );
+
+  //--------------------------------------------------------------------------
+  //
+  // Implementations of member functions of class Bridge
+  //
+
+  template<typename FORTRANFPTYPE>
+  Bridge<FORTRANFPTYPE>::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F )
+    : m_nevt( nevtF )
+    , m_nGoodHel( -1 )
+#ifdef __CUDACC__
+    , m_gputhreads( 256 )                  // default number of gpu threads
+    , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads
+    , m_devMomentaF( m_nevt )
+    , m_devMomentaC( m_nevt )
+    , m_devGs( m_nevt )
+    , m_devRndHel( m_nevt )
+    , m_devRndCol( m_nevt )
+    , m_devMEs( m_nevt )
+    , m_devSelHel( m_nevt )
+    , m_devSelCol( m_nevt )
+#else
+    , m_hstMomentaC( m_nevt )
+#endif
+    , m_hstGs( m_nevt )
+    , m_hstRndHel( m_nevt )
+    , m_hstRndCol( m_nevt )
+    , m_hstMEs( m_nevt )
+    , m_hstSelHel( m_nevt )
+    , m_hstSelCol( m_nevt )
+    , m_pmek( nullptr )
+  {
+    if( nparF != mgOnGpu::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
+    if( np4F != mgOnGpu::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
+#ifdef __CUDACC__
+    if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
+      throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
+    while( m_nevt != m_gpublocks * m_gputhreads )
+    {
+      m_gputhreads /= 2;
+      if( m_gputhreads < s_gputhreadsmin )
+        throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
+      m_gpublocks = m_nevt / m_gputhreads;
+    }
+    std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+    mg5amcGpu::CPPProcess process( /*verbose=*/false );
+    m_pmek.reset( new mg5amcGpu::MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
+#else
+    std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
+    mg5amcCpu::CPPProcess process( /*verbose=*/false );
+    m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#endif // __CUDACC__
+    process.initProc( "../../Cards/param_card.dat" );
+  }
+
+#ifdef __CUDACC__
+  template<typename FORTRANFPTYPE>
+  void Bridge<FORTRANFPTYPE>::set_gpugrid( const int gpublocks, const int gputhreads )
+  {
+    if( m_nevt != gpublocks * gputhreads )
+      throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
+    m_gpublocks = gpublocks;
+    m_gputhreads = gputhreads;
+    std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
+              << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+    m_pmek->setGrid( m_gpublocks, m_gputhreads );
+  }
+#endif
+
+#ifdef __CUDACC__
+  template<typename FORTRANFPTYPE>
+  void Bridge<FORTRANFPTYPE>::gpu_sequence( const FORTRANFPTYPE* momenta,
+                                            const FORTRANFPTYPE* gs,
+                                            const FORTRANFPTYPE* rndhel,
+                                            const FORTRANFPTYPE* rndcol,
+                                            const unsigned int channelId,
+                                            FORTRANFPTYPE* mes,
+                                            int* selhel,
+                                            int* selcol,
+                                            const bool goodHelOnly )
+  {
+    constexpr int neppM = MemoryAccessMomenta::neppM;
+    if constexpr( neppM == 1 && std::is_same_v<FORTRANFPTYPE, fptype> )
+    {
+      checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
+    }
+    else
+    {
+      checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) );
+      const int thrPerEvt = mgOnGpu::npar * mgOnGpu::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread)
+      //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
+      dev_transposeMomentaF2C<<<m_gpublocks * thrPerEvt, m_gputhreads>>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
+    }
+    if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
+    {
+      memcpy( m_hstGs.data(), gs, m_nevt * sizeof( FORTRANFPTYPE ) );
+      memcpy( m_hstRndHel.data(), rndhel, m_nevt * sizeof( FORTRANFPTYPE ) );
+      memcpy( m_hstRndCol.data(), rndcol, m_nevt * sizeof( FORTRANFPTYPE ) );
+    }
+    else
+    {
+      std::copy( gs, gs + m_nevt, m_hstGs.data() );
+      std::copy( rndhel, rndhel + m_nevt, m_hstRndHel.data() );
+      std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
+    }
+    copyDeviceFromHost( m_devGs, m_hstGs );
+    copyDeviceFromHost( m_devRndHel, m_hstRndHel );
+    copyDeviceFromHost( m_devRndCol, m_hstRndCol );
+    if( m_nGoodHel < 0 )
+    {
+      m_nGoodHel = m_pmek->computeGoodHelicities();
+      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+    }
+    if( goodHelOnly ) return;
+    m_pmek->computeMatrixElements( channelId );
+    copyHostFromDevice( m_hstMEs, m_devMEs );
+    flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+    copyHostFromDevice( m_hstSelHel, m_devSelHel );
+    copyHostFromDevice( m_hstSelCol, m_devSelCol );
+    if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
+    {
+      memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
+      memcpy( selhel, m_hstSelHel.data(), m_hstSelHel.bytes() );
+      memcpy( selcol, m_hstSelCol.data(), m_hstSelCol.bytes() );
+    }
+    else
+    {
+      std::copy( m_hstMEs.data(), m_hstMEs.data() + m_nevt, mes );
+      std::copy( m_hstSelHel.data(), m_hstSelHel.data() + m_nevt, selhel );
+      std::copy( m_hstSelCol.data(), m_hstSelCol.data() + m_nevt, selcol );
+    }
+  }
+#endif
+
+#ifndef __CUDACC__
+  template<typename FORTRANFPTYPE>
+  void Bridge<FORTRANFPTYPE>::cpu_sequence( const FORTRANFPTYPE* momenta,
+                                            const FORTRANFPTYPE* gs,
+                                            const FORTRANFPTYPE* rndhel,
+                                            const FORTRANFPTYPE* rndcol,
+                                            const unsigned int channelId,
+                                            FORTRANFPTYPE* mes,
+                                            int* selhel,
+                                            int* selcol,
+                                            const bool goodHelOnly )
+  {
+    hst_transposeMomentaF2C( momenta, m_hstMomentaC.data(), m_nevt );
+    if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
+    {
+      memcpy( m_hstGs.data(), gs, m_nevt * sizeof( FORTRANFPTYPE ) );
+      memcpy( m_hstRndHel.data(), rndhel, m_nevt * sizeof( FORTRANFPTYPE ) );
+      memcpy( m_hstRndCol.data(), rndcol, m_nevt * sizeof( FORTRANFPTYPE ) );
+    }
+    else
+    {
+      std::copy( gs, gs + m_nevt, m_hstGs.data() );
+      std::copy( rndhel, rndhel + m_nevt, m_hstRndHel.data() );
+      std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
+    }
+    if( m_nGoodHel < 0 )
+    {
+      m_nGoodHel = m_pmek->computeGoodHelicities();
+      if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
+    }
+    if( goodHelOnly ) return;
+    m_pmek->computeMatrixElements( channelId );
+    flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+    if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
+    {
+      memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
+      memcpy( selhel, m_hstSelHel.data(), m_hstSelHel.bytes() );
+      memcpy( selcol, m_hstSelCol.data(), m_hstSelCol.bytes() );
+    }
+    else
+    {
+      std::copy( m_hstMEs.data(), m_hstMEs.data() + m_nevt, mes );
+      std::copy( m_hstSelHel.data(), m_hstSelHel.data() + m_nevt, selhel );
+      std::copy( m_hstSelCol.data(), m_hstSelCol.data() + m_nevt, selcol );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+  //
+  // Implementations of transposition methods
+  // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS)
+  // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA)
+  //
+
+#ifdef __CUDACC__
+  template<typename Tin, typename Tout>
+  __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
+  {
+    constexpr bool oldImplementation = true; // default: use old implementation
+    if constexpr( oldImplementation )
+    {
+      // SR initial implementation
+      constexpr int part = mgOnGpu::npar;
+      constexpr int mome = mgOnGpu::np4;
+      constexpr int strd = MemoryAccessMomenta::neppM;
+      int pos = blockDim.x * blockIdx.x + threadIdx.x;
+      int arrlen = nevt * part * mome;
+      if( pos < arrlen )
+      {
+        int page_i = pos / ( strd * mome * part );
+        int rest_1 = pos % ( strd * mome * part );
+        int part_i = rest_1 / ( strd * mome );
+        int rest_2 = rest_1 % ( strd * mome );
+        int mome_i = rest_2 / strd;
+        int strd_i = rest_2 % strd;
+        int inpos =
+          ( page_i * strd + strd_i ) // event number
+            * ( part * mome )        // event size (pos of event)
+          + part_i * mome            // particle inside event
+          + mome_i;                  // momentum inside particle
+        out[pos] = in[inpos];        // F2C (Fortran to C)
+      }
+    }
+    else
+    {
+      // AV attempt another implementation with 1 event per thread: this seems slower...
+      // F-style: AOS[nevtF][nparF][np4F]
+      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      constexpr int npar = mgOnGpu::npar;
+      constexpr int np4 = mgOnGpu::np4;
+      constexpr int neppM = MemoryAccessMomenta::neppM;
+      assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+      int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      int ipagM = ievt / neppM;
+      int ieppM = ievt % neppM;
+      for( int ip4 = 0; ip4 < np4; ip4++ )
+        for( int ipar = 0; ipar < npar; ipar++ )
+        {
+          int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+          int fpos = ievt * npar * np4 + ipar * np4 + ip4;
+          out[cpos] = in[fpos]; // F2C (Fortran to C)
+        }
+    }
+  }
+#endif
+
+  template<typename Tin, typename Tout, bool F2C>
+  void hst_transposeMomenta( const Tin* in, Tout* out, const unsigned int nevt )
+  {
+    constexpr bool oldImplementation = false; // default: use new implementation
+    if constexpr( oldImplementation )
+    {
+      // SR initial implementation
+      constexpr unsigned int part = mgOnGpu::npar;
+      constexpr unsigned int mome = mgOnGpu::np4;
+      constexpr unsigned int strd = MemoryAccessMomenta::neppM;
+      unsigned int arrlen = nevt * part * mome;
+      for( unsigned int pos = 0; pos < arrlen; ++pos )
+      {
+        unsigned int page_i = pos / ( strd * mome * part );
+        unsigned int rest_1 = pos % ( strd * mome * part );
+        unsigned int part_i = rest_1 / ( strd * mome );
+        unsigned int rest_2 = rest_1 % ( strd * mome );
+        unsigned int mome_i = rest_2 / strd;
+        unsigned int strd_i = rest_2 % strd;
+        unsigned int inpos =
+          ( page_i * strd + strd_i ) // event number
+            * ( part * mome )        // event size (pos of event)
+          + part_i * mome            // particle inside event
+          + mome_i;                  // momentum inside particle
+        if constexpr( F2C )          // needs c++17 and cuda >=11.2 (#333)
+          out[pos] = in[inpos];      // F2C (Fortran to C)
+        else
+          out[inpos] = in[pos]; // C2F (C to Fortran)
+      }
+    }
+    else
+    {
+      // AV attempt another implementation: this is slightly faster (better c++ pipelining?)
+      // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough]
+      // F-style: AOS[nevtF][nparF][np4F]
+      // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM
+      constexpr unsigned int npar = mgOnGpu::npar;
+      constexpr unsigned int np4 = mgOnGpu::np4;
+      constexpr unsigned int neppM = MemoryAccessMomenta::neppM;
+      if constexpr( neppM == 1 && std::is_same_v<Tin, Tout> )
+      {
+        memcpy( out, in, nevt * npar * np4 * sizeof( Tin ) );
+      }
+      else
+      {
+        const unsigned int npagM = nevt / neppM;
+        assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM???
+        for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ )
+          for( unsigned int ip4 = 0; ip4 < np4; ip4++ )
+            for( unsigned int ipar = 0; ipar < npar; ipar++ )
+              for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ )
+              {
+                unsigned int ievt = ipagM * neppM + ieppM;
+                unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM;
+                unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4;
+                if constexpr( F2C )
+                  out[cpos] = in[fpos]; // F2C (Fortran to C)
+                else
+                  out[fpos] = in[cpos]; // C2F (C to Fortran)
+              }
+      }
+    }
+  }
+
+  template<typename Tin, typename Tout>
+  void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt )
+  {
+    constexpr bool F2C = true;
+    hst_transposeMomenta<Tin, Tout, F2C>( in, out, nevt );
+  }
+
+  template<typename Tin, typename Tout>
+  void hst_transposeMomentaC2F( const Tin* in, Tout* out, const unsigned int nevt )
+  {
+    constexpr bool F2C = false;
+    hst_transposeMomenta<Tin, Tout, F2C>( in, out, nevt );
+  }
+
+  //--------------------------------------------------------------------------
+}
+#endif // BRIDGE_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/BridgeKernels.cc
new file mode 100644
index 0000000000..c2c16ff038
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/BridgeKernels.cc
@@ -0,0 +1,149 @@
+#include "BridgeKernels.h"
+
+#include "MemoryAccessMomenta.h"
+
+#include <sstream>
+
+using mgOnGpu::npar; // the number of particles (external = initial + final)
+using mgOnGpu::np4;  // the number of dimensions of 4-momenta (E,px,py,pz)
+
+//============================================================================
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+  BridgeKernelBase::BridgeKernelBase( const BufferMomenta& momenta,         // input: momenta
+                                      const BufferGs& gs,                   // input: gs for alphaS
+                                      const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                      const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                      BufferMatrixElements& matrixElements, // output: matrix elements
+                                      BufferSelectedHelicity& selhel,       // output: helicity selection
+                                      BufferSelectedColor& selcol,          // output: color selection
+                                      const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_bridge( nevt, npar, np4 )
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "BridgeKernelBase: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "BridgeKernelBase: matrixElements must be a host array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "BridgeKernelBase: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "BridgeKernelBase: nevt mismatch with matrixElements" );
+  }
+
+  //--------------------------------------------------------------------------
+}
+
+//============================================================================
+
+#ifndef __CUDACC__
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  BridgeKernelHost::BridgeKernelHost( const BufferMomenta& momenta,         // input: momenta
+                                      const BufferGs& gs,                   // input: Gs for alphaS
+                                      const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                      const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                      BufferMatrixElements& matrixElements, // output: matrix elements
+                                      BufferSelectedHelicity& selhel,       // output: helicity selection
+                                      BufferSelectedColor& selcol,          // output: color selection
+                                      const size_t nevt )
+    : BridgeKernelBase( momenta, gs, rndhel, rndcol, matrixElements, selhel, selcol, nevt )
+    , m_fortranMomenta( nevt )
+  {
+  }
+
+  //--------------------------------------------------------------------------
+
+  void BridgeKernelHost::transposeInputMomentaC2F()
+  {
+    hst_transposeMomentaC2F( m_momenta.data(), m_fortranMomenta.data(), nevt() );
+  }
+
+  //--------------------------------------------------------------------------
+
+  int BridgeKernelHost::computeGoodHelicities()
+  {
+    constexpr bool goodHelOnly = true;
+    constexpr unsigned int channelId = 0; // disable multi-channel for helicity filtering
+    m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), channelId, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly );
+    return m_bridge.nGoodHel();
+  }
+
+  //--------------------------------------------------------------------------
+
+  void BridgeKernelHost::computeMatrixElements( const unsigned int channelId )
+  {
+    constexpr bool goodHelOnly = false;
+    m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), channelId, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly );
+  }
+
+  //--------------------------------------------------------------------------
+
+}
+#endif
+
+//============================================================================
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+{
+
+  //--------------------------------------------------------------------------
+
+  BridgeKernelDevice::BridgeKernelDevice( const BufferMomenta& momenta,         // input: momenta
+                                          const BufferGs& gs,                   // input: Gs for alphaS
+                                          const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                          const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                          BufferMatrixElements& matrixElements, // output: matrix elements
+                                          BufferSelectedHelicity& selhel,       // output: helicity selection
+                                          BufferSelectedColor& selcol,          // output: color selection
+                                          const size_t gpublocks,
+                                          const size_t gputhreads )
+    : BridgeKernelBase( momenta, gs, rndhel, rndcol, matrixElements, selhel, selcol, gpublocks * gputhreads )
+    , m_fortranMomenta( nevt() )
+    , m_gpublocks( gpublocks )
+    , m_gputhreads( gputhreads )
+  {
+    if( m_gpublocks == 0 ) throw std::runtime_error( "BridgeKernelDevice: gpublocks must be > 0" );
+    if( m_gputhreads == 0 ) throw std::runtime_error( "BridgeKernelDevice: gputhreads must be > 0" );
+    m_bridge.set_gpugrid( gpublocks, gputhreads );
+  }
+
+  //--------------------------------------------------------------------------
+
+  void BridgeKernelDevice::transposeInputMomentaC2F()
+  {
+    hst_transposeMomentaC2F( m_momenta.data(), m_fortranMomenta.data(), nevt() );
+  }
+
+  //--------------------------------------------------------------------------
+
+  int BridgeKernelDevice::computeGoodHelicities()
+  {
+    constexpr bool goodHelOnly = true;
+    constexpr unsigned int channelId = 0; // disable multi-channel for helicity filtering
+    m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), channelId, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly );
+    return m_bridge.nGoodHel();
+  }
+
+  //--------------------------------------------------------------------------
+
+  void BridgeKernelDevice::computeMatrixElements( const unsigned int channelId )
+  {
+    constexpr bool goodHelOnly = false;
+    m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), channelId, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly );
+  }
+
+  //--------------------------------------------------------------------------
+
+}
+#endif
+
+//============================================================================
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/BridgeKernels.h
new file mode 100644
index 0000000000..10e664a4c4
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/BridgeKernels.h
@@ -0,0 +1,134 @@
+#ifndef BRIDGEKERNELS_H
+#define BRIDGEKERNELS_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "Bridge.h"
+#include "MatrixElementKernels.h"
+#include "MemoryBuffers.h"
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+  // A Bridge wrapper base class encapsulating matrix element calculations on a CPU host
+  class BridgeKernelBase : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    BridgeKernelBase( const BufferMomenta& momenta,         // input: momenta
+                      const BufferGs& gs,                   // input: gs for alphaS
+                      const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                      const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                      BufferMatrixElements& matrixElements, // output: matrix elements
+                      BufferSelectedHelicity& selhel,       // output: helicity selection
+                      BufferSelectedColor& selcol,          // output: color selection
+                      const size_t nevt );
+
+    // Destructor
+    virtual ~BridgeKernelBase() {}
+
+    // Transpose input momenta from C to Fortran before the matrix element calculation in the Bridge
+    virtual void transposeInputMomentaC2F() = 0;
+
+  protected:
+
+    // The wrapped bridge
+    Bridge<fptype> m_bridge;
+  };
+
+  //--------------------------------------------------------------------------
+
+#ifndef __CUDACC__
+  // A Bridge wrapper class encapsulating matrix element calculations on a CPU host
+  class BridgeKernelHost final : public BridgeKernelBase
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    BridgeKernelHost( const BufferMomenta& momenta,         // input: momenta
+                      const BufferGs& gs,                   // input: gs for alphaS
+                      const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                      const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                      BufferMatrixElements& matrixElements, // output: matrix elements
+                      BufferSelectedHelicity& selhel,       // output: helicity selection
+                      BufferSelectedColor& selcol,          // output: color selection
+                      const size_t nevt );
+
+    // Destructor
+    virtual ~BridgeKernelHost() {}
+
+    // Transpose input momenta from C to Fortran before the matrix element calculation in the Bridge
+    void transposeInputMomentaC2F() override final;
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const unsigned int channelId ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+  private:
+
+    // The buffer for the input momenta, transposed to Fortran array indexing
+    HostBufferMomenta m_fortranMomenta;
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+  // A Bridge wrapper class encapsulating matrix element calculations on a GPU device
+  class BridgeKernelDevice : public BridgeKernelBase
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    BridgeKernelDevice( const BufferMomenta& momenta,         // input: momenta
+                        const BufferGs& gs,                   // input: gs for alphaS
+                        const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                        const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                        BufferMatrixElements& matrixElements, // output: matrix elements
+                        BufferSelectedHelicity& selhel,       // output: helicity selection
+                        BufferSelectedColor& selcol,          // output: color selection
+                        const size_t gpublocks,
+                        const size_t gputhreads );
+
+    // Destructor
+    virtual ~BridgeKernelDevice() {}
+
+    // Transpose input momenta from C to Fortran before the matrix element calculation in the Bridge
+    void transposeInputMomentaC2F() override final;
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const unsigned int channelId ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return true; }
+
+  private:
+
+    // The buffer for the input momenta, transposed to Fortran array indexing
+    PinnedHostBufferMomenta m_fortranMomenta;
+
+    // The number of blocks in the GPU grid
+    size_t m_gpublocks;
+
+    // The number of threads in the GPU grid
+    size_t m_gputhreads;
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+}
+#endif // BRIDGEKERNELS_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/CMakeLists.txt b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/CMakeLists.txt
new file mode 100644
index 0000000000..1e15f3e9ed
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/CMakeLists.txt
@@ -0,0 +1,4 @@
+SUBDIRLIST(SUBDIRS)
+FOREACH(subdir ${SUBDIRS})
+  ADD_SUBDIRECTORY(${subdir})
+ENDFOREACH()
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/CrossSectionKernels.cc
new file mode 100644
index 0000000000..398f8a87bd
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/CrossSectionKernels.cc
@@ -0,0 +1,231 @@
+#include "CrossSectionKernels.h"
+
+#include "MemoryAccessMatrixElements.h"
+#include "MemoryAccessWeights.h"
+#include "MemoryBuffers.h"
+
+#include <sstream>
+
+// ******************************************************************************************
+// *** NB: Disabling fast math is essential here, otherwise results are undefined         ***
+// *** NB: This file CrossSectionKernels.cc IS BUILT WITH -fno-fast-math in the Makefile! ***
+// *** NB: Attempts with __attribute__((optimize("-fno-fast-math"))) were unsatisfactory  ***
+// ******************************************************************************************
+
+inline bool
+fp_is_nan( const fptype& fp )
+{
+  //#pragma clang diagnostic push
+  //#pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
+  return std::isnan( fp ); // always false for clang in fast math mode (tautological compare)?
+  //#pragma clang diagnostic pop
+}
+
+inline bool
+fp_is_abnormal( const fptype& fp )
+{
+  if( fp_is_nan( fp ) ) return true;
+  if( fp != fp ) return true;
+  return false;
+}
+
+inline bool
+fp_is_zero( const fptype& fp )
+{
+  if( fp == 0 ) return true;
+  return false;
+}
+
+// See https://en.cppreference.com/w/cpp/numeric/math/FP_categories
+inline const char*
+fp_show_class( const fptype& fp )
+{
+  switch( std::fpclassify( fp ) )
+  {
+    case FP_INFINITE: return "Inf";
+    case FP_NAN: return "NaN";
+    case FP_NORMAL: return "normal";
+    case FP_SUBNORMAL: return "subnormal";
+    case FP_ZERO: return "zero";
+    default: return "unknown";
+  }
+}
+
+inline void
+debug_me_is_abnormal( const fptype& me, size_t ievtALL )
+{
+  std::cout << "DEBUG[" << ievtALL << "]"
+            << " ME=" << me
+            << " fpisabnormal=" << fp_is_abnormal( me )
+            << " fpclass=" << fp_show_class( me )
+            << " (me==me)=" << ( me == me )
+            << " (me==me+1)=" << ( me == me + 1 )
+            << " isnan=" << fp_is_nan( me )
+            << " isfinite=" << std::isfinite( me )
+            << " isnormal=" << std::isnormal( me )
+            << " is0=" << ( me == 0 )
+            << " is1=" << ( me == 1 )
+            << " abs(ME)=" << std::abs( me )
+            << " isnan=" << fp_is_nan( std::abs( me ) )
+            << std::endl;
+}
+
+//============================================================================
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+  void flagAbnormalMEs( fptype* hstMEs, unsigned int nevt )
+  {
+    for( unsigned int ievt = 0; ievt < nevt; ievt++ )
+    {
+      if( fp_is_abnormal( hstMEs[ievt] ) )
+      {
+        std::cout << "WARNING! flagging abnormal ME for ievt=" << ievt << std::endl;
+        hstMEs[ievt] = std::sqrt( -1. );
+      }
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  CrossSectionKernelHost::CrossSectionKernelHost( const BufferWeights& samplingWeights,       // input: sampling weights
+                                                  const BufferMatrixElements& matrixElements, // input: matrix elements
+                                                  EventStatistics& stats,                     // output: event statistics
+                                                  const size_t nevt )
+    : CrossSectionKernelBase( samplingWeights, matrixElements, stats )
+    , NumberOfEvents( nevt )
+  {
+    if( m_samplingWeights.isOnDevice() ) throw std::runtime_error( "CrossSectionKernelHost: samplingWeights must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "CrossSectionKernelHost: matrixElements must be a host array" );
+    if( this->nevt() != m_samplingWeights.nevt() ) throw std::runtime_error( "CrossSectionKernelHost: nevt mismatch with samplingWeights" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "CrossSectionKernelHost: nevt mismatch with matrixElements" );
+  }
+
+  //--------------------------------------------------------------------------
+
+  void CrossSectionKernelHost::updateEventStatistics( const bool debug )
+  {
+    EventStatistics stats; // new statistics for the new nevt events
+    // FIRST PASS: COUNT ALL/ABN/ZERO EVENTS, COMPUTE MIN/MAX, COMPUTE REFS AS MEANS OF SIMPLE SUMS
+    for( size_t ievt = 0; ievt < nevt(); ++ievt ) // Loop over all events in this iteration
+    {
+      const fptype& me = MemoryAccessMatrixElements::ieventAccessConst( m_matrixElements.data(), ievt );
+      const fptype& wg = MemoryAccessWeights::ieventAccessConst( m_samplingWeights.data(), ievt );
+      const size_t ievtALL = m_iter * nevt() + ievt;
+      // The following events are abnormal in a run with "-p 2048 256 12 -d"
+      // - check.exe/commonrand: ME[310744,451171,3007871,3163868,4471038,5473927] with fast math
+      // - check.exe/curand: ME[578162,1725762,2163579,5407629,5435532,6014690] with fast math
+      // - gcheck.exe/curand: ME[596016,1446938] with fast math
+      // Debug NaN/abnormal issues
+      //if ( ievtALL == 310744 ) // this ME is abnormal both with and without fast math
+      //  debug_me_is_abnormal( me, ievtALL );
+      //if ( ievtALL == 5473927 ) // this ME is abnormal only with fast math
+      //  debug_me_is_abnormal( me, ievtALL );
+      stats.nevtALL++;
+      if( fp_is_abnormal( me ) )
+      {
+        if( debug ) // only printed out with "-p -d" (matrixelementALL is not filled without -p)
+          std::cout << "WARNING! ME[" << ievtALL << "] is NaN/abnormal" << std::endl;
+        stats.nevtABN++;
+        continue;
+      }
+      if( fp_is_zero( me ) ) stats.nevtZERO++;
+      stats.minME = std::min( stats.minME, (double)me );
+      stats.maxME = std::max( stats.maxME, (double)me );
+      stats.minWG = std::min( stats.minWG, (double)wg );
+      stats.maxWG = std::max( stats.maxWG, (double)wg );
+      stats.sumMEdiff += me; // NB stats.refME is 0 here
+      stats.sumWGdiff += wg; // NB stats.refWG is 0 here
+    }
+    stats.refME = stats.meanME(); // draft ref
+    stats.refWG = stats.meanWG(); // draft ref
+    stats.sumMEdiff = 0;
+    stats.sumWGdiff = 0;
+    // SECOND PASS: IMPROVE MEANS FROM SUMS OF DIFFS TO PREVIOUS REF, UPDATE REF
+    for( size_t ievt = 0; ievt < nevt(); ++ievt ) // Loop over all events in this iteration
+    {
+      const fptype& me = MemoryAccessMatrixElements::ieventAccessConst( m_matrixElements.data(), ievt );
+      const fptype& wg = MemoryAccessWeights::ieventAccessConst( m_samplingWeights.data(), ievt );
+      if( fp_is_abnormal( me ) ) continue;
+      stats.sumMEdiff += ( me - stats.refME );
+      stats.sumWGdiff += ( wg - stats.refWG );
+    }
+    stats.refME = stats.meanME(); // final ref
+    stats.refWG = stats.meanWG(); // final ref
+    stats.sumMEdiff = 0;
+    stats.sumWGdiff = 0;
+    // THIRD PASS: COMPUTE STDDEV FROM SQUARED SUMS OF DIFFS TO REF
+    for( size_t ievt = 0; ievt < nevt(); ++ievt ) // Loop over all events in this iteration
+    {
+      const fptype& me = MemoryAccessMatrixElements::ieventAccessConst( m_matrixElements.data(), ievt );
+      const fptype& wg = MemoryAccessWeights::ieventAccessConst( m_samplingWeights.data(), ievt );
+      if( fp_is_abnormal( me ) ) continue;
+      stats.sqsMEdiff += std::pow( me - stats.refME, 2 );
+      stats.sqsWGdiff += std::pow( wg - stats.refWG, 2 );
+    }
+    // FOURTH PASS: UPDATE THE OVERALL STATS BY ADDING THE NEW STATS
+    m_stats += stats;
+    // Increment the iterations counter
+    m_iter++;
+  }
+
+  //--------------------------------------------------------------------------
+}
+
+//============================================================================
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+{
+
+  /*
+  //--------------------------------------------------------------------------
+
+  CrossSectionKernelDevice::CrossSectionKernelDevice( const BufferWeights& samplingWeights,       // input: sampling weights
+                                                      const BufferMatrixElements& matrixElements, // input: matrix elements
+                                                      EventStatistics& stats,                     // output: event statistics
+                                                      const size_t gpublocks,
+                                                      const size_t gputhreads )
+    : CrossSectionKernelBase( samplingWeights, matrixElements, stats )
+    , NumberOfEvents( gpublocks*gputhreads )
+    , m_gpublocks( gpublocks )
+    , m_gputhreads( gputhreads )
+  {
+    if ( ! m_samplingWeights.isOnDevice() ) throw std::runtime_error( "CrossSectionKernelDevice: samplingWeights must be a device array" );
+    if ( ! m_matrixElements.isOnDevice() ) throw std::runtime_error( "CrossSectionKernelDevice: matrixElements must be a device array" );
+    if ( m_gpublocks == 0 ) throw std::runtime_error( "CrossSectionKernelDevice: gpublocks must be > 0" );
+    if ( m_gputhreads == 0 ) throw std::runtime_error( "CrossSectionKernelDevice: gputhreads must be > 0" );
+    if ( this->nevt() != m_samplingWeights.nevt() ) throw std::runtime_error( "CrossSectionKernelDevice: nevt mismatch with samplingWeights" );
+    if ( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "CrossSectionKernelDevice: nevt mismatch with matrixElements" );
+  }
+
+  //--------------------------------------------------------------------------
+
+  void CrossSectionKernelDevice::setGrid( const size_t gpublocks, const size_t gputhreads )
+  {
+    if ( m_gpublocks == 0 ) throw std::runtime_error( "CrossSectionKernelDevice: gpublocks must be > 0 in setGrid" );
+    if ( m_gputhreads == 0 ) throw std::runtime_error( "CrossSectionKernelDevice: gputhreads must be > 0 in setGrid" );
+    if ( this->nevt() != m_gpublocks * m_gputhreads ) throw std::runtime_error( "CrossSectionKernelDevice: nevt mismatch in setGrid" );
+  }
+
+  //--------------------------------------------------------------------------
+
+  void CrossSectionKernelDevice::updateEventStatistics( const bool debug )
+  {
+    // Increment the iterations counter
+    m_iter++;
+  }
+
+  //--------------------------------------------------------------------------
+  */
+
+}
+#endif
+
+//============================================================================
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/CrossSectionKernels.h
new file mode 100644
index 0000000000..6098157b4e
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/CrossSectionKernels.h
@@ -0,0 +1,133 @@
+#ifndef CROSSSECTIONKERNELS_H
+#define CROSSSECTIONKERNELS_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "EventStatistics.h"
+#include "MemoryBuffers.h"
+
+//============================================================================
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+  // Helper function for Bridge.h: must be compiled without fast math
+  // Iterate through all output MEs and replace any NaN/abnormal ones by sqrt(-1)
+  void flagAbnormalMEs( fptype* hstMEs, unsigned int nevt );
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating the calculation of event statistics on a CPU host or on a GPU device
+  class CrossSectionKernelBase //: virtual public ICrossSectionKernel
+  {
+  protected:
+
+    // Constructor from existing input and output buffers
+    CrossSectionKernelBase( const BufferWeights& samplingWeights,       // input: sampling weights
+                            const BufferMatrixElements& matrixElements, // input: matrix elements
+                            EventStatistics& stats )                    // output: event statistics
+      : m_samplingWeights( samplingWeights )
+      , m_matrixElements( matrixElements )
+      , m_stats( stats )
+      , m_iter( 0 )
+    {
+      // NB: do not initialise EventStatistics (you may be asked to update an existing result)
+    }
+
+  public:
+
+    // Destructor
+    virtual ~CrossSectionKernelBase() {}
+
+    // Update event statistics
+    virtual void updateEventStatistics( const bool debug = false ) = 0;
+
+    // Is this a host or device kernel?
+    virtual bool isOnDevice() const = 0;
+
+  protected:
+
+    // The buffer for the sampling weights
+    const BufferWeights& m_samplingWeights;
+
+    // The buffer for the output matrix elements
+    const BufferMatrixElements& m_matrixElements;
+
+    // The event statistics
+    EventStatistics& m_stats;
+
+    // The number of iterations processed so far
+    size_t m_iter;
+  };
+
+  //--------------------------------------------------------------------------
+
+  // A class encapsulating the calculation of event statistics on a CPU host
+  class CrossSectionKernelHost final : public CrossSectionKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    CrossSectionKernelHost( const BufferWeights& samplingWeights,       // input: sampling weights
+                            const BufferMatrixElements& matrixElements, // input: matrix elements
+                            EventStatistics& stats,                     // output: event statistics
+                            const size_t nevt );
+
+    // Destructor
+    virtual ~CrossSectionKernelHost() {}
+
+    // Update event statistics
+    void updateEventStatistics( const bool debug = false ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+  };
+
+  //--------------------------------------------------------------------------
+
+  /*
+#ifdef __CUDACC__
+  // A class encapsulating the calculation of event statistics on a GPU device
+  class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    CrossSectionKernelDevice( const BufferWeights& samplingWeights,       // input: sampling weights
+                              const BufferMatrixElements& matrixElements, // input: matrix elements
+                              EventStatistics& stats,                     // output: event statistics
+                              const size_t gpublocks,
+                              const size_t gputhreads );
+
+    // Destructor
+    virtual ~CrossSectionKernelDevice(){}
+
+    // Reset gpublocks and gputhreads
+    void setGrid( const size_t gpublocks, const size_t gputhreads );
+
+    // Update event statistics
+    void updateEventStatistics( const bool debug=false ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return true; }
+
+  private:
+
+    // The number of blocks in the GPU grid
+    size_t m_gpublocks;
+
+    // The number of threads in the GPU grid
+    size_t m_gputhreads;
+
+  };
+#endif
+  */
+
+  //--------------------------------------------------------------------------
+}
+#endif // CROSSSECTIONKERNELS_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/CudaRuntime.h
new file mode 100644
index 0000000000..e16ed2c703
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/CudaRuntime.h
@@ -0,0 +1,80 @@
+#ifndef MG5AMC_CUDARUNTIME_H
+#define MG5AMC_CUDARUNTIME_H 1
+
+// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API
+// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api
+
+#include <cassert>
+#include <iostream>
+
+//--------------------------------------------------------------------------
+
+// See https://stackoverflow.com/a/14038590
+#ifdef __CUDACC__ /* clang-format off */
+#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); }
+inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true )
+{
+  if( code != cudaSuccess )
+  {
+    printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line );
+    if( abort ) assert( code == cudaSuccess );
+  }
+}
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+{
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! ***
+  struct CudaRuntime final
+  {
+    CudaRuntime( const bool debug = true )
+      : m_debug( debug ) { setUp( m_debug ); }
+    ~CudaRuntime() { tearDown( m_debug ); }
+    CudaRuntime( const CudaRuntime& ) = delete;
+    CudaRuntime( CudaRuntime&& ) = delete;
+    CudaRuntime& operator=( const CudaRuntime& ) = delete;
+    CudaRuntime& operator=( CudaRuntime&& ) = delete;
+    bool m_debug;
+
+    // Set up CUDA application
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
+    static void setUp( const bool debug = true )
+    {
+      // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
+      // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
+      /*
+      // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!]
+      // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization
+      // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/
+      if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl;
+      checkCuda( cudaFree( 0 ) ); // SLOW!
+      */
+      // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either
+      // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs)
+      if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl;
+      checkCuda( cudaSetDevice( 0 ) ); // SLOW!
+    }
+
+    // Tear down CUDA application (call cudaDeviceReset)
+    // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
+    // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
+    // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
+    static void tearDown( const bool debug = true )
+    {
+      if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl;
+      checkCuda( cudaDeviceReset() );
+    }
+  };
+
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+#endif // MG5AMC_CUDARUNTIME_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/EventStatistics.h
new file mode 100644
index 0000000000..19c5199bcc
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/EventStatistics.h
@@ -0,0 +1,160 @@
+#ifndef EventStatistics_H
+#define EventStatistics_H 1
+
+#include "mgOnGpuConfig.h" // for npar (meGeVexponent)
+
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <limits>
+#include <string>
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+  // The EventStatistics struct is used to accumulate running aggregates of event statistics.
+  // This will eventually include the process cross section and the process maximum weight:
+  // one important case of EventStatistics will then be the "gridpack" result set, which is
+  // the output of the "integration" step and the input to "unweighted event generation" step.
+  // The current implementation only includes statistics for matrix elements (ME) and sampling weights (WG);
+  // in first approximation, the process cross section and maximum weight are just the mean ME and maximum ME,
+  // but eventually the sampling weights WG (e.g. from Rambo) must also be taken into account in the calculation.
+  // The implementation uses differences to reference values to improve numerical precision.
+  struct EventStatistics
+  {
+  public:
+    size_t nevtALL;   // total number of events used
+    size_t nevtABN;   // number of events used, where ME is abnormal (nevtABN <= nevtALL)
+    size_t nevtZERO;  // number of not-abnormal events used, where ME is zero (nevtZERO <= nevtOK)
+    double minME;     // minimum matrix element
+    double maxME;     // maximum matrix element
+    double minWG;     // minimum sampling weight
+    double maxWG;     // maximum sampling weight
+    double refME;     // "reference" matrix element (normally the current mean)
+    double refWG;     // "reference" sampling weight (normally the current mean)
+    double sumMEdiff; // sum of diff to ref for matrix element
+    double sumWGdiff; // sum of diff to ref for sampling weight
+    double sqsMEdiff; // squared sum of diff to ref for matrix element
+    double sqsWGdiff; // squared sum of diff to ref for sampling weight
+    std::string tag;  // a text tag for printouts
+    // Number of events used, where ME is not abnormal
+    size_t nevtOK() const { return nevtALL - nevtABN; }
+    // Mean matrix element
+    // [x = ref+d => mean(x) = sum(x)/n = ref+sum(d)/n]
+    double meanME() const
+    {
+      return refME + ( nevtOK() > 0 ? sumMEdiff / nevtOK() : 0 );
+    }
+    // Mean sampling weight
+    // [x = ref+d => mean(x) = sum(x)/n = ref+sum(d)/n]
+    double meanWG() const
+    {
+      return refWG + ( nevtOK() > 0 ? sumWGdiff / nevtOK() : 0 );
+    }
+    // Variance matrix element
+    // [x = ref+d => n*var(x) = sum((x-mean(x))^2) = sum((ref+d-ref-sum(d)/n)^2) = sum((d-sum(d)/n)^2)/n = sum(d^2)-(sum(d))^2/n]
+    double varME() const { return ( sqsMEdiff - std::pow( sumMEdiff, 2 ) / nevtOK() ) / nevtOK(); }
+    // Variance sampling weight
+    // [x = ref+d => n*var(x) = sum((x-mean(x))^2) = sum((ref+d-ref-sum(d)/n)^2) = sum((d-sum(d)/n)^2)/n = sum(d^2)-(sum(d))^2/n]
+    double varWG() const { return ( sqsWGdiff - std::pow( sumWGdiff, 2 ) / nevtOK() ) / nevtOK(); }
+    // Standard deviation matrix element
+    double stdME() const { return std::sqrt( varME() ); }
+    // Standard deviation sampling weight
+    double stdWG() const { return std::sqrt( varWG() ); }
+    // Update reference matrix element
+    void updateRefME( const double newRef )
+    {
+      const double deltaRef = refME - newRef;
+      sqsMEdiff += deltaRef * ( 2 * sumMEdiff + nevtOK() * deltaRef );
+      sumMEdiff += deltaRef * nevtOK();
+      refME = newRef;
+    }
+    // Update reference sampling weight
+    void updateRefWG( const double newRef )
+    {
+      const double deltaRef = refWG - newRef;
+      sqsWGdiff += deltaRef * ( 2 * sumWGdiff + nevtOK() * deltaRef );
+      sumWGdiff += deltaRef * nevtOK();
+      refWG = newRef;
+    }
+    // Constructor
+    EventStatistics()
+      : nevtALL( 0 )
+      , nevtABN( 0 )
+      , nevtZERO( 0 )
+      , minME( std::numeric_limits<double>::max() )
+      , maxME( std::numeric_limits<double>::lowest() )
+      , minWG( std::numeric_limits<double>::max() )
+      , maxWG( std::numeric_limits<double>::lowest() )
+      , refME( 0 )
+      , refWG( 0 )
+      , sumMEdiff( 0 )
+      , sumWGdiff( 0 )
+      , sqsMEdiff( 0 )
+      , sqsWGdiff( 0 )
+      , tag( "" ) {}
+    // Combine two EventStatistics
+    EventStatistics& operator+=( const EventStatistics& stats )
+    {
+      EventStatistics s1 = *this; // temporary copy
+      EventStatistics s2 = stats; // temporary copy
+      EventStatistics& sum = *this;
+      sum.nevtALL = s1.nevtALL + s2.nevtALL;
+      sum.nevtABN = s1.nevtABN + s2.nevtABN;
+      sum.nevtZERO = s1.nevtZERO + s2.nevtZERO;
+      sum.minME = std::min( s1.minME, s2.minME );
+      sum.maxME = std::max( s1.maxME, s2.maxME );
+      sum.minWG = std::min( s1.minWG, s2.minWG );
+      sum.maxWG = std::max( s1.maxWG, s2.maxWG );
+      sum.refME = ( s1.meanME() * s1.nevtOK() + s2.meanME() * s2.nevtOK() ) / sum.nevtOK(); // new mean ME
+      s1.updateRefME( sum.refME );
+      s2.updateRefME( sum.refME );
+      sum.sumMEdiff = s1.sumMEdiff + s2.sumMEdiff;
+      sum.sqsMEdiff = s1.sqsMEdiff + s2.sqsMEdiff;
+      sum.refWG = ( s1.meanWG() * s1.nevtOK() + s2.meanWG() * s2.nevtOK() ) / sum.nevtOK(); // new mean WG
+      s1.updateRefWG( sum.refWG );
+      s2.updateRefWG( sum.refWG );
+      sum.sumWGdiff = s1.sumWGdiff + s2.sumWGdiff;
+      sum.sqsWGdiff = s1.sqsWGdiff + s2.sqsWGdiff;
+      return sum;
+    }
+    // Printout
+    void printout( std::ostream& out ) const
+    {
+      const EventStatistics& s = *this;
+      constexpr int meGeVexponent = -( 2 * mgOnGpu::npar - 8 );
+      out << s.tag << "NumMatrixElems(notAbnormal) = " << s.nevtOK() << std::endl
+          << std::scientific // fixed format: affects all floats (default precision: 6)
+          << s.tag << "MeanMatrixElemValue         = ( " << s.meanME()
+          << " +- " << s.stdME() / std::sqrt( s.nevtOK() ) << " )  GeV^" << meGeVexponent << std::endl // standard error
+          << s.tag << "[Min,Max]MatrixElemValue    = [ " << s.minME
+          << " ,  " << s.maxME << " ]  GeV^" << meGeVexponent << std::endl
+          << s.tag << "StdDevMatrixElemValue       = ( " << s.stdME()
+          << std::string( 16, ' ' ) << " )  GeV^" << meGeVexponent << std::endl
+          << s.tag << "MeanWeight                  = ( " << s.meanWG()
+          << " +- " << s.stdWG() / std::sqrt( s.nevtOK() ) << std::endl // standard error
+          << s.tag << "[Min,Max]Weight             = [ " << s.minWG
+          << " ,  " << s.maxWG << " ]" << std::endl
+          << s.tag << "StdDevWeight                = ( " << s.stdWG()
+          << std::string( 16, ' ' ) << " )" << std::endl
+          << std::defaultfloat; // default format: affects all floats
+    }
+  };
+
+  //--------------------------------------------------------------------------
+
+  inline std::ostream& operator<<( std::ostream& out, const EventStatistics& s )
+  {
+    s.printout( out );
+    return out;
+  }
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // EventStatistics_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MadgraphTest.h
new file mode 100644
index 0000000000..2a0be47978
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MadgraphTest.h
@@ -0,0 +1,300 @@
+// Stephan Hageboeck, CERN, 12/2020
+#ifndef MADGRAPHTEST_H_
+#define MADGRAPHTEST_H_ 1
+
+#include <gtest/gtest.h>
+#include <mgOnGpuConfig.h>
+
+#include <array>
+#include <cmath>
+#include <fstream>
+#include <iomanip>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace
+{
+
+  struct ReferenceData
+  {
+    std::vector<std::vector<std::array<fptype, mgOnGpu::np4>>> momenta;
+    std::vector<fptype> MEs;
+  };
+
+  /// Read batches of reference data from a file and store them in a map.
+  std::map<unsigned int, ReferenceData> readReferenceData( const std::string& refFileName )
+  {
+    std::ifstream referenceFile( refFileName.c_str() );
+    EXPECT_TRUE( referenceFile.is_open() ) << refFileName;
+    std::map<unsigned int, ReferenceData> referenceData;
+    unsigned int evtNo;
+    unsigned int batchNo;
+    for( std::string line; std::getline( referenceFile, line ); )
+    {
+      std::stringstream lineStr( line );
+      if( line.empty() || line[0] == '#' )
+      {
+        continue;
+      }
+      else if( line.find( "Event" ) != std::string::npos )
+      {
+        std::string dummy;
+        lineStr >> dummy >> evtNo >> dummy >> batchNo;
+      }
+      else if( line.find( "ME" ) != std::string::npos )
+      {
+        if( evtNo <= referenceData[batchNo].MEs.size() )
+          referenceData[batchNo].MEs.resize( evtNo + 1 );
+
+        std::string dummy;
+        lineStr >> dummy >> referenceData[batchNo].MEs[evtNo];
+      }
+      else
+      {
+        unsigned int particleIndex;
+        lineStr >> particleIndex;
+        if( evtNo <= referenceData[batchNo].momenta.size() )
+          referenceData[batchNo].momenta.resize( evtNo + 1 );
+        if( particleIndex <= referenceData[batchNo].momenta[evtNo].size() )
+          referenceData[batchNo].momenta[evtNo].resize( particleIndex + 1 );
+        auto& fourVec = referenceData[batchNo].momenta[evtNo][particleIndex];
+        for( unsigned int i = 0; i < fourVec.size(); ++i )
+        {
+          EXPECT_TRUE( lineStr.good() );
+          lineStr >> fourVec[i];
+        }
+        EXPECT_TRUE( lineStr.eof() );
+      }
+    }
+    return referenceData;
+  }
+
+}
+
+/**
+ * Test driver providing a common interface for testing different implementations.
+ * Users need to implement:
+ * - Functions to retrieve matrix element and 4-momenta. These are used in the tests.
+ * - Driver functions that run the madgraph workflow.
+ *
+ * Usage:
+ * ```
+ * class TestImplementation : public TestDriverBase {
+ *   <override all pure-virtual functions with Madgraph workflow>
+ * }
+ *
+ * class TestImplementation2 : public TestDriverBase {
+ *   <override all pure-virtual functions with a different Madgraph workflow>
+ * }
+ *
+ * INSTANTIATE_TEST_SUITE_P( TestName,
+ *                           MadgraphTest,
+ *                           testing::Values( new TestImplementation, new TestImplementation2, ... ) );
+ *```
+ *
+ * For adapting the test workflow, see the .cc and adapt
+ *   TEST_P(MadgraphTest, CompareMomentaAndME)
+ *
+ * To add a test that should be runnable with all test implementations that derive from TestDriverBase, add a new
+ *   TEST_P(MadgraphTest, <TestName>) {
+ *     <test code>
+ *   }
+ */
+class TestDriverBase
+{
+  std::string m_refFileName;
+public:
+  const unsigned int nparticle;
+  static constexpr unsigned int niter = 2;
+  static constexpr unsigned int gpublocks = 2;
+  static constexpr unsigned int gputhreads = 128;
+  static constexpr unsigned int nevt = gpublocks * gputhreads;
+
+  TestDriverBase( unsigned int npart, const std::string& refFileName )
+    : m_refFileName( refFileName )
+    , nparticle( npart )
+  {
+  }
+  TestDriverBase() = delete;
+  virtual ~TestDriverBase() {}
+  const std::string& getRefFileName() { return m_refFileName; }
+
+  // ------------------------------------------------
+  // Interface for retrieving info from madgraph
+  // ------------------------------------------------
+  virtual fptype getMomentum( std::size_t evtNo, unsigned int particleNo, unsigned int component ) const = 0;
+  virtual fptype getMatrixElement( std::size_t evtNo ) const = 0;
+
+  // ------------------------------------------------
+  // Interface for steering madgraph run
+  // ------------------------------------------------
+  virtual void prepareRandomNumbers( unsigned int iiter ) = 0;
+  virtual void prepareMomenta( fptype energy ) = 0;
+  virtual void runSigmaKin( std::size_t iiter ) = 0;
+
+  /// Print the requested event into the stream. If the reference data has enough events, it will be printed as well.
+  void dumpParticles( std::ostream& stream, std::size_t ievt, unsigned int numParticles, unsigned int nDigit, const ReferenceData& referenceData ) const
+  {
+    const auto width = nDigit + 8;
+    for( unsigned int ipar = 0; ipar < numParticles; ipar++ )
+    {
+      // NB: 'setw' affects only the next field (of any type)
+      stream << std::scientific // fixed format: affects all floats (default nDigit: 6)
+             << std::setprecision( nDigit )
+             << std::setw( 4 ) << ipar
+             << std::setw( width ) << getMomentum( ievt, ipar, 0 )
+             << std::setw( width ) << getMomentum( ievt, ipar, 1 )
+             << std::setw( width ) << getMomentum( ievt, ipar, 2 )
+             << std::setw( width ) << getMomentum( ievt, ipar, 3 )
+             << "\n";
+      if( ievt < referenceData.momenta.size() )
+      {
+        stream << "ref" << ipar;
+        stream << std::setw( width ) << referenceData.momenta[ievt][ipar][0]
+               << std::setw( width ) << referenceData.momenta[ievt][ipar][1]
+               << std::setw( width ) << referenceData.momenta[ievt][ipar][2]
+               << std::setw( width ) << referenceData.momenta[ievt][ipar][3]
+               << "\n\n";
+      }
+      stream << std::flush << std::defaultfloat; // default format: affects all floats
+    }
+  }
+};
+
+/**
+ * Test class that's defining all tests to run with a Madgraph workflow.
+ * The tests are defined below using TEST_P.
+ * Instantiate them using:
+ * ```
+ * INSTANTIATE_TEST_SUITE_P( TestName,
+ *                           MadgraphTest,
+ *                           testing::Values( new TestImplementation, new TestImplementation2, ... ) );
+ * ```
+ */
+class MadgraphTest : public testing::TestWithParam<TestDriverBase*>
+{
+protected:
+  std::unique_ptr<TestDriverBase> testDriver;
+
+  MadgraphTest()
+    : TestWithParam(), testDriver( GetParam() )
+  {
+  }
+};
+
+// Since we link both the CPU-only and GPU tests into the same executable, we prevent
+// a multiply defined symbol by only compiling this in the non-CUDA phase:
+#ifndef __CUDACC__
+
+/// Compare momenta and matrix elements.
+/// This uses an implementation of TestDriverBase to run a madgraph workflow,
+/// and compares momenta and matrix elements with a reference file.
+TEST_P( MadgraphTest, CompareMomentaAndME )
+{
+  // Set to true to dump events:
+  constexpr bool dumpEvents = false;
+  constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
+  const fptype toleranceMomenta = std::is_same<double, fptype>::value ? 1.E-10 : 3.E-2;
+#ifdef __APPLE__
+  const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 3.E-2; // see #583
+#else
+  const fptype toleranceMEs = std::is_same<double, fptype>::value ? 1.E-6 : 2.E-3;
+#endif
+  std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt";
+  while( dumpFileName.find( '/' ) != std::string::npos )
+  {
+    dumpFileName.replace( dumpFileName.find( '/' ), 1, "_" );
+  }
+  std::ofstream dumpFile;
+  if( dumpEvents )
+  {
+    dumpFile.open( dumpFileName, std::ios::trunc );
+  }
+  // Read reference data
+  const std::string refFileName = testDriver->getRefFileName();
+  std::map<unsigned int, ReferenceData> referenceData;
+  if( !dumpEvents )
+  {
+    referenceData = readReferenceData( refFileName );
+  }
+  ASSERT_FALSE( HasFailure() ); // It doesn't make any sense to continue if we couldn't read the reference file.
+  // **************************************
+  // *** START MAIN LOOP ON #ITERATIONS ***
+  // **************************************
+  for( unsigned int iiter = 0; iiter < testDriver->niter; ++iiter )
+  {
+    testDriver->prepareRandomNumbers( iiter );
+    testDriver->prepareMomenta( energy );
+    testDriver->runSigmaKin( iiter );
+    // --- Run checks on all events produced in this iteration
+    for( std::size_t ievt = 0; ievt < testDriver->nevt && !HasFailure(); ++ievt )
+    {
+      if( dumpEvents )
+      {
+        ASSERT_TRUE( dumpFile.is_open() ) << dumpFileName;
+        dumpFile << "Event " << std::setw( 8 ) << ievt << "  "
+                 << "Batch " << std::setw( 4 ) << iiter << "\n";
+        testDriver->dumpParticles( dumpFile, ievt, testDriver->nparticle, 15, ReferenceData() );
+        // Dump matrix element
+        dumpFile << std::setw( 4 ) << "ME" << std::scientific << std::setw( 15 + 8 )
+                 << testDriver->getMatrixElement( ievt ) << "\n"
+                 << std::endl
+                 << std::defaultfloat;
+        continue;
+      }
+      // Check that we have the required reference data
+      ASSERT_GT( referenceData.size(), iiter )
+        << "Don't have enough reference data for iteration " << iiter << ". Ref file:" << refFileName;
+      ASSERT_GT( referenceData[iiter].MEs.size(), ievt )
+        << "Don't have enough reference MEs for iteration " << iiter << " event " << ievt << ".\nRef file: " << refFileName;
+      ASSERT_GT( referenceData[iiter].momenta.size(), ievt )
+        << "Don't have enough reference momenta for iteration " << iiter << " event " << ievt << ".\nRef file: " << refFileName;
+      ASSERT_GE( referenceData[iiter].momenta[ievt].size(), testDriver->nparticle )
+        << "Don't have enough reference particles for iteration " << iiter << " event " << ievt << ".\nRef file: " << refFileName;
+      // This trace will help to understand the event that is being checked.
+      // It will only be printed in case of failures:
+      std::stringstream eventTrace;
+      eventTrace << "In comparing event " << ievt << " from iteration " << iiter << "\n";
+      testDriver->dumpParticles( eventTrace, ievt, testDriver->nparticle, 15, referenceData[iiter] );
+      eventTrace << std::setw( 4 ) << "ME" << std::scientific << std::setw( 15 + 8 )
+                 << testDriver->getMatrixElement( ievt ) << "\n"
+                 << std::setw( 4 ) << "r.ME" << std::scientific << std::setw( 15 + 8 )
+                 << referenceData[iiter].MEs[ievt] << std::endl
+                 << std::defaultfloat;
+      SCOPED_TRACE( eventTrace.str() );
+      // Compare Momenta
+      for( unsigned int ipar = 0; ipar < testDriver->nparticle; ++ipar )
+      {
+        std::stringstream momentumErrors;
+        for( unsigned int icomp = 0; icomp < mgOnGpu::np4; ++icomp )
+        {
+          const fptype pMadg = testDriver->getMomentum( ievt, ipar, icomp );
+          const fptype pOrig = referenceData[iiter].momenta[ievt][ipar][icomp];
+          const fptype relDelta = fabs( ( pMadg - pOrig ) / pOrig );
+          if( relDelta > toleranceMomenta )
+          {
+            momentumErrors << std::setprecision( 15 ) << std::scientific << "\nparticle " << ipar << "\tcomponent " << icomp
+                           << "\n\t madGraph:  " << std::setw( 22 ) << pMadg
+                           << "\n\t reference: " << std::setw( 22 ) << pOrig
+                           << "\n\t rel delta: " << std::setw( 22 ) << relDelta << " exceeds tolerance of " << toleranceMomenta;
+          }
+        }
+        ASSERT_TRUE( momentumErrors.str().empty() ) << momentumErrors.str();
+      }
+      // Compare ME:
+      EXPECT_NEAR( testDriver->getMatrixElement( ievt ),
+                   referenceData[iiter].MEs[ievt],
+                   toleranceMEs * referenceData[iiter].MEs[ievt] );
+    }
+  }
+  if( dumpEvents )
+  {
+    std::cout << "Event dump written to " << dumpFileName << std::endl;
+  }
+}
+
+#endif // __CUDACC__
+
+#endif /* MADGRAPHTEST_H_ */
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc
new file mode 100644
index 0000000000..da81c99218
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc
@@ -0,0 +1,237 @@
+#include "MatrixElementKernels.h"
+
+#include "CPPProcess.h"
+#include "CudaRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+
+#include <sstream>
+
+//============================================================================
+
+#ifndef __CUDACC__
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHost::MatrixElementKernelHost( const BufferMomenta& momenta,         // input: momenta
+                                                    const BufferGs& gs,                   // input: gs for alphaS
+                                                    const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                                    const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                                    BufferMatrixElements& matrixElements, // output: matrix elements
+                                                    BufferSelectedHelicity& selhel,       // output: helicity selection
+                                                    BufferSelectedColor& selcol,          // output: color selection
+                                                    const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_couplings( nevt )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( nevt )
+    , m_denominators( nevt )
+#endif
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHost: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHost: nevt mismatch with matrixElements" );
+    // Sanity checks for memory access (momenta buffer)
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHost: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+    // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
+    // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
+    // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
+    if( !MatrixElementKernelHost::hostSupportsSIMD() )
+      throw std::runtime_error( "Host does not support the SIMD implementation of MatrixElementKernelsHost" );
+  }
+
+  //--------------------------------------------------------------------------
+
+  int MatrixElementKernelHost::computeGoodHelicities()
+  {
+    using mgOnGpu::ncomb; // the number of helicity combinations
+    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    // ... 0d1. Compute good helicity mask on the host
+    computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() );
+#else
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
+#endif
+    // ... 0d2. Copy back good helicity list to static memory on the host
+    // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
+    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelHost::computeMatrixElements( const unsigned int channelId )
+  {
+    computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+#else
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Does this host system support the SIMD used in the matrix element calculation?
+  bool MatrixElementKernelHost::hostSupportsSIMD( const bool verbose )
+  {
+#if defined __AVX512VL__
+    bool known = true;
+    bool ok = __builtin_cpu_supports( "avx512vl" );
+    const std::string tag = "skylake-avx512 (AVX512VL)";
+#elif defined __AVX2__
+    bool known = true;
+    bool ok = __builtin_cpu_supports( "avx2" );
+    const std::string tag = "haswell (AVX2)";
+#elif defined __SSE4_2__
+#ifdef __PPC__
+    // See https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions-Available-on-all-Configurations.html
+    bool known = true;
+    bool ok = __builtin_cpu_supports( "vsx" );
+    const std::string tag = "powerpc vsx (128bit as in SSE4.2)";
+#elif defined __ARM_NEON__ // consider using __BUILTIN_CPU_SUPPORTS__
+    bool known = false; // __builtin_cpu_supports is not supported
+    // See https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions-Available-on-all-Configurations.html
+    // See https://stackoverflow.com/q/62783908
+    // See https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu
+    bool ok = true; // this is just an assumption!
+    const std::string tag = "arm neon (128bit as in SSE4.2)";
+#else
+    bool known = true;
+    bool ok = __builtin_cpu_supports( "sse4.2" );
+    const std::string tag = "nehalem (SSE4.2)";
+#endif
+#else
+    bool known = true;
+    bool ok = true;
+    const std::string tag = "none";
+#endif
+    if( verbose )
+    {
+      if( tag == "none" )
+        std::cout << "INFO: The application does not require the host to support any AVX feature" << std::endl;
+      else if( ok && known )
+        std::cout << "INFO: The application is built for " << tag << " and the host supports it" << std::endl;
+      else if( ok )
+        std::cout << "WARNING: The application is built for " << tag << " but it is unknown if the host supports it" << std::endl;
+      else
+        std::cout << "ERROR! The application is built for " << tag << " but the host does not support it" << std::endl;
+    }
+    return ok;
+  }
+
+  //--------------------------------------------------------------------------
+
+}
+#endif
+
+//============================================================================
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelDevice::MatrixElementKernelDevice( const BufferMomenta& momenta,         // input: momenta
+                                                        const BufferGs& gs,                   // input: gs for alphaS
+                                                        const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                                        const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                                        BufferMatrixElements& matrixElements, // output: matrix elements
+                                                        BufferSelectedHelicity& selhel,       // output: helicity selection
+                                                        BufferSelectedColor& selcol,          // output: color selection
+                                                        const size_t gpublocks,
+                                                        const size_t gputhreads )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, matrixElements, selhel, selcol )
+    , NumberOfEvents( gpublocks * gputhreads )
+    , m_couplings( this->nevt() )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( this->nevt() )
+    , m_denominators( this->nevt() )
+#endif
+    , m_gpublocks( gpublocks )
+    , m_gputhreads( gputhreads )
+  {
+    if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
+    if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
+    if( m_gpublocks == 0 ) throw std::runtime_error( "MatrixElementKernelDevice: gpublocks must be > 0" );
+    if( m_gputhreads == 0 ) throw std::runtime_error( "MatrixElementKernelDevice: gputhreads must be > 0" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelDevice: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelDevice: nevt mismatch with matrixElements" );
+    // Sanity checks for memory access (momenta buffer)
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( m_gputhreads % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelDevice::setGrid( const int gpublocks, const int gputhreads )
+  {
+    if( m_gpublocks == 0 ) throw std::runtime_error( "MatrixElementKernelDevice: gpublocks must be > 0 in setGrid" );
+    if( m_gputhreads == 0 ) throw std::runtime_error( "MatrixElementKernelDevice: gputhreads must be > 0 in setGrid" );
+    if( this->nevt() != m_gpublocks * m_gputhreads ) throw std::runtime_error( "MatrixElementKernelDevice: nevt mismatch in setGrid" );
+  }
+
+  //--------------------------------------------------------------------------
+
+  int MatrixElementKernelDevice::computeGoodHelicities()
+  {
+    using mgOnGpu::ncomb; // the number of helicity combinations
+    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
+    DeviceBufferHelicityMask devIsGoodHel( ncomb );
+    // ... 0d1. Compute good helicity mask on the device
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+#else
+    sigmaKin_getGoodHel<<<m_gpublocks, m_gputhreads>>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+#endif
+    checkCuda( cudaPeekAtLastError() );
+    // ... 0d2. Copy back good helicity mask to the host
+    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
+    // ... 0d3. Copy back good helicity list to constant memory on the device
+    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId )
+  {
+    computeDependentCouplings<<<m_gpublocks, m_gputhreads>>>( m_gs.data(), m_couplings.data() );
+#ifndef MGONGPU_NSIGHT_DEBUG
+    constexpr unsigned int sharedMemSize = 0;
+#else
+    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+#endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+#else
+    sigmaKin<<<m_gpublocks, m_gputhreads, sharedMemSize>>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+#endif
+    checkCuda( cudaPeekAtLastError() );
+    checkCuda( cudaDeviceSynchronize() );
+  }
+
+  //--------------------------------------------------------------------------
+
+}
+#endif
+
+//============================================================================
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h
new file mode 100644
index 0000000000..ec0fc9b18c
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h
@@ -0,0 +1,183 @@
+#ifndef MATRIXELEMENTKERNELS_H
+#define MATRIXELEMENTKERNELS_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryBuffers.h"
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating matrix element calculations on a CPU host or on a GPU device
+  class MatrixElementKernelBase //: virtual public IMatrixElementKernel
+  {
+  protected:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelBase( const BufferMomenta& momenta,         // input: momenta
+                             const BufferGs& gs,                   // input: gs for alphaS
+                             const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                             const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                             BufferMatrixElements& matrixElements, // output: matrix elements
+                             BufferSelectedHelicity& selhel,       // output: helicity selection
+                             BufferSelectedColor& selcol )         // output: color selection
+      : m_momenta( momenta )
+      , m_gs( gs )
+      , m_rndhel( rndhel )
+      , m_rndcol( rndcol )
+      , m_matrixElements( matrixElements )
+      , m_selhel( selhel )
+      , m_selcol( selcol )
+    {
+    }
+
+  public:
+
+    // Destructor
+    virtual ~MatrixElementKernelBase() {}
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    virtual int computeGoodHelicities() = 0;
+
+    // Compute matrix elements
+    virtual void computeMatrixElements( const unsigned int channelId ) = 0;
+
+    // Is this a host or device kernel?
+    virtual bool isOnDevice() const = 0;
+
+  protected:
+
+    // The buffer for the input momenta
+    const BufferMomenta& m_momenta;
+
+    // The buffer for the gs to calculate the alphaS values
+    const BufferGs& m_gs;
+
+    // The buffer for the random numbers for helicity selection
+    const BufferRndNumHelicity& m_rndhel;
+
+    // The buffer for the random numbers for color selection
+    const BufferRndNumColor& m_rndcol;
+
+    // The buffer for the output matrix elements
+    BufferMatrixElements& m_matrixElements;
+
+    // The buffer for the output helicity selection
+    BufferSelectedHelicity& m_selhel;
+
+    // The buffer for the output color selection
+    BufferSelectedColor& m_selcol;
+  };
+
+  //--------------------------------------------------------------------------
+
+#ifndef __CUDACC__
+  // A class encapsulating matrix element calculations on a CPU host
+  class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelHost( const BufferMomenta& momenta,         // input: momenta
+                             const BufferGs& gs,                   // input: gs for alphaS
+                             const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                             const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                             BufferMatrixElements& matrixElements, // output: matrix elements
+                             BufferSelectedHelicity& selhel,       // output: helicity selection
+                             BufferSelectedColor& selcol,          // output: color selection
+                             const size_t nevt );
+
+    // Destructor
+    virtual ~MatrixElementKernelHost() {}
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const unsigned int channelId ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+    // Does this host system support the SIMD used in the matrix element calculation?
+    // [NB: SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
+    static bool hostSupportsSIMD( const bool verbose = true );
+
+  private:
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    HostBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    HostBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    HostBufferDenominators m_denominators;
+#endif
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+  // A class encapsulating matrix element calculations on a GPU device
+  class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelDevice( const BufferMomenta& momenta,         // input: momenta
+                               const BufferGs& gs,                   // input: gs for alphaS
+                               const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                               const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                               BufferMatrixElements& matrixElements, // output: matrix elements
+                               BufferSelectedHelicity& selhel,       // output: helicity selection
+                               BufferSelectedColor& selcol,          // output: color selection
+                               const size_t gpublocks,
+                               const size_t gputhreads );
+
+    // Destructor
+    virtual ~MatrixElementKernelDevice() {}
+
+    // Reset gpublocks and gputhreads
+    void setGrid( const int gpublocks, const int gputhreads );
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const unsigned int channelId ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return true; }
+
+  private:
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    DeviceBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    DeviceBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    DeviceBufferDenominators m_denominators;
+#endif
+
+    // The number of blocks in the GPU grid
+    size_t m_gpublocks;
+
+    // The number of threads in the GPU grid
+    size_t m_gputhreads;
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+}
+#endif // MATRIXELEMENTKERNELS_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessAmplitudes.h
new file mode 100644
index 0000000000..f3ab497b7a
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessAmplitudes.h
@@ -0,0 +1,150 @@
+#ifndef MemoryAccessAmplitudes_H
+#define MemoryAccessAmplitudes_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuCxtypes.h"
+
+#include "MemoryAccessHelpers.h"
+
+#define MGONGPU_TRIVIAL_AMPLITUDES 1
+
+//----------------------------------------------------------------------------
+
+#ifndef MGONGPU_TRIVIAL_AMPLITUDES
+
+// A class describing the internal layout of memory buffers for amplitudes
+// This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA
+// [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
+class MemoryAccessAmplitudesBase //_AOSOAv1
+{
+public:
+
+  // Number of Events Per Page in the amplitude AOSOA memory buffer layout
+  static constexpr int neppA = 1; // AOS (just a test...)
+
+private:
+
+  friend class MemoryAccessHelper<MemoryAccessAmplitudesBase>;
+  friend class KernelAccessHelper<MemoryAccessAmplitudesBase, true>;
+  friend class KernelAccessHelper<MemoryAccessAmplitudesBase, false>;
+
+  // The number of floating point components of a complex number
+  static constexpr int nx2 = mgOnGpu::nx2;
+
+  //--------------------------------------------------------------------------
+  // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
+  // (in other words: first locate the event record for a given event, then locate an element in that record)
+  //--------------------------------------------------------------------------
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+  static __host__ __device__ inline fptype*
+  ieventAccessRecord( fptype* buffer,
+                      const int ievt )
+  {
+    const int ipagA = ievt / neppA; // #event "A-page"
+    const int ieppA = ievt % neppA; // #event in the current event A-page
+    constexpr int ix2 = 0;
+    return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA]
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
+  // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
+  static __host__ __device__ inline fptype&
+  decodeRecord( fptype* buffer,
+                const int ix2 )
+  {
+    constexpr int ipagA = 0;
+    constexpr int ieppA = 0;
+    return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA]
+  }
+};
+
+//----------------------------------------------------------------------------
+
+// A class providing access to memory buffers for a given event, based on explicit event numbers
+// Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
+class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase
+{
+public:
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+  static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecord;
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
+  static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecordConst;
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
+  static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessAmplitudesBase>::decodeRecord;
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
+  static constexpr auto decodeRecordIx2Const =
+    MemoryAccessHelper<MemoryAccessAmplitudesBase>::template decodeRecordConst<int>;
+
+  // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+  // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
+  static constexpr auto ieventAccessIx2 =
+    MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessField<int>;
+
+  // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+  // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
+  static constexpr auto ieventAccessIx2Const =
+    MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessFieldConst<int>;
+};
+
+#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
+
+//----------------------------------------------------------------------------
+
+// A class providing access to memory buffers for a given event, based on implicit kernel rules
+// Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+template<bool onDevice>
+class KernelAccessAmplitudes
+{
+public:
+
+#ifndef MGONGPU_TRIVIAL_AMPLITUDES
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
+  static constexpr auto kernelAccessIx2 =
+    KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessField<int>;
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
+  static constexpr auto kernelAccessIx2Const =
+    KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessFieldConst<int>;
+
+#else
+
+  static __host__ __device__ inline cxtype_sv*
+  kernelAccess( fptype* buffer )
+  {
+    return reinterpret_cast<cxtype_sv*>( buffer );
+  }
+
+  static __host__ __device__ inline const cxtype_sv*
+  kernelAccessConst( const fptype* buffer )
+  {
+    return reinterpret_cast<const cxtype_sv*>( buffer );
+  }
+
+#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
+};
+
+//----------------------------------------------------------------------------
+
+typedef KernelAccessAmplitudes<false> HostAccessAmplitudes;
+typedef KernelAccessAmplitudes<true> DeviceAccessAmplitudes;
+
+//----------------------------------------------------------------------------
+
+#endif // MemoryAccessAmplitudes_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessCouplings.h
new file mode 100644
index 0000000000..11e48b2165
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessCouplings.h
@@ -0,0 +1,256 @@
+#ifndef MemoryAccessCouplings_H
+#define MemoryAccessCouplings_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuCxtypes.h"
+
+#include "MemoryAccessHelpers.h"
+#include "MemoryAccessMomenta.h" // for MemoryAccessMomentaBase::neppM
+#include "MemoryBuffers.h"       // for HostBufferCouplings::isaligned
+
+//----------------------------------------------------------------------------
+
+// A class describing the internal layout of memory buffers for couplings
+// This implementation uses an AOSOA[npagC][ndcoup][nx2][neppC] "super-buffer" where nevt=npagC*neppC
+// From the "super-buffer" for ndcoup different couplings, use idcoupAccessBuffer to access the buffer for one specific coupling
+// [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
+class MemoryAccessCouplingsBase //_AOSOAv1
+{
+public:
+
+  // Number of Events Per Page in the coupling AOSOA memory buffer layout
+  static constexpr int neppC = MemoryAccessMomentaBase::neppM; // use the same AOSOA striding as for momenta
+
+  // SANITY CHECK: check that neppC is a power of two
+  static_assert( ispoweroftwo( neppC ), "neppC is not a power of 2" );
+
+  //--------------------------------------------------------------------------
+  // ** NB! A single super-buffer AOSOA[npagC][ndcoup][nx2][neppC] includes data for ndcoup different couplings  **
+  // ** NB! The ieventAccessRecord and kernelAccess functions refer to the buffer for one individual coupling    **
+  // ** NB! Use idcoupAccessBuffer to add a fixed offset and locate the buffer for one given individual coupling **
+  //--------------------------------------------------------------------------
+
+  // Locate the buffer for a single coupling (output) in a memory super-buffer (input) from the given coupling index (input)
+  // [Signature (non-const) ===> fptype* idcoupAccessBuffer( fptype* buffer, const int idcoup ) <===]
+  // NB: keep this in public even if exposed through KernelAccessCouplings: nvcc says it is inaccesible otherwise?
+  static __host__ __device__ inline fptype*
+  idcoupAccessBuffer( fptype* buffer, // input "super-buffer"
+                      const int idcoup )
+  {
+    constexpr int ipagC = 0;
+    constexpr int ieppC = 0;
+    constexpr int ix2 = 0;
+    // NB! this effectively adds an offset "idcoup * nx2 * neppC"
+    return &( buffer[ipagC * ndcoup * nx2 * neppC + idcoup * nx2 * neppC + ix2 * neppC + ieppC] ); // AOSOA[ipagC][idcoup][ix2][ieppC]
+  }
+
+  // Locate the buffer for a single coupling (output) in a memory super-buffer (input) from the given coupling index (input)
+  // [Signature (const) ===> const fptype* idcoupAccessBufferConst( const fptype* buffer, const int idcoup ) <===]
+  // NB: keep this in public even if exposed through KernelAccessCouplings: nvcc says it is inaccesible otherwise?
+  static __host__ __device__ inline const fptype*
+  idcoupAccessBufferConst( const fptype* buffer, // input "super-buffer"
+                           const int idcoup )
+  {
+    return idcoupAccessBuffer( const_cast<fptype*>( buffer ), idcoup );
+  }
+
+private:
+
+  friend class MemoryAccessHelper<MemoryAccessCouplingsBase>;
+  friend class KernelAccessHelper<MemoryAccessCouplingsBase, true>;
+  friend class KernelAccessHelper<MemoryAccessCouplingsBase, false>;
+
+  // The number of couplings that dependent on the running alphas QCD in this specific process
+  static constexpr size_t ndcoup = Parameters_SMEFTsim_topU3l_MwScheme_UFO_dependentCouplings::ndcoup;
+
+  // The number of floating point components of a complex number
+  static constexpr int nx2 = mgOnGpu::nx2;
+
+  //--------------------------------------------------------------------------
+  // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
+  // (in other words: first locate the event record for a given event, then locate an element in that record)
+  //--------------------------------------------------------------------------
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+  static __host__ __device__ inline fptype*
+  ieventAccessRecord( fptype* buffer,
+                      const int ievt )
+  {
+    const int ipagC = ievt / neppC; // #event "C-page"
+    const int ieppC = ievt % neppC; // #event in the current event C-page
+    constexpr int idcoup = 0;
+    constexpr int ix2 = 0;
+    return &( buffer[ipagC * ndcoup * nx2 * neppC + idcoup * nx2 * neppC + ix2 * neppC + ieppC] ); // AOSOA[ipagC][idcoup][ix2][ieppC]
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
+  // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
+  static __host__ __device__ inline fptype&
+  decodeRecord( fptype* buffer,
+                const int ix2 )
+  {
+    constexpr int ipagC = 0;
+    constexpr int ieppC = 0;
+    // NB! the offset "idcoup * nx2 * neppC" has been added in idcoupAccessBuffer
+    constexpr int idcoup = 0;
+    return buffer[ipagC * ndcoup * nx2 * neppC + idcoup * nx2 * neppC + ix2 * neppC + ieppC]; // AOSOA[ipagC][idcoup][ix2][ieppC]
+  }
+};
+
+//----------------------------------------------------------------------------
+
+// A class providing access to memory buffers for a given event, based on explicit event numbers
+// Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
+class MemoryAccessCouplings : public MemoryAccessCouplingsBase
+{
+public:
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+  static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessCouplingsBase>::ieventAccessRecord;
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
+  static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessCouplingsBase>::ieventAccessRecordConst;
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
+  static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessCouplingsBase>::decodeRecord;
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
+  static constexpr auto decodeRecordIx2Const =
+    MemoryAccessHelper<MemoryAccessCouplingsBase>::template decodeRecordConst<int>;
+
+  // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+  // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
+  static constexpr auto ieventAccessIx2 =
+    MemoryAccessHelper<MemoryAccessCouplingsBase>::template ieventAccessField<int>;
+
+  // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+  // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
+  static constexpr auto ieventAccessIx2Const =
+    MemoryAccessHelper<MemoryAccessCouplingsBase>::template ieventAccessFieldConst<int>;
+};
+
+//----------------------------------------------------------------------------
+
+// A class providing access to memory buffers for a given event, based on implicit kernel rules
+// Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+template<bool onDevice>
+class KernelAccessCouplings
+{
+public:
+
+  // Expose selected functions from MemoryAccessCouplingsBase
+  static constexpr auto idcoupAccessBuffer = MemoryAccessCouplingsBase::idcoupAccessBuffer;
+  static constexpr auto idcoupAccessBufferConst = MemoryAccessCouplingsBase::idcoupAccessBufferConst;
+
+  // Expose selected functions from MemoryAccessCouplings
+  static constexpr auto ieventAccessRecordConst = MemoryAccessCouplings::ieventAccessRecordConst;
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (non-const, SCALAR) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
+  static constexpr auto kernelAccessIx2_s =
+    KernelAccessHelper<MemoryAccessCouplingsBase, onDevice>::template kernelAccessField<int>;
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (const, SCALAR) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
+  static constexpr auto kernelAccessIx2Const_s =
+    KernelAccessHelper<MemoryAccessCouplingsBase, onDevice>::template kernelAccessFieldConst<int>;
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (non const, SCALAR OR VECTOR) ===> fptype_sv& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
+  static __host__ __device__ inline fptype_sv&
+  kernelAccessIx2( fptype* buffer,
+                   const int ix2 )
+  {
+    fptype& out = kernelAccessIx2_s( buffer, ix2 );
+#ifndef MGONGPU_CPPSIMD
+    return out;
+#else
+    // NB: derived from MemoryAccessMomenta, restricting the implementation to contiguous aligned arrays
+    constexpr int neppC = MemoryAccessCouplingsBase::neppC;
+    static_assert( neppC >= neppV );                              // ASSUME CONTIGUOUS ARRAYS
+    static_assert( neppC % neppV == 0 );                          // ASSUME CONTIGUOUS ARRAYS
+    static_assert( mg5amcCpu::HostBufferCouplings::isaligned() ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!)
+    //assert( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!)
+    return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast
+#endif
+  }
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (const, SCALAR OR VECTOR) ===> const fptype_sv& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
+  static __host__ __device__ inline const fptype_sv&
+  kernelAccessIx2Const( const fptype* buffer,
+                        const int ix2 )
+  {
+    return kernelAccessIx2( const_cast<fptype*>( buffer ), ix2 );
+  }
+
+  /*
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (const, SCALAR OR VECTOR) ===> const fptype_sv& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
+  static __host__ __device__ inline const fptype_sv&
+  kernelAccessIx2Const( const fptype* buffer,
+                        const int ix2 )
+  {
+    const fptype& out = kernelAccessIx2Const_s( buffer, ix2 );
+#ifndef MGONGPU_CPPSIMD
+    return out;
+#else
+    // NB: derived from MemoryAccessMomenta, restricting the implementation to contiguous aligned arrays
+    constexpr int neppC = MemoryAccessCouplingsBase::neppC;
+    static_assert( neppC >= neppV ); // ASSUME CONTIGUOUS ARRAYS
+    static_assert( neppC % neppV == 0 ); // ASSUME CONTIGUOUS ARRAYS
+    static_assert( mg5amcCpu::HostBufferCouplings::isaligned() ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!)    
+    //assert( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!)
+    return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast
+#endif
+  }
+  */
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (non const, SCALAR OR VECTOR) ===> cxtype_sv_ref kernelAccess( fptype* buffer ) <===]
+  static __host__ __device__ inline cxtype_sv_ref
+  kernelAccess( fptype* buffer )
+  {
+    /*
+    fptype_sv& real = kernelAccessIx2( buffer, 0 );
+    fptype_sv& imag = kernelAccessIx2( buffer, 1 );
+    printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+    return cxtype_sv_ref( real, imag );
+    */
+    return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ),
+                          kernelAccessIx2( buffer, 1 ) );
+  }
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (const, SCALAR OR VECTOR) ===> cxtype_sv kernelAccessConst( const fptype* buffer ) <===]
+  static __host__ __device__ inline cxtype_sv
+  kernelAccessConst( const fptype* buffer )
+  {
+    /*
+    const fptype_sv& real = kernelAccessIx2Const( buffer, 0 );
+    const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 );
+    printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+    return cxtype_sv( real, imag );
+    */
+    return cxtype_sv( kernelAccessIx2Const( buffer, 0 ),
+                      kernelAccessIx2Const( buffer, 1 ) );
+  }
+};
+
+//----------------------------------------------------------------------------
+
+typedef KernelAccessCouplings<false> HostAccessCouplings;
+typedef KernelAccessCouplings<true> DeviceAccessCouplings;
+
+//----------------------------------------------------------------------------
+
+#endif // MemoryAccessCouplings_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessCouplingsFixed.h
new file mode 100644
index 0000000000..0f9850baf2
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessCouplingsFixed.h
@@ -0,0 +1,70 @@
+#ifndef MemoryAccessCouplingsFixed_H
+#define MemoryAccessCouplingsFixed_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuCxtypes.h"
+#include "mgOnGpuVectors.h"
+
+//#include "MemoryAccessHelpers.h"
+
+//----------------------------------------------------------------------------
+
+// A class describing the internal layout of memory buffers for fixed couplings
+// This implementation uses a STRUCT[ndcoup][nx2] "super-buffer" layout: in practice, the cIPC global array
+// From the "super-buffer" for ndcoup different couplings, use idcoupAccessBuffer to access the buffer for one specific coupling
+// [If many implementations are used, a suffix _Sv1 should be appended to the class name]
+class MemoryAccessCouplingsFixedBase //_Sv1
+{
+public:
+
+  // Locate the buffer for a single coupling (output) in a memory super-buffer (input) from the given coupling index (input)
+  // [Signature (const) ===> const fptype* iicoupAccessBufferConst( const fptype* buffer, const int iicoup ) <===]
+  static __host__ __device__ inline const fptype*
+  iicoupAccessBufferConst( const fptype* buffer, // input "super-buffer": in practice, the cIPC global array
+                           const int iicoup )
+  {
+    constexpr int ix2 = 0;
+    // NB! this effectively adds an offset "iicoup * nx2"
+    return &( buffer[iicoup * nx2 + ix2] ); // STRUCT[idcoup][ix2]
+  }
+
+private:
+
+  // The number of floating point components of a complex number
+  static constexpr int nx2 = mgOnGpu::nx2;
+};
+
+//----------------------------------------------------------------------------
+
+// A class providing access to memory buffers for a given event, based on implicit kernel rules
+// Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+template<bool onDevice>
+class KernelAccessCouplingsFixed
+{
+public:
+
+  // Expose selected functions from MemoryAccessCouplingsFixedBase
+  static constexpr auto iicoupAccessBufferConst = MemoryAccessCouplingsFixedBase::iicoupAccessBufferConst;
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (const, SCALAR OR VECTOR) ===> cxtype_sv kernelAccessConst( const fptype* buffer ) <===]
+  static __host__ __device__ inline const cxtype_sv
+  kernelAccessConst( const fptype* buffer )
+  {
+    // TRIVIAL ACCESS to fixed-couplings buffers!
+    //return cxmake( fptype_sv{ buffer[0] }, fptype_sv{ buffer[1] } ); // NO! BUG #339!
+    const fptype_sv r_sv = fptype_sv{ 0 } + buffer[0];
+    const fptype_sv i_sv = fptype_sv{ 0 } + buffer[1];
+    return cxmake( r_sv, i_sv ); // ugly but effective
+  }
+};
+
+//----------------------------------------------------------------------------
+
+typedef KernelAccessCouplingsFixed<false> HostAccessCouplingsFixed;
+typedef KernelAccessCouplingsFixed<true> DeviceAccessCouplingsFixed;
+
+//----------------------------------------------------------------------------
+
+#endif // MemoryAccessCouplingsFixed_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessDenominators.h
new file mode 100644
index 0000000000..7a4a80ebd9
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessDenominators.h
@@ -0,0 +1,18 @@
+#ifndef MemoryAccessDenominators_H
+#define MemoryAccessDenominators_H 1
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+
+#include "MemoryAccessGs.h"
+
+//----------------------------------------------------------------------------
+
+// A class describing the internal layout of memory buffers for denominators
+// This implementation reuses the plain ARRAY[nevt] implementation of MemoryAccessGs
+
+typedef KernelAccessGs<false> HostAccessDenominators;
+typedef KernelAccessGs<true> DeviceAccessDenominators;
+
+//----------------------------------------------------------------------------
+
+#endif
+#endif // MemoryAccessDenominators_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessGs.h
new file mode 100644
index 0000000000..f233d64b9c
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessGs.h
@@ -0,0 +1,148 @@
+#ifndef MemoryAccessGs_H
+#define MemoryAccessGs_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessHelpers.h"
+#include "MemoryAccessVectors.h"
+#include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
+
+//----------------------------------------------------------------------------
+
+// A class describing the internal layout of memory buffers for Gs
+// This implementation uses a plain ARRAY[nevt]
+// [If many implementations are used, a suffix _ARRAYv1 should be appended to the class name]
+class MemoryAccessGsBase //_ARRAYv1
+{
+private:
+
+  friend class MemoryAccessHelper<MemoryAccessGsBase>;
+  friend class KernelAccessHelper<MemoryAccessGsBase, true>;
+  friend class KernelAccessHelper<MemoryAccessGsBase, false>;
+
+  //--------------------------------------------------------------------------
+  // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
+  // (in other words: first locate the event record for a given event, then locate an element in that record)
+  //--------------------------------------------------------------------------
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+  static __host__ __device__ inline fptype*
+  ieventAccessRecord( fptype* buffer,
+                      const int ievt )
+  {
+    return &( buffer[ievt] ); // ARRAY[nevt]
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
+  // [NB: expand variadic template "Ts... args" to empty and rename "Field" as empty]
+  static __host__ __device__ inline fptype&
+  decodeRecord( fptype* buffer )
+  {
+    constexpr int ievt = 0;
+    return buffer[ievt]; // ARRAY[nevt]
+  }
+};
+
+//----------------------------------------------------------------------------
+
+// A class providing access to memory buffers for a given event, based on explicit event numbers
+// Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
+class MemoryAccessGs : public MemoryAccessGsBase
+{
+public:
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+  static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessGsBase>::ieventAccessRecord;
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
+  static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessGsBase>::ieventAccessRecordConst;
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer ) <===]
+  static constexpr auto decodeRecord = MemoryAccessHelper<MemoryAccessGsBase>::decodeRecord;
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer ) <===]
+  static constexpr auto decodeRecordConst =
+    MemoryAccessHelper<MemoryAccessGsBase>::template decodeRecordConst<>;
+
+  // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+  // [Signature (non-const) ===> fptype& ieventAccess( fptype* buffer, const ievt ) <===]
+  static constexpr auto ieventAccess =
+    MemoryAccessHelper<MemoryAccessGsBase>::template ieventAccessField<>;
+
+  // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+  // [Signature (const) ===> const fptype& ieventAccessConst( const fptype* buffer, const ievt ) <===]
+  static constexpr auto ieventAccessConst =
+    MemoryAccessHelper<MemoryAccessGsBase>::template ieventAccessFieldConst<>;
+};
+
+//----------------------------------------------------------------------------
+
+// A class providing access to memory buffers for a given event, based on implicit kernel rules
+// Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+template<bool onDevice>
+class KernelAccessGs
+{
+public:
+
+  // Expose selected functions from MemoryAccessGs
+  static constexpr auto ieventAccessRecord = MemoryAccessGs::ieventAccessRecord;
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (non-const, SCALAR) ===> fptype& kernelAccess( fptype* buffer ) <===]
+  static constexpr auto kernelAccess_s =
+    KernelAccessHelper<MemoryAccessGsBase, onDevice>::template kernelAccessField<>; // requires cuda 11.4
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+  // [Signature (non-const, SCALAR OR VECTOR) ===> fptype_sv& kernelAccess( fptype* buffer ) <===]
+  static __host__ __device__ inline fptype_sv&
+  kernelAccess( fptype* buffer )
+  {
+    fptype& out = kernelAccess_s( buffer );
+#ifndef MGONGPU_CPPSIMD
+    return out;
+#else
+    // NB: derived from MemoryAccessMomenta, restricting the implementation to contiguous aligned arrays (#435)
+    static_assert( mg5amcCpu::HostBufferGs::isaligned() ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!)
+    //assert( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!)
+    return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast
+#endif
+  }
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
+  static constexpr auto kernelAccessConst_s =
+    KernelAccessHelper<MemoryAccessGsBase, onDevice>::template kernelAccessFieldConst<>; // requires cuda 11.4
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+  // [Signature (const, SCALAR OR VECTOR) ===> const fptype_sv& kernelAccess( const fptype* buffer ) <===]
+  static __host__ __device__ inline const fptype_sv&
+  kernelAccessConst( const fptype* buffer )
+  {
+    const fptype& out = kernelAccessConst_s( buffer );
+#ifndef MGONGPU_CPPSIMD
+    return out;
+#else
+    // NB: derived from MemoryAccessMomenta, restricting the implementation to contiguous aligned arrays (#435)
+    static_assert( mg5amcCpu::HostBufferGs::isaligned() ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!)
+    //assert( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!)
+    return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast
+#endif
+  }
+};
+
+//----------------------------------------------------------------------------
+
+typedef KernelAccessGs<false> HostAccessGs;
+typedef KernelAccessGs<true> DeviceAccessGs;
+
+//----------------------------------------------------------------------------
+
+#endif // MemoryAccessGs_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessHelpers.h
new file mode 100644
index 0000000000..aa3016c9a1
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessHelpers.h
@@ -0,0 +1,152 @@
+#ifndef MemoryAccessHelpers_H
+#define MemoryAccessHelpers_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuFptypes.h"
+
+//----------------------------------------------------------------------------
+
+// A templated helper class that includes the boilerplate code for MemoryAccess classes
+template<class T>
+class MemoryAccessHelper
+{
+public:
+
+  //--------------------------------------------------------------------------
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+  static constexpr auto ieventAccessRecord = T::ieventAccessRecord;
+
+  //--------------------------------------------------------------------------
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
+  static __host__ __device__ inline const fptype*
+  ieventAccessRecordConst( const fptype* buffer,
+                           const int ievt )
+  {
+    return ieventAccessRecord( const_cast<fptype*>( buffer ), ievt );
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
+  static constexpr auto decodeRecord = T::decodeRecord;
+
+  //--------------------------------------------------------------------------
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, Ts... args ) <===]
+  template<class... Ts>
+  static __host__ __device__ inline const fptype&
+  decodeRecordConst( const fptype* buffer,
+                     Ts... args ) // variadic template
+  {
+    return T::decodeRecord( const_cast<fptype*>( buffer ), args... );
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+  // [Signature (non-const) ===> fptype& ieventAccessField( fptype* buffer, const ievt, Ts... args ) <===]
+  template<class... Ts>
+  static __host__ __device__ inline fptype&
+  ieventAccessField( fptype* buffer,
+                     const int ievt,
+                     Ts... args ) // variadic template
+  {
+    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
+    // (in other words: first locate the event record for a given event, then locate an element in that record)
+    return T::decodeRecord( T::ieventAccessRecord( buffer, ievt ), args... );
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+  // [Signature (const) ===> const fptype& ieventAccessFieldConst( const fptype* buffer, const ievt, Ts... args ) <===]
+  template<class... Ts>
+  static __host__ __device__ inline const fptype&
+  ieventAccessFieldConst( const fptype* buffer,
+                          const int ievt,
+                          Ts... args ) // variadic template
+  {
+    return ieventAccessField( const_cast<fptype*>( buffer ), ievt, args... );
+  }
+};
+
+//----------------------------------------------------------------------------
+
+// A templated helper class that includes the boilerplate code for KernelAccess classes
+template<class T, bool onDevice>
+class KernelAccessHelper : public MemoryAccessHelper<T>
+{
+public:
+
+  //--------------------------------------------------------------------------
+
+  // Locate an event record (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+  // [Signature (non-const) ===> fptype* kernelAccessRecord( fptype* buffer ) <===]
+  static __host__ __device__ inline fptype*
+  kernelAccessRecord( fptype* buffer )
+  {
+    if constexpr( !onDevice ) // requires c++17 also in CUDA (#333)
+    {
+      // FIXME #436: clarify that buffer includes all events on device, and only the record for an event subset on host!
+      // FIXME #436: am I not assuming that the following line is always identical to buffer for all access classes T?
+      return T::ieventAccessRecord( buffer, 0 );
+    }
+    else
+    {
+#ifdef __CUDACC__
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+      //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x );
+      return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA
+#else
+      throw std::runtime_error( "kernelAccessRecord on device is only implemented in CUDA" );
+#endif
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Locate an event record (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+  // [Signature (const) ===> const fptype* kernelAccessRecordConst( const fptype* buffer ) <===]
+  static __host__ __device__ inline const fptype*
+  kernelAccessRecordConst( const fptype* buffer )
+  {
+    return kernelAccessRecord( const_cast<fptype*>( buffer ) );
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (non-const) ===> fptype& kernelAccessField( fptype* buffer, Ts... args ) <===]
+  template<class... Ts>
+  static __host__ __device__ inline fptype&
+  kernelAccessField( fptype* buffer,
+                     Ts... args ) // variadic template
+  {
+    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
+    // (in other words: first locate the event record for a given event, then locate an element in that record)
+    return T::decodeRecord( kernelAccessRecord( buffer ), args... );
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (const) ===> const fptype& kernelAccessFieldConst( const fptype* buffer, Ts... args ) <===]
+  template<class... Ts>
+  static __host__ __device__ inline const fptype&
+  kernelAccessFieldConst( const fptype* buffer,
+                          Ts... args ) // variadic template
+  {
+    return kernelAccessField( const_cast<fptype*>( buffer ), args... );
+  }
+
+  //--------------------------------------------------------------------------
+};
+
+#endif // MemoryAccessHelpers_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessMatrixElements.h
new file mode 100644
index 0000000000..05f0810807
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessMatrixElements.h
@@ -0,0 +1,132 @@
+#ifndef MemoryAccessMatrixElements_H
+#define MemoryAccessMatrixElements_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessHelpers.h"
+#include "MemoryAccessVectors.h"
+#include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned
+
+//----------------------------------------------------------------------------
+
+// A class describing the internal layout of memory buffers for matrix elements
+// This implementation uses a plain ARRAY[nevt]
+// [If many implementations are used, a suffix _ARRAYv1 should be appended to the class name]
+class MemoryAccessMatrixElementsBase //_ARRAYv1
+{
+private:
+
+  friend class MemoryAccessHelper<MemoryAccessMatrixElementsBase>;
+  friend class KernelAccessHelper<MemoryAccessMatrixElementsBase, true>;
+  friend class KernelAccessHelper<MemoryAccessMatrixElementsBase, false>;
+
+  //--------------------------------------------------------------------------
+  // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
+  // (in other words: first locate the event record for a given event, then locate an element in that record)
+  //--------------------------------------------------------------------------
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+  static __host__ __device__ inline fptype*
+  ieventAccessRecord( fptype* buffer,
+                      const int ievt )
+  {
+    return &( buffer[ievt] ); // ARRAY[nevt]
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
+  // [NB: expand variadic template "Ts... args" to empty and rename "Field" as empty]
+  static __host__ __device__ inline fptype&
+  decodeRecord( fptype* buffer )
+  {
+    constexpr int ievt = 0;
+    return buffer[ievt]; // ARRAY[nevt]
+  }
+};
+
+//----------------------------------------------------------------------------
+
+// A class providing access to memory buffers for a given event, based on explicit event numbers
+// Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
+class MemoryAccessMatrixElements : public MemoryAccessMatrixElementsBase
+{
+public:
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+  static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessMatrixElementsBase>::ieventAccessRecord;
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
+  static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessMatrixElementsBase>::ieventAccessRecordConst;
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer ) <===]
+  static constexpr auto decodeRecord = MemoryAccessHelper<MemoryAccessMatrixElementsBase>::decodeRecord;
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer ) <===]
+  static constexpr auto decodeRecordConst =
+    MemoryAccessHelper<MemoryAccessMatrixElementsBase>::template decodeRecordConst<>;
+
+  // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+  // [Signature (non-const) ===> fptype& ieventAccess( fptype* buffer, const ievt ) <===]
+  static constexpr auto ieventAccess =
+    MemoryAccessHelper<MemoryAccessMatrixElementsBase>::template ieventAccessField<>;
+
+  // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+  // [Signature (const) ===> const fptype& ieventAccessConst( const fptype* buffer, const ievt ) <===]
+  static constexpr auto ieventAccessConst =
+    MemoryAccessHelper<MemoryAccessMatrixElementsBase>::template ieventAccessFieldConst<>;
+};
+
+//----------------------------------------------------------------------------
+
+// A class providing access to memory buffers for a given event, based on implicit kernel rules
+// Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+template<bool onDevice>
+class KernelAccessMatrixElements
+{
+public:
+
+  // Expose selected functions from MemoryAccessMatrixElements
+  static constexpr auto ieventAccessRecord = MemoryAccessMatrixElements::ieventAccessRecord;
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (non-const, SCALAR) ===> fptype& kernelAccess_s( fptype* buffer ) <===]
+  static constexpr auto kernelAccess_s =
+    KernelAccessHelper<MemoryAccessMatrixElementsBase, onDevice>::template kernelAccessField<>; // requires cuda 11.4
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal)
+  // [Signature (non const, SCALAR OR VECTOR) ===> fptype_sv& kernelAccess( const fptype* buffer ) <===]
+  static __host__ __device__ inline fptype_sv&
+  kernelAccess( fptype* buffer )
+  {
+    fptype& out = kernelAccess_s( buffer );
+#ifndef MGONGPU_CPPSIMD
+    return out;
+#else
+    // NB: derived from MemoryAccessMomenta, restricting the implementation to contiguous aligned arrays (#435)
+    static_assert( mg5amcCpu::HostBufferMatrixElements::isaligned() ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!)
+    //assert( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!)
+    return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast
+#endif
+  }
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (const) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
+  static constexpr auto kernelAccessConst =
+    KernelAccessHelper<MemoryAccessMatrixElementsBase, onDevice>::template kernelAccessFieldConst<>; // requires cuda 11.4
+};
+
+//----------------------------------------------------------------------------
+
+typedef KernelAccessMatrixElements<false> HostAccessMatrixElements;
+typedef KernelAccessMatrixElements<true> DeviceAccessMatrixElements;
+
+//----------------------------------------------------------------------------
+
+#endif // MemoryAccessMatrixElements_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessMomenta.h
new file mode 100644
index 0000000000..ace50b40e8
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessMomenta.h
@@ -0,0 +1,260 @@
+#ifndef MemoryAccessMomenta_H
+#define MemoryAccessMomenta_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessHelpers.h"
+#include "MemoryAccessVectors.h"
+
+//----------------------------------------------------------------------------
+
+// A class describing the internal layout of memory buffers for momenta
+// This implementation uses an AOSOA[npagM][npar][np4][neppM] where nevt=npagM*neppM
+// [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
+class MemoryAccessMomentaBase //_AOSOAv1
+{
+public:
+
+  // Number of Events Per Page in the momenta AOSOA memory buffer layout
+  // (these are all best kept as a compile-time constants: see issue #23)
+#ifdef __CUDACC__ /* clang-format off */
+  // -----------------------------------------------------------------------------------------------
+  // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline
+  // --- This is relevant to ensure coalesced access to momenta in global memory
+  // --- Note that neppR is hardcoded and may differ from neppM and neppV on some platforms
+  // -----------------------------------------------------------------------------------------------
+  //static constexpr int neppM = 64/sizeof(fptype); // 2x 32-byte GPU cache lines (512 bits): 8 (DOUBLE) or 16 (FLOAT)
+  static constexpr int neppM = 32/sizeof(fptype); // (DEFAULT) 32-byte GPU cache line (256 bits): 4 (DOUBLE) or 8 (FLOAT)
+  //static constexpr int neppM = 1; // *** NB: this is equivalent to AOS *** (slower: 1.03E9 instead of 1.11E9 in eemumu)
+#else
+  // -----------------------------------------------------------------------------------------------
+  // --- CPUs: neppM is best set equal to the number of fptype's (neppV) in a vector register
+  // --- This is relevant to ensure faster access to momenta from C++ memory cache lines
+  // --- However, neppM is now decoupled from neppV (issue #176) and can be separately hardcoded
+  // --- In practice, neppR, neppM and neppV could now (in principle) all be different
+  // -----------------------------------------------------------------------------------------------
+#ifdef MGONGPU_CPPSIMD
+  static constexpr int neppM = MGONGPU_CPPSIMD; // (DEFAULT) neppM=neppV for optimal performance
+  //static constexpr int neppM = 64/sizeof(fptype); // maximum CPU vector width (512 bits): 8 (DOUBLE) or 16 (FLOAT)
+  //static constexpr int neppM = 32/sizeof(fptype); // lower CPU vector width (256 bits): 4 (DOUBLE) or 8 (FLOAT)
+  //static constexpr int neppM = 1; // *** NB: this is equivalent to AOS *** (slower: 4.66E6 instead of 5.09E9 in eemumu)
+  //static constexpr int neppM = MGONGPU_CPPSIMD*2; // FOR TESTS
+#else
+  static constexpr int neppM = 1; // (DEFAULT) neppM=neppV for optimal performance (NB: this is equivalent to AOS)
+#endif
+#endif /* clang-format on */
+
+  // SANITY CHECK: check that neppM is a power of two
+  static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+
+private:
+
+  friend class MemoryAccessHelper<MemoryAccessMomentaBase>;
+  friend class KernelAccessHelper<MemoryAccessMomentaBase, true>;
+  friend class KernelAccessHelper<MemoryAccessMomentaBase, false>;
+
+  // The number of components of a 4-momentum
+  static constexpr int np4 = mgOnGpu::np4;
+
+  // The number of particles in this physics process
+  static constexpr int npar = mgOnGpu::npar;
+
+  //--------------------------------------------------------------------------
+  // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
+  // (in other words: first locate the event record for a given event, then locate an element in that record)
+  //--------------------------------------------------------------------------
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+  static __host__ __device__ inline fptype*
+  ieventAccessRecord( fptype* buffer,
+                      const int ievt )
+  {
+    const int ipagM = ievt / neppM; // #event "M-page"
+    const int ieppM = ievt % neppM; // #event in the current event M-page
+    constexpr int ip4 = 0;
+    constexpr int ipar = 0;
+    return &( buffer[ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM] ); // AOSOA[ipagM][ipar][ip4][ieppM]
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
+  // [NB: expand variadic template "Ts... args" to "const int ip4, const int ipar" and rename "Field" as "Ip4Ipar"]
+  static __host__ __device__ inline fptype&
+  decodeRecord( fptype* buffer,
+                const int ip4,
+                const int ipar )
+  {
+    constexpr int ipagM = 0;
+    constexpr int ieppM = 0;
+    return buffer[ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM]; // AOSOA[ipagM][ipar][ip4][ieppM]
+  }
+};
+
+//----------------------------------------------------------------------------
+
+// A class providing access to memory buffers for a given event, based on explicit event numbers
+// Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
+class MemoryAccessMomenta : public MemoryAccessMomentaBase
+{
+public:
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+  static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessMomentaBase>::ieventAccessRecord;
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
+  static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessMomentaBase>::ieventAccessRecordConst;
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ipar, const int ipar ) <===]
+  static constexpr auto decodeRecordIp4Ipar = MemoryAccessHelper<MemoryAccessMomentaBase>::decodeRecord;
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ipar, const int ipar ) <===]
+  static constexpr auto decodeRecordIp4IparConst =
+    MemoryAccessHelper<MemoryAccessMomentaBase>::template decodeRecordConst<int, int>;
+
+  // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+  // [Signature (non-const) ===> fptype& ieventAccessIp4Ipar( fptype* buffer, const ievt, const int ipar, const int ipar ) <===]
+  static constexpr auto ieventAccessIp4Ipar =
+    MemoryAccessHelper<MemoryAccessMomentaBase>::template ieventAccessField<int, int>;
+
+  // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+  // [Signature (const) ===> const fptype& ieventAccessIp4IparConst( const fptype* buffer, const ievt, const int ipar, const int ipar ) <===]
+  // DEFAULT VERSION
+  static constexpr auto ieventAccessIp4IparConst =
+    MemoryAccessHelper<MemoryAccessMomentaBase>::template ieventAccessFieldConst<int, int>;
+
+  /*
+  // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+  // [Signature (const) ===> const fptype& ieventAccessIp4IparConst( const fptype* buffer, const ievt, const int ipar, const int ipar ) <===]
+  // DEBUG VERSION WITH PRINTOUTS
+  static __host__ __device__ inline const fptype& 
+  ieventAccessIp4IparConst( const fptype* buffer,
+                                          const int ievt,
+                                          const int ip4,
+                                          const int ipar )
+  {
+    const fptype& out = MemoryAccessHelper<MemoryAccessMomentaBase>::template ieventAccessFieldConst<int, int>( buffer, ievt, ip4, ipar );
+    printf( "ipar=%2d ip4=%2d ievt=%8d out=%8.3f\n", ipar, ip4, ievt, out );
+    return out;
+  }
+  */
+};
+
+//----------------------------------------------------------------------------
+
+// A class providing access to memory buffers for a given event, based on implicit kernel rules
+// Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+template<bool onDevice>
+class KernelAccessMomenta
+{
+public:
+
+  // Expose selected functions from MemoryAccessMomenta
+  static constexpr auto ieventAccessRecordConst = MemoryAccessMomenta::ieventAccessRecordConst;
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (non-const, SCALAR) ===> fptype& kernelAccessIp4Ipar( fptype* buffer, const int ipar, const int ipar ) <===]
+  static constexpr auto kernelAccessIp4Ipar =
+    KernelAccessHelper<MemoryAccessMomentaBase, onDevice>::template kernelAccessField<int, int>;
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (const, SCALAR) ===> const fptype& kernelAccessIp4IparConst( const fptype* buffer, const int ipar, const int ipar ) <===]
+  // DEFAULT VERSION
+  static constexpr auto kernelAccessIp4IparConst_s =
+    KernelAccessHelper<MemoryAccessMomentaBase, onDevice>::template kernelAccessFieldConst<int, int>;
+
+  /*
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (const, SCALAR) ===> const fptype& kernelAccessIp4IparConst( const fptype* buffer, const int ipar, const int ipar ) <===]
+  // DEBUG VERSION WITH PRINTOUTS
+  static __host__ __device__ inline const fptype&
+  kernelAccessIp4IparConst_s( const fptype* buffer,
+                              const int ip4,
+                              const int ipar )
+  {
+    const fptype& out = KernelAccessHelper<MemoryAccessMomentaBase, onDevice>::template kernelAccessFieldConst<int, int>( buffer, ip4, ipar );
+    printf( "ipar=%2d ip4=%2d ievt='kernel' out=%8.3f\n", ipar, ip4, out );
+    return out;
+  }
+  */
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (const, SCALAR OR VECTOR) ===> fptype_sv kernelAccessIp4IparConst( const fptype* buffer, const int ipar, const int ipar ) <===]
+  // FIXME? Eventually return by const reference and support aligned arrays only?
+  // FIXME? Currently return by value to support also unaligned and arbitrary arrays
+  static __host__ __device__ inline fptype_sv
+  kernelAccessIp4IparConst( const fptype* buffer,
+                            const int ip4,
+                            const int ipar )
+  {
+    const fptype& out = kernelAccessIp4IparConst_s( buffer, ip4, ipar );
+#ifndef MGONGPU_CPPSIMD
+    return out;
+#else
+    constexpr int neppM = MemoryAccessMomentaBase::neppM;
+    constexpr bool useContiguousEventsIfPossible = true; // DEFAULT
+    //constexpr bool useContiguousEventsIfPossible = false; // FOR PERFORMANCE TESTS (treat as arbitrary array even if it is an AOSOA)
+    // Use c++17 "if constexpr": compile-time branching
+    if constexpr( useContiguousEventsIfPossible && ( neppM >= neppV ) && ( neppM % neppV == 0 ) )
+    {
+      //constexpr bool skipAlignmentCheck = true; // FASTEST (SEGFAULTS IF MISALIGNED ACCESS, NEEDS A SANITY CHECK ELSEWHERE!)
+      constexpr bool skipAlignmentCheck = false; // DEFAULT: A BIT SLOWER BUT SAFER [ALLOWS MISALIGNED ACCESS]
+      if constexpr( skipAlignmentCheck )
+      {
+        //static bool first=true; if( first ){ std::cout << "WARNING! assume aligned AOSOA, skip check" << std::endl; first=false; } // SLOWER (5.06E6)
+        // FASTEST? (5.09E6 in eemumu 512y)
+        // This assumes alignment for momenta1d without checking - causes segmentation fault in reinterpret_cast if not aligned!
+        return mg5amcCpu::fptypevFromAlignedArray( out ); // use reinterpret_cast
+      }
+      else if( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 )
+      {
+        //static bool first=true; if( first ){ std::cout << "WARNING! aligned AOSOA, reinterpret cast" << std::endl; first=false; } // SLOWER (5.00E6)
+        // DEFAULT! A tiny bit (<1%) slower because of the alignment check (5.07E6 in eemumu 512y)
+        // This explicitly checks buffer alignment to avoid segmentation faults in reinterpret_cast
+        return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast
+      }
+      else
+      {
+        //static bool first=true; if( first ){ std::cout << "WARNING! AOSOA but no reinterpret cast" << std::endl; first=false; } // SLOWER (4.93E6)
+        // A bit (1%) slower (5.05E6 in eemumu 512y)
+        // This does not require buffer alignment, but it requires AOSOA with neppM>=neppV and neppM%neppV==0
+        return mg5amcCpu::fptypevFromUnalignedArray( out ); // SIMD bulk load of neppV, do not use reinterpret_cast (fewer SIMD operations)
+      }
+    }
+    else
+    {
+      //static bool first=true; if( first ){ std::cout << "WARNING! arbitrary array" << std::endl; first=false; } // SLOWER (5.08E6)
+      // ?!Used to be much slower, now a tiny bit faster for AOSOA?! (5.11E6 for AOSOA, 4.64E6 for AOS in eemumu 512y)
+      // This does not even require AOSOA with neppM>=neppV and neppM%neppV==0 (e.g. can be used with AOS neppM==1)
+      constexpr int ievt0 = 0; // just make it explicit in the code that buffer refers to a given ievt0 and decoderIeppV fetches event ievt0+ieppV
+      auto decoderIeppv = [buffer, ip4, ipar]( int ieppV )
+        -> const fptype&
+      { return MemoryAccessMomenta::ieventAccessIp4IparConst( buffer, ievt0 + ieppV, ip4, ipar ); };
+      return mg5amcCpu::fptypevFromArbitraryArray( decoderIeppv ); // iterate over ieppV in neppV (no SIMD)
+    }
+#endif
+  }
+
+  // Is this a HostAccess or DeviceAccess class?
+  // [this is only needed for a warning printout in rambo.h for nparf==1 #358]
+  static __host__ __device__ inline constexpr bool
+  isOnDevice()
+  {
+    return onDevice;
+  }
+};
+
+//----------------------------------------------------------------------------
+
+typedef KernelAccessMomenta<false> HostAccessMomenta;
+typedef KernelAccessMomenta<true> DeviceAccessMomenta;
+
+//----------------------------------------------------------------------------
+
+#endif // MemoryAccessMomenta_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessNumerators.h
new file mode 100644
index 0000000000..e5f81381a9
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessNumerators.h
@@ -0,0 +1,18 @@
+#ifndef MemoryAccessNumerators_H
+#define MemoryAccessNumerators_H 1
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+
+#include "MemoryAccessGs.h"
+
+//----------------------------------------------------------------------------
+
+// A class describing the internal layout of memory buffers for numerators
+// This implementation reuses the plain ARRAY[nevt] implementation of MemoryAccessGs
+
+typedef KernelAccessGs<false> HostAccessNumerators;
+typedef KernelAccessGs<true> DeviceAccessNumerators;
+
+//----------------------------------------------------------------------------
+
+#endif
+#endif // MemoryAccessNumerators_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessRandomNumbers.h
new file mode 100644
index 0000000000..a7ff24243f
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessRandomNumbers.h
@@ -0,0 +1,132 @@
+#ifndef MemoryAccessRandomNumbers_H
+#define MemoryAccessRandomNumbers_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessHelpers.h"
+
+//----------------------------------------------------------------------------
+
+// A class describing the internal layout of memory buffers for random numbers
+// This implementation uses an AOSOA[npagR][nparf][np4][neppR] where nevt=npagR*neppR
+// [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
+class MemoryAccessRandomNumbersBase //_AOSOAv1
+{
+public: /* clang-format off */
+
+  // Number of Events Per Page in the random number AOSOA memory buffer layout
+  // *** NB Different values of neppR lead to different physics results: the ***
+  // *** same 1d array is generated, but it is interpreted in different ways ***
+  static constexpr int neppR = 8; // HARDCODED TO GIVE ALWAYS THE SAME PHYSICS RESULTS!
+  //static constexpr int neppR = 1; // AOS (tests of sectors/requests)
+
+private: /* clang-format on */
+
+  friend class MemoryAccessHelper<MemoryAccessRandomNumbersBase>;
+  friend class KernelAccessHelper<MemoryAccessRandomNumbersBase, true>;
+  friend class KernelAccessHelper<MemoryAccessRandomNumbersBase, false>;
+
+  // The number of components of a 4-momentum
+  static constexpr int np4 = mgOnGpu::np4;
+
+  // The number of final state particles in this physics process
+  static constexpr int nparf = mgOnGpu::nparf;
+
+  //--------------------------------------------------------------------------
+  // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
+  // (in other words: first locate the event record for a given event, then locate an element in that record)
+  //--------------------------------------------------------------------------
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+  static __host__ __device__ inline fptype*
+  ieventAccessRecord( fptype* buffer,
+                      const int ievt )
+  {
+    const int ipagR = ievt / neppR; // #event "R-page"
+    const int ieppR = ievt % neppR; // #event in the current event R-page
+    constexpr int ip4 = 0;
+    constexpr int iparf = 0;
+    return &( buffer[ipagR * nparf * np4 * neppR + iparf * np4 * neppR + ip4 * neppR + ieppR] ); // AOSOA[ipagR][iparf][ip4][ieppR]
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
+  // [NB: expand variadic template "Ts... args" to "const int ip4, const int iparf" and rename "Field" as "Ip4Iparf"]
+  static __host__ __device__ inline fptype&
+  decodeRecord( fptype* buffer,
+                const int ip4,
+                const int iparf )
+  {
+    constexpr int ipagR = 0;
+    constexpr int ieppR = 0;
+    return buffer[ipagR * nparf * np4 * neppR + iparf * np4 * neppR + ip4 * neppR + ieppR]; // AOSOA[ipagR][iparf][ip4][ieppR]
+  }
+};
+
+//----------------------------------------------------------------------------
+
+// A class providing access to memory buffers for a given event, based on explicit event numbers
+// Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
+class MemoryAccessRandomNumbers : public MemoryAccessRandomNumbersBase
+{
+public:
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+  static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessRandomNumbersBase>::ieventAccessRecord;
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
+  static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessRandomNumbersBase>::ieventAccessRecordConst;
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ipar, const int iparf ) <===]
+  static constexpr auto decodeRecordIp4Iparf = MemoryAccessHelper<MemoryAccessRandomNumbersBase>::decodeRecord;
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ipar, const int iparf ) <===]
+  static constexpr auto decodeRecordIp4IparfConst =
+    MemoryAccessHelper<MemoryAccessRandomNumbersBase>::template decodeRecordConst<int, int>;
+
+  // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+  // [Signature (non-const) ===> fptype& ieventAccessIp4Iparf( fptype* buffer, const ievt, const int ipar, const int iparf ) <===]
+  static constexpr auto ieventAccessIp4Iparf =
+    MemoryAccessHelper<MemoryAccessRandomNumbersBase>::template ieventAccessField<int, int>;
+
+  // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+  // [Signature (const) ===> const fptype& ieventAccessIp4IparfConst( const fptype* buffer, const ievt, const int ipar, const int iparf ) <===]
+  static constexpr auto ieventAccessIp4IparfConst =
+    MemoryAccessHelper<MemoryAccessRandomNumbersBase>::template ieventAccessFieldConst<int, int>;
+};
+
+//----------------------------------------------------------------------------
+
+// A class providing access to memory buffers for a given event, based on implicit kernel rules
+// Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+template<bool onDevice>
+class KernelAccessRandomNumbers
+{
+public:
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (non-const) ===> fptype& kernelAccessIp4Iparf( fptype* buffer, const int ipar, const int iparf ) <===]
+  static constexpr auto kernelAccessIp4Iparf =
+    KernelAccessHelper<MemoryAccessRandomNumbersBase, onDevice>::template kernelAccessField<int, int>;
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (const) ===> const fptype& kernelAccessIp4IparfConst( const fptype* buffer, const int ipar, const int iparf ) <===]
+  static constexpr auto kernelAccessIp4IparfConst =
+    KernelAccessHelper<MemoryAccessRandomNumbersBase, onDevice>::template kernelAccessFieldConst<int, int>;
+};
+
+//----------------------------------------------------------------------------
+
+typedef KernelAccessRandomNumbers<false> HostAccessRandomNumbers;
+typedef KernelAccessRandomNumbers<true> DeviceAccessRandomNumbers;
+
+//----------------------------------------------------------------------------
+
+#endif // MemoryAccessRandomNumbers_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessVectors.h
new file mode 100644
index 0000000000..2697cdad52
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessVectors.h
@@ -0,0 +1,122 @@
+#ifndef MemoryAccessVectors_H
+#define MemoryAccessVectors_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#ifndef __CUDACC__
+namespace mg5amcCpu // this is only needed for CPU SIMD vectorization
+{
+
+#ifdef MGONGPU_CPPSIMD
+  //--------------------------------------------------------------------------
+
+  // Cast one non-const fptype_v reference (one vector of neppV fptype values) from one non-const fptype reference (#435),
+  // assuming that "pointer(evt#0)+1" indicates "pointer(evt#1)", and that the arrays are aligned
+  inline fptype_v& fptypevFromAlignedArray( fptype& ref )
+  {
+    return *reinterpret_cast<fptype_sv*>( &ref );
+  }
+
+  // Cast one const fptype_v reference (one vector of neppV fptype values) from one const fptype reference,
+  // assuming that "pointer(evt#0)+1" indicates "pointer(evt#1)", and that the arrays are aligned
+  inline const fptype_v& fptypevFromAlignedArray( const fptype& ref )
+  {
+    return *reinterpret_cast<const fptype_sv*>( &ref );
+  }
+
+  // Build one fptype_v (one vector of neppV fptype values) from one fptype reference,
+  // assuming that "pointer(evt#0)+1" indicates "pointer(evt#1)", but that the arrays are not aligned
+  inline fptype_v fptypevFromUnalignedArray( const fptype& ref )
+  {
+#if MGONGPU_CPPSIMD == 2
+    return fptype_v{ *( &ref ), // explicit initialization of all array elements (2)
+                     *( &ref + 1 ) };
+#elif MGONGPU_CPPSIMD == 4
+    return fptype_v{ *( &ref ), // explicit initialization of all array elements (4)
+                     *( &ref + 1 ),
+                     *( &ref + 2 ),
+                     *( &ref + 3 ) };
+#elif MGONGPU_CPPSIMD == 8
+    return fptype_v{ *( &ref ), // explicit initialization of all array elements (8)
+                     *( &ref + 1 ),
+                     *( &ref + 2 ),
+                     *( &ref + 3 ),
+                     *( &ref + 4 ),
+                     *( &ref + 5 ),
+                     *( &ref + 6 ),
+                     *( &ref + 7 ) };
+#elif MGONGPU_CPPSIMD == 16
+    return fptype_v{ *( &ref ), // explicit initialization of all array elements (16)
+                     *( &ref + 1 ),
+                     *( &ref + 2 ),
+                     *( &ref + 3 ),
+                     *( &ref + 4 ),
+                     *( &ref + 5 ),
+                     *( &ref + 6 ),
+                     *( &ref + 7 ),
+                     *( &ref + 8 ),
+                     *( &ref + 9 ),
+                     *( &ref + 10 ),
+                     *( &ref + 11 ),
+                     *( &ref + 12 ),
+                     *( &ref + 13 ),
+                     *( &ref + 14 ),
+                     *( &ref + 15 ) };
+#else
+#error Internal error! Unknown MGONGPU_CPPSIMD value
+#endif
+  }
+
+  // Build one fptype_v (one vector of neppV fptype values) from one fptype reference,
+  // with no a priori assumption on how the input fptype array should be decoded
+  template<typename Functor>
+  inline fptype_v fptypevFromArbitraryArray( Functor decoderIeppv )
+  {
+#if MGONGPU_CPPSIMD == 2
+    return fptype_v{ decoderIeppv( 0 ), // explicit initialization of all array elements (2)
+                     decoderIeppv( 1 ) };
+#elif MGONGPU_CPPSIMD == 4
+    return fptype_v{ decoderIeppv( 0 ), // explicit initialization of all array elements (4)
+                     decoderIeppv( 1 ),
+                     decoderIeppv( 2 ),
+                     decoderIeppv( 3 ) };
+#elif MGONGPU_CPPSIMD == 8
+    return fptype_v{ decoderIeppv( 0 ), // explicit initialization of all array elements (8)
+                     decoderIeppv( 1 ),
+                     decoderIeppv( 2 ),
+                     decoderIeppv( 3 ),
+                     decoderIeppv( 4 ),
+                     decoderIeppv( 5 ),
+                     decoderIeppv( 6 ),
+                     decoderIeppv( 7 ) };
+#elif MGONGPU_CPPSIMD == 16
+    return fptype_v{ decoderIeppv( 0 ), // explicit initialization of all array elements (16)
+                     decoderIeppv( 1 ),
+                     decoderIeppv( 2 ),
+                     decoderIeppv( 3 ),
+                     decoderIeppv( 4 ),
+                     decoderIeppv( 5 ),
+                     decoderIeppv( 6 ),
+                     decoderIeppv( 7 ),
+                     decoderIeppv( 8 ),
+                     decoderIeppv( 9 ),
+                     decoderIeppv( 10 ),
+                     decoderIeppv( 11 ),
+                     decoderIeppv( 12 ),
+                     decoderIeppv( 13 ),
+                     decoderIeppv( 14 ),
+                     decoderIeppv( 15 ) };
+#else
+#error Internal error! Unknown MGONGPU_CPPSIMD value
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+#endif
+
+} // end namespace
+#endif
+
+#endif // MemoryAccessVectors_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessWavefunctions.h
new file mode 100644
index 0000000000..738eef9a02
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessWavefunctions.h
@@ -0,0 +1,155 @@
+#ifndef MemoryAccessWavefunctions_H
+#define MemoryAccessWavefunctions_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuCxtypes.h"
+
+#include "MemoryAccessHelpers.h"
+
+#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
+
+//----------------------------------------------------------------------------
+
+#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
+
+// A class describing the internal layout of memory buffers for wavefunctions
+// This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW
+// [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
+class MemoryAccessWavefunctionsBase //_AOSOAv1
+{
+public:
+
+  // Number of Events Per Page in the wavefunction AOSOA memory buffer layout
+  static constexpr int neppW = 1; // AOS (just a test...)
+
+private:
+
+  friend class MemoryAccessHelper<MemoryAccessWavefunctionsBase>;
+  friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, true>;
+  friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, false>;
+
+  // The number of components of a (fermion or vector) wavefunction
+  static constexpr int nw6 = mgOnGpu::nw6;
+
+  // The number of floating point components of a complex number
+  static constexpr int nx2 = mgOnGpu::nx2;
+
+  //--------------------------------------------------------------------------
+  // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
+  // (in other words: first locate the event record for a given event, then locate an element in that record)
+  //--------------------------------------------------------------------------
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+  static __host__ __device__ inline fptype*
+  ieventAccessRecord( fptype* buffer,
+                      const int ievt )
+  {
+    const int ipagW = ievt / neppW; // #event "W-page"
+    const int ieppW = ievt % neppW; // #event in the current event W-page
+    constexpr int iw6 = 0;
+    constexpr int ix2 = 0;
+    return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW]
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
+  // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"]
+  static __host__ __device__ inline fptype&
+  decodeRecord( fptype* buffer,
+                const int iw6,
+                const int ix2 )
+  {
+    constexpr int ipagW = 0;
+    constexpr int ieppW = 0;
+    return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW]
+  }
+};
+
+//----------------------------------------------------------------------------
+
+// A class providing access to memory buffers for a given event, based on explicit event numbers
+// Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
+class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase
+{
+public:
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+  static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecord;
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
+  static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecordConst;
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===]
+  static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::decodeRecord;
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===]
+  static constexpr auto decodeRecordIw6Ix2Const =
+    MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template decodeRecordConst<int, int>;
+
+  // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+  // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
+  static constexpr auto ieventAccessIw6Ix2 =
+    MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessField<int, int>;
+
+  // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+  // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
+  static constexpr auto ieventAccessIw6Ix2Const =
+    MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessFieldConst<int, int>;
+};
+
+#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
+
+//----------------------------------------------------------------------------
+
+// A class providing access to memory buffers for a given event, based on implicit kernel rules
+// Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+template<bool onDevice>
+class KernelAccessWavefunctions
+{
+public:
+
+#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===]
+  static constexpr auto kernelAccessIw6Ix2 =
+    KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessField<int, int>;
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===]
+  static constexpr auto kernelAccessIw6Ix2Const =
+    KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessFieldConst<int, int>;
+
+#else
+
+  static __host__ __device__ inline cxtype_sv*
+  kernelAccess( fptype* buffer )
+  {
+    return reinterpret_cast<cxtype_sv*>( buffer );
+  }
+
+  static __host__ __device__ inline const cxtype_sv*
+  kernelAccessConst( const fptype* buffer )
+  {
+    return reinterpret_cast<const cxtype_sv*>( buffer );
+  }
+
+#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
+};
+
+//----------------------------------------------------------------------------
+
+typedef KernelAccessWavefunctions<false> HostAccessWavefunctions;
+typedef KernelAccessWavefunctions<true> DeviceAccessWavefunctions;
+
+//----------------------------------------------------------------------------
+
+#endif // MemoryAccessWavefunctions_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessWeights.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessWeights.h
new file mode 100644
index 0000000000..3915657657
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessWeights.h
@@ -0,0 +1,135 @@
+#ifndef MemoryAccessWeights_H
+#define MemoryAccessWeights_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessHelpers.h"
+
+//----------------------------------------------------------------------------
+
+// A class describing the internal layout of memory buffers for weights
+// This implementation uses a plain ARRAY[nevt]
+// [If many implementations are used, a suffix _ARRAYv1 should be appended to the class name]
+class MemoryAccessWeightsBase //_ARRAYv1
+{
+private:
+
+  friend class MemoryAccessHelper<MemoryAccessWeightsBase>;
+  friend class KernelAccessHelper<MemoryAccessWeightsBase, true>;
+  friend class KernelAccessHelper<MemoryAccessWeightsBase, false>;
+
+  //--------------------------------------------------------------------------
+  // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
+  // (in other words: first locate the event record for a given event, then locate an element in that record)
+  //--------------------------------------------------------------------------
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+  static __host__ __device__ inline fptype*
+  ieventAccessRecord( fptype* buffer,
+                      const int ievt )
+  {
+    return &( buffer[ievt] ); // ARRAY[nevt]
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
+  // [NB: expand variadic template "Ts... args" to empty and rename "Field" as empty]
+  static __host__ __device__ inline fptype&
+  decodeRecord( fptype* buffer )
+  {
+    constexpr int ievt = 0;
+    return buffer[ievt]; // ARRAY[nevt]
+  }
+};
+
+//----------------------------------------------------------------------------
+
+// A class providing access to memory buffers for a given event, based on explicit event numbers
+// Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
+class MemoryAccessWeights : public MemoryAccessWeightsBase
+{
+public:
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
+  static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWeightsBase>::ieventAccessRecord;
+
+  // Locate an event record (output) in a memory buffer (input) from the given event number (input)
+  // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
+  static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWeightsBase>::ieventAccessRecordConst;
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer ) <===]
+  static constexpr auto decodeRecord = MemoryAccessHelper<MemoryAccessWeightsBase>::decodeRecord;
+
+  // Locate a field (output) of an event record (input) from the given field indexes (input)
+  // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer ) <===]
+  static constexpr auto decodeRecordConst =
+    MemoryAccessHelper<MemoryAccessWeightsBase>::template decodeRecordConst<>;
+
+  // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+  // [Signature (non-const) ===> fptype& ieventAccess( fptype* buffer, const ievt ) <===]
+  static constexpr auto ieventAccess =
+    MemoryAccessHelper<MemoryAccessWeightsBase>::template ieventAccessField<>;
+
+  // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
+  // [Signature (const) ===> const fptype& ieventAccessConst( const fptype* buffer, const ievt ) <===]
+  static constexpr auto ieventAccessConst =
+    MemoryAccessHelper<MemoryAccessWeightsBase>::template ieventAccessFieldConst<>;
+};
+
+//----------------------------------------------------------------------------
+
+// A class providing access to memory buffers for a given event, based on implicit kernel rules
+// Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+template<bool onDevice>
+class KernelAccessWeights
+{
+public:
+
+  /*
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (non-const) ===> fptype& kernelAccess( fptype* buffer ) <===]
+  // FINAL IMPLEMENTATION FOR CUDA 11.4
+  static constexpr auto kernelAccess =
+    KernelAccessHelper<MemoryAccessWeightsBase, onDevice>::template kernelAccessField<>;
+  */
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (non-const) ===> fptype& kernelAccess( fptype* buffer ) <===]
+  // TEMPORARY HACK FOR CUDA 11.1
+  static __host__ __device__ inline fptype&
+  kernelAccess( fptype* buffer )
+  {
+    return KernelAccessHelper<MemoryAccessWeightsBase, onDevice>::template kernelAccessField<>( buffer );
+  }
+
+  /*
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (const) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
+  // FINAL IMPLEMENTATION FOR CUDA 11.4
+  static constexpr auto kernelAccessConst =
+    KernelAccessHelper<MemoryAccessWeightsBase, onDevice>::template kernelAccessFieldConst<>;
+  */
+
+  // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
+  // [Signature (const) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===]
+  // TEMPORARY HACK FOR CUDA 11.1
+  static __host__ __device__ inline const fptype&
+  kernelAccessConst( const fptype* buffer )
+  {
+    return KernelAccessHelper<MemoryAccessWeightsBase, onDevice>::template kernelAccessFieldConst<>( buffer );
+  }
+};
+
+//----------------------------------------------------------------------------
+
+typedef KernelAccessWeights<false> HostAccessWeights;
+typedef KernelAccessWeights<true> DeviceAccessWeights;
+
+//----------------------------------------------------------------------------
+
+#endif // MemoryAccessWeights_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h
new file mode 100644
index 0000000000..1d8f404c6d
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h
@@ -0,0 +1,530 @@
+#ifndef MemoryBuffers_H
+#define MemoryBuffers_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuCxtypes.h"
+
+#include "CudaRuntime.h"
+#include "Parameters_SMEFTsim_topU3l_MwScheme_UFO.h"
+
+#include <sstream>
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+  // TEMPORARY? Take this from a PhysicsProcess class? Define them here directly in codegen?
+  namespace MemoryBuffers
+  {
+    static constexpr size_t np4 = mgOnGpu::np4;
+    static constexpr size_t nparf = mgOnGpu::nparf;
+    static constexpr size_t npar = mgOnGpu::npar;
+    static constexpr size_t nw6 = mgOnGpu::nw6;
+    static constexpr size_t nx2 = mgOnGpu::nx2;
+    static constexpr size_t ndcoup = Parameters_SMEFTsim_topU3l_MwScheme_UFO_dependentCouplings::ndcoup;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // An abstract interface encapsulating a given number of events
+  class INumberOfEvents
+  {
+  public:
+    virtual ~INumberOfEvents() {}
+    virtual size_t nevt() const = 0;
+  };
+
+  //--------------------------------------------------------------------------
+
+  // A class encapsulating a given number of events
+  class NumberOfEvents : virtual public INumberOfEvents
+  {
+  public:
+    NumberOfEvents( const size_t nevt )
+      : m_nevt( nevt ) {}
+    virtual ~NumberOfEvents() {}
+    virtual size_t nevt() const override { return m_nevt; }
+  private:
+    const size_t m_nevt;
+  };
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer (not necessarily an event buffer)
+  template<typename T>
+  class BufferBase : virtual public INumberOfEvents
+  {
+  protected:
+    BufferBase( const size_t size, const bool onDevice )
+      : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
+    virtual ~BufferBase() {}
+  public:
+    T* data() { return m_data; }
+    const T* data() const { return m_data; }
+    T& operator[]( const size_t index ) { return m_data[index]; }
+    const T& operator[]( const size_t index ) const { return m_data[index]; }
+    size_t size() const { return m_size; }
+    size_t bytes() const { return m_size * sizeof( T ); }
+    bool isOnDevice() const { return m_isOnDevice; }
+    virtual size_t nevt() const override { throw std::runtime_error( "This BufferBase is not an event buffer" ); }
+  protected:
+    const size_t m_size;
+    T* m_data;
+    const bool m_isOnDevice;
+  };
+
+  //--------------------------------------------------------------------------
+
+#ifndef __CUDACC__
+  constexpr bool HostBufferALIGNED = false;   // ismisaligned=false
+  constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true
+
+  // A class encapsulating a C++ host buffer
+  template<typename T, bool ismisaligned>
+  class HostBufferBase : public BufferBase<T>
+  {
+  public:
+    HostBufferBase( const size_t size )
+      : BufferBase<T>( size, false )
+    {
+      if constexpr( !ismisaligned )
+        this->m_data = new( std::align_val_t( cppAlign ) ) T[size]();
+      else
+        this->m_data = new( std::align_val_t( cppAlign ) ) T[size + 1]() + 1; // TEST MISALIGNMENT!
+    }
+    virtual ~HostBufferBase()
+    {
+      if constexpr( !ismisaligned )
+        ::operator delete[]( this->m_data, std::align_val_t( cppAlign ) );
+      else
+        ::operator delete[]( ( this->m_data ) - 1, std::align_val_t( cppAlign ) ); // TEST MISALIGNMENT!
+    }
+    static constexpr bool isaligned() { return !ismisaligned; }
+  public:
+    static constexpr size_t cppAlign = mgOnGpu::cppAlign;
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+  // A class encapsulating a CUDA pinned host buffer
+  template<typename T>
+  class PinnedHostBufferBase : public BufferBase<T>
+  {
+  public:
+    PinnedHostBufferBase( const size_t size )
+      : BufferBase<T>( size, false )
+    {
+      checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) );
+    }
+    virtual ~PinnedHostBufferBase()
+    {
+      checkCuda( cudaFreeHost( this->m_data ) );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+  // A class encapsulating a CUDA device buffer
+  template<typename T>
+  class DeviceBufferBase : public BufferBase<T>
+  {
+  public:
+    DeviceBufferBase( const size_t size )
+      : BufferBase<T>( size, true )
+    {
+      checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) );
+    }
+    virtual ~DeviceBufferBase()
+    {
+      checkCuda( cudaFree( this->m_data ) );
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef __CUDACC__
+  // A class encapsulating a C++ host buffer for a given number of events
+  template<typename T, size_t sizePerEvent, bool ismisaligned>
+  class HostBuffer : public HostBufferBase<T, ismisaligned>, virtual private NumberOfEvents
+  {
+  public:
+    HostBuffer( const size_t nevt )
+      : NumberOfEvents( nevt )
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
+    virtual ~HostBuffer() {}
+    virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+  // A class encapsulating a CUDA pinned host buffer for a given number of events
+  template<typename T, size_t sizePerEvent>
+  class PinnedHostBuffer : public PinnedHostBufferBase<T>, virtual private NumberOfEvents
+  {
+  public:
+    PinnedHostBuffer( const size_t nevt )
+      : NumberOfEvents( nevt )
+      , PinnedHostBufferBase<T>( sizePerEvent * nevt ) {}
+    virtual ~PinnedHostBuffer() {}
+    virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+  // A class encapsulating a CUDA device buffer for a given number of events
+  template<typename T, size_t sizePerEvent>
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  {
+  public:
+    DeviceBuffer( const size_t nevt )
+      : NumberOfEvents( nevt )
+      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
+    virtual ~DeviceBuffer() {}
+    virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer for momenta random numbers
+  typedef BufferBase<fptype> BufferRndNumMomenta;
+
+  // The size (number of elements) per event in a memory buffer for momenta random numbers
+  constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf;
+
+#ifndef __CUDACC__
+  // A class encapsulating a C++ host buffer for momenta random numbers
+  typedef HostBuffer<fptype, sizePerEventRndNumMomenta, HostBufferALIGNED> HostBufferRndNumMomenta;
+#else
+  // A class encapsulating a CUDA pinned host buffer for momenta random numbers
+  typedef PinnedHostBuffer<fptype, sizePerEventRndNumMomenta> PinnedHostBufferRndNumMomenta;
+  // A class encapsulating a CUDA device buffer for momenta random numbers
+  typedef DeviceBuffer<fptype, sizePerEventRndNumMomenta> DeviceBufferRndNumMomenta;
+#endif
+
+  //--------------------------------------------------------------------------
+
+  /*
+  // A base class encapsulating a memory buffer with ONE fptype per event
+  typedef BufferBase<fptype> BufferOneFp;
+
+  // The size (number of elements) per event in a memory buffer with ONE fptype per event
+  constexpr size_t sizePerEventOneFp = 1;
+
+#ifndef __CUDACC__
+  // A class encapsulating a C++ host buffer with ONE fptype per event
+  typedef HostBuffer<fptype, sizePerEventOneFp, HostBufferALIGNED> HostBufferOneFp;
+#else
+  // A class encapsulating a CUDA pinned host buffer for gs
+  typedef PinnedHostBuffer<fptype, sizePerEventOneFp> PinnedHostBufferOneFp;
+  // A class encapsulating a CUDA device buffer for gs
+  typedef DeviceBuffer<fptype, sizePerEventOneFp> DeviceBufferOneFp;
+#endif
+
+  // Memory buffers for Gs (related to the event-by-event strength of running coupling constant alphas QCD)
+  typedef BufferOneFp BufferGs;
+  typedef HostBufferOneFp HostBufferGs;
+  typedef PinnedHostBufferOneFp PinnedHostBufferGs;
+  typedef DeviceBufferOneFp DeviceBufferGs;
+  */
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer for Gs (related to the event-by-event strength of running coupling constant alphas QCD)
+  typedef BufferBase<fptype> BufferGs;
+
+  // The size (number of elements) per event in a memory buffer for Gs
+  constexpr size_t sizePerEventGs = 1;
+
+#ifndef __CUDACC__
+  // A class encapsulating a C++ host buffer for gs
+  typedef HostBuffer<fptype, sizePerEventGs, HostBufferALIGNED> HostBufferGs;
+#else
+  // A class encapsulating a CUDA pinned host buffer for gs
+  typedef PinnedHostBuffer<fptype, sizePerEventGs> PinnedHostBufferGs;
+  // A class encapsulating a CUDA device buffer for gs
+  typedef DeviceBuffer<fptype, sizePerEventGs> DeviceBufferGs;
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  // A base class encapsulating a memory buffer for numerators (of the multichannel single-diagram enhancement factors)
+  typedef BufferBase<fptype> BufferNumerators;
+
+  // The size (number of elements) per event in a memory buffer for numerators
+  constexpr size_t sizePerEventNumerators = 1;
+
+#ifndef __CUDACC__
+  // A class encapsulating a C++ host buffer for gs
+  typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
+#else
+  // A class encapsulating a CUDA pinned host buffer for gs
+  typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
+  // A class encapsulating a CUDA device buffer for gs
+  typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  // A base class encapsulating a memory buffer for denominators (of the multichannel single-diagram enhancement factors)
+  typedef BufferBase<fptype> BufferDenominators;
+
+  // The size (number of elements) per event in a memory buffer for denominators
+  constexpr size_t sizePerEventDenominators = 1;
+
+#ifndef __CUDACC__
+  // A class encapsulating a C++ host buffer for gs
+  typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
+#else
+  // A class encapsulating a CUDA pinned host buffer for gs
+  typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
+  // A class encapsulating a CUDA device buffer for gs
+  typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer for couplings that depend on the event-by-event running coupling constant alphas QCD
+  typedef BufferBase<fptype> BufferCouplings;
+
+  // The size (number of elements) per event in a memory buffer for random numbers
+  constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
+
+#ifndef __CUDACC__
+  // A class encapsulating a C++ host buffer for gs
+  typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
+#else
+  // A class encapsulating a CUDA pinned host buffer for gs
+  typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
+  // A class encapsulating a CUDA device buffer for gs
+  typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer for momenta
+  typedef BufferBase<fptype> BufferMomenta;
+
+  // The size (number of elements) per event in a memory buffer for momenta
+  constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar;
+
+#ifndef __CUDACC__
+  // A class encapsulating a C++ host buffer for momenta
+  typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferALIGNED> HostBufferMomenta;
+  //typedef HostBuffer<fptype, sizePerEventMomenta, HostBufferMISALIGNED> HostBufferMomenta; // TEST MISALIGNMENT!
+#else
+  // A class encapsulating a CUDA pinned host buffer for momenta
+  typedef PinnedHostBuffer<fptype, sizePerEventMomenta> PinnedHostBufferMomenta;
+  // A class encapsulating a CUDA device buffer for momenta
+  typedef DeviceBuffer<fptype, sizePerEventMomenta> DeviceBufferMomenta;
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer for sampling weights
+  typedef BufferBase<fptype> BufferWeights;
+
+  // The size (number of elements) per event in a memory buffer for sampling weights
+  constexpr size_t sizePerEventWeights = 1;
+
+#ifndef __CUDACC__
+  // A class encapsulating a C++ host buffer for sampling weights
+  typedef HostBuffer<fptype, sizePerEventWeights, HostBufferALIGNED> HostBufferWeights;
+#else
+  // A class encapsulating a CUDA pinned host buffer for sampling weights
+  typedef PinnedHostBuffer<fptype, sizePerEventWeights> PinnedHostBufferWeights;
+  // A class encapsulating a CUDA device buffer for sampling weights
+  typedef DeviceBuffer<fptype, sizePerEventWeights> DeviceBufferWeights;
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer for matrix elements
+  typedef BufferBase<fptype> BufferMatrixElements;
+
+  // The size (number of elements) per event in a memory buffer for matrix elements
+  constexpr size_t sizePerEventMatrixElements = 1;
+
+#ifndef __CUDACC__
+  // A class encapsulating a C++ host buffer for matrix elements
+  typedef HostBuffer<fptype, sizePerEventMatrixElements, HostBufferALIGNED> HostBufferMatrixElements;
+#else
+  // A class encapsulating a CUDA pinned host buffer for matrix elements
+  typedef PinnedHostBuffer<fptype, sizePerEventMatrixElements> PinnedHostBufferMatrixElements;
+  // A class encapsulating a CUDA device buffer for matrix elements
+  typedef DeviceBuffer<fptype, sizePerEventMatrixElements> DeviceBufferMatrixElements;
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer for the helicity mask
+  typedef BufferBase<bool> BufferHelicityMask;
+
+#ifndef __CUDACC__
+  // A class encapsulating a C++ host buffer for the helicity mask
+  typedef HostBufferBase<bool, HostBufferALIGNED> HostBufferHelicityMask;
+#else
+  // A class encapsulating a CUDA pinned host buffer for the helicity mask
+  typedef PinnedHostBufferBase<bool> PinnedHostBufferHelicityMask;
+  // A class encapsulating a CUDA device buffer for the helicity mask
+  typedef DeviceBufferBase<bool> DeviceBufferHelicityMask;
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer for wavefunctions
+  typedef BufferBase<fptype> BufferWavefunctions;
+
+  // The size (number of elements) per event in a memory buffer for wavefunctions
+  constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2;
+
+#ifndef __CUDACC__
+  // A class encapsulating a C++ host buffer for wavefunctions
+  typedef HostBuffer<fptype, sizePerEventWavefunctions, HostBufferALIGNED> HostBufferWavefunctions;
+#else
+  // A class encapsulating a CUDA pinned host buffer for wavefunctions
+  typedef PinnedHostBuffer<fptype, sizePerEventWavefunctions> PinnedHostBufferWavefunctions;
+  // A class encapsulating a CUDA device buffer for wavefunctions
+  typedef DeviceBuffer<fptype, sizePerEventWavefunctions> DeviceBufferWavefunctions;
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer for helicity random numbers
+  typedef BufferBase<fptype> BufferRndNumHelicity;
+
+  // The size (number of elements) per event in a memory buffer for helicity random numbers
+  constexpr size_t sizePerEventRndNumHelicity = 1;
+
+#ifndef __CUDACC__
+  // A class encapsulating a C++ host buffer for helicity random numbers
+  typedef HostBuffer<fptype, sizePerEventRndNumHelicity, HostBufferALIGNED> HostBufferRndNumHelicity;
+#else
+  // A class encapsulating a CUDA pinned host buffer for helicity random numbers
+  typedef PinnedHostBuffer<fptype, sizePerEventRndNumHelicity> PinnedHostBufferRndNumHelicity;
+  // A class encapsulating a CUDA device buffer for helicity random numbers
+  typedef DeviceBuffer<fptype, sizePerEventRndNumHelicity> DeviceBufferRndNumHelicity;
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer for color random numbers
+  typedef BufferBase<fptype> BufferRndNumColor;
+
+  // The size (number of elements) per event in a memory buffer for color random numbers
+  constexpr size_t sizePerEventRndNumColor = 1;
+
+#ifndef __CUDACC__
+  // A class encapsulating a C++ host buffer for color random numbers
+  typedef HostBuffer<fptype, sizePerEventRndNumColor, HostBufferALIGNED> HostBufferRndNumColor;
+#else
+  // A class encapsulating a CUDA pinned host buffer for color random numbers
+  typedef PinnedHostBuffer<fptype, sizePerEventRndNumColor> PinnedHostBufferRndNumColor;
+  // A class encapsulating a CUDA device buffer for color random numbers
+  typedef DeviceBuffer<fptype, sizePerEventRndNumColor> DeviceBufferRndNumColor;
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer for helicity selection
+  typedef BufferBase<int> BufferSelectedHelicity;
+
+  // The size (number of elements) per event in a memory buffer for helicity selection
+  constexpr size_t sizePerEventSelectedHelicity = 1;
+
+#ifndef __CUDACC__
+  // A class encapsulating a C++ host buffer for helicity selection
+  typedef HostBuffer<int, sizePerEventSelectedHelicity, HostBufferALIGNED> HostBufferSelectedHelicity;
+#else
+  // A class encapsulating a CUDA pinned host buffer for helicity selection
+  typedef PinnedHostBuffer<int, sizePerEventSelectedHelicity> PinnedHostBufferSelectedHelicity;
+  // A class encapsulating a CUDA device buffer for helicity selection
+  typedef DeviceBuffer<int, sizePerEventSelectedHelicity> DeviceBufferSelectedHelicity;
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating a memory buffer for color selection
+  typedef BufferBase<int> BufferSelectedColor;
+
+  // The size (number of elements) per event in a memory buffer for color selection
+  constexpr size_t sizePerEventSelectedColor = 1;
+
+#ifndef __CUDACC__
+  // A class encapsulating a C++ host buffer for color selection
+  typedef HostBuffer<int, sizePerEventSelectedColor, HostBufferALIGNED> HostBufferSelectedColor;
+#else
+  // A class encapsulating a CUDA pinned host buffer for color selection
+  typedef PinnedHostBuffer<int, sizePerEventSelectedColor> PinnedHostBufferSelectedColor;
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventSelectedColor> DeviceBufferSelectedColor;
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+  template<class Tdst, class Tsrc>
+  void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
+  {
+    if( dst.size() != src.size() )
+    {
+      std::ostringstream sstr;
+      sstr << "Size (#elements) mismatch in copyDeviceFromHost: dst=" << dst.size() << ", src=" << src.size();
+      throw std::runtime_error( sstr.str() );
+    }
+    if( dst.bytes() != src.bytes() )
+    {
+      std::ostringstream sstr;
+      sstr << "Size (#bytes) mismatch in copyDeviceFromHost: dst=" << dst.bytes() << ", src=" << src.bytes();
+      throw std::runtime_error( sstr.str() );
+    }
+    // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) );
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+  template<class Tdst, class Tsrc>
+  void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
+  {
+    if( dst.size() != src.size() )
+    {
+      std::ostringstream sstr;
+      sstr << "Size (#elements) mismatch in copyHostFromDevice: dst=" << dst.size() << ", src=" << src.size();
+      throw std::runtime_error( sstr.str() );
+    }
+    if( dst.bytes() != src.bytes() )
+    {
+      std::ostringstream sstr;
+      sstr << "Size (#bytes) mismatch in copyHostFromDevice: dst=" << dst.bytes() << ", src=" << src.bytes();
+      throw std::runtime_error( sstr.str() );
+    }
+    // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array
+    checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) );
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // MemoryBuffers_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/Bridge.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/Bridge.h
new file mode 120000
index 0000000000..7afe008f47
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/Bridge.h
@@ -0,0 +1 @@
+../Bridge.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/BridgeKernels.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/BridgeKernels.cc
new file mode 120000
index 0000000000..4c8697458f
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/BridgeKernels.cc
@@ -0,0 +1 @@
+../BridgeKernels.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/BridgeKernels.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/BridgeKernels.h
new file mode 120000
index 0000000000..f21b556a84
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/BridgeKernels.h
@@ -0,0 +1 @@
+../BridgeKernels.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CMakeLists.txt b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CMakeLists.txt
new file mode 100644
index 0000000000..4ac6c179d3
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CMakeLists.txt
@@ -0,0 +1,24 @@
+get_filename_component(basename ${CMAKE_CURRENT_SOURCE_DIR} NAME)
+string(TOLOWER ${basename} targadd)
+
+file(GLOB_RECURSE HEADERS "../*.h" CPPProcess.h)
+set(SOURCES ../BridgeKernels.cc CPPProcess.cc ../CrossSectionKernels.cc
+            ../MatrixElementKernels.cc ../RamboSamplingKernels.cc
+            ../RandomNumberKernels.cc)
+
+set(libname mg5amc_cxx_${targadd})
+add_library(${libname} ${SOURCES} ${HEADERS})
+target_include_directories(${libname} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}"
+                                             "${PROJECT_SOURCE_DIR}/src"
+                                             "${PROJECT_GITROOT_DIR}/tools")
+
+set(execname check_${targadd}.exe)
+add_executable(${execname} check_sa.cc)
+target_link_libraries(${execname} PUBLIC mg5amc_common ${libname})
+target_include_directories(${execname} PRIVATE "${PROJECT_SOURCE_DIR}/src")
+
+# some XCode specific stuff to make the executable run
+set_property(TARGET ${libname} PROPERTY XCODE_GENERATE_SCHEME TRUE)
+set_property(TARGET ${execname} PROPERTY XCODE_GENERATE_SCHEME TRUE)
+set_property(TARGET ${execname} PROPERTY XCODE_SCHEME_ARGUMENTS "--bridge" "8" "8" "32")
+set_property(TARGET ${execname} PROPERTY XCODE_SCHEME_WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}")
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc
new file mode 100644
index 0000000000..6fd28880a0
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc
@@ -0,0 +1,2123 @@
+//==========================================================================
+// This file has been automatically generated for CUDA/C++ standalone by
+// MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-01-26
+// By the MadGraph5_aMC@NLO Development Team
+// Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
+//==========================================================================
+
+#include "CPPProcess.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "CudaRuntime.h"
+#include "HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h"
+#include "MemoryAccessAmplitudes.h"
+#include "MemoryAccessCouplings.h"
+#include "MemoryAccessCouplingsFixed.h"
+#include "MemoryAccessGs.h"
+#include "MemoryAccessMatrixElements.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryAccessWavefunctions.h"
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#include "MemoryAccessDenominators.h"
+#include "MemoryAccessNumerators.h"
+#include "coloramps.h"
+#endif
+
+#include <algorithm>
+#include <array>
+#include <cstring>
+#include <iostream>
+#include <memory>
+
+// Test ncu metrics for CUDA thread divergence
+#undef MGONGPU_TEST_DIVERGENCE
+//#define MGONGPU_TEST_DIVERGENCE 1
+
+//==========================================================================
+// Class member functions for calculating the matrix elements for
+// Process: g g > t t~ t t~ WEIGHTED<=4 @1
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  using mgOnGpu::np4;   // dimensions of 4-momenta (E,px,py,pz)
+  using mgOnGpu::npar;  // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  using mgOnGpu::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+
+  using mgOnGpu::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  using mgOnGpu::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+
+  using Parameters_SMEFTsim_topU3l_MwScheme_UFO_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
+  using Parameters_SMEFTsim_topU3l_MwScheme_UFO_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
+
+  // The number of colors
+  constexpr int ncolor = 12;
+
+  // The number of SIMD vectors of events processed by calculate_wavefunction
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  constexpr int nParity = 2;
+#else
+  constexpr int nParity = 1;
+#endif
+
+  // Physics parameters (masses, coupling, etc...)
+  // For CUDA performance, hardcoded constexpr's would be better: fewer registers and a tiny throughput increase
+  // However, physics parameters are user-defined through card files: use CUDA constant memory instead (issue #39)
+  // [NB if hardcoded parameters are used, it's better to define them here to avoid silent shadowing (issue #263)]
+#ifdef MGONGPU_HARDCODE_PARAM
+  __device__ const fptype cIPD[2] = { (fptype)Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT, (fptype)Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_WT };
+  __device__ const fptype* cIPC = nullptr; // unused as nicoup=0
+#else
+#ifdef __CUDACC__
+  __device__ __constant__ fptype cIPD[2];
+  __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0
+#else
+  static fptype cIPD[2];
+  static fptype* cIPC = nullptr; // unused as nicoup=0
+#endif
+#endif
+
+  // Helicity combinations (and filtering of "good" helicity combinations)
+#ifdef __CUDACC__
+  __device__ __constant__ short cHel[ncomb][npar];
+  __device__ __constant__ int cNGoodHel;
+  __device__ __constant__ int cGoodHel[ncomb];
+#else
+  static short cHel[ncomb][npar];
+  static int cNGoodHel;
+  static int cGoodHel[ncomb];
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate |M|^2 for each subprocess
+  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
+  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
+  // In CUDA, this device function computes the ME for a single event
+  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  __device__ INLINE void /* clang-format off */
+  calculate_wavefunctions( int ihel,
+                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
+                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
+                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                           const unsigned int channelId,  // input: multichannel channel id (1 to #diagrams); 0 to disable channel enhancement
+                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
+                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
+#ifndef __CUDACC__
+                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+                           )
+  //ALWAYS_INLINE // attributes are not permitted in a function definition
+  {
+#ifdef __CUDACC__
+    using namespace mg5amcGpu;
+    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
+    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
+    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
+    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
+    using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
+#endif
+#else
+    using namespace mg5amcCpu;
+    using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
+    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
+    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
+    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
+    using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
+#endif
+#endif /* clang-format on */
+    mgDebug( 0, __FUNCTION__ );
+    //printf( "calculate_wavefunctions: ihel=%2d\n", ihel );
+#ifndef __CUDACC__
+    //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
+#endif
+
+    // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
+    // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
+    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
+    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
+    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
+    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
+
+    // Proof of concept for using fptype* in the interface
+    fptype* w_fp[nwf];
+    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
+    fptype* amp_fp;
+    amp_fp = reinterpret_cast<fptype*>( amp_sv );
+
+    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
+    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
+    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+
+    // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
+    // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed fptypes #537: float for color algebra and double elsewhere
+    // Delay color algebra and ME updates (only on even pages)
+    cxtype_sv jamp_sv_previous[ncolor] = {};
+    fptype* MEs_previous = 0;
+#endif
+    for( int iParity = 0; iParity < nParity; ++iParity )
+    { // START LOOP ON IPARITY
+#ifndef __CUDACC__
+      const int ievt0 = ievt00 + iParity * neppV;
+#endif
+      constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings
+      const fptype* allCOUPs[nxcoup];
+#ifdef __CUDACC__
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
+#endif
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
+      for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+#ifdef __CUDACC__
+#pragma nv_diagnostic pop
+      // CUDA kernels take input/output buffers with momenta/MEs for all events
+      const fptype* momenta = allmomenta;
+      const fptype* COUPs[nxcoup];
+      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
+      fptype* MEs = allMEs;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
+#endif
+#else
+      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      const fptype* COUPs[nxcoup];
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
+      for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
+      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
+#endif
+#endif
+
+      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
+      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
+      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+#endif
+
+      // *** DIAGRAM 1 OF 72 ***
+
+      // Wavefunction(s) for diagram number 1
+      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+
+      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+
+      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+
+      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+
+      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][4], +1, w_fp[4], 4 );
+
+      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][5], -1, w_fp[5], 5 );
+
+      VVV5P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 0., 0., w_fp[6] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 0., 0., w_fp[7] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[8] );
+
+      // Amplitude(s) for diagram number 1
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[7], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[3] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[11] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+
+      // *** DIAGRAM 2 OF 72 ***
+
+      // Wavefunction(s) for diagram number 2
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[9] );
+
+      // Amplitude(s) for diagram number 2
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[4], w_fp[7], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[3] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[11] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+
+      // *** DIAGRAM 3 OF 72 ***
+
+      // Wavefunction(s) for diagram number 3
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[4], COUPs[1], 0., 0., w_fp[10] );
+
+      // Amplitude(s) for diagram number 3
+      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[10], COUPs[0], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[1] += 1. / 2. * amp_sv[0];
+      jamp_sv[2] -= 1. / 2. * amp_sv[0];
+      jamp_sv[9] -= 1. / 2. * amp_sv[0];
+      jamp_sv[10] += 1. / 2. * amp_sv[0];
+
+      // *** DIAGRAM 4 OF 72 ***
+
+      // Wavefunction(s) for diagram number 4
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], COUPs[1], 0., 0., w_fp[11] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[12] );
+
+      // Amplitude(s) for diagram number 4
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], w_fp[11], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[2] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[10] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+
+      // *** DIAGRAM 5 OF 72 ***
+
+      // Wavefunction(s) for diagram number 5
+      // (none)
+
+      // Amplitude(s) for diagram number 5
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[11], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[2] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[10] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[11] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+
+      // *** DIAGRAM 6 OF 72 ***
+
+      // Wavefunction(s) for diagram number 6
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 0., 0., w_fp[8] );
+
+      // Amplitude(s) for diagram number 6
+      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[11], w_fp[8], COUPs[0], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[0] -= 1. / 2. * amp_sv[0];
+      jamp_sv[3] += 1. / 2. * amp_sv[0];
+      jamp_sv[8] += 1. / 2. * amp_sv[0];
+      jamp_sv[11] -= 1. / 2. * amp_sv[0];
+
+      // *** DIAGRAM 7 OF 72 ***
+
+      // Wavefunction(s) for diagram number 7
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], cIPD[0], cIPD[1], w_fp[13] );
+
+      // Amplitude(s) for diagram number 7
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[13], w_fp[8], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[1] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[9] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+
+      // *** DIAGRAM 8 OF 72 ***
+
+      // Wavefunction(s) for diagram number 8
+      // (none)
+
+      // Amplitude(s) for diagram number 8
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[8], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[1] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[9] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[11] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+
+      // *** DIAGRAM 9 OF 72 ***
+
+      // Wavefunction(s) for diagram number 9
+      // (none)
+
+      // Amplitude(s) for diagram number 9
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[10], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[0] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[8] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+
+      // *** DIAGRAM 10 OF 72 ***
+
+      // Wavefunction(s) for diagram number 10
+      // (none)
+
+      // Amplitude(s) for diagram number 10
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[10], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[0] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[8] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+
+      // *** DIAGRAM 11 OF 72 ***
+
+      // Wavefunction(s) for diagram number 11
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[12] );
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[13] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[12], COUPs[1], 0., 0., w_fp[9] );
+
+      // Amplitude(s) for diagram number 11
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[4], w_fp[9], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[0] -= 1. / 2. * amp_sv[0];
+      jamp_sv[5] += 1. / 6. * amp_sv[0];
+
+      // *** DIAGRAM 12 OF 72 ***
+
+      // Wavefunction(s) for diagram number 12
+      // (none)
+
+      // Amplitude(s) for diagram number 12
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[12], w_fp[10], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[0] -= 1. / 6. * amp_sv[0];
+      jamp_sv[5] += 1. / 2. * amp_sv[0];
+
+      // *** DIAGRAM 13 OF 72 ***
+
+      // Wavefunction(s) for diagram number 13
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[6] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 0., 0., w_fp[14] );
+
+      // Amplitude(s) for diagram number 13
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[14], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[4] -= 1. / 6. * amp_sv[0];
+      jamp_sv[5] += 1. / 2. * amp_sv[0];
+
+      // *** DIAGRAM 14 OF 72 ***
+
+      // Wavefunction(s) for diagram number 14
+      // (none)
+
+      // Amplitude(s) for diagram number 14
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], w_fp[9], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[4] -= 1. / 2. * amp_sv[0];
+      jamp_sv[5] += 1. / 6. * amp_sv[0];
+
+      // *** DIAGRAM 15 OF 72 ***
+
+      // Wavefunction(s) for diagram number 15
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[15] );
+
+      // Amplitude(s) for diagram number 15
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[4], w_fp[14], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[1] += 1. / 2. * amp_sv[0];
+      jamp_sv[4] -= 1. / 6. * amp_sv[0];
+
+      // *** DIAGRAM 16 OF 72 ***
+
+      // Wavefunction(s) for diagram number 16
+      // (none)
+
+      // Amplitude(s) for diagram number 16
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[12], w_fp[8], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[1] += 1. / 6. * amp_sv[0];
+      jamp_sv[4] -= 1. / 2. * amp_sv[0];
+
+      // *** DIAGRAM 17 OF 72 ***
+
+      // Wavefunction(s) for diagram number 17
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[16] );
+
+      // Amplitude(s) for diagram number 17
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[16], w_fp[8], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[0] -= 1. / 2. * amp_sv[0];
+      jamp_sv[1] += 1. / 6. * amp_sv[0];
+
+      // *** DIAGRAM 18 OF 72 ***
+
+      // Wavefunction(s) for diagram number 18
+      // (none)
+
+      // Amplitude(s) for diagram number 18
+      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[9], COUPs[0], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+
+      // *** DIAGRAM 19 OF 72 ***
+
+      // Wavefunction(s) for diagram number 19
+      // (none)
+
+      // Amplitude(s) for diagram number 19
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[10], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[0] -= 1. / 6. * amp_sv[0];
+      jamp_sv[1] += 1. / 2. * amp_sv[0];
+
+      // *** DIAGRAM 20 OF 72 ***
+
+      // Wavefunction(s) for diagram number 20
+      // (none)
+
+      // Amplitude(s) for diagram number 20
+      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[14], COUPs[0], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+
+      // *** DIAGRAM 21 OF 72 ***
+
+      // Wavefunction(s) for diagram number 21
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[14] );
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[16] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[4], COUPs[1], 0., 0., w_fp[9] );
+
+      // Amplitude(s) for diagram number 21
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[16], w_fp[9], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[6] += 1. / 6. * amp_sv[0];
+      jamp_sv[8] -= 1. / 2. * amp_sv[0];
+
+      // *** DIAGRAM 22 OF 72 ***
+
+      // Wavefunction(s) for diagram number 22
+      // (none)
+
+      // Amplitude(s) for diagram number 22
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[16], w_fp[10], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[6] += 1. / 2. * amp_sv[0];
+      jamp_sv[8] -= 1. / 6. * amp_sv[0];
+
+      // *** DIAGRAM 23 OF 72 ***
+
+      // Wavefunction(s) for diagram number 23
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], COUPs[1], 0., 0., w_fp[12] );
+
+      // Amplitude(s) for diagram number 23
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[12], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[4] -= 1. / 6. * amp_sv[0];
+      jamp_sv[10] += 1. / 2. * amp_sv[0];
+
+      // *** DIAGRAM 24 OF 72 ***
+
+      // Wavefunction(s) for diagram number 24
+      // (none)
+
+      // Amplitude(s) for diagram number 24
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[6], w_fp[11], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[4] -= 1. / 2. * amp_sv[0];
+      jamp_sv[10] += 1. / 6. * amp_sv[0];
+
+      // *** DIAGRAM 25 OF 72 ***
+
+      // Wavefunction(s) for diagram number 25
+      // (none)
+
+      // Amplitude(s) for diagram number 25
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[4], w_fp[12], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[4] -= 1. / 6. * amp_sv[0];
+      jamp_sv[6] += 1. / 2. * amp_sv[0];
+
+      // *** DIAGRAM 26 OF 72 ***
+
+      // Wavefunction(s) for diagram number 26
+      // (none)
+
+      // Amplitude(s) for diagram number 26
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[9], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[4] -= 1. / 2. * amp_sv[0];
+      jamp_sv[6] += 1. / 6. * amp_sv[0];
+
+      // *** DIAGRAM 27 OF 72 ***
+
+      // Wavefunction(s) for diagram number 27
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[17] );
+
+      // Amplitude(s) for diagram number 27
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[17], w_fp[4], w_fp[11], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[8] -= 1. / 2. * amp_sv[0];
+      jamp_sv[10] += 1. / 6. * amp_sv[0];
+
+      // *** DIAGRAM 28 OF 72 ***
+
+      // Wavefunction(s) for diagram number 28
+      // (none)
+
+      // Amplitude(s) for diagram number 28
+      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[11], w_fp[9], COUPs[0], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+
+      // *** DIAGRAM 29 OF 72 ***
+
+      // Wavefunction(s) for diagram number 29
+      // (none)
+
+      // Amplitude(s) for diagram number 29
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[17], w_fp[2], w_fp[10], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[8] -= 1. / 6. * amp_sv[0];
+      jamp_sv[10] += 1. / 2. * amp_sv[0];
+
+      // *** DIAGRAM 30 OF 72 ***
+
+      // Wavefunction(s) for diagram number 30
+      // (none)
+
+      // Amplitude(s) for diagram number 30
+      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[12], COUPs[0], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[6] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+
+      // *** DIAGRAM 31 OF 72 ***
+
+      // Wavefunction(s) for diagram number 31
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[12] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 0., 0., w_fp[17] );
+
+      // Amplitude(s) for diagram number 31
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[16], w_fp[17], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[6] += 1. / 6. * amp_sv[0];
+      jamp_sv[7] -= 1. / 2. * amp_sv[0];
+
+      // *** DIAGRAM 32 OF 72 ***
+
+      // Wavefunction(s) for diagram number 32
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[12], COUPs[1], 0., 0., w_fp[9] );
+
+      // Amplitude(s) for diagram number 32
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[9], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[6] += 1. / 2. * amp_sv[0];
+      jamp_sv[7] -= 1. / 6. * amp_sv[0];
+
+      // *** DIAGRAM 33 OF 72 ***
+
+      // Wavefunction(s) for diagram number 33
+      // (none)
+
+      // Amplitude(s) for diagram number 33
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[9], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[2] += 1. / 2. * amp_sv[0];
+      jamp_sv[7] -= 1. / 6. * amp_sv[0];
+
+      // *** DIAGRAM 34 OF 72 ***
+
+      // Wavefunction(s) for diagram number 34
+      // (none)
+
+      // Amplitude(s) for diagram number 34
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[12], w_fp[11], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[2] += 1. / 6. * amp_sv[0];
+      jamp_sv[7] -= 1. / 2. * amp_sv[0];
+
+      // *** DIAGRAM 35 OF 72 ***
+
+      // Wavefunction(s) for diagram number 35
+      // (none)
+
+      // Amplitude(s) for diagram number 35
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[17], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[3] -= 1. / 2. * amp_sv[0];
+      jamp_sv[6] += 1. / 6. * amp_sv[0];
+
+      // *** DIAGRAM 36 OF 72 ***
+
+      // Wavefunction(s) for diagram number 36
+      // (none)
+
+      // Amplitude(s) for diagram number 36
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[12], w_fp[7], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[3] -= 1. / 6. * amp_sv[0];
+      jamp_sv[6] += 1. / 2. * amp_sv[0];
+
+      // *** DIAGRAM 37 OF 72 ***
+
+      // Wavefunction(s) for diagram number 37
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[14] );
+
+      // Amplitude(s) for diagram number 37
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[14], w_fp[7], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[2] += 1. / 2. * amp_sv[0];
+      jamp_sv[3] -= 1. / 6. * amp_sv[0];
+
+      // *** DIAGRAM 38 OF 72 ***
+
+      // Wavefunction(s) for diagram number 38
+      // (none)
+
+      // Amplitude(s) for diagram number 38
+      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[9], COUPs[0], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[6] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+
+      // *** DIAGRAM 39 OF 72 ***
+
+      // Wavefunction(s) for diagram number 39
+      // (none)
+
+      // Amplitude(s) for diagram number 39
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[11], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[2] += 1. / 6. * amp_sv[0];
+      jamp_sv[3] -= 1. / 2. * amp_sv[0];
+
+      // *** DIAGRAM 40 OF 72 ***
+
+      // Wavefunction(s) for diagram number 40
+      // (none)
+
+      // Amplitude(s) for diagram number 40
+      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[11], w_fp[17], COUPs[0], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+
+      // *** DIAGRAM 41 OF 72 ***
+
+      // Wavefunction(s) for diagram number 41
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[17] );
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[17], w_fp[4], COUPs[1], 0., 0., w_fp[14] );
+
+      // Amplitude(s) for diagram number 41
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[14], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[7] -= 1. / 6. * amp_sv[0];
+      jamp_sv[9] += 1. / 2. * amp_sv[0];
+
+      // *** DIAGRAM 42 OF 72 ***
+
+      // Wavefunction(s) for diagram number 42
+      // (none)
+
+      // Amplitude(s) for diagram number 42
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[17], w_fp[16], w_fp[8], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[7] -= 1. / 2. * amp_sv[0];
+      jamp_sv[9] += 1. / 6. * amp_sv[0];
+
+      // *** DIAGRAM 43 OF 72 ***
+
+      // Wavefunction(s) for diagram number 43
+      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[17], w_fp[2], COUPs[1], 0., 0., w_fp[9] );
+
+      // Amplitude(s) for diagram number 43
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[4], w_fp[9], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[5] += 1. / 6. * amp_sv[0];
+      jamp_sv[7] -= 1. / 2. * amp_sv[0];
+
+      // *** DIAGRAM 44 OF 72 ***
+
+      // Wavefunction(s) for diagram number 44
+      // (none)
+
+      // Amplitude(s) for diagram number 44
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[14], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[5] += 1. / 2. * amp_sv[0];
+      jamp_sv[7] -= 1. / 6. * amp_sv[0];
+
+      // *** DIAGRAM 45 OF 72 ***
+
+      // Wavefunction(s) for diagram number 45
+      // (none)
+
+      // Amplitude(s) for diagram number 45
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], w_fp[9], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[5] += 1. / 6. * amp_sv[0];
+      jamp_sv[11] -= 1. / 2. * amp_sv[0];
+
+      // *** DIAGRAM 46 OF 72 ***
+
+      // Wavefunction(s) for diagram number 46
+      // (none)
+
+      // Amplitude(s) for diagram number 46
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[17], w_fp[6], w_fp[7], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[5] += 1. / 2. * amp_sv[0];
+      jamp_sv[11] -= 1. / 6. * amp_sv[0];
+
+      // *** DIAGRAM 47 OF 72 ***
+
+      // Wavefunction(s) for diagram number 47
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[17], w_fp[1], COUPs[1], cIPD[0], cIPD[1], w_fp[12] );
+
+      // Amplitude(s) for diagram number 47
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], w_fp[7], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[9] += 1. / 2. * amp_sv[0];
+      jamp_sv[11] -= 1. / 6. * amp_sv[0];
+
+      // *** DIAGRAM 48 OF 72 ***
+
+      // Wavefunction(s) for diagram number 48
+      // (none)
+
+      // Amplitude(s) for diagram number 48
+      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[14], COUPs[0], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+
+      // *** DIAGRAM 49 OF 72 ***
+
+      // Wavefunction(s) for diagram number 49
+      // (none)
+
+      // Amplitude(s) for diagram number 49
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[8], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[9] += 1. / 6. * amp_sv[0];
+      jamp_sv[11] -= 1. / 2. * amp_sv[0];
+
+      // *** DIAGRAM 50 OF 72 ***
+
+      // Wavefunction(s) for diagram number 50
+      // (none)
+
+      // Amplitude(s) for diagram number 50
+      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[9], COUPs[0], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[11] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+
+      // *** DIAGRAM 51 OF 72 ***
+
+      // Wavefunction(s) for diagram number 51
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[16], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[9] );
+
+      // Amplitude(s) for diagram number 51
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[9], w_fp[8], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[8] -= 1. / 2. * amp_sv[0];
+      jamp_sv[9] += 1. / 6. * amp_sv[0];
+
+      // *** DIAGRAM 52 OF 72 ***
+
+      // Wavefunction(s) for diagram number 52
+      VVV5P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[0], 0., 0., w_fp[12] );
+
+      // Amplitude(s) for diagram number 52
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[16], w_fp[12], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[8] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+
+      // *** DIAGRAM 53 OF 72 ***
+
+      // Wavefunction(s) for diagram number 53
+      // (none)
+
+      // Amplitude(s) for diagram number 53
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[10], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[8] -= 1. / 6. * amp_sv[0];
+      jamp_sv[9] += 1. / 2. * amp_sv[0];
+
+      // *** DIAGRAM 54 OF 72 ***
+
+      // Wavefunction(s) for diagram number 54
+      VVV5P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], COUPs[0], 0., 0., w_fp[9] );
+
+      // Amplitude(s) for diagram number 54
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[9], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[6] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[9] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+
+      // *** DIAGRAM 55 OF 72 ***
+
+      // Wavefunction(s) for diagram number 55
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[13], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[16] );
+
+      // Amplitude(s) for diagram number 55
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[4], w_fp[11], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[0] -= 1. / 2. * amp_sv[0];
+      jamp_sv[2] += 1. / 6. * amp_sv[0];
+
+      // *** DIAGRAM 56 OF 72 ***
+
+      // Wavefunction(s) for diagram number 56
+      VVV5P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[11], COUPs[0], 0., 0., w_fp[14] );
+
+      // Amplitude(s) for diagram number 56
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[4], w_fp[14], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+
+      // *** DIAGRAM 57 OF 72 ***
+
+      // Wavefunction(s) for diagram number 57
+      // (none)
+
+      // Amplitude(s) for diagram number 57
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[10], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[0] -= 1. / 6. * amp_sv[0];
+      jamp_sv[2] += 1. / 2. * amp_sv[0];
+
+      // *** DIAGRAM 58 OF 72 ***
+
+      // Wavefunction(s) for diagram number 58
+      // (none)
+
+      // Amplitude(s) for diagram number 58
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[9], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+
+      // *** DIAGRAM 59 OF 72 ***
+
+      // Wavefunction(s) for diagram number 59
+      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[13] );
+
+      // Amplitude(s) for diagram number 59
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[13], w_fp[7], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[10] += 1. / 2. * amp_sv[0];
+      jamp_sv[11] -= 1. / 6. * amp_sv[0];
+
+      // *** DIAGRAM 60 OF 72 ***
+
+      // Wavefunction(s) for diagram number 60
+      VVV5P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[0], 0., 0., w_fp[16] );
+
+      // Amplitude(s) for diagram number 60
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[16], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+
+      // *** DIAGRAM 61 OF 72 ***
+
+      // Wavefunction(s) for diagram number 61
+      // (none)
+
+      // Amplitude(s) for diagram number 61
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[10] += 1. / 6. * amp_sv[0];
+      jamp_sv[11] -= 1. / 2. * amp_sv[0];
+
+      // *** DIAGRAM 62 OF 72 ***
+
+      // Wavefunction(s) for diagram number 62
+      // (none)
+
+      // Amplitude(s) for diagram number 62
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], w_fp[14], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[11] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+
+      // *** DIAGRAM 63 OF 72 ***
+
+      // Wavefunction(s) for diagram number 63
+      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[15], w_fp[0], COUPs[1], cIPD[0], cIPD[1], w_fp[6] );
+
+      // Amplitude(s) for diagram number 63
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[4], w_fp[7], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[1] += 1. / 2. * amp_sv[0];
+      jamp_sv[3] -= 1. / 6. * amp_sv[0];
+
+      // *** DIAGRAM 64 OF 72 ***
+
+      // Wavefunction(s) for diagram number 64
+      // (none)
+
+      // Amplitude(s) for diagram number 64
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[4], w_fp[16], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[1] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[6] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+
+      // *** DIAGRAM 65 OF 72 ***
+
+      // Wavefunction(s) for diagram number 65
+      // (none)
+
+      // Amplitude(s) for diagram number 65
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[8], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[1] += 1. / 6. * amp_sv[0];
+      jamp_sv[3] -= 1. / 2. * amp_sv[0];
+
+      // *** DIAGRAM 66 OF 72 ***
+
+      // Wavefunction(s) for diagram number 66
+      // (none)
+
+      // Amplitude(s) for diagram number 66
+      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[12], COUPs[1], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+      jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+
+      // *** DIAGRAM 67 OF 72 ***
+
+      // Wavefunction(s) for diagram number 67
+      // (none)
+
+      // Amplitude(s) for diagram number 67
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[1] += 1. / 2. * amp_sv[0];
+      jamp_sv[2] -= 1. / 2. * amp_sv[0];
+      jamp_sv[9] -= 1. / 2. * amp_sv[0];
+      jamp_sv[10] += 1. / 2. * amp_sv[0];
+      VVVV9_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[1] += 1. / 2. * amp_sv[0];
+      jamp_sv[5] -= 1. / 2. * amp_sv[0];
+      jamp_sv[6] -= 1. / 2. * amp_sv[0];
+      jamp_sv[10] += 1. / 2. * amp_sv[0];
+      VVVV10_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[2] += 1. / 2. * amp_sv[0];
+      jamp_sv[5] -= 1. / 2. * amp_sv[0];
+      jamp_sv[6] -= 1. / 2. * amp_sv[0];
+      jamp_sv[9] += 1. / 2. * amp_sv[0];
+
+      // *** DIAGRAM 68 OF 72 ***
+
+      // Wavefunction(s) for diagram number 68
+      // (none)
+
+      // Amplitude(s) for diagram number 68
+      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[16], COUPs[0], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[1] += 1. / 2. * amp_sv[0];
+      jamp_sv[5] -= 1. / 2. * amp_sv[0];
+      jamp_sv[6] -= 1. / 2. * amp_sv[0];
+      jamp_sv[10] += 1. / 2. * amp_sv[0];
+
+      // *** DIAGRAM 69 OF 72 ***
+
+      // Wavefunction(s) for diagram number 69
+      // (none)
+
+      // Amplitude(s) for diagram number 69
+      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[9], COUPs[0], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[2] += 1. / 2. * amp_sv[0];
+      jamp_sv[5] -= 1. / 2. * amp_sv[0];
+      jamp_sv[6] -= 1. / 2. * amp_sv[0];
+      jamp_sv[9] += 1. / 2. * amp_sv[0];
+
+      // *** DIAGRAM 70 OF 72 ***
+
+      // Wavefunction(s) for diagram number 70
+      // (none)
+
+      // Amplitude(s) for diagram number 70
+      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[11], w_fp[8], COUPs[2], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[0] -= 1. / 2. * amp_sv[0];
+      jamp_sv[3] += 1. / 2. * amp_sv[0];
+      jamp_sv[8] += 1. / 2. * amp_sv[0];
+      jamp_sv[11] -= 1. / 2. * amp_sv[0];
+      VVVV9_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[11], w_fp[8], COUPs[2], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[0] -= 1. / 2. * amp_sv[0];
+      jamp_sv[4] += 1. / 2. * amp_sv[0];
+      jamp_sv[7] += 1. / 2. * amp_sv[0];
+      jamp_sv[11] -= 1. / 2. * amp_sv[0];
+      VVVV10_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[11], w_fp[8], COUPs[2], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[3] -= 1. / 2. * amp_sv[0];
+      jamp_sv[4] += 1. / 2. * amp_sv[0];
+      jamp_sv[7] += 1. / 2. * amp_sv[0];
+      jamp_sv[8] -= 1. / 2. * amp_sv[0];
+
+      // *** DIAGRAM 71 OF 72 ***
+
+      // Wavefunction(s) for diagram number 71
+      // (none)
+
+      // Amplitude(s) for diagram number 71
+      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[14], COUPs[0], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[0] -= 1. / 2. * amp_sv[0];
+      jamp_sv[4] += 1. / 2. * amp_sv[0];
+      jamp_sv[7] += 1. / 2. * amp_sv[0];
+      jamp_sv[11] -= 1. / 2. * amp_sv[0];
+
+      // *** DIAGRAM 72 OF 72 ***
+
+      // Wavefunction(s) for diagram number 72
+      // (none)
+
+      // Amplitude(s) for diagram number 72
+      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[11], w_fp[12], COUPs[0], &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+      jamp_sv[3] -= 1. / 2. * amp_sv[0];
+      jamp_sv[4] += 1. / 2. * amp_sv[0];
+      jamp_sv[7] += 1. / 2. * amp_sv[0];
+      jamp_sv[8] -= 1. / 2. * amp_sv[0];
+
+      // *** COLOR CHOICE BELOW ***
+      // Store the leading color flows for choice of color
+      if( jamp2_sv ) // disable color choice if nullptr
+        for( int icolC = 0; icolC < ncolor; icolC++ )
+          jamp2_sv[ncolor * iParity + icolC] += cxabs2( jamp_sv[icolC] );
+
+      // *** COLOR MATRIX BELOW ***
+      // (This method used to be called CPPProcess::matrix_1_gg_ttxttx()?)
+
+      // The color denominators (initialize all array elements, with ncolor=12)
+      // [NB do keep 'static' for these constexpr arrays, see issue #283]
+      static constexpr fptype2 denom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12]
+
+      // The color matrix (initialize all array elements, with ncolor=12)
+      // [NB do keep 'static' for these constexpr arrays, see issue #283]
+      static constexpr fptype2 cf[ncolor][ncolor] = {
+        { 48, 16, 16, 6, 0, 16, -2, 0, -6, -2, -2, 6 },
+        { 16, 48, 6, 16, 16, 0, 0, -2, -2, -6, 6, -2 },
+        { 16, 6, 48, 16, -2, 0, 0, 16, -2, 6, -6, -2 },
+        { 6, 16, 16, 48, 0, -2, 16, 0, 6, -2, -2, -6 },
+        { 0, 16, -2, 0, 48, 16, 16, 6, 0, -2, 16, 0 },
+        { 16, 0, 0, -2, 16, 48, 6, 16, -2, 0, 0, 16 },
+        { -2, 0, 0, 16, 16, 6, 48, 16, 16, 0, 0, -2 },
+        { 0, -2, 16, 0, 6, 16, 16, 48, 0, 16, -2, 0 },
+        { -6, -2, -2, 6, 0, -2, 16, 0, 48, 16, 16, 6 },
+        { -2, -6, 6, -2, -2, 0, 0, 16, 16, 48, 6, 16 },
+        { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 },
+        { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12]
+
+#ifndef __CUDACC__
+      // Pre-compute a constexpr triangular color matrix properly normalized #475
+      struct TriangularNormalizedColorMatrix
+      {
+        // See https://stackoverflow.com/a/34465458
+        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+          : value()
+        {
+          for( int icol = 0; icol < ncolor; icol++ )
+          {
+            // Diagonal terms
+            value[icol][icol] = cf[icol][icol] / denom[icol];
+            // Off-diagonal terms
+            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
+          }
+        }
+        fptype2 value[ncolor][ncolor];
+      };
+      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+#endif
+
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
+      {
+        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
+        for( int icol = 0; icol < ncolor; icol++ )
+          jamp_sv_previous[icol] = jamp_sv[icol];
+        MEs_previous = MEs;
+        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
+      }
+      fptype_sv deltaMEs_previous = { 0 };
+#endif
+
+      // Sum and square the color flows to get the matrix element
+      // (compute |M|^2 by squaring |M|, taking into account colours)
+      // Sum and square the color flows to get the matrix element
+      // (compute |M|^2 by squaring |M|, taking into account colours)
+      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
+
+      // Use the property that M is a real matrix (see #475):
+      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+      // In addition, on C++ use the property that M is symmetric (see #475),
+      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
+      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv jampR_sv[ncolor] = { 0 };
+      fptype2_sv jampI_sv[ncolor] = { 0 };
+      for( int icol = 0; icol < ncolor; icol++ )
+      {
+        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
+        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
+      }
+#endif
+      for( int icol = 0; icol < ncolor; icol++ )
+      {
+#ifndef __CUDACC__
+        // === C++ START ===
+        // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRi_sv = jampR_sv[icol];
+        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+        // Off-diagonal terms
+        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+        {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+          fptype2_sv& jampRj_sv = jampR_sv[jcol];
+          fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+        }
+        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        deltaMEs_previous += fpvsplit0( deltaMEs2 );
+        deltaMEs += fpvsplit1( deltaMEs2 );
+#else
+        deltaMEs += deltaMEs2;
+#endif
+        // === C++ END ===
+#else
+        // === CUDA START ===
+        fptype2_sv ztempR_sv = { 0 };
+        fptype2_sv ztempI_sv = { 0 };
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+        {
+          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
+          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
+          ztempR_sv += cf[icol][jcol] * jampRj_sv;
+          ztempI_sv += cf[icol][jcol] * jampIj_sv;
+        }
+        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
+        // === CUDA END ===
+#endif
+      }
+
+      // *** STORE THE RESULTS ***
+
+      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
+      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+      MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
+      MEs_sv_previous += deltaMEs_previous;
+#endif
+      /*
+#ifdef __CUDACC__
+      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
+#else
+#ifdef MGONGPU_CPPSIMD
+      if( cNGoodHel > 0 )
+        for( int ieppV = 0; ieppV < neppV; ieppV++ )
+          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+#else
+      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+#endif
+#endif
+      */
+    } // END LOOP ON IPARITY
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+  CPPProcess::CPPProcess( bool verbose,
+                          bool debug )
+    : m_verbose( verbose )
+    , m_debug( debug )
+#ifndef MGONGPU_HARDCODE_PARAM
+    , m_pars( 0 )
+#endif
+    , m_masses()
+  {
+    // Helicities for the process [NB do keep 'static' for this constexpr array, see issue #283]
+    // *** NB There is no automatic check yet that these are in the same order as Fortran! #569 ***
+    static constexpr short tHel[ncomb][mgOnGpu::npar] = {
+      { -1, -1, -1, 1, -1, 1 },
+      { -1, -1, -1, 1, -1, -1 },
+      { -1, -1, -1, 1, 1, 1 },
+      { -1, -1, -1, 1, 1, -1 },
+      { -1, -1, -1, -1, -1, 1 },
+      { -1, -1, -1, -1, -1, -1 },
+      { -1, -1, -1, -1, 1, 1 },
+      { -1, -1, -1, -1, 1, -1 },
+      { -1, -1, 1, 1, -1, 1 },
+      { -1, -1, 1, 1, -1, -1 },
+      { -1, -1, 1, 1, 1, 1 },
+      { -1, -1, 1, 1, 1, -1 },
+      { -1, -1, 1, -1, -1, 1 },
+      { -1, -1, 1, -1, -1, -1 },
+      { -1, -1, 1, -1, 1, 1 },
+      { -1, -1, 1, -1, 1, -1 },
+      { -1, 1, -1, 1, -1, 1 },
+      { -1, 1, -1, 1, -1, -1 },
+      { -1, 1, -1, 1, 1, 1 },
+      { -1, 1, -1, 1, 1, -1 },
+      { -1, 1, -1, -1, -1, 1 },
+      { -1, 1, -1, -1, -1, -1 },
+      { -1, 1, -1, -1, 1, 1 },
+      { -1, 1, -1, -1, 1, -1 },
+      { -1, 1, 1, 1, -1, 1 },
+      { -1, 1, 1, 1, -1, -1 },
+      { -1, 1, 1, 1, 1, 1 },
+      { -1, 1, 1, 1, 1, -1 },
+      { -1, 1, 1, -1, -1, 1 },
+      { -1, 1, 1, -1, -1, -1 },
+      { -1, 1, 1, -1, 1, 1 },
+      { -1, 1, 1, -1, 1, -1 },
+      { 1, -1, -1, 1, -1, 1 },
+      { 1, -1, -1, 1, -1, -1 },
+      { 1, -1, -1, 1, 1, 1 },
+      { 1, -1, -1, 1, 1, -1 },
+      { 1, -1, -1, -1, -1, 1 },
+      { 1, -1, -1, -1, -1, -1 },
+      { 1, -1, -1, -1, 1, 1 },
+      { 1, -1, -1, -1, 1, -1 },
+      { 1, -1, 1, 1, -1, 1 },
+      { 1, -1, 1, 1, -1, -1 },
+      { 1, -1, 1, 1, 1, 1 },
+      { 1, -1, 1, 1, 1, -1 },
+      { 1, -1, 1, -1, -1, 1 },
+      { 1, -1, 1, -1, -1, -1 },
+      { 1, -1, 1, -1, 1, 1 },
+      { 1, -1, 1, -1, 1, -1 },
+      { 1, 1, -1, 1, -1, 1 },
+      { 1, 1, -1, 1, -1, -1 },
+      { 1, 1, -1, 1, 1, 1 },
+      { 1, 1, -1, 1, 1, -1 },
+      { 1, 1, -1, -1, -1, 1 },
+      { 1, 1, -1, -1, -1, -1 },
+      { 1, 1, -1, -1, 1, 1 },
+      { 1, 1, -1, -1, 1, -1 },
+      { 1, 1, 1, 1, -1, 1 },
+      { 1, 1, 1, 1, -1, -1 },
+      { 1, 1, 1, 1, 1, 1 },
+      { 1, 1, 1, 1, 1, -1 },
+      { 1, 1, 1, -1, -1, 1 },
+      { 1, 1, 1, -1, -1, -1 },
+      { 1, 1, 1, -1, 1, 1 },
+      { 1, 1, 1, -1, 1, -1 } };
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * mgOnGpu::npar * sizeof( short ) ) );
+#else
+    memcpy( cHel, tHel, ncomb * mgOnGpu::npar * sizeof( short ) );
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+  CPPProcess::~CPPProcess() {}
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPU_HARDCODE_PARAM
+  // Initialize process (with parameters read from user cards)
+  void
+  CPPProcess::initProc( const std::string& param_card_name )
+  {
+    // Instantiate the model class and set parameters that stay fixed during run
+    m_pars = Parameters_SMEFTsim_topU3l_MwScheme_UFO::getInstance();
+    SLHAReader slha( param_card_name, m_verbose );
+    m_pars->setIndependentParameters( slha );
+    m_pars->setIndependentCouplings();
+    //m_pars->setDependentParameters(); // now computed event-by-event (running alphas #373)
+    //m_pars->setDependentCouplings(); // now computed event-by-event (running alphas #373)
+    if( m_verbose )
+    {
+      m_pars->printIndependentParameters();
+      m_pars->printIndependentCouplings();
+      //m_pars->printDependentParameters(); // now computed event-by-event (running alphas #373)
+      //m_pars->printDependentCouplings(); // now computed event-by-event (running alphas #373)
+    }
+    // Set external particle masses for this matrix element
+    m_masses.push_back( m_pars->ZERO );
+    m_masses.push_back( m_pars->ZERO );
+    m_masses.push_back( m_pars->mdl_MT );
+    m_masses.push_back( m_pars->mdl_MT );
+    m_masses.push_back( m_pars->mdl_MT );
+    m_masses.push_back( m_pars->mdl_MT );
+    // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
+    // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
+    const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
+    //const cxtype tIPC[0] = { ... }; // nicoup=0
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) );
+    //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0
+#else
+    memcpy( cIPD, tIPD, 2 * sizeof( fptype ) );
+    //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0
+#endif
+    //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl;
+  }
+#else
+  // Initialize process (with hardcoded parameters)
+  void
+  CPPProcess::initProc( const std::string& /*param_card_name*/ )
+  {
+    // Use hardcoded physics parameters
+    if( m_verbose )
+    {
+      Parameters_SMEFTsim_topU3l_MwScheme_UFO::printIndependentParameters();
+      Parameters_SMEFTsim_topU3l_MwScheme_UFO::printIndependentCouplings();
+      //Parameters_SMEFTsim_topU3l_MwScheme_UFO::printDependentParameters(); // now computed event-by-event (running alphas #373)
+      //Parameters_SMEFTsim_topU3l_MwScheme_UFO::printDependentCouplings(); // now computed event-by-event (running alphas #373)
+    }
+    // Set external particle masses for this matrix element
+    m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::ZERO );
+    m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::ZERO );
+    m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT );
+    m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT );
+    m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT );
+    m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT );
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // Retrieve the compiler that was used to build this module
+  const std::string
+  CPPProcess::getCompiler()
+  {
+    std::stringstream out;
+    // CUDA version (NVCC)
+    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
+    // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
+#ifdef __NVCC__
+#if defined __CUDACC_VER_MAJOR__ && defined __CUDACC_VER_MINOR__ && defined __CUDACC_VER_BUILD__
+    out << "nvcc " << __CUDACC_VER_MAJOR__ << "." << __CUDACC_VER_MINOR__ << "." << __CUDACC_VER_BUILD__;
+#else
+    out << "nvcc UNKNOWN";
+#endif
+    out << " (";
+#endif
+    // ICX version (either as CXX or as host compiler inside NVCC)
+#if defined __INTEL_COMPILER
+#error "icc is no longer supported: please use icx"
+#elif defined __INTEL_LLVM_COMPILER // alternative: __INTEL_CLANG_COMPILER
+    out << "icx " << __INTEL_LLVM_COMPILER;
+#ifdef __NVCC__
+    out << ", ";
+#else
+    out << " (";
+#endif
+#endif
+    // CLANG version (either as CXX or as host compiler inside NVCC or inside ICX)
+#if defined __clang__
+#if defined __clang_major__ && defined __clang_minor__ && defined __clang_patchlevel__
+#ifdef __APPLE__
+    out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__;
+#else
+    out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__;
+    // GCC toolchain version inside CLANG
+    std::string tchainout;
+    std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'";
+    std::unique_ptr<FILE, decltype( &pclose )> tchainpipe( popen( tchaincmd.c_str(), "r" ), pclose );
+    if( !tchainpipe ) throw std::runtime_error( "`readelf ...` failed?" );
+    std::array<char, 128> tchainbuf;
+    while( fgets( tchainbuf.data(), tchainbuf.size(), tchainpipe.get() ) != nullptr ) tchainout += tchainbuf.data();
+    tchainout.pop_back(); // remove trailing newline
+#if defined __NVCC__ or defined __INTEL_LLVM_COMPILER
+    out << ", gcc " << tchainout;
+#else
+    out << " (gcc " << tchainout << ")";
+#endif
+#endif
+#else
+    out << "clang UNKNOWKN";
+#endif
+#else
+    // GCC version (either as CXX or as host compiler inside NVCC)
+#if defined __GNUC__ && defined __GNUC_MINOR__ && defined __GNUC_PATCHLEVEL__
+    out << "gcc " << __GNUC__ << "." << __GNUC_MINOR__ << "." << __GNUC_PATCHLEVEL__;
+#else
+    out << "gcc UNKNOWKN";
+#endif
+#endif
+#if defined __NVCC__ or defined __INTEL_LLVM_COMPILER
+    out << ")";
+#endif
+    return out.str();
+  }
+
+  //--------------------------------------------------------------------------
+
+  __global__ void /* clang-format off */
+  computeDependentCouplings( const fptype* allgs, // input: Gs[nevt]
+                             fptype* allcouplings // output: couplings[nevt*ndcoup*2]
+#ifndef __CUDACC__
+                             , const int nevt     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#endif
+  ) /* clang-format on */
+  {
+#ifdef __CUDACC__
+    using namespace mg5amcGpu;
+    using G_ACCESS = DeviceAccessGs;
+    using C_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings );
+#else
+    using namespace mg5amcCpu;
+    using G_ACCESS = HostAccessGs;
+    using C_ACCESS = HostAccessCouplings;
+    for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
+    {
+      const int ievt0 = ipagV * neppV;
+      const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
+      fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
+      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings );
+    }
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+#ifdef __CUDACC__ /* clang-format off */
+  __global__ void
+  sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
+                       const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
+                       fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
+  { /* clang-format on */
+    fptype allMEsLast = 0;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+    allMEs[ievt] = 0;
+    for( int ihel = 0; ihel < ncomb; ihel++ )
+    {
+      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
+      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      constexpr unsigned int channelId = 0; // disable single-diagram channel enhancement
+      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+#else
+      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+#endif
+      if( allMEs[ievt] != allMEsLast )
+      {
+        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+        isGoodHel[ihel] = true;
+      }
+      allMEsLast = allMEs[ievt]; // running sum up to helicity ihel for event ievt
+    }
+  }
+#else
+  void
+  sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
+                       const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
+                       fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
+    //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
+    // Allocate arrays at build time to contain at least 16 events (or at least neppV events if neppV>16, e.g. in future VPUs)
+    constexpr int maxtry0 = std::max( 16, neppV ); // 16, but at least neppV (otherwise the npagV loop does not even start)
+    fptype allMEsLast[maxtry0] = { 0 };            // allocated at build time: maxtry0 must be a constexpr
+    // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
+    assert( nevt >= neppV );
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+
+    // PART 0 - INITIALISATION (before calculate_wavefunctions)
+    // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
+    for( int ievt = 0; ievt < maxtry; ++ievt )
+    {
+      allMEs[ievt] = 0; // all zeros
+    }
+
+    // PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS
+    const int npagV = maxtry / neppV;
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed fptypes #537: float for color algebra and double elsewhere
+    // Delay color algebra and ME updates (only on even pages)
+    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
+#else
+    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
+#endif
+    for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
+    {
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
+#else
+      const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
+#endif
+      for( int ihel = 0; ihel < ncomb; ihel++ )
+      {
+        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+        constexpr unsigned int channelId = 0; // disable single-diagram channel enhancement
+        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#else
+        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+#endif
+        for( int ieppV = 0; ieppV < neppV; ++ieppV )
+        {
+          const int ievt = ievt00 + ieppV;
+          const bool differs = ( allMEs[ievt] != allMEsLast[ievt] );
+          if( differs )
+          {
+            //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+            isGoodHel[ihel] = true;
+          }
+          allMEsLast[ievt] = allMEs[ievt]; // running sum up to helicity ihel
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+          const int ievt2 = ievt00 + ieppV + neppV;
+          const bool differs2 = ( allMEs[ievt2] != allMEsLast[ievt2] );
+          if( differs2 )
+          {
+            //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+            isGoodHel[ihel] = true;
+          }
+          allMEsLast[ievt2] = allMEs[ievt2]; // running sum up to helicity ihel
+#endif
+        }
+      }
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+  int                                          // output: nGoodHel (the number of good helicity combinations out of ncomb)
+  sigmaKin_setGoodHel( const bool* isGoodHel ) // input: isGoodHel[ncomb] - host array (CUDA and C++)
+  {
+    int nGoodHel = 0;
+    int goodHel[ncomb] = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
+    for( int ihel = 0; ihel < ncomb; ihel++ )
+    {
+      //std::cout << "sigmaKin_setGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
+      if( isGoodHel[ihel] )
+      {
+        goodHel[nGoodHel] = ihel;
+        nGoodHel++;
+      }
+    }
+#ifdef __CUDACC__
+    checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) );
+    checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) );
+#else
+    cNGoodHel = nGoodHel;
+    for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
+#endif
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+  // Evaluate |M|^2, part independent of incoming flavour
+
+  __global__ void /* clang-format off */
+  sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,       // input: random numbers[nevt] for helicity selection
+            const fptype* allrndcol,       // input: random numbers[nevt] for color selection
+            fptype* allMEs,                // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const unsigned int channelId,  // input: multichannel channel id (1 to #diagrams); 0 to disable channel enhancement
+            fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            int* allselhel,                // output: helicity selection[nevt]
+            int* allselcol                 // output: helicity selection[nevt]
+#ifndef __CUDACC__
+            , const int nevt               // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#endif
+            ) /* clang-format on */
+  {
+    mgDebugInitialise();
+
+    // SANITY CHECKS for cudacpp code generation (see issues #272 and #343 and PRs #619 and #626)
+    // These variable are not used anywhere else in the code and their scope is limited to this sanity check
+    {
+      // nprocesses has always been found equal to 1 for all processes generated so far in all code versions
+      constexpr int nprocesses = 1;
+      static_assert( nprocesses == 1, "Assume nprocesses == 1" );
+      // process_id corresponds to the index of DSIG1 Fortran functions (must be 1 because cudacpp is unable to handle DSIG2)
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
+      static_assert( process_id == 1, "Assume process_id == 1" );
+    }
+
+    // Denominators: spins, colors and identical particles
+    constexpr int helcolDenominators[1] = { 1024 }; // assume nprocesses == 1 (#272 and #343)
+
+#ifdef __CUDACC__
+    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+#else
+    //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
+    //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;   // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events
+#endif
+#endif
+
+    // Start sigmaKin_lines
+    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
+#ifdef __CUDACC__
+    allMEs[ievt] = 0;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    allNumerators[ievt] = 0;
+    allDenominators[ievt] = 0;
+#endif
+#else
+    const int npagV = nevt / neppV;
+    for( int ipagV = 0; ipagV < npagV; ++ipagV )
+    {
+      const int ievt0 = ipagV * neppV;
+      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+      MEs_sv = fptype_sv{ 0 };
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
+      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+      numerators_sv = fptype_sv{ 0 };
+      denominators_sv = fptype_sv{ 0 };
+#endif
+    }
+#endif
+
+    // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS ===
+    // (in both CUDA and C++, using precomputed good helicities)
+
+#ifdef __CUDACC__ // CUDA OR C++
+
+    // *** START OF PART 1a - CUDA (one event per CPU thread) ***
+    // Running sum of partial amplitudes squared for event by event color selection (#402)
+    // (for the single event processed in calculate_wavefunctions)
+    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
+    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
+    {
+      const int ihel = cGoodHel[ighel];
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+#else
+      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+#endif
+      MEs_ighel[ighel] = allMEs[ievt];
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
+      {
+        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    const int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
+    fptype targetamp[ncolor] = { 0 };
+    for( int icolC = 0; icolC < ncolor; icolC++ )
+    {
+      if( icolC == 0 )
+        targetamp[icolC] = 0;
+      else
+        targetamp[icolC] = targetamp[icolC - 1];
+      if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+    }
+    //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+    for( int icolC = 0; icolC < ncolor; icolC++ )
+    {
+      if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+      {
+        allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+        break;
+      }
+    }
+#endif
+    // *** END OF PART 1a - CUDA (one event per CPU thread) ***
+
+#else // CUDA OR C++
+
+    // *** START OF PART 1b - C++ (loop on event pages)
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed fptypes #537: float for color algebra and double elsewhere
+    // Delay color algebra and ME updates (only on even pages)
+    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
+#else
+    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
+#endif
+#ifdef _OPENMP
+    // OMP multithreading #575 (NB: tested only with gcc11 so far)
+    // See https://www.openmp.org/specifications/
+    // - default(none): no variables are shared by default
+    // - shared: as the name says
+    // - private: give each thread its own copy, without initialising
+    // - firstprivate: give each thread its own copy, and initialise with value from outside
+#define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define _OMPLIST1 , allDenominators, allNumerators, channelId, mgOnGpu::icolamp
+#else
+#define _OMPLIST1
+#endif
+#pragma omp parallel for default( none ) shared( _OMPLIST0 _OMPLIST1 )
+#undef _OMPLIST0
+#undef _OMPLIST1
+#endif // _OPENMP
+    for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
+    {
+      // Running sum of partial amplitudes squared for event by event color selection (#402)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
+      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
+      fptype_sv MEs_ighel[ncomb] = { 0 };    // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype_sv MEs_ighel2[ncomb] = { 0 };   // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
+#else
+      const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
+#endif
+      for( int ighel = 0; ighel < cNGoodHel; ighel++ )
+      {
+        const int ihel = cGoodHel[ighel];
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#else
+        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+#endif
+        MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
+#endif
+      }
+      // Event-by-event random choice of helicity #403
+      for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      {
+        const int ievt = ievt00 + ieppV;
+        //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+        for( int ighel = 0; ighel < cNGoodHel; ighel++ )
+        {
+#if defined MGONGPU_CPPSIMD
+          const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
+#else
+          const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
+#endif
+          if( okhel )
+          {
+            const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+            allselhel[ievt] = ihelF;
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            break;
+          }
+        }
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        const int ievt2 = ievt00 + ieppV + neppV;
+        //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
+        for( int ighel = 0; ighel < cNGoodHel; ighel++ )
+        {
+          if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
+          {
+            const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+            allselhel[ievt2] = ihelF;
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            break;
+          }
+        }
+#endif
+      }
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice)
+      const int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0
+      // Event-by-event random choice of color #402
+      fptype_sv targetamp[ncolor] = { 0 };
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = fptype_sv{ 0 };
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype_sv targetamp2[ncolor] = { 0 };
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp2[icolC] = fptype_sv{ 0 };
+        else
+          targetamp2[icolC] = targetamp2[icolC - 1];
+        if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC];
+      }
+#endif
+      for( int ieppV = 0; ieppV < neppV; ++ieppV )
+      {
+        const int ievt = ievt00 + ieppV;
+        //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+        for( int icolC = 0; icolC < ncolor; icolC++ )
+        {
+#if defined MGONGPU_CPPSIMD
+          const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] );
+#else
+          const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] );
+#endif
+          if( okcol )
+          {
+            allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+            break;
+          }
+        }
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        const int ievt2 = ievt00 + ieppV + neppV;
+        //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] );
+        for( int icolC = 0; icolC < ncolor; icolC++ )
+        {
+          if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) )
+          {
+            allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+            break;
+          }
+        }
+#endif
+      }
+#endif // multichannel enabled (random color choice)
+    }
+    // *** END OF PART 1b - C++ (loop on event pages)
+
+#endif // CUDA or C++
+
+    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
+    // [NB 'sum over final spins, average over initial spins', eg see
+    // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
+#ifdef __CUDACC__
+    allMEs[ievt] /= helcolDenominators[0];
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt];
+#endif
+#else
+    for( int ipagV = 0; ipagV < npagV; ++ipagV )
+    {
+      const int ievt0 = ipagV * neppV;
+      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+      MEs_sv /= helcolDenominators[0];
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+      if( channelId > 0 )
+      {
+        fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+        fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
+        fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+        fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+        MEs_sv *= numerators_sv / denominators_sv;
+      }
+#endif
+      //for( int ieppV = 0; ieppV < neppV; ieppV++ )
+      //{
+      //  const unsigned int ievt = ipagV * neppV + ieppV;
+      //  printf( "sigmaKin: ievt=%2d me=%f\n", ievt, allMEs[ievt] );
+      //}
+    }
+#endif
+    mgDebugFinalise();
+  }
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
+
+//==========================================================================
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h
new file mode 100644
index 0000000000..8e1aa66442
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h
@@ -0,0 +1,166 @@
+//==========================================================================
+// This file has been automatically generated for CUDA/C++ standalone by
+// MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-01-26
+// By the MadGraph5_aMC@NLO Development Team
+// Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
+//==========================================================================
+
+#ifndef MG5_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx_H
+#define MG5_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "Parameters_SMEFTsim_topU3l_MwScheme_UFO.h"
+
+#include <vector>
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //==========================================================================
+  // A class for calculating the matrix elements for
+  // Process: g g > t t~ t t~ WEIGHTED<=4 @1
+  //--------------------------------------------------------------------------
+
+  class CPPProcess
+  {
+  public: /* clang-format off */
+
+    // Constructor (from command line arguments)
+    CPPProcess( bool verbose = false, bool debug = false );
+
+    // Destructor
+    ~CPPProcess();
+
+    // Initialize process (read model parameters from file)
+    virtual void initProc( const std::string& param_card_name );
+
+    // Retrieve the compiler that was used to build this module
+    static const std::string getCompiler();
+
+    // Other methods of this instance (???)
+    //const std::vector<fptype>& getMasses() const { return m_masses; }
+    //virtual int code() const{ return 1; }
+    //void setInitial( int inid1, int inid2 ){ id1 = inid1; id2 = inid2; }
+    //int getDim() const { return dim; }
+    //int getNIOParticles() const { return nexternal; } // nexternal was nioparticles
+
+    // Accessors (unused so far: add four of them only to fix a clang build warning)
+    //bool verbose() const { return m_verbose; }
+    bool debug() const { return m_debug; }
+
+  public: /* clang-format on */
+
+    // Hardcoded parameters for this process (constant class variables)
+    // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
+    //static const int ninitial = mgOnGpu::npari;
+    //static const int nexternal = 6; // mgOnGpu::npar (nexternal was nioparticles)
+    //static const int nwavefuncs = 6; // mgOnGpu::nwf
+    //static const int namplitudes = 76;
+    //static const int ncomb = 64; // mgOnGpu::ncomb
+
+  private:
+
+    // Command line arguments (constructor)
+    bool m_verbose;
+    bool m_debug;
+
+    // Physics model parameters to be read from file (initProc function)
+#ifndef MGONGPU_HARDCODE_PARAM
+    Parameters_SMEFTsim_topU3l_MwScheme_UFO* m_pars;
+#endif
+    std::vector<fptype> m_masses; // external particle masses
+
+    // Other variables of this instance (???)
+    //int id1, id2; // initial particle ids
+    //cxtype** amp; // ???
+  };
+
+  //--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+  __global__ void
+  computeDependentCouplings( const fptype* allgs,    // input: Gs[nevt]
+                             fptype* allcouplings ); // output: couplings[nevt*ndcoup*2]
+#else
+  __global__ void
+  computeDependentCouplings( const fptype* allgs,  // input: Gs[nevt]
+                             fptype* allcouplings, // output: couplings[nevt*ndcoup*2]
+                             const int nevt );     // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef __CUDACC__ /* clang-format off */
+  __global__ void
+  sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
+                       const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
+                       fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+#else
+  __global__ void
+  sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
+                       const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
+                       fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#endif /* clang-format on */
+
+  //--------------------------------------------------------------------------
+
+  int                                           // output: nGoodHel (the number of good helicity combinations out of ncomb)
+  sigmaKin_setGoodHel( const bool* isGoodHel ); // input: isGoodHel[ncomb] - host array
+
+  //--------------------------------------------------------------------------
+
+#ifdef __CUDACC__ /* clang-format off */
+  __global__ void
+  sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,       // input: random numbers[nevt] for helicity selection
+            const fptype* allrndcol,       // input: random numbers[nevt] for color selection
+            fptype* allMEs,                // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const unsigned int channelId,  // input: multichannel channel id (1 to #diagrams); 0 to disable channel enhancement
+            fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            int* allselhel,                // output: helicity selection[nevt]
+            int* allselcol                 // output: helicity selection[nevt]
+            );
+#else
+  __global__ void
+  sigmaKin( const fptype* allmomenta,      // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,       // input: random numbers[nevt] for helicity selection
+            const fptype* allrndcol,       // input: random numbers[nevt] for color selection
+            fptype* allMEs,                // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const unsigned int channelId,  // input: multichannel channel id (1 to #diagrams); 0 to disable channel enhancement
+            fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+#endif
+            int* allselhel,                // output: helicity selection[nevt]
+            int* allselcol,                // output: helicity selection[nevt]
+            const int nevt );              // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#endif /* clang-format on */
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // MG5_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CrossSectionKernels.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CrossSectionKernels.cc
new file mode 120000
index 0000000000..d9cb57c4bb
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CrossSectionKernels.cc
@@ -0,0 +1 @@
+../CrossSectionKernels.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CrossSectionKernels.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CrossSectionKernels.h
new file mode 120000
index 0000000000..125b8758e4
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CrossSectionKernels.h
@@ -0,0 +1 @@
+../CrossSectionKernels.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CudaRuntime.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CudaRuntime.h
new file mode 120000
index 0000000000..ce9e1a487a
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CudaRuntime.h
@@ -0,0 +1 @@
+../CudaRuntime.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/EventStatistics.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/EventStatistics.h
new file mode 120000
index 0000000000..34c1a31129
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/EventStatistics.h
@@ -0,0 +1 @@
+../EventStatistics.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MadgraphTest.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MadgraphTest.h
new file mode 120000
index 0000000000..13942d64c4
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MadgraphTest.h
@@ -0,0 +1 @@
+../MadgraphTest.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MatrixElementKernels.cc
new file mode 120000
index 0000000000..f800cb9638
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MatrixElementKernels.cc
@@ -0,0 +1 @@
+../MatrixElementKernels.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MatrixElementKernels.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MatrixElementKernels.h
new file mode 120000
index 0000000000..ac47855d4f
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MatrixElementKernels.h
@@ -0,0 +1 @@
+../MatrixElementKernels.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessAmplitudes.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessAmplitudes.h
new file mode 120000
index 0000000000..448995d3e5
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessAmplitudes.h
@@ -0,0 +1 @@
+../MemoryAccessAmplitudes.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessCouplings.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessCouplings.h
new file mode 120000
index 0000000000..388f907580
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessCouplings.h
@@ -0,0 +1 @@
+../MemoryAccessCouplings.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessCouplingsFixed.h
new file mode 120000
index 0000000000..c795c16465
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessCouplingsFixed.h
@@ -0,0 +1 @@
+../MemoryAccessCouplingsFixed.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessDenominators.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessDenominators.h
new file mode 120000
index 0000000000..4ab752bdad
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessDenominators.h
@@ -0,0 +1 @@
+../MemoryAccessDenominators.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessGs.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessGs.h
new file mode 120000
index 0000000000..9d5e237faf
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessGs.h
@@ -0,0 +1 @@
+../MemoryAccessGs.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessHelpers.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessHelpers.h
new file mode 120000
index 0000000000..3692f9e4da
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessHelpers.h
@@ -0,0 +1 @@
+../MemoryAccessHelpers.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessMatrixElements.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessMatrixElements.h
new file mode 120000
index 0000000000..b04a26e4f6
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessMatrixElements.h
@@ -0,0 +1 @@
+../MemoryAccessMatrixElements.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessMomenta.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessMomenta.h
new file mode 120000
index 0000000000..4a5e8b375d
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessMomenta.h
@@ -0,0 +1 @@
+../MemoryAccessMomenta.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessNumerators.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessNumerators.h
new file mode 120000
index 0000000000..a525b6607d
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessNumerators.h
@@ -0,0 +1 @@
+../MemoryAccessNumerators.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessRandomNumbers.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessRandomNumbers.h
new file mode 120000
index 0000000000..844de324e7
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessRandomNumbers.h
@@ -0,0 +1 @@
+../MemoryAccessRandomNumbers.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessVectors.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessVectors.h
new file mode 120000
index 0000000000..d890503974
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessVectors.h
@@ -0,0 +1 @@
+../MemoryAccessVectors.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessWavefunctions.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessWavefunctions.h
new file mode 120000
index 0000000000..61a331899b
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessWavefunctions.h
@@ -0,0 +1 @@
+../MemoryAccessWavefunctions.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessWeights.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessWeights.h
new file mode 120000
index 0000000000..ec10cd2e17
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryAccessWeights.h
@@ -0,0 +1 @@
+../MemoryAccessWeights.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryBuffers.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryBuffers.h
new file mode 120000
index 0000000000..600b7ad779
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/MemoryBuffers.h
@@ -0,0 +1 @@
+../MemoryBuffers.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/RamboSamplingKernels.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/RamboSamplingKernels.cc
new file mode 120000
index 0000000000..033b20955e
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/RamboSamplingKernels.cc
@@ -0,0 +1 @@
+../RamboSamplingKernels.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/RamboSamplingKernels.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/RamboSamplingKernels.h
new file mode 120000
index 0000000000..ca354ce496
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/RamboSamplingKernels.h
@@ -0,0 +1 @@
+../RamboSamplingKernels.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/RandomNumberKernels.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/RandomNumberKernels.cc
new file mode 120000
index 0000000000..09a0e03a16
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/RandomNumberKernels.cc
@@ -0,0 +1 @@
+../RandomNumberKernels.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/RandomNumberKernels.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/RandomNumberKernels.h
new file mode 120000
index 0000000000..5e8526a6ae
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/RandomNumberKernels.h
@@ -0,0 +1 @@
+../RandomNumberKernels.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/check_sa.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/check_sa.cc
new file mode 100644
index 0000000000..41367fd70b
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/check_sa.cc
@@ -0,0 +1,1120 @@
+#include "mgOnGpuConfig.h"
+
+#include "BridgeKernels.h"
+#include "CPPProcess.h"
+#include "CrossSectionKernels.h"
+#include "MatrixElementKernels.h"
+#include "MemoryAccessMatrixElements.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryAccessRandomNumbers.h"
+#include "MemoryAccessWeights.h"
+#include "MemoryBuffers.h"
+#include "RamboSamplingKernels.h"
+#include "RandomNumberKernels.h"
+#include "epoch_process_id.h"
+#include "ompnumthreads.h"
+#include "timermap.h"
+
+#include <unistd.h>
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <numeric>
+#include <string>
+
+#define STRINGIFY( s ) #s
+#define XSTRINGIFY( s ) STRINGIFY( s )
+
+#define SEP79 79
+
+bool
+is_number( const char* s )
+{
+  const char* t = s;
+  while( *t != '\0' && isdigit( *t ) )
+    ++t;
+  return (int)strlen( s ) == t - s;
+}
+
+int
+usage( char* argv0, int ret = 1 )
+{
+  std::cout << "Usage: " << argv0
+            << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]"
+            << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl;
+  std::cout << std::endl;
+  std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl;
+  std::cout << "(also in CPU/C++ code, where only the product of these two parameters counts)" << std::endl;
+  std::cout << std::endl;
+  std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
+  std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
+#ifndef __CUDACC__
+#ifdef _OPENMP
+  std::cout << std::endl;
+  std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
+  std::cout << "(OMP multithreading will be disabled if OMP_NUM_THREADS is not set)" << std::endl;
+#endif
+#endif
+  return ret;
+}
+
+int
+main( int argc, char** argv )
+{
+  // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...)
+#ifdef __CUDACC__
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  // DEFAULTS FOR COMMAND LINE ARGUMENTS
+  bool verbose = false;
+  bool debug = false;
+  bool perf = false;
+  bool json = false;
+  unsigned int niter = 0;
+  unsigned int gpublocks = 1;
+  unsigned int gputhreads = 32;
+  unsigned int jsondate = 0;
+  unsigned int jsonrun = 0;
+  unsigned int numvec[5] = { 0, 0, 0, 0, 0 };
+  int nnum = 0;
+  // Random number mode
+  enum class RandomNumberMode
+  {
+    CommonRandom = 0,
+    CurandHost = 1,
+    CurandDevice = 2
+  };
+#ifdef __CUDACC__
+  RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU
+#elif not defined MGONGPU_HAS_NO_CURAND
+  RandomNumberMode rndgen = RandomNumberMode::CurandHost;  // default on CPU if build has curand
+#else
+  RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand
+#endif
+  // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!)
+  enum class RamboSamplingMode
+  {
+    RamboHost = 1,
+    RamboDevice = 2
+  };
+#ifdef __CUDACC__
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
+#else
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
+#endif
+  // Bridge emulation mode (NB Bridge implies RamboHost!)
+  bool bridge = false;
+
+  // READ COMMAND LINE ARGUMENTS
+  for( int argn = 1; argn < argc; ++argn )
+  {
+    std::string arg = argv[argn];
+    if( ( arg == "--verbose" ) || ( arg == "-v" ) )
+    {
+      verbose = true;
+    }
+    else if( ( arg == "--debug" ) || ( arg == "-d" ) )
+    {
+      debug = true;
+    }
+    else if( ( arg == "--performance" ) || ( arg == "-p" ) )
+    {
+      perf = true;
+    }
+    else if( ( arg == "--json" ) || ( arg == "-j" ) )
+    {
+      json = true;
+    }
+    else if( arg == "--curdev" )
+    {
+#ifdef __CUDACC__
+      rndgen = RandomNumberMode::CurandDevice;
+#else
+      throw std::runtime_error( "CurandDevice is not supported on CPUs" );
+#endif
+    }
+    else if( arg == "--curhst" )
+    {
+#ifndef MGONGPU_HAS_NO_CURAND
+      rndgen = RandomNumberMode::CurandHost;
+#else
+      throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" );
+#endif
+    }
+    else if( arg == "--common" )
+    {
+      rndgen = RandomNumberMode::CommonRandom;
+    }
+    else if( arg == "--rmbdev" )
+    {
+#ifdef __CUDACC__
+      rmbsmp = RamboSamplingMode::RamboDevice;
+#else
+      throw std::runtime_error( "RamboDevice is not supported on CPUs" );
+#endif
+    }
+    else if( arg == "--rmbhst" )
+    {
+      rmbsmp = RamboSamplingMode::RamboHost;
+    }
+    else if( arg == "--bridge" )
+    {
+      bridge = true;
+    }
+    else if( is_number( argv[argn] ) && nnum < 5 )
+    {
+      numvec[nnum++] = strtoul( argv[argn], NULL, 0 );
+    }
+    else
+    {
+      return usage( argv[0] );
+    }
+  }
+
+  if( nnum == 3 || nnum == 5 )
+  {
+    gpublocks = numvec[0];
+    gputhreads = numvec[1];
+    niter = numvec[2];
+    if( nnum == 5 )
+    {
+      jsondate = numvec[3];
+      jsonrun = numvec[4];
+    }
+  }
+  else if( nnum == 1 )
+  {
+    niter = numvec[0];
+  }
+  else
+  {
+    return usage( argv[0] );
+  }
+
+  if( niter == 0 )
+    return usage( argv[0] );
+
+  if( bridge && rmbsmp == RamboSamplingMode::RamboDevice )
+  {
+    std::cout << "WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost" << std::endl;
+    rmbsmp = RamboSamplingMode::RamboHost;
+  }
+
+  if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::CurandDevice )
+  {
+#if not defined MGONGPU_HAS_NO_CURAND
+    std::cout << "WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost" << std::endl;
+    rndgen = RandomNumberMode::CurandHost;
+#else
+    std::cout << "WARNING! RamboHost selected: cannot use CurandDevice, will use CommonRandom" << std::endl;
+    rndgen = RandomNumberMode::CommonRandom;
+#endif
+  }
+
+  constexpr int neppM = MemoryAccessMomenta::neppM;       // AOSOA layout
+  constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout
+
+  using mgOnGpu::ntpbMAX;
+  if( gputhreads > ntpbMAX )
+  {
+    std::cout << "ERROR! #threads/block should be <= " << ntpbMAX << std::endl;
+    return usage( argv[0] );
+  }
+
+#ifndef __CUDACC__
+#ifdef _OPENMP
+  ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1)
+#endif
+#endif
+
+#ifndef __CUDACC__
+  // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation
+  // Note: this prevents a crash on pmpe04 but not on some github CI nodes?
+  // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!]
+  if( !MatrixElementKernelHost::hostSupportsSIMD() ) return 1;
+#endif
+
+  const unsigned int ndim = gpublocks * gputhreads; // number of threads in one GPU grid
+  const unsigned int nevt = ndim;                   // number of events in one iteration == number of GPU threads
+
+  if( verbose )
+    std::cout << "# iterations: " << niter << std::endl;
+
+  // *** START THE NEW TIMERS ***
+  mgOnGpu::TimerMap timermap;
+
+  // === STEP 0 - INITIALISE
+
+#ifdef __CUDACC__
+
+  // --- 00. Initialise cuda
+  // Instantiate a CudaRuntime at the beginnining of the application's main to
+  // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor
+  const std::string cdinKey = "00 CudaInit";
+  timermap.start( cdinKey );
+  CudaRuntime cudaRuntime( debug );
+#endif
+
+  // --- 0a. Initialise physics process
+  const std::string procKey = "0a ProcInit";
+  timermap.start( procKey );
+
+  // Create a process object
+  CPPProcess process( verbose );
+
+  // Read param_card and set parameters
+  process.initProc( "../../Cards/param_card.dat" );
+  const fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
+  //const fptype energy = 91.2; // Ecms = 91.2 GeV (Z peak)
+  //const fptype energy = 0.100; // Ecms = 100 MeV (well below the Z peak, pure em scattering)
+  const int meGeVexponent = -( 2 * mgOnGpu::npar - 8 );
+
+  // --- 0b. Allocate memory structures
+  const std::string alloKey = "0b MemAlloc";
+  timermap.start( alloKey );
+
+  // Memory buffers for random numbers for momenta
+#ifndef __CUDACC__
+  HostBufferRndNumMomenta hstRndmom( nevt );
+#else
+  PinnedHostBufferRndNumMomenta hstRndmom( nevt );
+  DeviceBufferRndNumMomenta devRndmom( nevt );
+#endif
+
+  // Memory buffers for sampling weights
+#ifndef __CUDACC__
+  HostBufferWeights hstWeights( nevt );
+#else
+  PinnedHostBufferWeights hstWeights( nevt );
+  DeviceBufferWeights devWeights( nevt );
+#endif
+
+  // Memory buffers for momenta
+#ifndef __CUDACC__
+  HostBufferMomenta hstMomenta( nevt );
+#else
+  PinnedHostBufferMomenta hstMomenta( nevt );
+  DeviceBufferMomenta devMomenta( nevt );
+#endif
+
+  // Memory buffers for Gs
+#ifndef __CUDACC__
+  HostBufferGs hstGs( nevt );
+#else
+  PinnedHostBufferGs hstGs( nevt );
+  DeviceBufferGs devGs( nevt );
+#endif
+
+  // Hardcode Gs for now (eventually they should come from Fortran MadEvent)
+  for( unsigned int i = 0; i < nevt; ++i )
+  {
+    constexpr fptype fixedG = 1.2177157847767195; // fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
+    hstGs[i] = fixedG;
+    //if ( i > 0 ) hstGs[i] = 0; // try hardcoding G only for event 0
+    //hstGs[i] = i;
+  }
+
+  // Memory buffers for matrix elements
+#ifndef __CUDACC__
+  HostBufferMatrixElements hstMatrixElements( nevt );
+#else
+  PinnedHostBufferMatrixElements hstMatrixElements( nevt );
+  DeviceBufferMatrixElements devMatrixElements( nevt );
+#endif
+
+  // Memory buffers for random numbers for helicity selection
+  // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) ***
+#ifndef __CUDACC__
+  HostBufferRndNumHelicity hstRndHel( nevt );
+#else
+  PinnedHostBufferRndNumHelicity hstRndHel( nevt );
+  DeviceBufferRndNumHelicity devRndHel( nevt );
+#endif
+
+  // Memory buffers for random numbers for color selection
+  // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) ***
+#ifndef __CUDACC__
+  HostBufferRndNumColor hstRndCol( nevt );
+#else
+  PinnedHostBufferRndNumColor hstRndCol( nevt );
+  DeviceBufferRndNumColor devRndCol( nevt );
+#endif
+
+  // Memory buffers for helicity selection
+#ifndef __CUDACC__
+  HostBufferSelectedHelicity hstSelHel( nevt );
+#else
+  PinnedHostBufferSelectedHelicity hstSelHel( nevt );
+  DeviceBufferSelectedHelicity devSelHel( nevt );
+#endif
+
+  // Memory buffers for color selection
+#ifndef __CUDACC__
+  HostBufferSelectedColor hstSelCol( nevt );
+#else
+  PinnedHostBufferSelectedColor hstSelCol( nevt );
+  DeviceBufferSelectedColor devSelCol( nevt );
+#endif
+
+  std::unique_ptr<double[]> genrtimes( new double[niter] );
+  std::unique_ptr<double[]> rambtimes( new double[niter] );
+  std::unique_ptr<double[]> wavetimes( new double[niter] );
+  std::unique_ptr<double[]> wv3atimes( new double[niter] );
+
+  // --- 0c. Create curand or common generator
+  const std::string cgenKey = "0c GenCreat";
+  timermap.start( cgenKey );
+  // Allocate the appropriate RandomNumberKernel
+  std::unique_ptr<RandomNumberKernelBase> prnk;
+  if( rndgen == RandomNumberMode::CommonRandom )
+  {
+    prnk.reset( new CommonRandomNumberKernel( hstRndmom ) );
+  }
+#ifndef MGONGPU_HAS_NO_CURAND
+  else if( rndgen == RandomNumberMode::CurandHost )
+  {
+    const bool onDevice = false;
+    prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) );
+  }
+#ifdef __CUDACC__
+  else
+  {
+    const bool onDevice = true;
+    prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) );
+  }
+#else
+  else
+  {
+    throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+  }
+#endif
+#else
+  else
+  {
+    throw std::logic_error( "This application was built without Curand support" ); // INTERNAL ERROR (no path to this statement)
+  }
+#endif
+
+  // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment]
+  std::unique_ptr<SamplingKernelBase> prsk;
+  if( rmbsmp == RamboSamplingMode::RamboHost )
+  {
+    prsk.reset( new RamboSamplingKernelHost( energy, hstRndmom, hstMomenta, hstWeights, nevt ) );
+  }
+  else
+  {
+#ifdef __CUDACC__
+    prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) );
+#else
+    throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement)
+#endif
+  }
+
+  // --- 0c. Create matrix element kernel [keep this in 0c for the moment]
+  std::unique_ptr<MatrixElementKernelBase> pmek;
+  if( !bridge )
+  {
+#ifdef __CUDACC__
+    pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) );
+#else
+    pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
+#endif
+  }
+  else
+  {
+#ifdef __CUDACC__
+    pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) );
+#else
+    pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) );
+#endif
+  }
+  int nGoodHel = 0; // the number of good helicities (out of ncomb)
+
+  // --- 0c. Create cross section kernel [keep this in 0c for the moment]
+  EventStatistics hstStats;
+  CrossSectionKernelHost xsk( hstWeights, hstMatrixElements, hstStats, nevt );
+
+  // **************************************
+  // *** START MAIN LOOP ON #ITERATIONS ***
+  // **************************************
+
+  for( unsigned long int iiter = 0; iiter < niter; ++iiter )
+  {
+    //std::cout << "Iteration #" << iiter+1 << " of " << niter << std::endl;
+
+    // === STEP 1 OF 3
+
+    // *** START THE OLD-STYLE TIMER FOR RANDOM GEN ***
+    double genrtime = 0;
+
+    // --- 1a. Seed rnd generator (to get same results on host and device in curand)
+    // [NB This should not be necessary using the host API: "Generation functions
+    // can be called multiple times on the same generator to generate successive
+    // blocks of results. For pseudorandom generators, multiple calls to generation
+    // functions will yield the same result as a single call with a large size."]
+    const unsigned long long seed = 20200805;
+    const std::string sgenKey = "1a GenSeed ";
+    timermap.start( sgenKey );
+    prnk->seedGenerator( seed + iiter );
+    genrtime += timermap.stop();
+
+    // --- 1b. Generate all relevant numbers to build nevt events (i.e. nevt phase space points) on the host
+    const std::string rngnKey = "1b GenRnGen";
+    timermap.start( rngnKey );
+    prnk->generateRnarray();
+    //std::cout << "Got random numbers" << std::endl;
+
+#ifdef __CUDACC__
+    if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice )
+    {
+      // --- 1c. Copy rndmom from host to device
+      const std::string htodKey = "1c CpHTDrnd";
+      genrtime += timermap.start( htodKey );
+      copyDeviceFromHost( devRndmom, hstRndmom );
+    }
+#endif
+
+    // *** STOP THE OLD-STYLE TIMER FOR RANDOM GEN ***
+    genrtime += timermap.stop();
+
+    // === STEP 2 OF 3
+    // Fill in particle momenta for each of nevt events on the device
+
+    // *** START THE OLD-STYLE TIMER FOR RAMBO ***
+    double rambtime = 0;
+
+    // --- 2a. Fill in momenta of initial state particles on the device
+    const std::string riniKey = "2a RamboIni";
+    timermap.start( riniKey );
+    prsk->getMomentaInitial();
+    //std::cout << "Got initial momenta" << std::endl;
+
+    // --- 2b. Fill in momenta of final state particles using the RAMBO algorithm on the device
+    // (i.e. map random numbers to final-state particle momenta for each of nevt events)
+    const std::string rfinKey = "2b RamboFin";
+    rambtime += timermap.start( rfinKey );
+    prsk->getMomentaFinal();
+    //std::cout << "Got final momenta" << std::endl;
+
+#ifdef __CUDACC__
+    if( rmbsmp == RamboSamplingMode::RamboDevice )
+    {
+      // --- 2c. CopyDToH Weights
+      const std::string cwgtKey = "2c CpDTHwgt";
+      rambtime += timermap.start( cwgtKey );
+      copyHostFromDevice( hstWeights, devWeights );
+
+      // --- 2d. CopyDToH Momenta
+      const std::string cmomKey = "2d CpDTHmom";
+      rambtime += timermap.start( cmomKey );
+      copyHostFromDevice( hstMomenta, devMomenta );
+    }
+    else // only if ( ! bridge ) ???
+    {
+      // --- 2c. CopyHToD Weights
+      const std::string cwgtKey = "2c CpHTDwgt";
+      rambtime += timermap.start( cwgtKey );
+      copyDeviceFromHost( devWeights, hstWeights );
+
+      // --- 2d. CopyHToD Momenta
+      const std::string cmomKey = "2d CpHTDmom";
+      rambtime += timermap.start( cmomKey );
+      copyDeviceFromHost( devMomenta, hstMomenta );
+    }
+#endif
+
+    // *** STOP THE OLD-STYLE TIMER FOR RAMBO ***
+    rambtime += timermap.stop();
+
+    // === STEP 3 OF 3
+    // Evaluate matrix elements for all nevt events
+    // 0d. For Bridge only, transpose C2F [renamed as 0d: this is not initialisation, but I want it out of the ME timers (#371)]
+    // 0e. (Only on the first iteration) Get good helicities [renamed as 0e: this IS initialisation!]
+    // 3a. Evaluate MEs on the device (include transpose F2C for Bridge)
+    // 3b. Copy MEs back from device to host
+
+    // --- 0d. TransC2F
+    if( bridge )
+    {
+      const std::string tc2fKey = "0d TransC2F";
+      timermap.start( tc2fKey );
+      dynamic_cast<BridgeKernelBase*>( pmek.get() )->transposeInputMomentaC2F();
+    }
+
+#ifdef __CUDACC__
+    // --- 2d. CopyHToD Momenta
+    const std::string gKey = "0.. CpHTDg";
+    rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER!
+    copyDeviceFromHost( devGs, hstGs );
+#endif
+
+    // --- 0e. SGoodHel
+    if( iiter == 0 )
+    {
+      const std::string ghelKey = "0e SGoodHel";
+      timermap.start( ghelKey );
+      nGoodHel = pmek->computeGoodHelicities();
+    }
+
+    // *** START THE OLD-STYLE TIMERS FOR MATRIX ELEMENTS (WAVEFUNCTIONS) ***
+    double wavetime = 0; // calc plus copy
+    double wv3atime = 0; // calc only
+
+    // --- 3a. SigmaKin
+    const std::string skinKey = "3a SigmaKin";
+    timermap.start( skinKey );
+    constexpr unsigned int channelId = 0; // TEMPORARY? disable multi-channel in check.exe and gcheck.exe #466
+    pmek->computeMatrixElements( channelId );
+
+    // *** STOP THE NEW OLD-STYLE TIMER FOR MATRIX ELEMENTS (WAVEFUNCTIONS) ***
+    wv3atime += timermap.stop(); // calc only
+    wavetime += wv3atime;        // calc plus copy
+
+#ifdef __CUDACC__
+    if( !bridge )
+    {
+      // --- 3b. CopyDToH MEs
+      const std::string cmesKey = "3b CpDTHmes";
+      timermap.start( cmesKey );
+      copyHostFromDevice( hstMatrixElements, devMatrixElements );
+      // *** STOP THE OLD OLD-STYLE TIMER FOR MATRIX ELEMENTS (WAVEFUNCTIONS) ***
+      wavetime += timermap.stop(); // calc plus copy
+    }
+#endif
+
+    // === STEP 4 FINALISE LOOP
+    // --- 4@ Update event statistics
+    const std::string updtKey = "4@ UpdtStat";
+    timermap.start( updtKey );
+    xsk.updateEventStatistics();
+
+    // --- 4a Dump within the loop
+    const std::string loopKey = "4a DumpLoop";
+    timermap.start( loopKey );
+    genrtimes[iiter] = genrtime;
+    rambtimes[iiter] = rambtime;
+    wavetimes[iiter] = wavetime;
+    wv3atimes[iiter] = wv3atime;
+
+    if( verbose )
+    {
+      std::cout << std::string( SEP79, '*' ) << std::endl
+                << "Iteration #" << iiter + 1 << " of " << niter << std::endl;
+      if( perf ) std::cout << "Wave function time: " << wavetime << std::endl;
+    }
+
+    for( unsigned int ievt = 0; ievt < nevt; ++ievt ) // Loop over all events in this iteration
+    {
+      if( verbose )
+      {
+        // Display momenta
+        std::cout << "Momenta:" << std::endl;
+        for( int ipar = 0; ipar < mgOnGpu::npar; ipar++ )
+        {
+          // NB: 'setw' affects only the next field (of any type)
+          std::cout << std::scientific // fixed format: affects all floats (default precision: 6)
+                    << std::setw( 4 ) << ipar + 1
+                    << std::setw( 14 ) << MemoryAccessMomenta::ieventAccessIp4IparConst( hstMomenta.data(), ievt, 0, ipar )
+                    << std::setw( 14 ) << MemoryAccessMomenta::ieventAccessIp4IparConst( hstMomenta.data(), ievt, 1, ipar )
+                    << std::setw( 14 ) << MemoryAccessMomenta::ieventAccessIp4IparConst( hstMomenta.data(), ievt, 2, ipar )
+                    << std::setw( 14 ) << MemoryAccessMomenta::ieventAccessIp4IparConst( hstMomenta.data(), ievt, 3, ipar )
+                    << std::endl
+                    << std::defaultfloat; // default format: affects all floats
+        }
+        std::cout << std::string( SEP79, '-' ) << std::endl;
+        // Display matrix elements
+        std::cout << " Matrix element = " << MemoryAccessMatrixElements::ieventAccessConst( hstMatrixElements.data(), ievt )
+                  << " GeV^" << meGeVexponent << std::endl;
+        std::cout << std::string( SEP79, '-' ) << std::endl;
+      }
+    }
+
+    if( !( verbose || debug || perf ) )
+    {
+      std::cout << ".";
+    }
+  }
+
+  // **************************************
+  // *** END MAIN LOOP ON #ITERATIONS ***
+  // **************************************
+
+  // === STEP 8 ANALYSIS
+  // --- 8a Analysis: compute stats after the loop
+  const std::string statKey = "8a CompStat";
+  timermap.start( statKey );
+
+  double sumgtim = 0;
+  //double sqsgtim = 0;
+  double mingtim = genrtimes[0];
+  double maxgtim = genrtimes[0];
+  for( unsigned int iiter = 0; iiter < niter; ++iiter )
+  {
+    sumgtim += genrtimes[iiter];
+    //sqsgtim += genrtimes[iiter]*genrtimes[iiter];
+    mingtim = std::min( mingtim, genrtimes[iiter] );
+    maxgtim = std::max( maxgtim, genrtimes[iiter] );
+  }
+
+  double sumrtim = 0;
+  //double sqsrtim = 0;
+  double minrtim = rambtimes[0];
+  double maxrtim = rambtimes[0];
+  for( unsigned int iiter = 0; iiter < niter; ++iiter )
+  {
+    sumrtim += rambtimes[iiter];
+    //sqsrtim += rambtimes[iiter]*rambtimes[iiter];
+    minrtim = std::min( minrtim, rambtimes[iiter] );
+    maxrtim = std::max( maxrtim, rambtimes[iiter] );
+  }
+
+  double sumwtim = 0;
+  //double sqswtim = 0;
+  double minwtim = wavetimes[0];
+  double maxwtim = wavetimes[0];
+  for( unsigned int iiter = 0; iiter < niter; ++iiter )
+  {
+    sumwtim += wavetimes[iiter];
+    //sqswtim += wavetimes[iiter]*wavetimes[iiter];
+    minwtim = std::min( minwtim, wavetimes[iiter] );
+    maxwtim = std::max( maxwtim, wavetimes[iiter] );
+  }
+  double meanwtim = sumwtim / niter;
+  //double stdwtim = std::sqrt( sqswtim / niter - meanwtim * meanwtim );
+
+  double sumw3atim = 0;
+  //double sqsw3atim = 0;
+  double minw3atim = wv3atimes[0];
+  double maxw3atim = wv3atimes[0];
+  for( unsigned int iiter = 0; iiter < niter; ++iiter )
+  {
+    sumw3atim += wv3atimes[iiter];
+    //sqsw3atim += wv3atimes[iiter]*wv3atimes[iiter];
+    minw3atim = std::min( minw3atim, wv3atimes[iiter] );
+    maxw3atim = std::max( maxw3atim, wv3atimes[iiter] );
+  }
+  double meanw3atim = sumw3atim / niter;
+  //double stdw3atim = std::sqrt( sqsw3atim / niter - meanw3atim * meanw3atim );
+
+  const unsigned int nevtALL = hstStats.nevtALL; // total number of ALL events in all iterations
+  if( nevtALL != niter * nevt )
+    std::cout << "ERROR! nevtALL mismatch " << nevtALL << " != " << niter * nevt << std::endl; // SANITY CHECK
+  int nabn = hstStats.nevtABN;
+  int nzero = hstStats.nevtZERO;
+
+  // === STEP 9 FINALISE
+
+  std::string rndgentxt;
+  if( rndgen == RandomNumberMode::CommonRandom )
+    rndgentxt = "COMMON RANDOM HOST";
+  else if( rndgen == RandomNumberMode::CurandHost )
+    rndgentxt = "CURAND HOST";
+  else if( rndgen == RandomNumberMode::CurandDevice )
+    rndgentxt = "CURAND DEVICE";
+#ifdef __CUDACC__
+  rndgentxt += " (CUDA code)";
+#else
+  rndgentxt += " (C++ code)";
+#endif
+
+  // Workflow description summary
+  std::string wrkflwtxt;
+  // -- CUDA or C++?
+#ifdef __CUDACC__
+  wrkflwtxt += "CUD:";
+#else
+  wrkflwtxt += "CPP:";
+#endif
+  // -- DOUBLE or FLOAT?
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537)
+#elif defined MGONGPU_FPTYPE_DOUBLE
+  wrkflwtxt += "DBL+";
+#elif defined MGONGPU_FPTYPE_FLOAT
+  wrkflwtxt += "FLT+";
+#else
+  wrkflwtxt += "???+"; // no path to this statement
+#endif
+  // -- CUCOMPLEX or THRUST or STD complex numbers?
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_CUCOMPLEX
+  wrkflwtxt += "CUX:";
+#elif defined MGONGPU_CUCXTYPE_THRUST
+  wrkflwtxt += "THX:";
+#elif defined MGONGPU_CUCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
+#else
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+  wrkflwtxt += "STX:";
+#elif defined MGONGPU_CPPCXTYPE_CXSMPL
+  wrkflwtxt += "CXS:";
+#else
+  wrkflwtxt += "???:"; // no path to this statement
+#endif
+#endif
+  // -- COMMON or CURAND HOST or CURAND DEVICE random numbers?
+  if( rndgen == RandomNumberMode::CommonRandom )
+    wrkflwtxt += "COMMON+";
+  else if( rndgen == RandomNumberMode::CurandHost )
+    wrkflwtxt += "CURHST+";
+  else if( rndgen == RandomNumberMode::CurandDevice )
+    wrkflwtxt += "CURDEV+";
+  else
+    wrkflwtxt += "??????+"; // no path to this statement
+  // -- HOST or DEVICE rambo sampling?
+  if( rmbsmp == RamboSamplingMode::RamboHost )
+    wrkflwtxt += "RMBHST+";
+  else if( rmbsmp == RamboSamplingMode::RamboDevice )
+    wrkflwtxt += "RMBDEV+";
+  else
+    wrkflwtxt += "??????+"; // no path to this statement
+#ifdef __CUDACC__
+  // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE?
+  if( !bridge )
+    wrkflwtxt += "MESDEV";
+  else
+    wrkflwtxt += "BRDDEV";
+#else
+  if( !bridge )
+    wrkflwtxt += "MESHST"; // FIXME! allow this also in CUDA (eventually with various simd levels)
+  else
+    wrkflwtxt += "BRDHST";
+#endif
+    // -- SIMD matrix elements?
+#if !defined MGONGPU_CPPSIMD
+  wrkflwtxt += "/none";
+#elif defined __AVX512VL__
+#ifdef MGONGPU_PVW512
+  wrkflwtxt += "/512z";
+#else
+  wrkflwtxt += "/512y";
+#endif
+#elif defined __AVX2__
+  wrkflwtxt += "/avx2";
+#elif defined __SSE4_2__
+#ifdef __PPC__
+  wrkflwtxt += "/ppcv";
+#elif defined __ARM_NEON__
+  wrkflwtxt += "/neon";
+#else
+  wrkflwtxt += "/sse4";
+#endif
+#else
+  wrkflwtxt += "/????"; // no path to this statement
+#endif
+  // -- Has cxtype_v::operator[] bracket with non-const reference?
+#if defined MGONGPU_CPPSIMD
+#ifdef MGONGPU_HAS_CPPCXTYPEV_BRK
+  wrkflwtxt += "+CXVBRK";
+#else
+  wrkflwtxt += "+NOVBRK";
+#endif
+#else
+  wrkflwtxt += "+NAVBRK"; // N/A
+#endif
+
+  // --- 9a Dump to screen
+  const std::string dumpKey = "9a DumpScrn";
+  timermap.start( dumpKey );
+
+  if( !( verbose || debug || perf ) )
+  {
+    std::cout << std::endl;
+  }
+
+  if( perf )
+  {
+#ifndef __CUDACC__
+#ifdef _OPENMP
+    // Get the output of "nproc --all" (https://stackoverflow.com/a/478960)
+    std::string nprocall;
+    std::unique_ptr<FILE, decltype( &pclose )> nprocpipe( popen( "nproc --all", "r" ), pclose );
+    if( !nprocpipe ) throw std::runtime_error( "`nproc --all` failed?" );
+    std::array<char, 128> nprocbuf;
+    while( fgets( nprocbuf.data(), nprocbuf.size(), nprocpipe.get() ) != nullptr ) nprocall += nprocbuf.data();
+#endif
+#endif
+#ifdef MGONGPU_CPPSIMD
+#ifdef MGONGPU_HAS_CPPCXTYPEV_BRK
+    const std::string cxtref = " [cxtype_ref=YES]";
+#else
+    const std::string cxtref = " [cxtype_ref=NO]";
+#endif
+#endif
+    // Dump all configuration parameters and all results
+    std::cout << std::string( SEP79, '*' ) << std::endl
+#ifdef __CUDACC__
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA"
+#else
+              << "Process                     = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP"
+#endif
+              << " [" << process.getCompiler() << "]"
+#ifdef MGONGPU_INLINE_HELAMPS
+              << " [inlineHel=1]"
+#else
+              << " [inlineHel=0]"
+#endif
+#ifdef MGONGPU_HARDCODE_PARAM
+              << " [hardcodePARAM=1]" << std::endl
+#else
+              << " [hardcodePARAM=0]" << std::endl
+#endif
+              << "NumBlocksPerGrid            = " << gpublocks << std::endl
+              << "NumThreadsPerBlock          = " << gputhreads << std::endl
+              << "NumIterations               = " << niter << std::endl
+              << std::string( SEP79, '-' ) << std::endl;
+    std::cout << "Workflow summary            = " << wrkflwtxt << std::endl
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+              << "FP precision                = MIXED (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
+#elif defined MGONGPU_FPTYPE_DOUBLE
+              << "FP precision                = DOUBLE (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
+#elif defined MGONGPU_FPTYPE_FLOAT
+              << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
+#endif
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_CUCOMPLEX
+              << "Complex type                = CUCOMPLEX" << std::endl
+#elif defined MGONGPU_CUCXTYPE_THRUST
+              << "Complex type                = THRUST::COMPLEX" << std::endl
+#endif
+#else
+              << "Complex type                = STD::COMPLEX" << std::endl
+#endif
+              << "RanNumb memory layout       = AOSOA[" << neppR << "]"
+              << ( neppR == 1 ? " == AOS" : "" )
+              << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl
+              << "Momenta memory layout       = AOSOA[" << neppM << "]"
+              << ( neppM == 1 ? " == AOS" : "" ) << std::endl
+#ifdef __CUDACC__
+    //<< "Wavefunction GPU memory     = LOCAL" << std::endl
+#else
+#if !defined MGONGPU_CPPSIMD
+              << "Internal loops fptype_sv    = SCALAR ('none': ~vector[" << neppV
+              << "], no SIMD)" << std::endl
+#elif defined __AVX512VL__
+#ifdef MGONGPU_PVW512
+              << "Internal loops fptype_sv    = VECTOR[" << neppV
+              << "] ('512z': AVX512, 512bit)" << cxtref << std::endl
+#else
+              << "Internal loops fptype_sv    = VECTOR[" << neppV
+              << "] ('512y': AVX512, 256bit)" << cxtref << std::endl
+#endif
+#elif defined __AVX2__
+              << "Internal loops fptype_sv    = VECTOR[" << neppV
+              << "] ('avx2': AVX2, 256bit)" << cxtref << std::endl
+#elif defined __SSE4_2__
+              << "Internal loops fptype_sv    = VECTOR[" << neppV
+#ifdef __PPC__
+              << "] ('sse4': PPC VSX, 128bit)" << cxtref << std::endl
+#elif defined __ARM_NEON__
+              << "] ('sse4': ARM NEON, 128bit)" << cxtref << std::endl
+#else
+              << "] ('sse4': SSE4.2, 128bit)" << cxtref << std::endl
+#endif
+#else
+#error Internal error: unknown SIMD build configuration
+#endif
+#endif
+              << "Random number generation    = " << rndgentxt << std::endl
+#ifndef __CUDACC__
+#ifdef _OPENMP
+              << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline
+#endif
+#endif
+              //<< "MatrixElements compiler     = " << process.getCompiler() << std::endl
+              << std::string( SEP79, '-' ) << std::endl
+              << "HelicityComb Good/Tot       = " << nGoodHel << "/" << mgOnGpu::ncomb << std::endl
+              << std::string( SEP79, '-' ) << std::endl
+              << "NumberOfEntries             = " << niter << std::endl
+              << std::scientific // fixed format: affects all floats (default precision: 6)
+              << "TotalTime[Rnd+Rmb+ME] (123) = ( " << sumgtim + sumrtim + sumwtim << std::string( 16, ' ' ) << " )  sec" << std::endl
+              << "TotalTime[Rambo+ME]    (23) = ( " << sumrtim + sumwtim << std::string( 16, ' ' ) << " )  sec" << std::endl
+              << "TotalTime[RndNumGen]    (1) = ( " << sumgtim << std::string( 16, ' ' ) << " )  sec" << std::endl
+              << "TotalTime[Rambo]        (2) = ( " << sumrtim << std::string( 16, ' ' ) << " )  sec" << std::endl
+              << "TotalTime[MatrixElems]  (3) = ( " << sumwtim << std::string( 16, ' ' ) << " )  sec" << std::endl
+              << "MeanTimeInMatrixElems       = ( " << meanwtim << std::string( 16, ' ' ) << " )  sec" << std::endl
+              << "[Min,Max]TimeInMatrixElems  = [ " << minwtim
+              << " ,  " << maxwtim << " ]  sec" << std::endl
+              //<< "StdDevTimeInMatrixElems     = ( " << stdwtim << std::string(16, ' ') << " )  sec" << std::endl
+              << "TotalTime[MECalcOnly]  (3a) = ( " << sumw3atim << std::string( 16, ' ' ) << " )  sec" << std::endl
+              << "MeanTimeInMECalcOnly        = ( " << meanw3atim << std::string( 16, ' ' ) << " )  sec" << std::endl
+              << "[Min,Max]TimeInMECalcOnly   = [ " << minw3atim
+              << " ,  " << maxw3atim << " ]  sec" << std::endl
+              //<< "StdDevTimeInMECalcOnly      = ( " << stdw3atim << std::string(16, ' ') << " )  sec" << std::endl
+              << std::string( SEP79, '-' ) << std::endl
+              //<< "ProcessID:                  = " << getpid() << std::endl
+              //<< "NProcesses                  = " << process.nprocesses << std::endl // assume nprocesses == 1 (#272 and #343)
+              << "TotalEventsComputed         = " << nevtALL << std::endl
+              << "EvtsPerSec[Rnd+Rmb+ME](123) = ( " << nevtALL / ( sumgtim + sumrtim + sumwtim )
+              << std::string( 16, ' ' ) << " )  sec^-1" << std::endl
+              << "EvtsPerSec[Rmb+ME]     (23) = ( " << nevtALL / ( sumrtim + sumwtim )
+              << std::string( 16, ' ' ) << " )  sec^-1" << std::endl
+              //<< "EvtsPerSec[RndNumGen]   (1) = ( " << nevtALL/sumgtim
+              //<< std::string(16, ' ') << " )  sec^-1" << std::endl
+              //<< "EvtsPerSec[Rambo]        (2) = ( " << nevtALL/sumrtim
+              //<< std::string(16, ' ') << " )  sec^-1" << std::endl
+              << "EvtsPerSec[MatrixElems] (3) = ( " << nevtALL / sumwtim
+              << std::string( 16, ' ' ) << " )  sec^-1" << std::endl
+              << "EvtsPerSec[MECalcOnly] (3a) = ( " << nevtALL / sumw3atim
+              << std::string( 16, ' ' ) << " )  sec^-1" << std::endl
+              << std::defaultfloat; // default format: affects all floats
+    std::cout << std::string( SEP79, '*' ) << std::endl
+              << hstStats;
+  }
+
+  // --- 9b Dump to json
+  const std::string jsonKey = "9b DumpJson";
+  timermap.start( jsonKey );
+
+  if( json )
+  {
+    std::string jsonFileName = std::to_string( jsondate ) + "-perf-test-run" + std::to_string( jsonrun ) + ".json";
+    jsonFileName = "./perf/data/" + jsonFileName;
+
+    //Checks if file exists
+    std::ifstream fileCheck;
+    bool fileExists = false;
+    fileCheck.open( jsonFileName );
+    if( fileCheck )
+    {
+      fileExists = true;
+      fileCheck.close();
+    }
+
+    std::ofstream jsonFile;
+    jsonFile.open( jsonFileName, std::ios_base::app );
+    if( !fileExists )
+    {
+      jsonFile << "[" << std::endl;
+    }
+    else
+    {
+      //deleting the last bracket and outputting a ", "
+      std::string temp = "truncate -s-1 " + jsonFileName;
+      const char* command = temp.c_str();
+      if( system( command ) != 0 )
+        std::cout << "WARNING! Command '" << temp << "' failed" << std::endl;
+      jsonFile << ", " << std::endl;
+    }
+
+    jsonFile << "{" << std::endl
+             << "\"NumIterations\": " << niter << ", " << std::endl
+             << "\"NumThreadsPerBlock\": " << gputhreads << ", " << std::endl
+             << "\"NumBlocksPerGrid\": " << gpublocks << ", " << std::endl
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+             << "\"FP precision\": "
+             << "\"MIXED (NaN/abnormal=" << nabn << ")\"," << std::endl
+#elif defined MGONGPU_FPTYPE_DOUBLE
+             << "\"FP precision\": "
+             << "\"DOUBLE (NaN/abnormal=" << nabn << ")\"," << std::endl
+#elif defined MGONGPU_FPTYPE_FLOAT
+             << "\"FP precision\": "
+             << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
+#endif
+             << "\"Complex type\": "
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_CUCOMPLEX
+             << "\"CUCOMPLEX\"," << std::endl
+#elif defined MGONGPU_CUCXTYPE_THRUST
+             << "\"THRUST::COMPLEX\"," << std::endl
+#endif
+#else
+             << "\"STD::COMPLEX\"," << std::endl
+#endif
+             << "\"RanNumb memory layout\": "
+             << "\"AOSOA[" << neppR << "]\""
+             << ( neppR == 1 ? " == AOS" : "" ) << ", " << std::endl
+             << "\"Momenta memory layout\": "
+             << "\"AOSOA[" << neppM << "]\""
+             << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl
+#ifdef __CUDACC__
+    //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl
+#endif
+             << "\"Curand generation\": "
+             << "\"" << rndgentxt << "\"," << std::endl;
+
+    double minelem = hstStats.minME;
+    double maxelem = hstStats.maxME;
+    double meanelem = hstStats.meanME();
+    double stdelem = hstStats.stdME();
+
+    jsonFile << "\"NumberOfEntries\": " << niter << "," << std::endl
+             //<< std::scientific // Not sure about this
+             << "\"TotalTime[Rnd+Rmb+ME] (123)\": \""
+             << std::to_string( sumgtim + sumrtim + sumwtim ) << " sec\","
+             << std::endl
+             << "\"TotalTime[Rambo+ME] (23)\": \""
+             << std::to_string( sumrtim + sumwtim ) << " sec\"," << std::endl
+             << "\"TotalTime[RndNumGen] (1)\": \""
+             << std::to_string( sumgtim ) << " sec\"," << std::endl
+             << "\"TotalTime[Rambo] (2)\": \""
+             << std::to_string( sumrtim ) << " sec\"," << std::endl
+             << "\"TotalTime[MatrixElems] (3)\": \""
+             << std::to_string( sumwtim ) << " sec\"," << std::endl
+             << "\"MeanTimeInMatrixElems\": \""
+             << std::to_string( meanwtim ) << " sec\"," << std::endl
+             << "\"MinTimeInMatrixElems\": \""
+             << std::to_string( minwtim ) << " sec\"," << std::endl
+             << "\"MaxTimeInMatrixElems\": \""
+             << std::to_string( maxwtim ) << " sec\"," << std::endl
+             //<< "ProcessID:                = " << getpid() << std::endl
+             //<< "NProcesses                = " << process.nprocesses << std::endl // assume nprocesses == 1 (#272 and #343)
+             << "\"TotalEventsComputed\": " << nevtALL << "," << std::endl
+             << "\"EvtsPerSec[Rnd+Rmb+ME](123)\": \""
+             << std::to_string( nevtALL / ( sumgtim + sumrtim + sumwtim ) ) << " sec^-1\"," << std::endl
+             << "\"EvtsPerSec[Rmb+ME] (23)\": \""
+             << std::to_string( nevtALL / ( sumrtim + sumwtim ) ) << " sec^-1\"," << std::endl
+             << "\"EvtsPerSec[MatrixElems] (3)\": \""
+             << std::to_string( nevtALL / sumwtim ) << " sec^-1\"," << std::endl
+             << "\"EvtsPerSec[MECalcOnly] (3)\": \""
+             << std::to_string( nevtALL / sumw3atim ) << " sec^-1\"," << std::endl
+             << "\"NumMatrixElems(notAbnormal)\": " << nevtALL - nabn << "," << std::endl
+             << std::scientific
+             << "\"MeanMatrixElemValue\": "
+             << "\"" << std::to_string( meanelem ) << " GeV^"
+             << std::to_string( meGeVexponent ) << "\"," << std::endl
+             << "\"StdErrMatrixElemValue\": "
+             << "\"" << std::to_string( stdelem / sqrt( nevtALL ) ) << " GeV^"
+             << std::to_string( meGeVexponent ) << "\"," << std::endl
+             << "\"StdDevMatrixElemValue\": "
+             << "\"" << std::to_string( stdelem )
+             << " GeV^" << std::to_string( meGeVexponent ) << "\"," << std::endl
+             << "\"MinMatrixElemValue\": "
+             << "\"" << std::to_string( minelem ) << " GeV^"
+             << std::to_string( meGeVexponent ) << "\"," << std::endl
+             << "\"MaxMatrixElemValue\": "
+             << "\"" << std::to_string( maxelem ) << " GeV^"
+             << std::to_string( meGeVexponent ) << "\"," << std::endl;
+
+    timermap.dump( jsonFile, true ); // NB For the active json timer this dumps a partial total
+
+    jsonFile << "}" << std::endl;
+    jsonFile << "]";
+    jsonFile.close();
+  }
+
+  // *** STOP THE NEW TIMERS ***
+  timermap.stop();
+  if( perf )
+  {
+    std::cout << std::string( SEP79, '*' ) << std::endl;
+    timermap.dump();
+    std::cout << std::string( SEP79, '*' ) << std::endl;
+  }
+
+  // [NB some resources like curand generators will be deleted here when stack-allocated classes go out of scope]
+  //std::cout << "ALL OK" << std::endl;
+  return 0;
+}
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/cudacpp.mk
new file mode 120000
index 0000000000..252b38e27a
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/cudacpp.mk
@@ -0,0 +1 @@
+../cudacpp.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/epoch_process_id.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/epoch_process_id.h
new file mode 100644
index 0000000000..064373c38c
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/epoch_process_id.h
@@ -0,0 +1,11 @@
+#ifndef EPOCH_PROCESS_ID_H
+#define EPOCH_PROCESS_ID_H 1
+
+// No need to indicate EPOCHX_ any longer for auto-generated code
+// However, keep the name of the file as it may be useful again for new manual developments
+#define MG_EPOCH_PROCESS_ID SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX
+
+// For simplicity, define here the name of the process-dependent reference file for tests
+#define MG_EPOCH_REFERENCE_FILE_NAME "../../../../../test/ref/dump_CPUTest.Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx.txt"
+
+#endif // EPOCH_PROCESS_ID_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fbridge.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fbridge.cc
new file mode 120000
index 0000000000..cbcc1f579f
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fbridge.cc
@@ -0,0 +1 @@
+../fbridge.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fbridge.inc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fbridge.inc
new file mode 120000
index 0000000000..69598a6d2f
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fbridge.inc
@@ -0,0 +1 @@
+../fbridge.inc
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fcheck_sa.f b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fcheck_sa.f
new file mode 100644
index 0000000000..0320b590a7
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fcheck_sa.f
@@ -0,0 +1,84 @@
+      PROGRAM FCHECK_SA
+      IMPLICIT NONE
+      INCLUDE 'fsampler.inc'
+      INCLUDE 'fbridge.inc'
+      INTEGER*8 SAMPLER, BRIDGE ! 64bit memory addresses
+      INTEGER NEVTMAX, NEXTERNAL, NP4
+      PARAMETER(NEVTMAX=2048*256, NEXTERNAL=6, NP4=4)
+      CHARACTER*32 ARG0, ARG1, ARG2, ARG3
+      INTEGER NARG1, NARG2, NARG3
+      INTEGER NEVT, NITER
+      INTEGER IEVT, IITER
+c     INTEGER IEXTERNAL
+      DOUBLE PRECISION MOMENTA(0:NP4-1, NEXTERNAL, NEVTMAX) ! c-array momenta[nevt][nexternal][np4]
+      DOUBLE PRECISION GS(NEVTMAX)
+      DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used
+      DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used
+      INTEGER*4 CHANID
+      PARAMETER(CHANID=0) ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466
+      DOUBLE PRECISION MES(NEVTMAX)
+      INTEGER*4 SELHEL(NEVTMAX) ! not yet used
+      INTEGER*4 SELCOL(NEVTMAX) ! not yet used
+      DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision
+      INTEGER NEVTOK ! exclude nan/abnormal MEs
+C
+C READ COMMAND LINE ARGUMENTS
+C (NB: most errors will crash the program !)
+C
+      IF ( COMMAND_ARGUMENT_COUNT() == 3 ) THEN
+        CALL GET_COMMAND_ARGUMENT(1,ARG1)
+        CALL GET_COMMAND_ARGUMENT(2,ARG2)
+        CALL GET_COMMAND_ARGUMENT(3,ARG3)
+        READ (ARG1,'(I4)') NARG1
+        READ (ARG2,'(I4)') NARG2
+        READ (ARG3,'(I4)') NARG3
+        WRITE(6,*) "GPUBLOCKS=  ", NARG1
+        WRITE(6,*) "GPUTHREADS= ", NARG2
+        WRITE(6,*) "NITERATIONS=", NARG3
+        NEVT = NARG1 * NARG2
+        NITER = NARG3
+        IF ( NEVT > NEVTMAX ) THEN
+          WRITE(6,*) "ERROR! NEVT>NEVTMAX"
+          STOP
+        ENDIF
+      ELSE
+        CALL GET_COMMAND_ARGUMENT(0,ARG0)
+        WRITE(6,*) "Usage: ", TRIM(ARG0),
+     &    " gpublocks gputhreads niterations"
+        STOP
+      ENDIF
+C
+C USE SAMPLER AND BRIDGE
+C
+      NEVTOK = 0
+      MES_SUM = 0
+      CALL FBRIDGECREATE(BRIDGE, NEVT, NEXTERNAL, NP4) ! this must be at the beginning as it initialises the CUDA device
+      CALL FSAMPLERCREATE(SAMPLER, NEVT, NEXTERNAL, NP4)
+      DO IITER = 1, NITER
+        CALL FSAMPLERSEQUENCE(SAMPLER, MOMENTA)
+        DO IEVT = 1, NEVT
+          GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
+        END DO
+        CALL FBRIDGESEQUENCE(BRIDGE, MOMENTA, GS,
+     &    RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL)
+        DO IEVT = 1, NEVT
+c         DO IEXTERNAL = 1, NEXTERNAL
+c           WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL,
+c    &        MOMENTA(0, IEXTERNAL, IEVT),
+c    &        MOMENTA(1, IEXTERNAL, IEVT),
+c    &        MOMENTA(2, IEXTERNAL, IEVT),
+c    &        MOMENTA(3, IEXTERNAL, IEVT)
+c         END DO
+c         WRITE(6,*) 'MES    ', IEVT, MES(IEVT)
+c         WRITE(6,*)
+          IF ( .NOT. ISNAN(MES(IEVT)) ) THEN
+            NEVTOK = NEVTOK + 1
+            MES_SUM = MES_SUM + MES(IEVT)
+          ENDIF
+        END DO
+      END DO
+      CALL FSAMPLERDELETE(SAMPLER)
+      CALL FBRIDGEDELETE(BRIDGE) ! this must be at the end as it shuts down the CUDA device
+      WRITE(6,*) 'Average Matrix Element:', MES_SUM/NEVT/NITER
+      WRITE(6,*) 'Abnormal MEs:', NEVT*NITER - NEVTOK
+      END PROGRAM FCHECK_SA
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fsampler.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fsampler.cc
new file mode 120000
index 0000000000..521c828d41
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fsampler.cc
@@ -0,0 +1 @@
+../fsampler.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fsampler.inc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fsampler.inc
new file mode 120000
index 0000000000..4b0f3c2656
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/fsampler.inc
@@ -0,0 +1 @@
+../fsampler.inc
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/gBridgeKernels.cu b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/gBridgeKernels.cu
new file mode 120000
index 0000000000..12c1d49d13
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/gBridgeKernels.cu
@@ -0,0 +1 @@
+BridgeKernels.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/gCPPProcess.cu b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/gCPPProcess.cu
new file mode 120000
index 0000000000..1fc8661d4e
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/gCPPProcess.cu
@@ -0,0 +1 @@
+CPPProcess.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/gCrossSectionKernels.cu b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/gCrossSectionKernels.cu
new file mode 120000
index 0000000000..9a05a7b55a
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/gCrossSectionKernels.cu
@@ -0,0 +1 @@
+CrossSectionKernels.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/gMatrixElementKernels.cu b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/gMatrixElementKernels.cu
new file mode 120000
index 0000000000..82415576cc
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/gMatrixElementKernels.cu
@@ -0,0 +1 @@
+MatrixElementKernels.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/gRamboSamplingKernels.cu b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/gRamboSamplingKernels.cu
new file mode 120000
index 0000000000..8dbfaa6493
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/gRamboSamplingKernels.cu
@@ -0,0 +1 @@
+RamboSamplingKernels.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/gRandomNumberKernels.cu b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/gRandomNumberKernels.cu
new file mode 120000
index 0000000000..26580cf106
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/gRandomNumberKernels.cu
@@ -0,0 +1 @@
+RandomNumberKernels.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/gcheck_sa.cu b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/gcheck_sa.cu
new file mode 120000
index 0000000000..b99171c25e
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/gcheck_sa.cu
@@ -0,0 +1 @@
+check_sa.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/makefile b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/makefile
new file mode 120000
index 0000000000..cd937e1d9e
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/makefile
@@ -0,0 +1 @@
+cudacpp.mk
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/nvtx.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/nvtx.h
new file mode 120000
index 0000000000..a2f268fa94
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/nvtx.h
@@ -0,0 +1 @@
+../nvtx.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/ompnumthreads.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/ompnumthreads.h
new file mode 120000
index 0000000000..4385e53fca
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/ompnumthreads.h
@@ -0,0 +1 @@
+../ompnumthreads.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/perf.py b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/perf.py
new file mode 120000
index 0000000000..b7d410aefa
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/perf.py
@@ -0,0 +1 @@
+../perf.py
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/profile.sh b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/profile.sh
new file mode 120000
index 0000000000..01080a084d
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/profile.sh
@@ -0,0 +1 @@
+../profile.sh
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/runTest.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/runTest.cc
new file mode 120000
index 0000000000..32afd3ca34
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/runTest.cc
@@ -0,0 +1 @@
+../runTest.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/testmisc.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/testmisc.cc
new file mode 120000
index 0000000000..3b553cf3f8
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/testmisc.cc
@@ -0,0 +1 @@
+../testmisc.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/testxxx.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/testxxx.cc
new file mode 120000
index 0000000000..045b2f10ea
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/testxxx.cc
@@ -0,0 +1 @@
+../testxxx.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/testxxx_cc_ref.txt b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/testxxx_cc_ref.txt
new file mode 120000
index 0000000000..51764d98ac
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/testxxx_cc_ref.txt
@@ -0,0 +1 @@
+../testxxx_cc_ref.txt
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/timer.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/timer.h
new file mode 120000
index 0000000000..e161ad9e27
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/timer.h
@@ -0,0 +1 @@
+../timer.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/timermap.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/timermap.h
new file mode 120000
index 0000000000..1479de7fc0
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/timermap.h
@@ -0,0 +1 @@
+../timermap.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/RamboSamplingKernels.cc
new file mode 100644
index 0000000000..ed2e042427
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/RamboSamplingKernels.cc
@@ -0,0 +1,178 @@
+#include "RamboSamplingKernels.h"
+
+#include "CudaRuntime.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryAccessRandomNumbers.h"
+#include "MemoryAccessWeights.h"
+#include "MemoryBuffers.h"
+#include "rambo.h" // inline implementation of RAMBO algorithms and kernels
+
+#include <sstream>
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+  RamboSamplingKernelHost::RamboSamplingKernelHost( const fptype energy,               // input: energy
+                                                    const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
+                                                    BufferMomenta& momenta,            // output: momenta
+                                                    BufferWeights& weights,            // output: weights
+                                                    const size_t nevt )
+    : SamplingKernelBase( energy, rndmom, momenta, weights )
+    , NumberOfEvents( nevt )
+  {
+    if( m_rndmom.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelHost: rndmom must be a host array" );
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelHost: momenta must be a host array" );
+    if( m_weights.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelHost: weights must be a host array" );
+    if( this->nevt() != m_rndmom.nevt() ) throw std::runtime_error( "RamboSamplingKernelHost: nevt mismatch with rndmom" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "RamboSamplingKernelHost: nevt mismatch with momenta" );
+    if( this->nevt() != m_weights.nevt() ) throw std::runtime_error( "RamboSamplingKernelHost: nevt mismatch with weights" );
+    // Sanity checks for memory access (momenta buffer)
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "RamboSamplingKernelHost: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+    // Sanity checks for memory access (random number buffer)
+    constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout
+    static_assert( ispoweroftwo( neppR ), "neppR is not a power of 2" );
+    if( nevt % neppR != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "RamboSamplingKernelHost: nevt should be a multiple of neppR=" << neppR;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  void
+  RamboSamplingKernelHost::getMomentaInitial()
+  {
+    constexpr auto getMomentaInitial = ramboGetMomentaInitial<HostAccessMomenta>;
+    // ** START LOOP ON IEVT **
+    for( size_t ievt = 0; ievt < nevt(); ++ievt )
+    {
+      // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
+      fptype* ievtMomenta = MemoryAccessMomenta::ieventAccessRecord( m_momenta.data(), ievt );
+      getMomentaInitial( m_energy, ievtMomenta );
+    }
+    // ** END LOOP ON IEVT **
+  }
+
+  //--------------------------------------------------------------------------
+
+  void
+  RamboSamplingKernelHost::getMomentaFinal()
+  {
+    constexpr auto getMomentaFinal = ramboGetMomentaFinal<HostAccessRandomNumbers, HostAccessMomenta, HostAccessWeights>;
+    // ** START LOOP ON IEVT **
+    for( size_t ievt = 0; ievt < nevt(); ++ievt )
+    {
+      // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
+      const fptype* ievtRndmom = MemoryAccessRandomNumbers::ieventAccessRecordConst( m_rndmom.data(), ievt );
+      fptype* ievtMomenta = MemoryAccessMomenta::ieventAccessRecord( m_momenta.data(), ievt );
+      fptype* ievtWeights = MemoryAccessWeights::ieventAccessRecord( m_weights.data(), ievt );
+      getMomentaFinal( m_energy, ievtRndmom, ievtMomenta, ievtWeights );
+    }
+    // ** END LOOP ON IEVT **
+  }
+
+  //--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+  RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy,               // input: energy
+                                                        const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
+                                                        BufferMomenta& momenta,            // output: momenta
+                                                        BufferWeights& weights,            // output: weights
+                                                        const size_t gpublocks,
+                                                        const size_t gputhreads )
+    : SamplingKernelBase( energy, rndmom, momenta, weights )
+    , NumberOfEvents( gpublocks * gputhreads )
+    , m_gpublocks( gpublocks )
+    , m_gputhreads( gputhreads )
+  {
+    if( !m_rndmom.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelDevice: rndmom must be a device array" );
+    if( !m_momenta.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelDevice: momenta must be a device array" );
+    if( !m_weights.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelDevice: weights must be a device array" );
+    if( m_gpublocks == 0 ) throw std::runtime_error( "RamboSamplingKernelDevice: gpublocks must be > 0" );
+    if( m_gputhreads == 0 ) throw std::runtime_error( "RamboSamplingKernelDevice: gputhreads must be > 0" );
+    if( this->nevt() != m_rndmom.nevt() ) throw std::runtime_error( "RamboSamplingKernelDevice: nevt mismatch with rndmom" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "RamboSamplingKernelDevice: nevt mismatch with momenta" );
+    if( this->nevt() != m_weights.nevt() ) throw std::runtime_error( "RamboSamplingKernelDevice: nevt mismatch with weights" );
+    // Sanity checks for memory access (momenta buffer)
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( m_gputhreads % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "RamboSamplingKernelHost: gputhreads should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+    // Sanity checks for memory access (random number buffer)
+    constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout
+    static_assert( ispoweroftwo( neppR ), "neppR is not a power of 2" );
+    if( m_gputhreads % neppR != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "RamboSamplingKernelDevice: gputhreads should be a multiple of neppR=" << neppR;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+  __global__ void
+  getMomentaInitialDevice( const fptype energy,
+                           fptype* momenta )
+  {
+    constexpr auto getMomentaInitial = ramboGetMomentaInitial<DeviceAccessMomenta>;
+    return getMomentaInitial( energy, momenta );
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+  void
+  RamboSamplingKernelDevice::getMomentaInitial()
+  {
+    getMomentaInitialDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_momenta.data() );
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+  __global__ void
+  getMomentaFinalDevice( const fptype energy,
+                         const fptype* rndmom,
+                         fptype* momenta,
+                         fptype* wgts )
+  {
+    constexpr auto getMomentaFinal = ramboGetMomentaFinal<DeviceAccessRandomNumbers, DeviceAccessMomenta, DeviceAccessWeights>;
+    return getMomentaFinal( energy, rndmom, momenta, wgts );
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+  void
+  RamboSamplingKernelDevice::getMomentaFinal()
+  {
+    getMomentaFinalDevice<<<m_gpublocks, m_gputhreads>>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() );
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+}
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/RamboSamplingKernels.h
new file mode 100644
index 0000000000..f40433af4a
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/RamboSamplingKernels.h
@@ -0,0 +1,129 @@
+#ifndef RAMBOSAMPLINGKERNELS_H
+#define RAMBOSAMPLINGKERNELS_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryBuffers.h"
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating phase space sampling on a CPU host or on a GPU device
+  class SamplingKernelBase //: virtual public ISamplingKernel
+  {
+  protected:
+
+    // Constructor from existing input and output buffers
+    SamplingKernelBase( const fptype energy,               // input: energy
+                        const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
+                        BufferMomenta& momenta,            // output: momenta
+                        BufferWeights& weights )           // output: weights
+      : m_energy( energy )
+      , m_rndmom( rndmom )
+      , m_momenta( momenta )
+      , m_weights( weights )
+    {
+    }
+
+  public:
+
+    // Destructor
+    virtual ~SamplingKernelBase() {}
+
+    // Get momenta of initial state particles
+    virtual void getMomentaInitial() = 0;
+
+    // Get momenta of final state particles and weights
+    virtual void getMomentaFinal() = 0;
+
+    // Is this a host or device kernel?
+    virtual bool isOnDevice() const = 0;
+
+  protected:
+
+    // The energy
+    const fptype m_energy;
+
+    // The buffer for the input random numbers
+    const BufferRndNumMomenta& m_rndmom;
+
+    // The buffer for the output momenta
+    BufferMomenta& m_momenta;
+
+    // The buffer for the output weights
+    BufferWeights& m_weights;
+  };
+
+  //--------------------------------------------------------------------------
+
+  // A class encapsulating RAMBO phase space sampling on a CPU host
+  class RamboSamplingKernelHost final : public SamplingKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    RamboSamplingKernelHost( const fptype energy,               // input: energy
+                             const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
+                             BufferMomenta& momenta,            // output: momenta
+                             BufferWeights& weights,            // output: weights
+                             const size_t nevt );
+
+    // Destructor
+    virtual ~RamboSamplingKernelHost() {}
+
+    // Get momenta of initial state particles
+    void getMomentaInitial() override final;
+
+    // Get momenta of final state particles and weights
+    void getMomentaFinal() override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+  };
+
+  //--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+  // A class encapsulating RAMBO phase space sampling on a GPU device
+  class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    RamboSamplingKernelDevice( const fptype energy,               // input: energy
+                               const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1]
+                               BufferMomenta& momenta,            // output: momenta
+                               BufferWeights& weights,            // output: weights
+                               const size_t gpublocks,
+                               const size_t gputhreads );
+
+    // Destructor
+    virtual ~RamboSamplingKernelDevice() {}
+
+    // Get momenta of initial state particles
+    void getMomentaInitial() override final;
+
+    // Get momenta of final state particles and weights
+    void getMomentaFinal() override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return true; }
+
+  private:
+
+    // The number of blocks in the GPU grid
+    size_t m_gpublocks;
+
+    // The number of threads in the GPU grid
+    size_t m_gputhreads;
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+}
+#endif // RAMBOSAMPLINGKERNELS_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/RandomNumberKernels.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/RandomNumberKernels.cc
new file mode 100644
index 0000000000..eb8bc09ea9
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/RandomNumberKernels.cc
@@ -0,0 +1,149 @@
+#include "RandomNumberKernels.h"
+
+#include "CommonRandomNumbers.h"
+#include "CudaRuntime.h"
+#include "MemoryBuffers.h"
+
+#include <cassert>
+
+#ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */
+#define checkCurand( code ){ assertCurand( code, __FILE__, __LINE__ ); }
+inline void assertCurand( curandStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != CURAND_STATUS_SUCCESS )
+  {
+    printf( "CurandAssert: %s:%d code=%d\n", file, line, code );
+    if ( abort ) assert( code == CURAND_STATUS_SUCCESS );
+  }
+}
+#endif /* clang-format on */
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+  CommonRandomNumberKernel::CommonRandomNumberKernel( BufferRndNumMomenta& rnarray )
+    : RandomNumberKernelBase( rnarray )
+    , m_seed( 20211220 )
+  {
+    if( m_rnarray.isOnDevice() )
+      throw std::runtime_error( "CommonRandomNumberKernel on host with a device random number array" );
+  }
+
+  //--------------------------------------------------------------------------
+
+  void CommonRandomNumberKernel::generateRnarray()
+  {
+    std::vector<double> rnd = CommonRandomNumbers::generate<double>( m_rnarray.size(), m_seed ); // NB: generate as double (HARDCODED)
+    std::copy( rnd.begin(), rnd.end(), m_rnarray.data() );                                       // NB: copy may imply a double-to-float conversion
+  }
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPU_HAS_NO_CURAND
+  CurandRandomNumberKernel::CurandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice )
+    : RandomNumberKernelBase( rnarray )
+    , m_isOnDevice( onDevice )
+  {
+    if( m_isOnDevice )
+    {
+#ifdef __CUDACC__
+      if( !m_rnarray.isOnDevice() )
+        throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" );
+#else
+      throw std::runtime_error( "CurandRandomNumberKernel does not support CurandDevice on CPU host" );
+#endif
+    }
+    else
+    {
+      if( m_rnarray.isOnDevice() )
+        throw std::runtime_error( "CurandRandomNumberKernel on host with a device random number array" );
+    }
+    createGenerator();
+  }
+
+  //--------------------------------------------------------------------------
+
+  CurandRandomNumberKernel::~CurandRandomNumberKernel()
+  {
+    destroyGenerator();
+  }
+
+  //--------------------------------------------------------------------------
+
+  void CurandRandomNumberKernel::seedGenerator( const unsigned int seed )
+  {
+    if( m_isOnDevice )
+    {
+      destroyGenerator(); // workaround for #429
+      createGenerator();  // workaround for #429
+    }
+    //printf( "seedGenerator: seed %d\n", seed );
+    checkCurand( curandSetPseudoRandomGeneratorSeed( m_rnGen, seed ) );
+  }
+
+  //--------------------------------------------------------------------------
+
+  void CurandRandomNumberKernel::createGenerator()
+  {
+    // [NB Timings are for GenRnGen host|device (cpp|cuda) generation of 256*32*1 events with nproc=1: rn(0) is host=0.0012s]
+    const curandRngType_t type = CURAND_RNG_PSEUDO_MTGP32; //          0.00082s | 0.00064s (FOR FAST TESTS)
+    //const curandRngType_t type = CURAND_RNG_PSEUDO_XORWOW;        // 0.049s   | 0.0016s
+    //const curandRngType_t type = CURAND_RNG_PSEUDO_MRG32K3A;      // 0.71s    | 0.0012s  (better but slower, especially in c++)
+    //const curandRngType_t type = CURAND_RNG_PSEUDO_MT19937;       // 21s      | 0.021s
+    //const curandRngType_t type = CURAND_RNG_PSEUDO_PHILOX4_32_10; // 0.024s   | 0.00026s (used to segfault?)
+    if( m_isOnDevice )
+    {
+      checkCurand( curandCreateGenerator( &m_rnGen, type ) );
+    }
+    else
+    {
+      checkCurand( curandCreateGeneratorHost( &m_rnGen, type ) );
+    }
+    //checkCurand( curandSetGeneratorOrdering( *&m_rnGen, CURAND_ORDERING_PSEUDO_LEGACY ) ); // fails with code=104 (see #429)
+    checkCurand( curandSetGeneratorOrdering( *&m_rnGen, CURAND_ORDERING_PSEUDO_BEST ) );
+    //checkCurand( curandSetGeneratorOrdering( *&m_rnGen, CURAND_ORDERING_PSEUDO_DYNAMIC ) ); // fails with code=104 (see #429)
+    //checkCurand( curandSetGeneratorOrdering( *&m_rnGen, CURAND_ORDERING_PSEUDO_SEEDED ) ); // fails with code=104 (see #429)
+  }
+
+  //--------------------------------------------------------------------------
+
+  void CurandRandomNumberKernel::destroyGenerator()
+  {
+    checkCurand( curandDestroyGenerator( m_rnGen ) );
+  }
+
+  //--------------------------------------------------------------------------
+
+  void CurandRandomNumberKernel::generateRnarray()
+  {
+#if defined MGONGPU_FPTYPE_DOUBLE
+    checkCurand( curandGenerateUniformDouble( m_rnGen, m_rnarray.data(), m_rnarray.size() ) );
+#elif defined MGONGPU_FPTYPE_FLOAT
+    checkCurand( curandGenerateUniform( m_rnGen, m_rnarray.data(), m_rnarray.size() ) );
+#endif
+    /*
+    printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() );
+    fptype* data = m_rnarray.data();
+#ifdef __CUDACC__
+    if( m_rnarray.isOnDevice() )
+    {
+      data = new fptype[m_rnarray.size()]();
+      checkCuda( cudaMemcpy( data, m_rnarray.data(), m_rnarray.bytes(), cudaMemcpyDeviceToHost ) );
+    }
+#endif
+    for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ )
+      printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] );
+#ifdef __CUDACC__
+    if( m_rnarray.isOnDevice() ) delete[] data;
+#endif
+    */
+  }
+
+  //--------------------------------------------------------------------------
+#endif
+}
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/RandomNumberKernels.h
new file mode 100644
index 0000000000..4d55f3d449
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/RandomNumberKernels.h
@@ -0,0 +1,146 @@
+#ifndef RANDOMNUMBERKERNELS_H
+#define RANDOMNUMBERKERNELS_H 1
+
+#include "mgOnGpuConfig.h"
+
+// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined
+#ifndef MGONGPU_HAS_NO_CURAND
+#include "curand.h"
+#endif
+
+#include "MemoryBuffers.h"
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+  /*
+  // An interface encapsulating random number generation on a CPU host or on a GPU device
+  class IRandomNumberKernel
+  {
+  public:
+
+    // Destructor
+    virtual ~IRandomNumberKernel(){}
+
+    // Seed the random number generator
+    virtual void seedGenerator( const unsigned int seed ) = 0;
+
+    // Generate the random number array
+    virtual void generateRnarray() = 0;
+
+    // Is this a host or device kernel?
+    virtual bool isOnDevice() const = 0;
+
+  };
+  */
+
+  //--------------------------------------------------------------------------
+
+  // A base class encapsulating random number generation on a CPU host or on a GPU device
+  class RandomNumberKernelBase //: virtual public IRandomNumberKernel
+  {
+
+  protected:
+
+    // Constructor from an existing output buffer
+    RandomNumberKernelBase( BufferRndNumMomenta& rnarray )
+      : m_rnarray( rnarray ) {}
+
+  public:
+
+    // Destructor
+    virtual ~RandomNumberKernelBase() {}
+
+    // Seed the random number generator
+    virtual void seedGenerator( const unsigned int seed ) = 0;
+
+    // Generate the random number array
+    virtual void generateRnarray() = 0;
+
+    // Is this a host or device kernel?
+    virtual bool isOnDevice() const = 0;
+
+  protected:
+
+    // The buffer for the output random numbers
+    BufferRndNumMomenta& m_rnarray;
+  };
+
+  //--------------------------------------------------------------------------
+
+  // A class encapsulating common random number generation on a CPU host
+  class CommonRandomNumberKernel final : public RandomNumberKernelBase
+  {
+  public:
+
+    // Constructor from an existing output buffer
+    CommonRandomNumberKernel( BufferRndNumMomenta& rnarray );
+
+    // Destructor
+    ~CommonRandomNumberKernel() {}
+
+    // Seed the random number generator
+    void seedGenerator( const unsigned int seed ) override final { m_seed = seed; };
+
+    // Generate the random number array
+    void generateRnarray() override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+  private:
+
+    // The generator seed
+    unsigned int m_seed;
+  };
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPU_HAS_NO_CURAND
+  // A class encapsulating CURAND random number generation on a CPU host or on a GPU device
+  class CurandRandomNumberKernel final : public RandomNumberKernelBase
+  {
+  public:
+
+    // Constructor from an existing output buffer
+    CurandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice );
+
+    // Destructor
+    ~CurandRandomNumberKernel();
+
+    // Seed the random number generator
+    void seedGenerator( const unsigned int seed ) override final;
+
+    // Generate the random number array
+    void generateRnarray() override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return m_isOnDevice; }
+
+  private:
+
+    // Create the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor)
+    void createGenerator();
+
+    // Destroy the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor)
+    void destroyGenerator();
+
+  private:
+
+    // Is this a host or device kernel?
+    const bool m_isOnDevice;
+
+    // The curand generator
+    curandGenerator_t m_rnGen;
+  };
+
+#endif
+
+  //--------------------------------------------------------------------------
+}
+#endif // RANDOMNUMBERKERNELS_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk
new file mode 100644
index 0000000000..2155495366
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk
@@ -0,0 +1,798 @@
+#=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
+#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories
+
+CUDACPP_MAKEFILE = $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
+CUDACPP_SRC_MAKEFILE = cudacpp_src.mk
+
+#-------------------------------------------------------------------------------
+
+#=== Use bash in the Makefile (https://www.gnu.org/software/make/manual/html_node/Choosing-the-Shell.html)
+
+SHELL := /bin/bash
+
+#-------------------------------------------------------------------------------
+
+#=== Detect O/S and architecture (assuming uname is available, https://en.wikipedia.org/wiki/Uname)
+
+# Detect O/S kernel (Linux, Darwin...)
+UNAME_S := $(shell uname -s)
+###$(info UNAME_S='$(UNAME_S)')
+
+# Detect architecture (x86_64, ppc64le...)
+UNAME_P := $(shell uname -p)
+###$(info UNAME_P='$(UNAME_P)')
+
+#-------------------------------------------------------------------------------
+
+#=== Configure common compiler flags for C++ and CUDA
+
+INCFLAGS = -I.
+OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+
+# Dependency on src directory
+MG5AMC_COMMONLIB = mg5amc_common
+LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+INCFLAGS += -I../../src
+
+# Dependency on tools directory
+TOOLSDIR = ../../../../../tools
+INCFLAGS += -I$(TOOLSDIR)
+
+# Dependency on test directory
+TESTDIR  = ../../../../../test
+GTESTLIBDIR = $(TESTDIR)/googletest/build/lib/
+GTESTLIBS   = $(GTESTLIBDIR)/libgtest.a $(GTESTLIBDIR)/libgtest_main.a
+
+#-------------------------------------------------------------------------------
+
+#=== Configure the C++ compiler
+
+CXXFLAGS = $(OPTFLAGS) -std=c++17 $(INCFLAGS) $(USE_NVTX) -Wall -Wshadow -Wextra
+ifeq ($(shell $(CXX) --version | grep ^nvc++),)
+CXXFLAGS+= -ffast-math # see issue #117
+endif
+###CXXFLAGS+= -Ofast # performance is not different from --fast-math
+###CXXFLAGS+= -g # FOR DEBUGGING ONLY
+
+# Optionally add debug flags to display the full list of flags (eg on Darwin)
+###CXXFLAGS+= -v
+
+# Note: AR, CXX and FC are implicitly defined if not set externally
+# See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html
+
+#-------------------------------------------------------------------------------
+
+#=== Configure the CUDA compiler
+
+# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505)
+# This is because it is impossible to pass this to "CUFLAGS += -ccbin <host-compiler>" below
+ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache <host-compiler>" from outside
+  $(warning CUDA builds are not supported for multi-word CXX "$(CXX)")
+  override CUDA_HOME=disabled
+endif
+
+# If CUDA_HOME is not set, try to set it from the location of nvcc
+ifndef CUDA_HOME
+  CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null))
+  $(warning CUDA_HOME was not set: using "$(CUDA_HOME)")
+endif
+
+# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists
+ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
+  NVCC = $(CUDA_HOME)/bin/nvcc
+  USE_NVTX ?=-DUSE_NVTX
+  # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster).
+  # Embed device code for 70, and PTX for 70+.
+  # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533).
+  # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity).
+  MADGRAPH_CUDA_ARCHITECTURE ?= 70
+  ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533
+  ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533
+  comma:=,
+  CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch))
+  CUINC       = -I$(CUDA_HOME)/include/
+  CULIBFLAGS  = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here!
+  CUOPTFLAGS  = -lineinfo
+  CUFLAGS     = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
+  ###CUFLAGS    += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
+  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
+  # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12)
+  ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12)
+else ifneq ($(origin REQUIRE_CUDA),undefined)
+  # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443)
+  $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH))
+else
+  # No cuda. Switch cuda compilation off and go to common random numbers in C++
+  $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda)
+  override NVCC=
+  override USE_NVTX=
+  override CULIBFLAGS=
+endif
+
+# Set the host C++ compiler for nvcc via "-ccbin <host-compiler>"
+# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported)
+CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX)))
+
+# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504)
+ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined)
+CUFLAGS += -allow-unsupported-compiler
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure ccache for C++ and CUDA builds
+
+# Enable ccache if USECCACHE=1
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+  override CXX:=ccache $(CXX)
+endif
+#ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
+#  override AR:=ccache $(AR)
+#endif
+ifneq ($(NVCC),)
+  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
+    override NVCC:=ccache $(NVCC)
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure PowerPC-specific compiler flags for C++ and CUDA
+
+# PowerPC-specific CXX compiler flags (being reviewed)
+ifeq ($(UNAME_P),ppc64le)
+  CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3% both for none and sse4
+  # Throughput references without the extra flags below: none=1.41-1.42E6, sse4=2.15-2.19E6
+  ###CXXFLAGS+= -DNO_WARN_X86_INTRINSICS # no change
+  ###CXXFLAGS+= -fpeel-loops # no change
+  ###CXXFLAGS+= -funroll-loops # gains ~1% for none, loses ~1% for sse4
+  ###CXXFLAGS+= -ftree-vectorize # no change
+  ###CXXFLAGS+= -flto # would increase to none=4.08-4.12E6, sse4=4.99-5.03E6!
+else
+  ###CXXFLAGS+= -flto # also on Intel this would increase throughputs by a factor 2 to 4...
+  ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
+endif
+
+# PowerPC-specific CUDA compiler flags (to be reviewed!)
+ifeq ($(UNAME_P),ppc64le)
+  CUFLAGS+= -Xcompiler -mno-float128
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+
+# Set the default OMPFLAGS choice
+ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+override OMPFLAGS = -fopenmp
+###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578)
+else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),)
+override OMPFLAGS = -fopenmp
+###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578)
+else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),)
+override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578)
+else
+override OMPFLAGS = -fopenmp
+###override OMPFLAGS = # disable OpenMP MT (default before #575)
+endif
+
+# Set the default AVX (vectorization) choice
+ifeq ($(AVX),)
+  ifeq ($(UNAME_P),ppc64le)
+    ###override AVX = none
+    override AVX = sse4
+  else ifeq ($(UNAME_P),arm)
+    ###override AVX = none
+    override AVX = sse4
+  else ifeq ($(wildcard /proc/cpuinfo),)
+    override AVX = none
+    $(warning Using AVX='$(AVX)' because host SIMD features cannot be read from /proc/cpuinfo)
+  else ifeq ($(shell grep -m1 -c avx512vl /proc/cpuinfo)$(shell $(CXX) --version | grep ^clang),1)
+    override AVX = 512y
+    ###$(info Using AVX='$(AVX)' as no user input exists)
+  else
+    override AVX = avx2
+    ifneq ($(shell grep -m1 -c avx512vl /proc/cpuinfo),1)
+      $(warning Using AVX='$(AVX)' because host does not support avx512vl)
+    else
+      $(warning Using AVX='$(AVX)' because this is faster than avx512vl for clang)
+    endif
+  endif
+else
+  ###$(info Using AVX='$(AVX)' according to user input)
+endif
+
+# Set the default FPTYPE (floating point type) choice
+ifeq ($(FPTYPE),)
+  override FPTYPE = d
+endif
+
+# Set the default HELINL (inline helicities?) choice
+ifeq ($(HELINL),)
+  override HELINL = 0
+endif
+
+# Set the default HRDCOD (hardcode cIPD physics parameters?) choice
+ifeq ($(HRDCOD),)
+  override HRDCOD = 0
+endif
+
+# Set the default RNDGEN (random number generator) choice
+ifeq ($(NVCC),)
+  override RNDGEN = hasNoCurand
+else ifeq ($(RNDGEN),)
+  override RNDGEN = hasCurand
+endif
+
+# Export AVX, FPTYPE, HELINL, HRDCOD, RNDGEN, OMPFLAGS so that it is not necessary to pass them to the src Makefile too
+export AVX
+export FPTYPE
+export HELINL
+export HRDCOD
+export RNDGEN
+export OMPFLAGS
+
+#-------------------------------------------------------------------------------
+
+#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+
+# Set the build flags appropriate to OMPFLAGS
+$(info OMPFLAGS=$(OMPFLAGS))
+CXXFLAGS += $(OMPFLAGS)
+
+# Set the build flags appropriate to each AVX choice (example: "make AVX=none")
+# [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
+# [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
+$(info AVX=$(AVX))
+ifeq ($(UNAME_P),ppc64le)
+  ifeq ($(AVX),sse4)
+    override AVXFLAGS = -D__SSE4_2__ # Power9 VSX with 128 width (VSR registers)
+  else ifneq ($(AVX),none)
+    $(error Unknown AVX='$(AVX)': only 'none' and 'sse4' are supported on PowerPC for the moment)
+  endif
+else ifeq ($(UNAME_P),arm)
+  ifeq ($(AVX),sse4)
+    override AVXFLAGS = -D__SSE4_2__ # ARM NEON with 128 width (Q/quadword registers)
+  else ifneq ($(AVX),none)
+    $(error Unknown AVX='$(AVX)': only 'none' and 'sse4' are supported on ARM for the moment)
+  endif
+else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
+  ifeq ($(AVX),none)
+    override AVXFLAGS = -mno-sse3 # no SIMD
+  else ifeq ($(AVX),sse4)
+    override AVXFLAGS = -mno-avx # SSE4.2 with 128 width (xmm registers)
+  else ifeq ($(AVX),avx2)
+    override AVXFLAGS = -march=haswell # AVX2 with 256 width (ymm registers) [DEFAULT for clang]
+  else ifeq ($(AVX),512y)
+    override AVXFLAGS = -march=skylake -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
+  else ifeq ($(AVX),512z)
+    override AVXFLAGS = -march=skylake -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else
+    $(error Unknown AVX='$(AVX)': only 'none', 'sse4', 'avx2', '512y' and '512z' are supported)
+  endif
+else
+  ifeq ($(AVX),none)
+    override AVXFLAGS = -march=x86-64 # no SIMD (see #588)
+  else ifeq ($(AVX),sse4)
+    override AVXFLAGS = -march=nehalem # SSE4.2 with 128 width (xmm registers)
+  else ifeq ($(AVX),avx2)
+    override AVXFLAGS = -march=haswell # AVX2 with 256 width (ymm registers) [DEFAULT for clang]
+  else ifeq ($(AVX),512y)
+    override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
+  else ifeq ($(AVX),512z)
+    override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else
+    $(error Unknown AVX='$(AVX)': only 'none', 'sse4', 'avx2', '512y' and '512z' are supported)
+  endif
+endif
+# For the moment, use AVXFLAGS everywhere: eventually, use them only in encapsulated implementations?
+CXXFLAGS+= $(AVXFLAGS)
+
+# Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
+$(info FPTYPE=$(FPTYPE))
+ifeq ($(FPTYPE),d)
+  CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+else ifeq ($(FPTYPE),f)
+  CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+else ifeq ($(FPTYPE),m)
+  CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+  CUFLAGS  += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+else
+  $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
+endif
+
+# Set the build flags appropriate to each HELINL choice (example: "make HELINL=1")
+$(info HELINL=$(HELINL))
+ifeq ($(HELINL),1)
+  CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
+  CUFLAGS  += -DMGONGPU_INLINE_HELAMPS
+else ifneq ($(HELINL),0)
+  $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
+endif
+
+# Set the build flags appropriate to each HRDCOD choice (example: "make HRDCOD=1")
+$(info HRDCOD=$(HRDCOD))
+ifeq ($(HRDCOD),1)
+  CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
+  CUFLAGS  += -DMGONGPU_HARDCODE_PARAM
+else ifneq ($(HRDCOD),0)
+  $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
+endif
+
+# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand")
+$(info RNDGEN=$(RNDGEN))
+ifeq ($(RNDGEN),hasNoCurand)
+  CXXFLAGS += -DMGONGPU_HAS_NO_CURAND
+else ifneq ($(RNDGEN),hasCurand)
+  $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported)
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure build directories and build lockfiles ===
+
+# Build directory "short" tag (defines target and path to the optional build directory)
+# (Rationale: keep directory names shorter, e.g. do not include random number generator choice)
+override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)
+
+# Build lockfile "full" tag (defines full specification of build options that cannot be intermixed)
+# (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators)
+override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN)
+
+# Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1
+ifeq ($(USEBUILDDIR),1)
+  override BUILDDIR = build.$(DIRTAG)
+  override LIBDIR = ../../lib/$(BUILDDIR)
+  override LIBDIRRPATH = '$$ORIGIN/../$(LIBDIR)'
+  $(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG) (USEBUILDDIR is set = 1))
+else
+  override BUILDDIR = .
+  override LIBDIR = ../../lib
+  override LIBDIRRPATH = '$$ORIGIN/$(LIBDIR)'
+  $(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG) (USEBUILDDIR is not set))
+endif
+###override INCDIR = ../../include
+###$(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG))
+
+# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH
+# Use relative paths with respect to the executables or shared libraries ($ORIGIN on Linux)
+# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary
+ifeq ($(UNAME_S),Darwin)
+  override CXXLIBFLAGSRPATH =
+  override CULIBFLAGSRPATH =
+  override CXXLIBFLAGSRPATH2 =
+  override CULIBFLAGSRPATH2 =
+else
+  # RPATH to cuda/cpp libs when linking executables
+  override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH)
+  override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH)
+  # RPATH to common lib when linking cuda/cpp libs
+  override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN'
+  override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN'
+endif
+
+# Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac)
+override RUNTIME =
+
+#===============================================================================
+#=== Makefile TARGETS and build rules below
+#===============================================================================
+
+cxx_main=$(BUILDDIR)/check.exe
+fcxx_main=$(BUILDDIR)/fcheck.exe
+
+ifneq ($(NVCC),)
+cu_main=$(BUILDDIR)/gcheck.exe
+fcu_main=$(BUILDDIR)/fgcheck.exe
+else
+cu_main=
+fcu_main=
+endif
+
+testmain=$(BUILDDIR)/runTest.exe
+
+all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_main) $(cxx_main) $(testmain) $(fcu_main) $(fcxx_main)
+
+# Target (and build options): debug
+MAKEDEBUG=
+debug: OPTFLAGS   = -g -O0 -DDEBUG2
+debug: CUOPTFLAGS = -G
+debug: MAKEDEBUG := debug
+debug: all.$(TAG)
+
+# Target: tag-specific build lockfiles
+override oldtagsb=`if [ -d $(BUILDDIR) ]; then find $(BUILDDIR) -maxdepth 1 -name '.build.*' ! -name '.build.$(TAG)' -exec echo $(shell pwd)/{} \; ; fi`
+$(BUILDDIR)/.build.$(TAG):
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	@if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo "  $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi
+	@touch $(BUILDDIR)/.build.$(TAG)
+
+# Generic target and build rules: objects from CUDA compilation
+ifneq ($(NVCC),)
+$(BUILDDIR)/%.o : %.cu *.h ../../src/*.h
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@
+
+$(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@
+endif
+
+# Generic target and build rules: objects from C++ compilation
+$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CUINC) -fPIC -c $< -o $@
+
+# Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117)
+ifeq ($(shell $(CXX) --version | grep ^nvc++),)
+$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
+$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math
+ifneq ($(NVCC),)
+$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math
+endif
+endif
+
+# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592)
+ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),)
+ifneq ($(NVCC),)
+CUFLAGS += -Xcompiler -Wno-deprecated-builtins
+endif
+endif
+
+# Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516)
+# This patch does remove the warning, but I prefer to keep it disabled for the moment...
+###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),)
+###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option
+###ifneq ($(NVCC),)
+###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option
+###endif
+###endif
+
+#### Apply special build flags only to CPPProcess.cc (-flto)
+###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto
+
+#### Apply special build flags only to CPPProcess.cc (AVXFLAGS)
+###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS)
+
+#-------------------------------------------------------------------------------
+
+# Target (and build rules): common (src) library
+commonlib : $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so
+
+$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so: ../../src/*.h ../../src/*.cc
+	$(MAKE) -C ../../src $(MAKEDEBUG) -f $(CUDACPP_SRC_MAKEFILE)
+
+#-------------------------------------------------------------------------------
+
+processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
+###$(info processid_short=$(processid_short))
+
+MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
+cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o
+cxx_objects_exe=$(BUILDDIR)/RandomNumberKernels.o $(BUILDDIR)/RamboSamplingKernels.o
+
+ifneq ($(NVCC),)
+MG5AMC_CULIB = mg5amc_$(processid_short)_cuda
+cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o
+cu_objects_exe=$(BUILDDIR)/gRandomNumberKernels.o $(BUILDDIR)/gRamboSamplingKernels.o
+endif
+
+# Target (and build rules): C++ and CUDA shared libraries
+$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o
+$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o
+$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib)
+	$(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+
+ifneq ($(NVCC),)
+$(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o
+$(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o
+$(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib)
+	$(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+endif
+
+#-------------------------------------------------------------------------------
+
+# Target (and build rules): Fortran include files
+###$(INCDIR)/%.inc : ../%.inc
+###	@if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi
+###	\cp $< $@
+
+#-------------------------------------------------------------------------------
+
+# Target (and build rules): C++ and CUDA standalone executables
+$(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
+	$(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS)
+
+ifneq ($(NVCC),)
+ifneq ($(shell $(CXX) --version | grep ^Intel),)
+$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
+$(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
+endif
+$(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
+	$(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(CULIBFLAGS)
+endif
+
+#-------------------------------------------------------------------------------
+
+# Generic target and build rules: objects from Fortran compilation
+$(BUILDDIR)/%.o : %.f *.inc
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(FC) -I. -c $< -o $@
+
+# Generic target and build rules: objects from Fortran compilation
+###$(BUILDDIR)/%.o : %.f *.inc
+###	@if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi
+###	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+###	$(FC) -I. -I$(INCDIR) -c $< -o $@
+
+# Target (and build rules): Fortran standalone executables
+###$(BUILDDIR)/fcheck_sa.o : $(INCDIR)/fbridge.inc
+
+ifeq ($(UNAME_S),Darwin)
+$(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
+endif
+$(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe)
+	$(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS)
+
+ifneq ($(NVCC),)
+ifneq ($(shell $(CXX) --version | grep ^Intel),)
+$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+endif
+ifeq ($(UNAME_S),Darwin)
+$(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
+endif
+$(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe)
+	$(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(CULIBFLAGS)
+endif
+
+#-------------------------------------------------------------------------------
+
+# Target (and build rules): test objects and test executable
+$(BUILDDIR)/testxxx.o: $(GTESTLIBS)
+$(BUILDDIR)/testxxx.o: INCFLAGS += -I$(TESTDIR)/googletest/googletest/include
+$(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt
+$(testmain): $(BUILDDIR)/testxxx.o
+$(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions
+
+ifneq ($(NVCC),)
+$(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS)
+$(BUILDDIR)/testxxx_cu.o: INCFLAGS += -I$(TESTDIR)/googletest/googletest/include
+$(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt
+$(testmain): $(BUILDDIR)/testxxx_cu.o
+$(testmain): cu_objects_exe += $(BUILDDIR)/testxxx_cu.o # Comment out this line to skip the CUDA test of xxx functions
+endif
+
+$(BUILDDIR)/testmisc.o: $(GTESTLIBS)
+$(BUILDDIR)/testmisc.o: INCFLAGS += -I$(TESTDIR)/googletest/googletest/include
+$(testmain): $(BUILDDIR)/testmisc.o
+$(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests
+
+ifneq ($(NVCC),)
+$(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS)
+$(BUILDDIR)/testmisc_cu.o: INCFLAGS += -I$(TESTDIR)/googletest/googletest/include
+$(testmain): $(BUILDDIR)/testmisc_cu.o
+$(testmain): cu_objects_exe += $(BUILDDIR)/testmisc_cu.o # Comment out this line to skip the CUDA miscellaneous tests
+endif
+
+$(BUILDDIR)/runTest.o: $(GTESTLIBS)
+$(BUILDDIR)/runTest.o: INCFLAGS += -I$(TESTDIR)/googletest/googletest/include
+$(testmain): $(BUILDDIR)/runTest.o
+$(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o
+
+ifneq ($(NVCC),)
+$(BUILDDIR)/runTest_cu.o: $(GTESTLIBS)
+$(BUILDDIR)/runTest_cu.o: INCFLAGS += -I$(TESTDIR)/googletest/googletest/include
+ifneq ($(shell $(CXX) --version | grep ^Intel),)
+$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy')
+$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9')
+else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
+$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
+endif
+$(testmain): $(BUILDDIR)/runTest_cu.o
+$(testmain): cu_objects_exe  += $(BUILDDIR)/runTest_cu.o
+endif
+
+$(testmain): $(GTESTLIBS)
+$(testmain): INCFLAGS += -I$(TESTDIR)/googletest/googletest/include
+$(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -lgtest_main
+
+ifneq ($(OMPFLAGS),)
+ifneq ($(shell $(CXX) --version | egrep '^Intel'),)
+$(testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648)
+else ifneq ($(shell $(CXX) --version | egrep '^clang'),)
+$(testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604
+###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),)
+###$(testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604)
+else
+$(testmain): LIBFLAGS += -lgomp
+endif
+endif
+
+ifeq ($(NVCC),) # link only runTest.o
+$(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS)
+	$(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS)
+else # link both runTest.o and runTest_cu.o
+$(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS)
+	$(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda $(CULIBFLAGS)
+endif
+
+# Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215
+$(GTESTLIBS):
+ifneq ($(shell which flock 2>/dev/null),)
+	flock $(BUILDDIR)/.make_test.lock $(MAKE) -C $(TESTDIR)
+else
+	$(MAKE) -C $(TESTDIR)
+endif
+
+#-------------------------------------------------------------------------------
+
+# Target: build all targets in all AVX modes (each AVX mode in a separate build directory)
+# Split the avxall target into five separate targets to allow parallel 'make -j avxall' builds
+# (Hack: add a fbridge.inc dependency to avxall, to ensure it is only copied once for all AVX modes)
+avxnone:
+	@echo
+	$(MAKE) USEBUILDDIR=1 AVX=none -f $(CUDACPP_MAKEFILE)
+
+avxsse4:
+	@echo
+	$(MAKE) USEBUILDDIR=1 AVX=sse4 -f $(CUDACPP_MAKEFILE)
+
+avxavx2:
+	@echo
+	$(MAKE) USEBUILDDIR=1 AVX=avx2 -f $(CUDACPP_MAKEFILE)
+
+avx512y:
+	@echo
+	$(MAKE) USEBUILDDIR=1 AVX=512y -f $(CUDACPP_MAKEFILE)
+
+avx512z:
+	@echo
+	$(MAKE) USEBUILDDIR=1 AVX=512z -f $(CUDACPP_MAKEFILE)
+
+ifeq ($(UNAME_P),ppc64le)
+###avxall: $(INCDIR)/fbridge.inc avxnone avxsse4
+avxall: avxnone avxsse4
+else ifeq ($(UNAME_P),arm)
+###avxall: $(INCDIR)/fbridge.inc avxnone avxsse4
+avxall: avxnone avxsse4
+else
+###avxall: $(INCDIR)/fbridge.inc avxnone avxsse4 avxavx2 avx512y avx512z
+avxall: avxnone avxsse4 avxavx2 avx512y avx512z
+endif
+
+#-------------------------------------------------------------------------------
+
+# Target: clean the builds
+.PHONY: clean
+
+clean:
+ifeq ($(USEBUILDDIR),1)
+	rm -rf $(BUILDDIR)
+else
+	rm -f $(BUILDDIR)/.build.* $(BUILDDIR)/*.o $(BUILDDIR)/*.exe
+	rm -f $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(LIBDIR)/lib$(MG5AMC_CULIB).so
+endif
+	$(MAKE) -C ../../src clean -f $(CUDACPP_SRC_MAKEFILE)
+###	rm -rf $(INCDIR)
+
+cleanall:
+	@echo
+	$(MAKE) USEBUILDDIR=0 clean -f $(CUDACPP_MAKEFILE)
+	@echo
+	$(MAKE) USEBUILDDIR=0 -C ../../src cleanall -f $(CUDACPP_SRC_MAKEFILE)
+	rm -rf build.*
+
+# Target: clean the builds as well as the googletest installation
+distclean: cleanall
+	$(MAKE) -C $(TESTDIR) clean
+
+#-------------------------------------------------------------------------------
+
+# Target: show system and compiler information
+info:
+	@echo ""
+	@uname -spn # e.g. Linux nodename.cern.ch x86_64
+ifeq ($(UNAME_S),Darwin)
+	@sysctl -a | grep -i brand
+	@sysctl -a | grep machdep.cpu | grep features || true
+	@sysctl -a | grep hw.physicalcpu:
+	@sysctl -a | grep hw.logicalcpu:
+else
+	@cat /proc/cpuinfo | grep "model name" | sort -u
+	@cat /proc/cpuinfo | grep "flags" | sort -u
+	@cat /proc/cpuinfo | grep "cpu cores" | sort -u
+	@cat /proc/cpuinfo | grep "physical id" | sort -u
+endif
+	@echo ""
+ifneq ($(shell which nvidia-smi 2>/dev/null),)
+	nvidia-smi -L
+	@echo ""
+endif
+	@echo USECCACHE=$(USECCACHE)
+ifeq ($(USECCACHE),1)
+	ccache --version | head -1
+endif
+	@echo ""
+	@echo NVCC=$(NVCC)
+ifneq ($(NVCC),)
+	$(NVCC) --version
+endif
+	@echo ""
+	@echo CXX=$(CXX)
+ifneq ($(shell $(CXX) --version | grep ^clang),)
+	@echo $(CXX) -v
+	@$(CXX) -v |& egrep -v '(Found|multilib)'
+	@readelf -p .comment `$(CXX) -print-libgcc-file-name` |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print "GCC toolchain:",$$5}'
+else
+	$(CXX) --version
+endif
+	@echo ""
+	@echo FC=$(FC)
+	$(FC) --version
+
+#-------------------------------------------------------------------------------
+
+# Target: check (run the C++ test executable)
+# [NB THIS IS WHAT IS USED IN THE GITHUB CI!]
+ifneq ($(NVCC),)
+check: runTest cmpFcheck cmpFGcheck
+else
+check: runTest cmpFcheck
+endif
+
+# Target: runTest (run the C++ test executable runTest.exe)
+runTest: all.$(TAG)
+	$(RUNTIME) $(BUILDDIR)/runTest.exe
+
+# Target: runCheck (run the C++ standalone executable check.exe, with a small number of events)
+runCheck: all.$(TAG)
+	$(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2
+
+# Target: runGcheck (run the CUDA standalone executable gcheck.exe, with a small number of events)
+runGcheck: all.$(TAG)
+	$(RUNTIME) $(BUILDDIR)/gcheck.exe -p 2 32 2
+
+# Target: runFcheck (run the Fortran standalone executable - with C++ MEs - fcheck.exe, with a small number of events)
+runFcheck: all.$(TAG)
+	$(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2
+
+# Target: runFGcheck (run the Fortran standalone executable - with CUDA MEs - fgcheck.exe, with a small number of events)
+runFGcheck: all.$(TAG)
+	$(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2
+
+# Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events)
+cmpFcheck: all.$(TAG)
+	@echo
+	@echo "$(BUILDDIR)/check.exe --common -p 2 32 2"
+	@echo "$(BUILDDIR)/fcheck.exe 2 32 2"
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++)    = $${me1}"; echo "Avg ME (F77/C++)    = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+
+# Target: cmpFGcheck (compare ME results from the CUDA and Fortran with CUDA MEs standalone executables, with a small number of events)
+cmpFGcheck: all.$(TAG)
+	@echo
+	@echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2"
+	@echo "$(BUILDDIR)/fgcheck.exe 2 32 2"
+	@me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/CUDA)   = $${me1}"; echo "Avg ME (F77/CUDA)   = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi
+
+# Target: memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck)
+memcheck: all.$(TAG)
+	$(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/gcheck.exe -p 2 32 2
+
+#-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.cc
new file mode 100644
index 0000000000..9c9287e0c5
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.cc
@@ -0,0 +1,126 @@
+#include "Bridge.h"
+#include "CPPProcess.h"
+#include "CudaRuntime.h"
+
+extern "C"
+{
+  /**
+   * The namespace where the Bridge class is taken from.
+   *
+   * In the current implementation, two separate shared libraries are created for the GPU/CUDA and CPU/C++ implementations.
+   * Actually, two shared libraries for GPU and CPU are created for each of the five SIMD implementations on CPUs (none, sse4, avx2, 512y, 512z).
+   * A single fcreatebridge_ symbol is created in each library with the same name, connected to the appropriate Bridge on CPU or GPU.
+   * The Fortran MadEvent code is always the same: the choice whether to use a CPU or GPU implementation is done by linking the appropriate library.
+   * As the names of the two CPU/GPU libraries are the same in the five SIMD implementations, the choice of SIMD is done by setting LD_LIBRARY_PATH.
+   *
+   * In a future implementation, a single heterogeneous shared library may be created, with the same interface.
+   * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations.
+   * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file.
+   */
+#ifdef __CUDACC__
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  /**
+   * The floating point precision used in Fortran arrays.
+   * This is presently hardcoded to double precision (REAL*8).
+   */
+  using FORTRANFPTYPE = double; // for Fortran double precision (REAL*8) arrays
+  //using FORTRANFPTYPE = float; // for Fortran single precision (REAL*4) arrays
+
+  /**
+   * Create a Bridge and return its pointer.
+   * This is a C symbol that should be called from the Fortran code (in auto_dsig1.f).
+   *
+   * @param ppbridge the pointer to the Bridge pointer (the Bridge pointer is handled in Fortran as an INTEGER*8 variable)
+   * @param nevtF the pointer to the number of events in the Fortran arrays
+   * @param nparF the pointer to the number of external particles in the Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F the pointer to the number of momenta components, usually 4, in the Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   */
+  void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F )
+  {
+#ifdef __CUDACC__
+    CudaRuntime::setUp();
+#endif
+    // Create a process object, read parm card and set parameters
+    // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
+    // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads?
+    CPPProcess process( /*verbose=*/false );
+    process.initProc( "../../Cards/param_card.dat" );
+    // FIXME: disable OMP in Bridge when called from Fortran
+    *ppbridge = new Bridge<FORTRANFPTYPE>( *pnevtF, *pnparF, *pnp4F );
+  }
+
+  /**
+   * Delete a Bridge.
+   * This is a C symbol that should be called from the Fortran code (in auto_dsig1.f).
+   *
+   * @param ppbridge the pointer to the Bridge pointer (the Bridge pointer is handled in Fortran as an INTEGER*8 variable)
+   */
+  void fbridgedelete_( CppObjectInFortran** ppbridge )
+  {
+    Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" );
+    delete pbridge;
+#ifdef __CUDACC__
+    CudaRuntime::tearDown();
+#endif
+  }
+
+  /**
+   * Execute the matrix-element calculation "sequence" via a Bridge on GPU/CUDA or CUDA/C++.
+   * This is a C symbol that should be called from the Fortran code (in auto_dsig1.f).
+   *
+   * @param ppbridge the pointer to the Bridge pointer (the Bridge pointer is handled in Fortran as an INTEGER*8 variable)
+   * @param momenta the pointer to the input 4-momenta
+   * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
+   * @param rndhel the pointer to the input random numbers for helicity selection
+   * @param rndcol the pointer to the input random numbers for color selection
+   * @param channelId the pointer to the input Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
+   * @param mes the pointer to the output matrix elements
+   * @param selhel the pointer to the output selected helicities
+   * @param selcol the pointer to the output selected colors
+   */
+  void fbridgesequence_( CppObjectInFortran** ppbridge,
+                         const FORTRANFPTYPE* momenta,
+                         const FORTRANFPTYPE* gs,
+                         const FORTRANFPTYPE* rndhel,
+                         const FORTRANFPTYPE* rndcol,
+                         const unsigned int* pchannelId,
+                         FORTRANFPTYPE* mes,
+                         int* selhel,
+                         int* selcol )
+  {
+    Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" );
+#ifdef __CUDACC__
+    // Use the device/GPU implementation in the CUDA library
+    // (there is also a host implementation in this library)
+    pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
+#else
+    // Use the host/CPU implementation in the C++ library
+    // (there is no device implementation in this library)
+    pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol );
+#endif
+  }
+
+  /**
+   * Retrieve the number of good helicities for helicity filtering in the Bridge.
+   * This is a C symbol that should be called from the Fortran code (in auto_dsig1.f).
+   *
+   * @param ppbridge the pointer to the Bridge pointer (the Bridge pointer is handled in Fortran as an INTEGER*8 variable)
+   * @param pngoodhel the pointer to the output number of good helicities
+   * @param pntothel the pointer to the output total number of helicities
+   */
+  void fbridgegetngoodhel_( CppObjectInFortran** ppbridge,
+                            unsigned int* pngoodhel,
+                            unsigned int* pntothel )
+  {
+    Bridge<FORTRANFPTYPE>* pbridge = dynamic_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
+    if( pbridge == 0 ) throw std::runtime_error( "fbridgegetngoodhel_: invalid Bridge address" );
+    *pngoodhel = pbridge->nGoodHel();
+    *pntothel = pbridge->nTotHel();
+  }
+}
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.inc
new file mode 100644
index 0000000000..f140b660fc
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fbridge.inc
@@ -0,0 +1,66 @@
+C
+C Create a Bridge and return its pointer
+C - PBRIDGE: the memory address of the C++ Bridge
+C - NEVT:    the number of events in the Fortran arrays
+C - NPAR:    the number of external particles in the Fortran arrays (KEPT FOR SANITY CHECKS ONLY: remove it?)
+C - NP4:     the number of momenta components, usually 4, in the Fortran arrays (KEPT FOR SANITY CHECKS ONLY: remove it?)
+C
+      INTERFACE
+         SUBROUTINE FBRIDGECREATE(PBRIDGE, NEVT, NPAR, NP4)
+         INTEGER*8 PBRIDGE
+         INTEGER*4 NEVT
+         INTEGER*4 NPAR
+         INTEGER*4 NP4
+         END SUBROUTINE FBRIDGECREATE
+      END INTERFACE
+      
+C
+C Delete a Bridge.
+C - PBRIDGE: the memory address of the C++ Bridge
+C
+      INTERFACE
+         SUBROUTINE FBRIDGEDELETE(PBRIDGE)
+         INTEGER*8 PBRIDGE
+         END SUBROUTINE FBRIDGEDELETE
+      END INTERFACE
+      
+C
+C Execute the matrix-element calculation "sequence" via a Bridge on GPU/CUDA or CUDA/C++.
+C - PBRIDGE: the memory address of the C++ Bridge
+C - MOMENTA: the input 4-momenta Fortran array
+C - GS:      the input Gs (running QCD coupling constant alphas) Fortran array
+C - RNDHEL:  the input random number Fortran array for helicity selection
+C - RNDCOL:  the input random number Fortran array for color selection
+C - CHANID:  the input Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
+C - MES:     the output matrix element Fortran array
+C - SELHEL:  the output selected helicity Fortran array
+C - SELCOL:  the output selected color Fortran array
+C
+      INTERFACE
+         SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS,
+     &     RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL)
+         INTEGER*8 PBRIDGE
+         DOUBLE PRECISION MOMENTA(*)
+         DOUBLE PRECISION GS(*)
+         DOUBLE PRECISION RNDHEL(*)
+         DOUBLE PRECISION RNDCOL(*)
+         INTEGER*4 CHANID
+         DOUBLE PRECISION MES(*)
+         INTEGER*4 SELHEL(*)
+         INTEGER*4 SELCOL(*)
+         END SUBROUTINE FBRIDGESEQUENCE
+      END INTERFACE
+
+C
+C Retrieve the number of good helicities for helicity filtering in the Bridge.
+C - PBRIDGE:  the memory address of the C++ Bridge
+C - NGOODHEL: the output number of good helicities
+C - NTOTHEL:  the output total number of helicities in cudacpp (aka NCOMB in Fortran)
+C
+      INTERFACE
+         SUBROUTINE FBRIDGEGETNGOODHEL(PBRIDGE, NGOODHEL, NTOTHEL)
+         INTEGER*8 PBRIDGE
+         INTEGER*4 NGOODHEL
+         INTEGER*4 NTOTHEL
+         END SUBROUTINE FBRIDGEGETNGOODHEL
+      END INTERFACE
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fsampler.cc
new file mode 100644
index 0000000000..bc90937f47
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fsampler.cc
@@ -0,0 +1,159 @@
+#include "mgOnGpuConfig.h"
+
+#include "Bridge.h"
+#include "MemoryBuffers.h"
+#include "RamboSamplingKernels.h"
+#include "RandomNumberKernels.h"
+
+//--------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  template<typename FORTRANFPTYPE>
+  class Sampler final : public CppObjectInFortran
+  {
+  public:
+    // Constructor
+    // @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran arrays
+    // @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY: remove it?)
+    // @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY: remove it?)
+    Sampler( int nevtF, int nparF, int np4F );
+    // Destructor
+    virtual ~Sampler() {}
+    // Delete copy/move constructors and assignment operators
+    Sampler( const Sampler& ) = delete;
+    Sampler( Sampler&& ) = delete;
+    Sampler& operator=( const Sampler& ) = delete;
+    Sampler& operator=( Sampler&& ) = delete;
+    // Draw random numbers and convert them to momenta in C++, then transpose them to Fortran momenta
+    void samplerHostSequence( FORTRANFPTYPE* fortranMomenta );
+  private:
+    const int m_nevt; // The number of events in each iteration
+    int m_iiter;      // The iteration counter (for random number seeding)
+#ifndef __CUDACC__
+    HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
+    HostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
+    HostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
+#else
+    PinnedHostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers
+    PinnedHostBufferMomenta m_hstMomenta;      // Memory buffers for momenta
+    PinnedHostBufferWeights m_hstWeights;      // Memory buffers for sampling weights
+#endif
+    std::unique_ptr<RandomNumberKernelBase> m_prnk; // The appropriate RandomNumberKernel
+    std::unique_ptr<SamplingKernelBase> m_prsk;     // The appropriate SamplingKernel
+    // HARDCODED DEFAULTS
+    static constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak)
+  };
+
+  template<typename FORTRANFPTYPE>
+  Sampler<FORTRANFPTYPE>::Sampler( int nevtF, int nparF, int np4F )
+    : m_nevt( nevtF )
+    , m_iiter( 0 )
+    , m_hstRndmom( nevtF )
+    , m_hstMomenta( nevtF )
+    , m_hstWeights( nevtF )
+    , m_prnk( new CommonRandomNumberKernel( m_hstRndmom ) )
+    , m_prsk( new RamboSamplingKernelHost( energy, m_hstRndmom, m_hstMomenta, m_hstWeights, nevtF ) )
+  {
+    if( nparF != mgOnGpu::npar ) throw std::runtime_error( "Sampler constructor: npar mismatch" );
+    if( np4F != mgOnGpu::np4 ) throw std::runtime_error( "Sampler constructor: np4 mismatch" );
+    std::cout << "WARNING! Instantiate host Sampler (nevt=" << m_nevt << ")" << std::endl;
+  }
+
+  // Draw random numbers and convert them to momenta in C++, then transpose them to Fortran momenta
+  template<typename FORTRANFPTYPE>
+  void Sampler<FORTRANFPTYPE>::samplerHostSequence( FORTRANFPTYPE* fortranMomenta )
+  {
+    std::cout << "Iteration #" << m_iiter + 1 << std::endl;
+    // === STEP 1 OF 3
+    // --- 1a. Seed rnd generator (to get same results on host and device in curand)
+    // [NB This should not be necessary using the host API: "Generation functions
+    // can be called multiple times on the same generator to generate successive
+    // blocks of results. For pseudorandom generators, multiple calls to generation
+    // functions will yield the same result as a single call with a large size."]
+    // *** NB! REMEMBER THAT THE FORTRAN SAMPLER ALWAYS USES COMMON RANDOM NUMBERS! ***
+    constexpr unsigned long long seed = 20200805;
+    m_prnk->seedGenerator( seed + m_iiter );
+    m_iiter++;
+    // --- 1b. Generate all relevant numbers to build nevt events (i.e. nevt phase space points) on the host
+    m_prnk->generateRnarray();
+    //std::cout << "Got random numbers" << std::endl;
+    // === STEP 2 OF 3
+    // --- 2a. Fill in momenta of initial state particles on the device
+    m_prsk->getMomentaInitial();
+    //std::cout << "Got initial momenta" << std::endl;
+    // --- 2b. Fill in momenta of final state particles using the RAMBO algorithm on the device
+    // (i.e. map random numbers to final-state particle momenta for each of nevt events)
+    m_prsk->getMomentaFinal();
+    //std::cout << "Got final momenta" << std::endl;
+    // --- 2c. TransposeC2F
+    hst_transposeMomentaC2F( m_hstMomenta.data(), fortranMomenta, m_nevt );
+  }
+}
+
+//--------------------------------------------------------------------------
+
+extern "C"
+{
+#ifdef __CUDACC__
+  using namespace mg5amcGpu;
+#else
+  using namespace mg5amcCpu;
+#endif
+
+  /**
+   * The floating point precision used in Fortran arrays.
+   * This is presently hardcoded to double precision (REAL*8).
+   */
+  using FORTRANFPTYPE = double; // for Fortran double precision (REAL*8) arrays
+  //using FORTRANFPTYPE = float; // for Fortran single precision (REAL*4) arrays
+
+  /**
+   * Create a Sampler and return its pointer.
+   * This is a C symbol that should be called from the Fortran code (in auto_dsig1.f).
+   *
+   * @param ppsampler the pointer to the Sampler pointer (the Sampler pointer is handled in Fortran as an INTEGER*8 variable)
+   * @param nevtF the pointer to the number of events in the Fortran arrays
+   * @param nparF the pointer to the number of external particles in the Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   * @param np4F the pointer to the number of momenta components, usually 4, in the Fortran arrays (KEPT FOR SANITY CHECKS ONLY)
+   */
+  void fsamplercreate_( CppObjectInFortran** ppsampler, const int* pnevtF, const int* pnparF, const int* pnp4F )
+  {
+    *ppsampler = new Sampler<FORTRANFPTYPE>( *pnevtF, *pnparF, *pnp4F );
+  }
+
+  /**
+   * Delete a Sampler.
+   * This is a C symbol that should be called from the Fortran code (in auto_dsig1.f).
+   *
+   * @param ppsampler the pointer to the Sampler pointer (the Sampler pointer is handled in Fortran as an INTEGER*8 variable)
+   */
+  void fsamplerdelete_( CppObjectInFortran** ppsampler )
+  {
+    Sampler<FORTRANFPTYPE>* psampler = dynamic_cast<Sampler<FORTRANFPTYPE>*>( *ppsampler );
+    if( psampler == 0 ) throw std::runtime_error( "fsamplerdelete_: invalid Sampler address" );
+    delete psampler;
+  }
+
+  /**
+   * Execute the matrix-element calculation "sequence" via a Sampler on GPU/CUDA or CUDA/C++.
+   * This is a C symbol that should be called from the Fortran code (in auto_dsig1.f).
+   *
+   * @param ppsampler the pointer to the Sampler pointer (the Sampler pointer is handled in Fortran as an INTEGER*8 variable)
+   * @param momenta the pointer to the input 4-momenta
+   * @param mes the pointer to the output matrix elements
+   */
+  void fsamplersequence_( CppObjectInFortran** ppsampler, FORTRANFPTYPE* momenta )
+  {
+    Sampler<FORTRANFPTYPE>* psampler = dynamic_cast<Sampler<FORTRANFPTYPE>*>( *ppsampler );
+    if( psampler == 0 ) throw std::runtime_error( "fsamplersequence_: invalid Sampler address" );
+    // Use the host/CPU implementation (there is no device implementation)
+    psampler->samplerHostSequence( momenta );
+  }
+}
+
+//--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fsampler.inc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fsampler.inc
new file mode 100644
index 0000000000..d4895df206
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/fsampler.inc
@@ -0,0 +1,37 @@
+C
+C Create a Sampler and return its pointer
+C - PSAMPLER:  the memory address of the C++ Sampler
+C - NEVT:      the number of events in the Fortran arrays
+C - NPAR:      the number of external particles in the Fortran arrays (KEPT FOR SANITY CHECKS ONLY: remove it?)
+C - NP4:       the number of momenta components, usually 4, in the Fortran arrays (KEPT FOR SANITY CHECKS ONLY: remove it?)
+C
+      INTERFACE
+         SUBROUTINE FSAMPLERCREATE(PSAMPLER, NEVT, NPAR, NP4)
+         INTEGER*8 PSAMPLER
+         INTEGER*4 NEVT
+         INTEGER*4 NPAR
+         INTEGER*4 NP4
+         END SUBROUTINE FSAMPLERCREATE
+      END INTERFACE
+      
+C
+C Delete a Sampler.
+C - PSAMPLER:  the memory address of the C++ Sampler
+C
+      INTERFACE
+         SUBROUTINE FSAMPLERDELETE(PSAMPLER)
+         INTEGER*8 PSAMPLER
+         END SUBROUTINE FSAMPLERDELETE
+      END INTERFACE
+      
+C
+C Execute the matrix-element calculation "sequence" via a Sampler on GPU/CUDA or CUDA/C++.
+C - PSAMPLER:  the memory address of the C++ Sampler
+C - MOMENTA:   the output 4-momenta Fortran array
+C
+      INTERFACE
+         SUBROUTINE FSAMPLERSEQUENCE(PSAMPLER, MOMENTA)
+         INTEGER*8 PSAMPLER
+         DOUBLE PRECISION MOMENTA(*)
+         END SUBROUTINE FSAMPLERSEQUENCE
+      END INTERFACE
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/nvtx.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/nvtx.h
new file mode 100644
index 0000000000..e206b8e075
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/nvtx.h
@@ -0,0 +1,69 @@
+#ifndef MGONGPUNVTX_H
+#define MGONGPUNVTX_H 1
+
+// Provides macros for simply use of NVTX, if a compiler macro USE_NVTX is defined.
+// Original author Peter Heywood <p.heywood@sheffield.ac.uk>
+// With a few modifications by Andrea Valassi
+
+//-------------------------------------------
+// NVTX is enabled
+//-------------------------------------------
+
+#ifdef USE_NVTX
+
+#include <stdio.h>
+
+// This assumes CUDA 10.0+
+#include "nvtx3/nvToolsExt.h"
+
+// Scope some things into a namespace
+namespace nvtx
+{
+
+  // Colour palette (RGB): https://colorbrewer2.org/#type=qualitative&scheme=Paired&n=12
+  const uint32_t palette[] = { 0xffa6cee3, 0xff1f78b4, 0xffb2df8a, 0xff33a02c, 0xfffb9a99, 0xffe31a1c, 0xfffdbf6f, 0xffff7f00, 0xffcab2d6, 0xff6a3d9a, 0xffffff99, 0xffb15928 };
+  const uint32_t colourCount = sizeof( palette ) / sizeof( uint32_t );
+
+  // Inline method to push an nvtx range
+  inline void push( const char* str, const uint32_t nextColourIdx )
+  {
+    // Get the wrapped colour index
+    uint32_t colourIdx = nextColourIdx % colourCount;
+    // Build/populate the struct of nvtx event attributes
+    nvtxEventAttributes_t eventAttrib = { 0 }; // zero-out the struct (see https://nvidia.github.io/NVTX/doxygen/structnvtx_event_attributes__v2.html)
+    eventAttrib.version = NVTX_VERSION;
+    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+    eventAttrib.colorType = NVTX_COLOR_ARGB;
+    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+    eventAttrib.color = palette[colourIdx];
+    eventAttrib.message.ascii = str;
+    // Push the custom event.
+    nvtxRangePushEx( &eventAttrib );
+  }
+
+  // Inline method to pop an nvtx range
+  inline void pop()
+  {
+    nvtxRangePop();
+  }
+
+}
+
+// Macro to push an arbitrary nvtx marker
+#define NVTX_PUSH( str, idx ) nvtx::push( str, idx )
+
+// Macro to pop an arbitrary nvtx marker
+#define NVTX_POP() nvtx::pop()
+
+//-------------------------------------------
+// NVTX is not enabled
+//-------------------------------------------
+
+#else
+
+#define NVTX_PUSH( str, idx )
+#define NVTX_POP()
+
+#endif
+
+#endif // MGONGPUNVTX_H 1
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/ompnumthreads.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/ompnumthreads.h
new file mode 100644
index 0000000000..9f8dbbb7f9
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/ompnumthreads.h
@@ -0,0 +1,58 @@
+#ifndef OMPNUMTHREADS_H
+#define OMPNUMTHREADS_H 1
+
+#ifdef _OPENMP
+
+#include <omp.h>
+
+#include <iostream>
+
+// The OMP_NUM_THREADS environment variable is used to control OMP multi-threading
+// By default, all available $(nproc) threads are used if OMP_NUM_THREADS is not set:
+// if ompnumthreadsNotSetMeansOneThread is called, only one thread is used instead
+inline void
+ompnumthreadsNotSetMeansOneThread( int debuglevel ) // quiet(-1), info(0), debug(1)
+{
+  // Set OMP_NUM_THREADS equal to 1 if it is not yet set
+  char* ompnthr = getenv( "OMP_NUM_THREADS" );
+  if( debuglevel == 1 )
+  {
+    std::cout << "DEBUG: entering ompnumthreadsNotSetMeansOneThread" << std::endl;
+    std::cout << "DEBUG: omp_get_num_threads() = "
+              << omp_get_num_threads() << std::endl; // always == 1 here!
+    std::cout << "DEBUG: omp_get_max_threads() = "
+              << omp_get_max_threads() << std::endl;
+    std::cout << "DEBUG: ${OMP_NUM_THREADS}    = '"
+              << ( ompnthr == 0 ? "[not set]" : ompnthr ) << "'" << std::endl;
+  }
+  if( ompnthr == NULL ||
+      std::string( ompnthr ).find_first_not_of( "0123456789" ) != std::string::npos ||
+      atol( ompnthr ) == 0 )
+  {
+    if( ompnthr != NULL )
+      std::cout << "(ompnumthreadsNotSetMeansOneThread) "
+                << "WARNING! OMP_NUM_THREADS is invalid: will use only 1 thread" << std::endl;
+    else if( debuglevel >= 0 )
+      std::cout << "(ompnumthreadsNotSetMeansOneThread) "
+                << "DEBUG: OMP_NUM_THREADS is not set: will use only 1 thread" << std::endl;
+    omp_set_num_threads( 1 ); // https://stackoverflow.com/a/22816325
+    if( debuglevel == 1 )
+    {
+      std::cout << "DEBUG: omp_get_num_threads() = "
+                << omp_get_num_threads() << std::endl; // always == 1 here!
+      std::cout << "DEBUG: omp_get_max_threads() = "
+                << omp_get_max_threads() << std::endl;
+    }
+  }
+  else if( debuglevel >= 0 )
+    std::cout << "(ompnumthreadsNotSetMeansOneThread) "
+              << "DEBUG: OMP_NUM_THREADS = " << ompnthr << std::endl;
+  if( debuglevel >= 0 )
+    std::cout << "(ompnumthreadsNotSetMeansOneThread) "
+              << "omp_get_max_threads() = " << omp_get_max_threads() << std::endl;
+  if( debuglevel == 1 )
+    std::cout << "DEBUG: exiting ompnumthreadsNotSetMeansOneThread" << std::endl;
+}
+#endif
+
+#endif // OMPNUMTHREADS_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/perf.py b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/perf.py
new file mode 100644
index 0000000000..63f4c714a7
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/perf.py
@@ -0,0 +1,346 @@
+#!/usr/bin/env python3
+
+from optparse import OptionParser
+from datetime import datetime
+from mpl_toolkits.mplot3d import Axes3D  # noqa: F401
+import matplotlib.pyplot as plt
+from matplotlib import cm
+from matplotlib.ticker import ScalarFormatter
+import numpy as np
+import copy
+import sys
+import json
+from operator import itemgetter
+
+
+class Perf():
+
+    def __init__(self, date, run, x, y, z, xrem, yrem, loc):
+        perffile = '%s/%s-perf-test-run%s.json' % (loc, date, run)
+        data = open(perffile, 'r')
+        readJson = json.loads(data.read())
+        data.close()
+        self.axesn = [x, y, z]
+        self.axesr = [xrem, yrem]  # remove outer bands from axes
+        self.axesv = [[], [], []]
+        self.data = self.prepData(readJson)
+
+    def prepData(self, jsonData):
+        for data in jsonData:
+            for i in data:
+                if isinstance(data[i], type('test')):
+                    idx = -1
+                    if data[i].find("sec") != -1:
+                        idx = data[i].find("sec")
+                    elif data[i].find("GEV") != -1:
+                        idx = data[i].find("GeV")
+
+                    if idx != -1:
+                        data[i] = float(data[i][:idx - 1])
+        return jsonData
+
+    def prepAxes3D(self):
+        for d in self.data:
+            ks = list(d.keys())
+            for ax in self.axesn:
+                idx = self.axesn.index(ax)
+                axlist = self.axesv[idx]
+                if ax in ks:
+                    axval = d[ax]
+                    if axval not in axlist:
+                        axlist.append(axval)
+                else:
+                    print('Error: cannot find axes name %s in %s' % (ax, d))
+        if len(self.axesv[0]) * len(self.axesv[1]) != len(self.axesv[2]):
+            print('Error: axes don\'t match x * y != z (%d * %d != %d' %
+                  (len(self.axesv[0]), len(self.axesv[1]), len(self.axesv[2])))
+        self.axesv[0].sort()
+        self.axesv[1].sort()
+        self.axesv[0] = self.axesv[0][self.axesr[0]:]  # sr
+        self.axesv[1] = self.axesv[1][self.axesr[1]:]  # sr
+
+    def prepData3D(self):
+        xlen = len(self.axesv[0])
+        ylen = len(self.axesv[1])
+        self.data2d = []
+        ylist = [0] * ylen
+        for i in range(xlen):
+            self.data2d.append(copy.deepcopy(ylist))
+        for d in self.data:
+            xpos = -1
+            ypos = -1
+            if d[self.axesn[0]] in self.axesv[0]:
+                xpos = self.axesv[0].index(d[self.axesn[0]])
+            if d[self.axesn[1]] in self.axesv[1]:
+                ypos = self.axesv[1].index(d[self.axesn[1]])
+            if xpos != -1 and ypos != -1:
+                zval = d[self.axesn[2]]
+                self.data2d[xpos][ypos] = zval
+
+    def plot3D(self):
+        self.prepAxes3D()
+        self.prepData3D()
+
+        data_array = np.array(self.data2d)
+        fig = plt.figure()
+        ax = fig.add_subplot(111, projection='3d')
+        x_data, y_data = np.meshgrid(np.arange(data_array.shape[1]),
+                                     np.arange(data_array.shape[0]))
+        xticks = x_data[0]
+        yticks = np.array(list(range(len(y_data))))
+        x_data = x_data.flatten()
+        y_data = y_data.flatten()
+        z_data = data_array.flatten()
+        ax.set_xlabel(self.axesn[1], {'fontsize': 'small'})
+        ax.set_xticks(xticks)
+        # consider 'fontsize': 'small' for dict also yticklabels
+        ax.set_xticklabels(self.axesv[1], {'rotation': 45, 'fontsize': 'small'})
+        ax.set_ylabel(self.axesn[0], {'fontsize': 'small'})
+        ax.set_yticks(yticks)
+        # consider 'fontsize': 'small' for dict
+        ax.set_yticklabels(self.axesv[0], {'rotation': 45, 'fontsize': 'small'})
+        ax.set_zlabel(self.axesn[2], {'fontsize': 'small'})
+        # ax.set_zscale('log')
+        # z_data = np.log10(z_data)
+        ax.bar3d(x_data, y_data, np.zeros(len(z_data)), 1, 1, z_data)
+        plt.show()
+
+    def prepData2D(self):
+        self.dataDict2D = {}
+        xname = self.axesn[0]
+        yname = self.axesn[1]
+        zname = self.axesn[2]
+
+        for d in self.data:
+            xval = d[xname]
+            yval = d[yname]
+            zval = d[zname]
+            dim = xval * yval
+            tick = '%s/%s' % (str(xval), str(yval))
+            vallist = [float(str(zval).split()[0]), tick]
+            if dim not in self.dataDict2D:
+                self.dataDict2D[dim] = [vallist]
+            else:
+                self.dataDict2D[dim].append(vallist)
+
+    def plot2D(self):
+        self.prepData2D()
+
+        # use this value to plot a flat line for the cpu values to compare with
+        cpuval = 0
+        # cpuval = 79766.84    # tot
+        # cpuval = 427251.1  # rmb + me
+        # cpuval = 472578.7    # me
+
+        cmap = {'32': 'red', '64': 'orange', '128': 'blue', '256': 'green'}
+        smap = {'32': 20, '64': 40, '128': 80, '256': 160}
+
+        dims = list(self.dataDict2D.keys())
+        dims.sort()
+        xlist = list(range(1, len(dims) + 1))
+        ylist = []
+        clist = []
+        slist = []
+        ylabels = []
+        for d in dims:
+            ysublist = []
+            for y in self.dataDict2D[d]:
+                ysublist.append(y)  # y[0]
+            ysublist = sorted(ysublist, key=itemgetter(0), reverse=True)
+            clist.append([cmap[x[1].split('/')[0]] for x in ysublist])
+            slist.append([smap[x[1].split('/')[0]] for x in ysublist])
+            # Temporary conversion for total time for events -> events per sec
+            # ysublist[0][0] = d / ysublist[0][0]
+            ylabels.append([x[1] for x in ysublist])
+            ylist.append([x[0] for x in ysublist])
+
+        fig, ax = plt.subplots()
+        print(xlist)
+        print(ylist)
+        for xe, ye, ce, se in zip(xlist, ylist, clist, slist):
+            print([xe] * len(ye))
+            ax.scatter([xe] * len(ye), ye, s=se, facecolors='none',
+                       edgecolors=ce)
+            if cpuval:
+                ax.scatter(xe, cpuval, marker='+', c='dimgrey')
+
+        ax.set_xticks(xlist)
+        ax.set_xlabel('%s * %s' % (self.axesn[0], self.axesn[1]))
+        ax.set_ylabel('%s' % (self.axesn[2]))
+        ax.set_yscale('log')
+        ax.set_xticklabels(dims, rotation=45)
+        ax.yaxis.set_major_formatter(ScalarFormatter())
+        plt.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
+        # Commenting only for the current example due to an overlap of the
+        # product labels
+        # xpos = 1
+        # for y in ylabels:
+        #     xstr = ''
+        #     for x in y:
+        #         # xstr += x.replace('/', '\n')
+        #         xstr += x
+        #         xstr += '\n'
+        #     ax.text(xpos, 1, xstr, {'fontsize': 'xx-small',
+        #                             'ha': 'center',
+        #                             'va': 'bottom'})
+        #     xpos += 1
+
+        handlelist = []
+        for k in cmap:
+            handlelist.append(plt.scatter([], [], s=smap[k], marker='o',
+                                          color=cmap[k], facecolor='none'))
+
+        print(handlelist)
+        plt.legend(handlelist, [str(x) for x in cmap.keys()],
+                   title="# threads / block")
+
+        plt.show()
+
+    def plotStack(self, threads=32):
+        collist = ['Purples', 'Blues', 'Greens', 'Oranges', 'Reds', 'Greys']
+        # collist = ['tab20b', 'tab20c']
+
+        bars = {}
+        blocks = []
+        for d in self.data:
+            if d['NumThreadsPerBlock'] == threads:
+                blocks.append(d['NumBlocksPerGrid'])
+                for k in d:
+                    if k[0].isdigit():
+                        if k not in bars:
+                            bars[k] = []
+
+        barks = list(bars.keys())
+        barks.sort()
+        blocks.sort()
+
+        for d in self.data:
+            if d['NumThreadsPerBlock'] == threads:
+                for b in barks:
+                    if b in d:
+                        bars[b].append(d[b])
+                    else:
+                        bars[b].append(0)
+
+        ind = np.arange(len(bars[barks[0]]))
+        width = 0.35
+
+        plts = []
+        ci = -1
+        cj = 0.5
+        plts.append(plt.bar(ind, bars[barks[0]], width, edgecolor='black',
+                            color='white'))
+        bot = [0] * len(bars[barks[0]])
+        for i in range(1, len(barks)):
+            colcod = barks[i][:2]
+            if colcod[1] == 'a':
+                ci += 1
+                cj = 0.5
+            else:
+                cj += 0.1
+            print(colcod, ci, cj, bot[-1], barks[i])
+            col = cm.get_cmap(collist[ci])(cj)
+            sumlist = []
+            for (l1, l2) in zip(bot, bars[barks[i - 1]]):
+                sumlist.append(l1 + l2)
+            bot = sumlist
+            plts.append(plt.bar(ind, bars[barks[i]], width,
+                        bottom=bot, color=col, edgecolor=col))
+
+        plt.ylabel('seconds')
+        plts.reverse()
+        barks.reverse()
+        plt.xticks(ind, [str(x) for x in blocks], rotation=45)
+        plt.legend([x[0] for x in plts], barks)
+
+        plt.show()
+
+
+# import numpy as np
+# import matplotlib.pyplot as plt
+#
+# N = 5
+# menMeans = (20, 35, 30, 35, 27)
+# womenMeans = (25, 32, 34, 20, 25)
+# menStd = (2, 3, 4, 1, 2)
+# womenStd = (3, 5, 2, 3, 3)
+# ind = np.arange(N)    # the x locations for the groups
+# width = 0.35       # the width of the bars: can also be len(x) sequence
+#
+# p1 = plt.bar(ind, menMeans, width, yerr=menStd)
+# p2 = plt.bar(ind, womenMeans, width,
+#              bottom=menMeans, yerr=womenStd)
+#
+# plt.ylabel('Scores')
+# plt.title('Scores by group and gender')
+# plt.xticks(ind, ('G1', 'G2', 'G3', 'G4', 'G5'))
+# plt.yticks(np.arange(0, 81, 10))
+# plt.legend((p1[0], p2[0]), ('Men', 'Women'))
+#
+# plt.show()
+
+def print_keys(loc, date, run):
+    perffile = '%s/%s-perf-test-run%s.json' % (loc, date, run)
+    data = open(perffile, 'r')
+    readJson = json.loads(data.read())
+    data.close()
+    for k in list(readJson[0].keys()):
+        print(k)
+
+
+if __name__ == '__main__':
+
+    n = datetime.now()
+    today = str(n.year) + str(n.month).rjust(2, '0') + str(n.day).rjust(2, '0')
+    parser = OptionParser()
+    parser.add_option('-l', '--location', dest='dir', default='data',
+                      help='directory with data (default: data)')
+    parser.add_option('-d', '--date', dest='date', default=today,
+                      help='date of data files YYYYMMDD (default: today)')
+    parser.add_option('-r', '--run', default='1', dest='run',
+                      help='run number (default: 1)')
+    parser.add_option('-x', dest='xax', default='NumThreadsPerBlock',
+                      help='variable name for x axis \
+                            (default: NumThreadsPerBlock)')
+    parser.add_option('-y', dest='yax', default='NumBlocksPerGrid',
+                      help='variable name for y axis \
+                            (default: NumBlocksPerGrid)')
+    parser.add_option('-z', dest='zax', default='TotalTimeInWaveFuncs',
+                      help='variable name for z axis \
+                            (default: TotalTimeInWaveFuncs)')
+    parser.add_option('--xrm', dest='xrm', default=0,
+                      help='# of outer x dimensions to remove')
+    parser.add_option('--yrm', dest='yrm', default=0,
+                      help='# of outer y dimensions to remove')
+    parser.add_option('-k', '--keys', dest='keys', action='store_true',
+                      help='print available keys from data')
+
+    (op, ar) = parser.parse_args()
+
+    plotnames = ['2D', '3D', 'STACK']
+    plot = '2D'
+
+    xrm = 0
+    yrm = 0
+    if op.xrm:
+        xrm = int(op.xrm)
+    if op.yrm:
+        yrm = int(op.yrm)
+
+    if op.keys:
+        print_keys(op.dir, op.date, op.run)
+        sys.exit(0)
+
+    if (len(ar) == 1 and ar[0].upper() not in plotnames) or len(ar) > 1:
+        print(parser.print_help())
+        sys.exit(1)
+    elif len(ar) == 1:
+        plot = ar[0].upper()
+
+    p = Perf(op.date, op.run, op.xax, op.yax, op.zax, xrm, yrm, op.dir)
+    if plot == '3D':
+        p.plot3D()
+    if plot == '2D':
+        p.plot2D()
+    if plot == 'STACK':
+        p.plotStack()
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/profile.sh b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/profile.sh
new file mode 100755
index 0000000000..1d60fa3542
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/profile.sh
@@ -0,0 +1,182 @@
+#!/bin/bash
+
+usage(){
+  echo "Usage (GUI analysis): $0 -l label [-cc] [-p #blocks #threads #iterations]"
+  echo "Usage (CL analysis):  $0 -nogui [-p #blocks #threads #iterations]"
+  exit 1
+}
+
+# Default options
+tag=cu
+###cuargs="16384 32 12" # NEW DEFAULT 2020.08.10 (faster on local, and allows comparison to global and shared memory)
+###ccargs="  256 32 12" # Similar to cuda config, but faster than using "16384 32 12"
+##cuargs="16384 32 2" # faster tests
+##ccargs="  256 32 2" # faster tests
+cuargs="2048 256 1" # NEW DEFAULT 2021.04.06 (matches "-p 2048 256 12" but only one iteration)
+ccargs="2048 256 1" # NEW DEFAULT 2021.04.06 (matches "-p 2048 256 12" but only one iteration)
+args=
+label=
+
+# Command line arguments
+while [ "$1" != "" ]; do
+  # Profile C++ instead of cuda
+  if [ "$1" == "-cc" ]; then
+    if [ "$tag" != "nogui" ]; then
+      tag=cc
+      shift
+    else
+      echo "ERROR! Incompatible options -gui and -cc"
+      usage
+    fi
+  # Fast no-GUI profiling with ncu
+  elif [ "$1" == "-nogui" ]; then
+    if [ "$tag" != "cc" ]; then
+      tag=nogui
+      shift
+    else
+      echo "ERROR! Incompatible options -gui and -cc"
+      usage
+    fi
+  # Override blocks/threads/iterations
+  # (NB do not exceed 12 iterations: profiling overhead per iteration is huge)
+  elif [ "$1" == "-p" ]; then
+    if [ "$4" != "" ]; then
+      args="$2 $3 $4"    
+      shift 4
+    else
+      usage
+    fi
+  # Label
+  elif [ "$1" == "-l" ]; then
+    if [ "$2" != "" ]; then
+      label="$2"
+      shift 2
+    else
+      usage
+    fi
+  # Invalid arguments
+  else
+    usage
+  fi
+done
+
+if [ "$tag" == "cc" ]; then
+  if [ "$args" == "" ]; then args=$ccargs; fi
+  cmd="./check.exe -p $args"
+  make
+else
+  if [ "$args" == "" ]; then args=$cuargs; fi
+  cmd="./gcheck.exe -p $args"
+  make
+fi
+
+ncu="ncu"
+nsys="nsys"
+ncugui="ncu-ui &"
+nsysgui="nsight-sys &"
+
+# Settings specific to CERN condor/batch nodes
+###host=$(hostname)
+###if [ "${host%%cern.ch}" != "${host}" ] && [ "${host##b}" != "${host}" ]; then
+###  ncu=/usr/local/cuda-11.0/bin/ncu
+###  ###nsys=/usr/local/cuda-10.1/bin/nsys
+###  ###nsys=/usr/local/cuda-10.2/bin/nsys
+###  nsys=/cvmfs/sft.cern.ch/lcg/releases/cuda/11.0RC-d9c38/x86_64-centos7-gcc62-opt/bin/nsys
+###  ncugui="Launch the Nsight Compute GUI from Windows"
+###  nsysgui="Launch the Nsight System GUI from Windows"
+###fi
+
+# Settings specific to CERN IT/SC nodes
+# (nsys 11.4 and 11.5 fail with 'boost::wrapexcept<QuadDCommon::NotFoundException>')
+host=$(hostname)
+if [ "${host%%cern.ch}" != "${host}" ] && [ "${host##itsc}" != "${host}" ]; then
+  CUDA_NSIGHT_HOME=/usr/local/cuda-11.1
+  echo "Using Nsight from ${CUDA_NSIGHT_HOME}"
+  ncu=${CUDA_NSIGHT_HOME}/bin/ncu
+  nsys=${CUDA_NSIGHT_HOME}/bin/nsys
+  ncugui="${CUDA_NSIGHT_HOME}/bin/ncu-ui &"
+  nsysgui="${CUDA_NSIGHT_HOME}/bin/nsight-sys &"
+fi
+
+# Set the ncu sampling period (default is auto)
+# The value is in the range [0..31], the actual period is 2**(5+value) cycles. 
+###ncu="${ncu} --sampling-interval 0"  # MAX sampling frequency
+###ncu="${ncu} --sampling-interval 31" # MIN sampling frequency
+
+# METRICS FOR COALESCED MEMORY ACCESS (AOSOA etc)
+# See https://developer.nvidia.com/blog/using-nsight-compute-to-inspect-your-kernels/
+# These used to be called gld_transactions and global_load_requests
+# See also https://docs.nvidia.com/nsight-compute/2019.5/NsightComputeCli/index.html#nvprof-metric-comparison
+# See also https://stackoverflow.com/questions/60535867
+metrics=l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum
+
+# METRICS FOR REGISTER PRESSURE
+metrics+=,launch__registers_per_thread
+
+# METRICS FOR DIVERGENCE
+metrics+=,sm__sass_average_branch_targets_threads_uniform.pct
+
+# GUI analysis
+if [ "$tag" != "nogui" ]; then
+
+  if [ "$label" == "" ]; then
+    echo "ERROR! You must specify a label"
+    usage
+  fi
+
+  arg1=$(echo $args | cut -d' ' -f1)
+  arg2=$(echo $args | cut -d' ' -f2)
+  arg3=$(echo $args | cut -d' ' -f3)
+  
+  ###if [ "${host%%raplab*}" != "${host}" ]; then
+  ###  logs=nsight_logs_raplab
+  ###elif [ "${host%%cern.ch}" != "${host}" ] && [ "${host##b}" != "${host}" ]; then
+  ###  logs=nsight_logs_lxbatch
+  ###else
+  ###  logs=nsight_logs
+  ###fi
+  logs=nsight_logs
+
+  if [ ! -d $logs ]; then mkdir -p $logs; fi
+  trace=$logs/Sigma_sm_gg_ttxgg_${tag}_`date +%m%d_%H%M`_b${arg1}_t${arg2}_i${arg3}
+  if [ "$label" != "" ]; then trace=${trace}_${label}; fi
+  
+  echo
+  echo "PROFILING: ${cmd}"
+  echo "OUTPUT: ${trace}.*"
+  echo
+  
+  \rm -f ${trace}.*
+  
+  hostname > ${trace}.txt
+  echo "nproc=$(nproc)" >> ${trace}.txt
+  echo >> ${trace}.txt
+  ( time ${cmd} ) 2>&1 | tee -a ${trace}.txt
+  nvidia-smi -q -d CLOCK >> ${trace}.txt
+  
+  if [ "$tag" == "cu" ]; then
+    echo
+    echo "${ncu} --set full --metrics ${metrics} -o ${trace} ${cmd}"
+    echo
+    ${ncu} --set full --metrics ${metrics} -o ${trace} ${cmd}
+  fi
+  echo
+  echo "${nsys} profile -o ${trace} ${cmd}"
+  echo
+  ${nsys} profile -o ${trace} ${cmd}
+  echo ""
+  echo "TO ANALYSE TRACE FILES:"
+  echo "  ${ncugui}"
+  echo "  ${nsysgui}"
+  
+# NO-GUI analysis
+else
+
+  echo
+  echo "PROFILING: ${cmd}"
+  echo "${ncu} --metrics ${metrics} ${cmd}"
+  echo
+  echo sudo LD_LIBRARY_PATH=${LD_LIBRARY_PATH} $(which ${ncu}) --metrics ${metrics}  --target-processes all ${cmd}
+  sudo LD_LIBRARY_PATH=${LD_LIBRARY_PATH} $(which ${ncu}) --metrics ${metrics}  --target-processes all ${cmd}
+
+fi
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/runTest.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/runTest.cc
new file mode 100644
index 0000000000..a1cec39ced
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/runTest.cc
@@ -0,0 +1,251 @@
+#include "mgOnGpuConfig.h"
+
+#include "CPPProcess.h"
+#include "MadgraphTest.h"
+#include "MatrixElementKernels.h"
+#include "MemoryAccessMatrixElements.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryBuffers.h"
+#include "RamboSamplingKernels.h"
+#include "RandomNumberKernels.h"
+#include "epoch_process_id.h"
+
+#ifdef __CUDACC__
+using namespace mg5amcGpu;
+#else
+using namespace mg5amcCpu;
+#endif
+
+struct CUDA_CPU_TestBase : public TestDriverBase
+{
+  static constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+  static constexpr int np4 = mgOnGpu::np4;
+  static constexpr int npar = mgOnGpu::npar;
+  static_assert( gputhreads % neppM == 0, "ERROR! #threads/block should be a multiple of neppM" );
+  static_assert( gputhreads <= mgOnGpu::ntpbMAX, "ERROR! #threads/block should be <= ntpbMAX" );
+  CUDA_CPU_TestBase( const std::string& refFileName )
+    : TestDriverBase( npar, refFileName ) {}
+};
+
+#ifndef __CUDACC__
+struct CPUTest : public CUDA_CPU_TestBase
+{
+  // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
+  // [NB the hst/dev memory arrays must be initialised in the constructor, see issue #290]
+  CPPProcess process;
+  HostBufferRndNumMomenta hstRndmom;
+  HostBufferMomenta hstMomenta;
+  HostBufferGs hstGs;
+  HostBufferRndNumHelicity hstRndHel;
+  HostBufferRndNumColor hstRndCol;
+  HostBufferWeights hstWeights;
+  HostBufferMatrixElements hstMatrixElements;
+  HostBufferSelectedHelicity hstSelHel;
+  HostBufferSelectedColor hstSelCol;
+  HostBufferHelicityMask hstIsGoodHel;
+
+  // Create a process object
+  // Read param_card and set parameters
+  // ** WARNING EVIL EVIL **
+  // The CPPProcess constructor has side effects on the globals Proc::cHel, which is needed in ME calculations.
+  // Don't remove!
+  CPUTest( const std::string& refFileName )
+    : CUDA_CPU_TestBase( refFileName )
+    , process( /*verbose=*/false )
+    , hstRndmom( nevt )
+    , hstMomenta( nevt )
+    , hstGs( nevt )
+    , hstRndHel( nevt )
+    , hstRndCol( nevt )
+    , hstWeights( nevt )
+    , hstMatrixElements( nevt )
+    , hstSelHel( nevt )
+    , hstSelCol( nevt )
+    , hstIsGoodHel( mgOnGpu::ncomb )
+  {
+    process.initProc( "../../Cards/param_card.dat" );
+  }
+
+  virtual ~CPUTest() {}
+
+  void prepareRandomNumbers( unsigned int iiter ) override
+  {
+    CommonRandomNumberKernel rnk( hstRndmom );
+    rnk.seedGenerator( 1337 + iiter );
+    rnk.generateRnarray();
+  }
+
+  void prepareMomenta( fptype energy ) override
+  {
+    RamboSamplingKernelHost rsk( energy, hstRndmom, hstMomenta, hstWeights, nevt );
+    // --- 2a. Fill in momenta of initial state particles on the device
+    rsk.getMomentaInitial();
+    // --- 2b. Fill in momenta of final state particles using the RAMBO algorithm on the device
+    // (i.e. map random numbers to final-state particle momenta for each of nevt events)
+    rsk.getMomentaFinal();
+  }
+
+  void runSigmaKin( std::size_t iiter ) override
+  {
+    constexpr fptype fixedG = 1.2177157847767195; // fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
+    for( unsigned int i = 0; i < nevt; ++i ) hstGs[i] = fixedG;
+    MatrixElementKernelHost mek( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt );
+    if( iiter == 0 ) mek.computeGoodHelicities();
+    constexpr unsigned int channelId = 0; // TEMPORARY? disable multi-channel in runTest.exe #466
+    mek.computeMatrixElements( channelId );
+  }
+
+  fptype getMomentum( std::size_t ievt, unsigned int ipar, unsigned int ip4 ) const override
+  {
+    assert( ipar < npar );
+    assert( ip4 < np4 );
+    return MemoryAccessMomenta::ieventAccessIp4IparConst( hstMomenta.data(), ievt, ip4, ipar );
+  }
+
+  fptype getMatrixElement( std::size_t ievt ) const override
+  {
+    return MemoryAccessMatrixElements::ieventAccessConst( hstMatrixElements.data(), ievt );
+  }
+};
+#endif
+
+#ifdef __CUDACC__
+struct CUDATest : public CUDA_CPU_TestBase
+{
+  // Reset the device when our test goes out of scope. Note that this should happen after
+  // the frees, i.e. be declared before the pointers to device memory.
+  struct DeviceReset
+  {
+    ~DeviceReset()
+    {
+      checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
+    }
+  } deviceResetter;
+
+  // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device)
+  // [NB the hst/dev memory arrays must be initialised in the constructor, see issue #290]
+  CPPProcess process;
+  PinnedHostBufferRndNumMomenta hstRndmom;
+  PinnedHostBufferMomenta hstMomenta;
+  PinnedHostBufferGs hstGs;
+  PinnedHostBufferRndNumHelicity hstRndHel;
+  PinnedHostBufferRndNumColor hstRndCol;
+  PinnedHostBufferWeights hstWeights;
+  PinnedHostBufferMatrixElements hstMatrixElements;
+  PinnedHostBufferSelectedHelicity hstSelHel;
+  PinnedHostBufferSelectedColor hstSelCol;
+  PinnedHostBufferHelicityMask hstIsGoodHel;
+  DeviceBufferRndNumMomenta devRndmom;
+  DeviceBufferMomenta devMomenta;
+  DeviceBufferGs devGs;
+  DeviceBufferRndNumHelicity devRndHel;
+  DeviceBufferRndNumColor devRndCol;
+  DeviceBufferWeights devWeights;
+  DeviceBufferMatrixElements devMatrixElements;
+  DeviceBufferSelectedHelicity devSelHel;
+  DeviceBufferSelectedColor devSelCol;
+  DeviceBufferHelicityMask devIsGoodHel;
+
+  // Create a process object
+  // Read param_card and set parameters
+  // ** WARNING EVIL EVIL **
+  // The CPPProcess constructor has side effects on the globals Proc::cHel, which is needed in ME calculations.
+  // Don't remove!
+  CUDATest( const std::string& refFileName )
+    : CUDA_CPU_TestBase( refFileName )
+    , process( /*verbose=*/false )
+    , hstRndmom( nevt )
+    , hstMomenta( nevt )
+    , hstGs( nevt )
+    , hstRndHel( nevt )
+    , hstRndCol( nevt )
+    , hstWeights( nevt )
+    , hstMatrixElements( nevt )
+    , hstSelHel( nevt )
+    , hstSelCol( nevt )
+    , hstIsGoodHel( mgOnGpu::ncomb )
+    , devRndmom( nevt )
+    , devMomenta( nevt )
+    , devGs( nevt )
+    , devRndHel( nevt )
+    , devRndCol( nevt )
+    , devWeights( nevt )
+    , devMatrixElements( nevt )
+    , devSelHel( nevt )
+    , devSelCol( nevt )
+    , devIsGoodHel( mgOnGpu::ncomb )
+  {
+    process.initProc( "../../Cards/param_card.dat" );
+  }
+
+  virtual ~CUDATest() {}
+
+  void prepareRandomNumbers( unsigned int iiter ) override
+  {
+    CommonRandomNumberKernel rnk( hstRndmom );
+    rnk.seedGenerator( 1337 + iiter );
+    rnk.generateRnarray();
+    copyDeviceFromHost( devRndmom, hstRndmom );
+  }
+
+  void prepareMomenta( fptype energy ) override
+  {
+    RamboSamplingKernelDevice rsk( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads );
+    // --- 2a. Fill in momenta of initial state particles on the device
+    rsk.getMomentaInitial();
+    // --- 2b. Fill in momenta of final state particles using the RAMBO algorithm on the device
+    // (i.e. map random numbers to final-state particle momenta for each of nevt events)
+    rsk.getMomentaFinal();
+    // --- 2c. CopyDToH Weights
+    copyHostFromDevice( hstWeights, devWeights );
+    // --- 2d. CopyDToH Momenta
+    copyHostFromDevice( hstMomenta, devMomenta );
+  }
+
+  void runSigmaKin( std::size_t iiter ) override
+  {
+    constexpr fptype fixedG = 1.2177157847767195; // fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc)
+    for( unsigned int i = 0; i < nevt; ++i ) hstGs[i] = fixedG;
+    copyDeviceFromHost( devGs, hstGs ); // BUG FIX #566
+    MatrixElementKernelDevice mek( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads );
+    if( iiter == 0 ) mek.computeGoodHelicities();
+    constexpr unsigned int channelId = 0; // TEMPORARY? disable multi-channel in runTest.exe #466
+    mek.computeMatrixElements( channelId );
+    copyHostFromDevice( hstMatrixElements, devMatrixElements );
+  }
+
+  fptype getMomentum( std::size_t ievt, unsigned int ipar, unsigned int ip4 ) const override
+  {
+    assert( ipar < npar );
+    assert( ip4 < np4 );
+    return MemoryAccessMomenta::ieventAccessIp4IparConst( hstMomenta.data(), ievt, ip4, ipar );
+  }
+
+  fptype getMatrixElement( std::size_t ievt ) const override
+  {
+    return MemoryAccessMatrixElements::ieventAccessConst( hstMatrixElements.data(), ievt );
+  }
+};
+#endif
+
+// Use two levels of macros to force stringification at the right level
+// (see https://gcc.gnu.org/onlinedocs/gcc-3.0.1/cpp_3.html#SEC17 and https://stackoverflow.com/a/3419392)
+// Google macro is in https://github.com/google/googletest/blob/master/googletest/include/gtest/gtest-param-test.h
+#define TESTID_CPU( s ) s##_CPU
+#define XTESTID_CPU( s ) TESTID_CPU( s )
+#define MG_INSTANTIATE_TEST_SUITE_CPU( prefix, test_suite_name ) \
+INSTANTIATE_TEST_SUITE_P( prefix, \
+                          test_suite_name, \
+                          testing::Values( new CPUTest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
+#define TESTID_GPU( s ) s##_GPU
+#define XTESTID_GPU( s ) TESTID_GPU( s )
+#define MG_INSTANTIATE_TEST_SUITE_GPU( prefix, test_suite_name ) \
+INSTANTIATE_TEST_SUITE_P( prefix, \
+                          test_suite_name, \
+                          testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) );
+
+#ifdef __CUDACC__
+MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
+#else
+MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest );
+#endif
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/testmisc.cc
new file mode 100644
index 0000000000..5fa8ac70fe
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/testmisc.cc
@@ -0,0 +1,217 @@
+// Use ./runTest.exe --gtest_filter=*misc to run only this test
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "epoch_process_id.h"
+
+#include <gtest/gtest.h>
+
+#include <sstream>
+#include <typeinfo>
+
+#ifdef __CUDACC__
+#define TESTID( s ) s##_GPU_MISC
+#else
+#define TESTID( s ) s##_CPU_MISC
+#endif
+
+#define XTESTID( s ) TESTID( s )
+
+#ifdef MGONGPU_CPPSIMD /* clang-format off */
+bool maskand( const bool_v& mask ){ bool out = true; for ( int i=0; i<neppV; i++ ) out = out && mask[i]; return out; }
+#define EXPECT_TRUE_sv( cond ) { bool_v mask( cond ); EXPECT_TRUE( maskand( mask ) ); }
+#else
+#define EXPECT_TRUE_sv( cond ) { EXPECT_TRUE( cond ); }
+#endif /* clang-format on */
+
+inline const std::string
+boolTF( const bool& b )
+{
+  return ( b ? "T" : "F" );
+}
+
+#ifdef MGONGPU_CPPSIMD
+inline const std::string
+boolTF( const bool_v& v )
+{
+  std::stringstream out;
+  out << "{ " << ( v[0] ? "T" : "F" );
+  for( int i = 1; i < neppV; i++ ) out << ", " << ( v[i] ? "T" : "F" );
+  out << " }";
+  return out.str();
+}
+#endif
+
+TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc )
+{
+  EXPECT_TRUE( true );
+
+  //--------------------------------------------------------------------------
+
+  // Vector initialization for fptype_sv
+  {
+    fptype_sv f{ 0 };
+    EXPECT_TRUE_sv( f == 0 );
+  }
+  {
+    fptype_sv f = fptype_sv{ 0 };
+    EXPECT_TRUE_sv( f == 0 );
+  }
+
+  // Vector initialization for fptype_sv - demonstrate bug #339 in older cxmake implementation
+  {
+    fptype_sv f{ 1 };
+    //std::cout << f << std::endl << boolTF( f == 1 ) << std::endl;
+    //EXPECT_TRUE_sv( f == 1 ); // this fails for vectors! TFFF
+#ifndef MGONGPU_CPPSIMD
+    EXPECT_TRUE_sv( f == 1 ); // this succeds: T
+#else
+    EXPECT_TRUE( ( f == 1 )[0] ); // this succeds: TFFF[0]
+    EXPECT_TRUE( ( f[0] == 1 ) );
+    for( int i = 1; i < neppV; i++ )
+    {
+      EXPECT_TRUE( !( ( f == 1 )[i] ) ); // this succeds: FTTT[i>=1]
+      EXPECT_TRUE( ( f[i] == 0 ) );      // equals 0, not 1
+    }
+#endif
+  }
+
+#ifdef MGONGPU_CPPSIMD
+  // Vector initialization for cxtype_sv - demonstrate fix for bug #339
+  {
+    fptype_sv f1 = fptype_v{ 0 } + 1;
+    EXPECT_TRUE_sv( f1 == 1 );
+    cxtype_v c12 = cxmake( f1, 2 );
+    //std::cout << c12 << std::endl << boolTF( c12.real() == 1 ) << std::endl << boolTF( c12.imag() == 2 ) << std::endl;
+    EXPECT_TRUE_sv( c12.real() == 1 );
+    EXPECT_TRUE_sv( c12.imag() == 2 );
+    cxtype_v c21 = cxmake( 2, f1 );
+    //std::cout << c21 << std::endl << boolTF( c21.real() == 2 ) << std::endl << boolTF( c21.imag() == 1 ) << std::endl;
+    EXPECT_TRUE_sv( c21.real() == 2 );
+    EXPECT_TRUE_sv( c21.imag() == 1 );
+  }
+#endif
+
+  // Vector initialization for cxtype_sv
+  {
+    cxtype_sv c = cxzero_sv();
+    EXPECT_TRUE_sv( c.real() == 0 );
+    EXPECT_TRUE_sv( c.imag() == 0 );
+  }
+  {
+    cxtype_sv c = cxmake( 1, fptype_sv{ 0 } ); // here was a bug #339
+    EXPECT_TRUE_sv( c.real() == 1 );
+    EXPECT_TRUE_sv( c.imag() == 0 );
+  }
+  {
+    cxtype_sv c = cxmake( fptype_sv{ 0 }, 1 ); // here was a bug #339
+    EXPECT_TRUE_sv( c.real() == 0 );
+    EXPECT_TRUE_sv( c.imag() == 1 );
+  }
+
+  // Array initialization for cxtype_sv array (example: jamp_sv in CPPProcess.cc)
+  {
+    cxtype_sv array[2] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxype is NOT, if "= {}" is missing!)
+    //std::cout << array[0].real() << std::endl; std::cout << boolTF( array[0].real() == 0 ) << std::endl;
+    EXPECT_TRUE_sv( array[0].real() == 0 );
+    EXPECT_TRUE_sv( array[0].imag() == 0 );
+    EXPECT_TRUE_sv( array[1].real() == 0 );
+    EXPECT_TRUE_sv( array[1].imag() == 0 );
+  }
+
+  // Alternative array initialization for cxtype_sv array (example: was used for outwf in testxxx.cc)
+  {
+    cxtype_sv array[2]{}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxype is NOT, if "{}" is missing!)
+    //std::cout << array[0].real() << std::endl; std::cout << boolTF( array[0].real() == 0 ) << std::endl;
+    EXPECT_TRUE_sv( array[0].real() == 0 );
+    EXPECT_TRUE_sv( array[0].imag() == 0 );
+    EXPECT_TRUE_sv( array[1].real() == 0 );
+    EXPECT_TRUE_sv( array[1].imag() == 0 );
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Scalar complex references
+  {
+    using namespace mgOnGpu;
+    // Refs to f1, f2
+    fptype f1 = 1;
+    fptype f2 = 2;
+    cxtype_ref r12( f1, f2 ); // copy refs
+    //cxtype_ref r12a( r12 ); //deleted
+    cxtype_ref r12a( cxtype_ref( f1, f2 ) ); // copy refs
+    //cxtype_ref r12b = r12; // deleted
+    cxtype_ref r12b = cxtype_ref( f1, f2 ); // copy refs
+    EXPECT_TRUE( cxtype( r12 ).real() == 1 );
+    EXPECT_TRUE( cxtype( r12 ).imag() == 2 );
+    EXPECT_TRUE( cxtype( r12a ).real() == 1 );
+    EXPECT_TRUE( cxtype( r12a ).imag() == 2 );
+    EXPECT_TRUE( cxtype( r12b ).real() == 1 );
+    EXPECT_TRUE( cxtype( r12b ).imag() == 2 );
+    // Refs to f1c, f2c
+    fptype f1c = 0;
+    fptype f2c = 0;
+    cxtype_ref r12c( f1c, f2c );
+    EXPECT_TRUE( cxtype( r12c ).real() == 0 );
+    EXPECT_TRUE( cxtype( r12c ).imag() == 0 );
+    //r12c = r12; // deleted
+    r12c = cxtype( r12 ); // copy values
+    EXPECT_TRUE( cxtype( r12c ).real() == 1 );
+    EXPECT_TRUE( cxtype( r12c ).imag() == 2 );
+    // Update f1, f2
+    f1 = 10;
+    f2 = 20;
+    EXPECT_TRUE( cxtype( r12 ).real() == 10 );
+    EXPECT_TRUE( cxtype( r12 ).imag() == 20 );
+    EXPECT_TRUE( cxtype( r12a ).real() == 10 );
+    EXPECT_TRUE( cxtype( r12a ).imag() == 20 );
+    EXPECT_TRUE( cxtype( r12b ).real() == 10 );
+    EXPECT_TRUE( cxtype( r12b ).imag() == 20 );
+    EXPECT_TRUE( cxtype( r12c ).real() == 1 ); // points to f1c, not to f1
+    EXPECT_TRUE( cxtype( r12c ).imag() == 2 ); // points to f2c, not to f2
+  }
+
+  // Vector complex references
+  {
+    using namespace mgOnGpu;
+    // Refs to f1, f2
+    fptype_sv f1 = fptype_sv{ 0 } + 1;
+    fptype_sv f2 = fptype_sv{ 0 } + 2;
+    cxtype_sv_ref r12( f1, f2 ); // copy refs
+    //cxtype_sv_ref r12a( r12 ); //deleted
+    cxtype_sv_ref r12a( cxtype_sv_ref( f1, f2 ) ); // copy refs
+    //cxtype_sv_ref r12b = r12; // deleted
+    cxtype_sv_ref r12b = cxtype_sv_ref( f1, f2 ); // copy refs
+    EXPECT_TRUE_sv( cxtype_sv( r12 ).real() == 1 );
+    EXPECT_TRUE_sv( cxtype_sv( r12 ).imag() == 2 );
+    EXPECT_TRUE_sv( cxtype_sv( r12a ).real() == 1 );
+    EXPECT_TRUE_sv( cxtype_sv( r12a ).imag() == 2 );
+    EXPECT_TRUE_sv( cxtype_sv( r12b ).real() == 1 );
+    EXPECT_TRUE_sv( cxtype_sv( r12b ).imag() == 2 );
+    // Refs to f1c, f2c
+    fptype_sv f1c = fptype_sv{ 0 };
+    fptype_sv f2c = fptype_sv{ 0 };
+    cxtype_sv_ref r12c( f1c, f2c );
+    EXPECT_TRUE_sv( cxtype_sv( r12c ).real() == 0 );
+    EXPECT_TRUE_sv( cxtype_sv( r12c ).imag() == 0 );
+    //r12c = r12; // deleted
+    r12c = cxtype_sv( r12 ); // copy values
+    EXPECT_TRUE_sv( cxtype_sv( r12c ).real() == 1 );
+    EXPECT_TRUE_sv( cxtype_sv( r12c ).imag() == 2 );
+    // Update f1, f2
+    f1 = fptype_sv{ 0 } + 10;
+    f2 = fptype_sv{ 0 } + 20;
+    EXPECT_TRUE_sv( cxtype_sv( r12 ).real() == 10 );
+    EXPECT_TRUE_sv( cxtype_sv( r12 ).imag() == 20 );
+    EXPECT_TRUE_sv( cxtype_sv( r12a ).real() == 10 );
+    EXPECT_TRUE_sv( cxtype_sv( r12a ).imag() == 20 );
+    EXPECT_TRUE_sv( cxtype_sv( r12b ).real() == 10 );
+    EXPECT_TRUE_sv( cxtype_sv( r12b ).imag() == 20 );
+    EXPECT_TRUE_sv( cxtype_sv( r12c ).real() == 1 ); // points to f1c, not to f1
+    EXPECT_TRUE_sv( cxtype_sv( r12c ).imag() == 2 ); // points to f2c, not to f2
+  }
+
+  //--------------------------------------------------------------------------
+}
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/testxxx.cc
new file mode 100644
index 0000000000..849678acca
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/testxxx.cc
@@ -0,0 +1,323 @@
+#include "mgOnGpuConfig.h"
+
+#include "CPPProcess.h"
+#include "HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h"
+#include "MemoryAccessMomenta.h"
+#include "MemoryAccessWavefunctions.h"
+#include "MemoryBuffers.h"
+#include "epoch_process_id.h"
+
+#include <gtest/gtest.h>
+
+#include <array>
+#include <cassert>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+#ifdef __CUDACC__
+#define TESTID( s ) s##_GPU_XXX
+#else
+#define TESTID( s ) s##_CPU_XXX
+#endif
+
+#define XTESTID( s ) TESTID( s )
+
+TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx )
+{
+  constexpr bool dumpEvents = false;       // dump the expected output of the test?
+  constexpr bool testEvents = !dumpEvents; // run the test?
+  constexpr fptype toleranceXXXs = std::is_same<fptype, double>::value ? 1.E-15 : 1.E-5;
+  // Constant parameters
+  constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+  using mgOnGpu::neppV;
+  using mgOnGpu::np4;
+  using mgOnGpu::npar;
+  const int nevt = 16;         // 12 independent tests plus 4 duplicates (need a multiple of 8 for floats or for '512z')
+  assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM
+  assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV
+  // Fill in the input momenta
+#ifdef __CUDACC__
+  mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
+#else
+  mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM]
+#endif /* clang-format off */
+  const fptype par0[np4 * nevt] = // AOS[nevt][np4]
+    {
+      500, 0, 0, 500,      // #0 (m=0 pT=0 E=pz>0)
+      500, 0, 0, -500,     // #1 (m=0 pT=0 -E=pz<0)
+      500, 300, 400, 0,    // #2 (m=0 pT>0 pz=0)
+      500, 180, 240, 400,  // #3 (m=0 pT>0 pz>0)
+      500, 180, 240, -400, // #4 (m=0 pT>0 pz<0)
+      500, 0, 0, 0,        // #5 (m=50>0 pT=0 pz=0)
+      500, 0, 0, 300,      // #6 (m=40>0 pT=0 pz>0)
+      500, 0, 0, -300,     // #7 (m=40>0 pT=0 pz<0)
+      500, 180, 240, 0,    // #8 (m=40>0 pT>0 pz=0)
+      500, -240, -180, 0,  // #9 (m=40>0 pT>0 pz=0)
+      500, 180, 192, 144,  // #10 (m=40>0 pT>0 pz>0)
+      500, 180, 192, -144, // #11 (m=40>0 pT>0 pz<0)
+      500, 0, 0, 500,      // DUPLICATE #12 == #0 (m=0 pT=0 E=pz>0)
+      500, 0, 0, -500,     // DUPLICATE #13 == #1 (m=0 pT=0 -E=pz<0)
+      500, 300, 400, 0,    // DUPLICATE #14 == #2 (m=0 pT>0 pz=0)
+      500, 180, 240, 400   // DUPLICATE #15 == #3 (m=0 pT>0 pz>0)
+    }; /* clang-format on */
+  // Array initialization: zero-out as "{0}" (C and C++) or as "{}" (C++ only)
+  // See https://en.cppreference.com/w/c/language/array_initialization#Notes
+  fptype mass0[nevt] = {};
+  bool ispzgt0[nevt] = {};
+  bool ispzlt0[nevt] = {};
+  bool isptgt0[nevt] = {};
+  for( int ievt = 0; ievt < nevt; ievt++ )
+  {
+    const fptype p0 = par0[ievt * np4 + 0];
+    const fptype p1 = par0[ievt * np4 + 1];
+    const fptype p2 = par0[ievt * np4 + 2];
+    const fptype p3 = par0[ievt * np4 + 3];
+    mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 );
+    ispzgt0[ievt] = ( p3 > 0 );
+    ispzlt0[ievt] = ( p3 < 0 );
+    isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 );
+  }
+  const int ipar0 = 0; // use only particle0 for this test
+  for( int ievt = 0; ievt < nevt; ievt++ )
+  {
+    for( int ip4 = 0; ip4 < np4; ip4++ )
+    {
+      MemoryAccessMomenta::ieventAccessIp4Ipar( hstMomenta.data(), ievt, ip4, ipar0 ) = par0[ievt * np4 + ip4]; // AOS to AOSOA
+    }
+  }
+  // Expected output wavefunctions
+  std::vector<std::array<fptype, 12>> expwfs;
+#include "testxxx_cc_ref.txt" // expwfs.push_back( {...} );
+  std::string dumpFileName = "testxxx_cc_ref.txt.new";
+  // Compute the output wavefunctions
+  // Dump new reference file if requested
+  using mgOnGpu::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  int itest = 0;      // index on the expected output vector
+  std::ofstream dumpFile;
+  if( dumpEvents ) dumpFile.open( dumpFileName, std::ios::trunc );
+  auto dumpwf6 = [&]( std::ostream& out, const cxtype_sv wf[6], const char* xxx, int ievt, int nsp, fptype mass )
+  {
+    out << std::setprecision( 15 ) << std::scientific;
+    out << "  expwfs.push_back( {";
+    out << "                                   // ---------" << std::endl;
+    for( int iw6 = 0; iw6 < nw6; iw6++ )
+    {
+#ifdef MGONGPU_CPPSIMD
+      const int ieppV = ievt % neppV; // #event in the current event vector in this iteration
+#ifdef MGONGPU_HAS_CPPCXTYPEV_BRK
+      out << std::setw( 26 ) << cxreal( wf[iw6][ieppV] ) << ", ";
+      out << std::setw( 22 ) << cximag( wf[iw6][ieppV] );
+#else
+      out << std::setw( 26 ) << wf[iw6].real()[ieppV] << ", ";
+      out << std::setw( 22 ) << wf[iw6].imag()[ieppV];
+#endif
+#else
+      out << std::setw( 26 ) << wf[iw6].real();
+      out << ", " << std::setw( 22 ) << wf[iw6].imag();
+#endif
+      if( iw6 < nw6 - 1 )
+        out << ",    ";
+      else
+        out << " } );";
+      out << " // itest=" << itest << ": " << xxx << "#" << ievt;
+      out << " nsp=" << nsp << " mass=" << (int)mass << std::endl;
+    }
+    out << std::defaultfloat;
+  };
+  auto testwf6 = [&]( const cxtype_sv wf[6], const char* xxx, int ievt, int nsp, fptype mass )
+  {
+    if( dumpEvents ) dumpwf6( dumpFile, wf, xxx, ievt, nsp, mass );
+    if( testEvents )
+    {
+      std::array<fptype, 12>& expwf = expwfs[itest];
+      //std::cout << "Testing " << std::setw(3) << itest << ": " << xxx << " #" << ievt << std::endl;
+      ////for ( int iw6 = 0; iw6<nw6; iw6++ ) std::cout << wf[iw6] << std::endl;
+      ////std::cout << "against" << std::endl;
+      ////for ( int iw6 = 0; iw6<nw6; iw6++ )
+      ////  std::cout << "[" << expwf[iw6*2] << "," << expwf[iw6*2+1] << "]" << std::endl; // NB: expwf[iw6*2], expwf[iw6*2+1] are fp
+      for( int iw6 = 0; iw6 < nw6; iw6++ )
+      {
+        const fptype expReal = expwf[iw6 * 2];
+        const fptype expImag = expwf[iw6 * 2 + 1];
+        if( true )
+        {
+#ifdef MGONGPU_CPPSIMD
+          const int ieppV = ievt % neppV; // #event in the current event vector in this iteration
+#ifdef MGONGPU_HAS_CPPCXTYPEV_BRK
+          EXPECT_NEAR( cxreal( wf[iw6][ieppV] ), expReal, std::abs( expReal * toleranceXXXs ) )
+            << " itest=" << itest << ": " << xxx << "#" << ievt;
+          EXPECT_NEAR( cximag( wf[iw6][ieppV] ), expImag, std::abs( expImag * toleranceXXXs ) )
+            << " itest=" << itest << ": " << xxx << "#" << ievt;
+#else
+          EXPECT_NEAR( wf[iw6].real()[ieppV], expReal, std::abs( expReal * toleranceXXXs ) )
+            << " itest=" << itest << ": " << xxx << "#" << ievt;
+          EXPECT_NEAR( wf[iw6].imag()[ieppV], expImag, std::abs( expImag * toleranceXXXs ) )
+            << " itest=" << itest << ": " << xxx << "#" << ievt;
+#endif
+#else
+          EXPECT_NEAR( cxreal( wf[iw6] ), expReal, std::abs( expReal * toleranceXXXs ) )
+            << " itest=" << itest << ": " << xxx << "#" << ievt;
+          EXPECT_NEAR( cximag( wf[iw6] ), expImag, std::abs( expImag * toleranceXXXs ) )
+            << " itest=" << itest << ": " << xxx << "#" << ievt;
+#endif
+        }
+      }
+    }
+    itest++;
+  };
+  auto testwf6two = [&]( const cxtype_sv wf[6], const cxtype_sv expwf[6], const char* xxx, int ievt )
+  {
+    if( testEvents )
+    {
+      const std::string xxxFull( xxx[0] == 'i' ? "ixxxxx" : "oxxxxx" );
+      //std::cout << "Testing " << std::setw(3) << itest << ": ";
+      //std::cout << xxx << " #" << ievt << " against " << xxxFull << std::endl;
+      ////for ( int iw6 = 0; iw6<nw6; iw6++ ) std::cout << wf[iw6] << std::endl;
+      ////std::cout << "against" << std::endl;
+      ////for ( int iw6 = 0; iw6<nw6; iw6++ ) std::cout << expwf[iw6] << std::endl; // NB: expwf[iw6] is cx
+      for( int iw6 = 0; iw6 < nw6; iw6++ )
+      {
+        if( true )
+        {
+#ifdef MGONGPU_CPPSIMD
+          const int ieppV = ievt % neppV; // #event in the current event vector in this iteration
+#ifdef MGONGPU_HAS_CPPCXTYPEV_BRK
+          const fptype expReal = cxreal( expwf[iw6][ieppV] );
+          const fptype expImag = cximag( expwf[iw6][ieppV] );
+          EXPECT_NEAR( cxreal( wf[iw6][ieppV] ), expReal, std::abs( expReal * toleranceXXXs ) )
+            << " itest=" << itest << ": " << xxx << "#" << ievt << " against " << xxxFull;
+          EXPECT_NEAR( cximag( wf[iw6][ieppV] ), expImag, std::abs( expImag * toleranceXXXs ) )
+            << " itest=" << itest << ": " << xxx << "#" << ievt << " against " << xxxFull;
+#else
+          const fptype expReal = expwf[iw6].real()[ieppV];
+          const fptype expImag = expwf[iw6].imag()[ieppV];
+          EXPECT_NEAR( wf[iw6].real()[ieppV], expReal, std::abs( expReal * toleranceXXXs ) )
+            << " itest=" << itest << ": " << xxx << "#" << ievt << " against " << xxxFull;
+          EXPECT_NEAR( wf[iw6].imag()[ieppV], expImag, std::abs( expImag * toleranceXXXs ) )
+            << " itest=" << itest << ": " << xxx << "#" << ievt << " against " << xxxFull;
+#endif
+#else
+          const fptype expReal = cxreal( expwf[iw6] );
+          const fptype expImag = cximag( expwf[iw6] );
+          EXPECT_NEAR( cxreal( wf[iw6] ), expReal, std::abs( expReal * toleranceXXXs ) )
+            << " itest=" << itest << ": " << xxx << "#" << ievt << " against " << xxxFull;
+          EXPECT_NEAR( cximag( wf[iw6] ), expImag, std::abs( expImag * toleranceXXXs ) )
+            << " itest=" << itest << ": " << xxx << "#" << ievt << " against " << xxxFull;
+#endif
+        }
+      }
+    }
+  };
+  // Array initialization: zero-out as "{0}" (C and C++) or as "{}" (C++ only)
+  // See https://en.cppreference.com/w/c/language/array_initialization#Notes
+  cxtype_sv outwfI[6] = {}; // last result of ixxxxx (mass==0)
+  cxtype_sv outwfO[6] = {}; // last result of oxxxxx (mass==0)
+  cxtype_sv outwf[6] = {};
+  cxtype_sv outwf3[6] = {};                                // NB: only 3 are filled by sxxxxx, but 6 are compared!
+  fptype* fp_outwfI = reinterpret_cast<fptype*>( outwfI ); // proof of concept for using fptype* in the interface
+  fptype* fp_outwfO = reinterpret_cast<fptype*>( outwfO ); // proof of concept for using fptype* in the interface
+  fptype* fp_outwf = reinterpret_cast<fptype*>( outwf );   // proof of concept for using fptype* in the interface
+  fptype* fp_outwf3 = reinterpret_cast<fptype*>( outwf3 ); // proof of concept for using fptype* in the interface
+  const int nhel = 1;
+  for( auto nsp: { -1, +1 } ) // antifermion/fermion (or initial/final for scalar and vector)
+  {
+    for( int ievt = 0; ievt < nevt; ievt++ )
+    {
+#ifdef __CUDACC__
+      using namespace mg5amcGpu;
+#else
+      using namespace mg5amcCpu;
+#endif
+      if( false )
+      {
+        std::cout << std::endl;
+        for( int ip4 = 0; ip4 < np4; ip4++ ) std::cout << par0[ievt * np4 + ip4] << ", ";
+        std::cout << std::endl;
+      }
+      const int ipagV = ievt / neppV; // #event vector in this iteration
+      const fptype* ievt0Momenta = MemoryAccessMomenta::ieventAccessRecordConst( hstMomenta.data(), ipagV * neppV );
+      // Test ixxxxx - NO ASSUMPTIONS
+      {
+        const fptype fmass = mass0[ievt];
+        ixxxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, fmass, nhel, nsp, fp_outwfI, ipar0 );
+        testwf6( outwfI, "ixxxxx", ievt, nsp, fmass );
+        ixxxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, -fmass, nhel, nsp, fp_outwfI, ipar0 );
+        testwf6( outwfI, "ixxxxx", ievt, nsp, -fmass );
+      }
+      // Test ipzxxx - ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == +PZ > 0)
+      if( mass0[ievt] == 0 && !isptgt0[ievt] && ispzgt0[ievt] )
+      {
+        ipzxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, nhel, nsp, fp_outwf, ipar0 );
+        testwf6two( outwf, outwfI, "ipzxxx", ievt );
+        testwf6( outwf, "ipzxxx", ievt, nsp, 0 );
+      }
+      // Test imzxxx - ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == -PZ > 0)
+      if( mass0[ievt] == 0 && !isptgt0[ievt] && ispzlt0[ievt] )
+      {
+        imzxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, nhel, nsp, fp_outwf, ipar0 );
+        testwf6two( outwf, outwfI, "imzxxx", ievt );
+        testwf6( outwf, "imzxxx", ievt, nsp, 0 );
+      }
+      // Test ixzxxx - ASSUMPTIONS: (FMASS == 0) and (PT > 0)
+      if( mass0[ievt] == 0 && isptgt0[ievt] )
+      {
+        ixzxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, nhel, nsp, fp_outwf, ipar0 );
+        testwf6two( outwf, outwfI, "ixzxxx", ievt );
+        testwf6( outwf, "ixzxxx", ievt, nsp, 0 );
+      }
+      // Test vxxxxx - NO ASSUMPTIONS
+      {
+        const fptype vmass = mass0[ievt];
+        vxxxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, vmass, nhel, nsp, fp_outwf, ipar0 );
+        testwf6( outwf, "vxxxxx", ievt, nsp, vmass );
+        vxxxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, -vmass, nhel, nsp, fp_outwf, ipar0 );
+        testwf6( outwf, "vxxxxx", ievt, nsp, -vmass );
+      }
+      // Test sxxxxx - NO ASSUMPTIONS
+      {
+        const fptype smass = mass0[ievt];
+        sxxxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, nsp, fp_outwf3, ipar0 ); // no mass, no helicity (was "smass>0")
+        testwf6( outwf3, "sxxxxx", ievt, nsp, smass );
+        sxxxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, nsp, fp_outwf3, ipar0 ); // no mass, no helicity (was "smass<0")
+        testwf6( outwf3, "sxxxxx", ievt, nsp, -smass );
+      }
+      // Test oxxxxx - NO ASSUMPTIONS
+      {
+        const fptype fmass = mass0[ievt];
+        oxxxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, fmass, nhel, nsp, fp_outwfO, ipar0 );
+        testwf6( outwfO, "oxxxxx", ievt, nsp, fmass );
+        oxxxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, -fmass, nhel, nsp, fp_outwfO, ipar0 );
+        testwf6( outwfO, "oxxxxx", ievt, nsp, -fmass );
+      }
+      // Test opzxxx - ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == +PZ > 0)
+      if( mass0[ievt] == 0 && !isptgt0[ievt] && ispzgt0[ievt] )
+      {
+        opzxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, nhel, nsp, fp_outwf, ipar0 );
+        testwf6two( outwf, outwfO, "opzxxx", ievt );
+        testwf6( outwf, "opzxxx", ievt, nsp, 0 );
+      }
+      // Test omzxxx - ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == -PZ > 0)
+      if( mass0[ievt] == 0 && !isptgt0[ievt] && ispzlt0[ievt] )
+      {
+        omzxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, nhel, nsp, fp_outwf, ipar0 );
+        testwf6two( outwf, outwfO, "omzxxx", ievt );
+        testwf6( outwf, "omzxxx", ievt, nsp, 0 );
+      }
+      // Test oxzxxx - ASSUMPTIONS: (FMASS == 0) and (PT > 0)
+      if( mass0[ievt] == 0 && isptgt0[ievt] )
+      {
+        oxzxxx<HostAccessMomenta, HostAccessWavefunctions>( ievt0Momenta, nhel, nsp, reinterpret_cast<fptype*>( outwf ), ipar0 );
+        testwf6two( outwf, outwfO, "oxzxxx", ievt );
+        testwf6( outwf, "oxzxxx", ievt, nsp, 0 );
+      }
+    }
+  }
+  if( dumpEvents )
+  {
+    dumpFile.close();
+    std::cout << "INFO: New reference data dumped to file '" << dumpFileName << "'" << std::endl;
+  }
+}
+
+//==========================================================================
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/testxxx_cc_ref.txt b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/testxxx_cc_ref.txt
new file mode 100644
index 0000000000..8bc0384a68
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/testxxx_cc_ref.txt
@@ -0,0 +1,2044 @@
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=0: ixxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=0: ixxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=0: ixxxxx#0 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=0: ixxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=0: ixxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=0: ixxxxx#0 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=1: ixxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=1: ixxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=1: ixxxxx#0 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=1: ixxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=1: ixxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=1: ixxxxx#0 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=2: ipzxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=2: ipzxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=2: ipzxxx#0 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=2: ipzxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=2: ipzxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=2: ipzxxx#0 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=3: vxxxxx#0 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=3: vxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=3: vxxxxx#0 nsp=-1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=3: vxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=3: vxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=3: vxxxxx#0 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=4: vxxxxx#0 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=4: vxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=4: vxxxxx#0 nsp=-1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=4: vxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=4: vxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=4: vxxxxx#0 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=5: sxxxxx#0 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=5: sxxxxx#0 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=5: sxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=5: sxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=5: sxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=5: sxxxxx#0 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=6: sxxxxx#0 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=6: sxxxxx#0 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=6: sxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=6: sxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=6: sxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=6: sxxxxx#0 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=7: oxxxxx#0 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=7: oxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=7: oxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=7: oxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=7: oxxxxx#0 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=7: oxxxxx#0 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=8: oxxxxx#0 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=8: oxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=8: oxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=8: oxxxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=8: oxxxxx#0 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=8: oxxxxx#0 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=9: opzxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=9: opzxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=9: opzxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=9: opzxxx#0 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=9: opzxxx#0 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=9: opzxxx#0 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=10: ixxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=10: ixxxxx#1 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=10: ixxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=10: ixxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=10: ixxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=10: ixxxxx#1 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=11: ixxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=11: ixxxxx#1 nsp=-1 mass=0
+    -3.162277660168379e+01, -0.000000000000000e+00,     // itest=11: ixxxxx#1 nsp=-1 mass=0
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=11: ixxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=11: ixxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=11: ixxxxx#1 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=12: imzxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=12: imzxxx#1 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=12: imzxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=12: imzxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=12: imzxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=12: imzxxx#1 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=13: vxxxxx#1 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=13: vxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=13: vxxxxx#1 nsp=-1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=13: vxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=13: vxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=13: vxxxxx#1 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=14: vxxxxx#1 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=14: vxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=14: vxxxxx#1 nsp=-1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=14: vxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=14: vxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=14: vxxxxx#1 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=15: sxxxxx#1 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=15: sxxxxx#1 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=15: sxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=15: sxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=15: sxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=15: sxxxxx#1 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=16: sxxxxx#1 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=16: sxxxxx#1 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=16: sxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=16: sxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=16: sxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=16: sxxxxx#1 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=17: oxxxxx#1 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=17: oxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=17: oxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=17: oxxxxx#1 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=17: oxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=17: oxxxxx#1 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=18: oxxxxx#1 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=18: oxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=18: oxxxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=18: oxxxxx#1 nsp=-1 mass=0
+    -3.162277660168379e+01, -0.000000000000000e+00,     // itest=18: oxxxxx#1 nsp=-1 mass=0
+    -0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=18: oxxxxx#1 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=19: omzxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=19: omzxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=19: omzxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=19: omzxxx#1 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=19: omzxxx#1 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=19: omzxxx#1 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=20: ixxxxx#2 nsp=-1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=20: ixxxxx#2 nsp=-1 mass=0
+     1.341640786499874e+01, -1.788854381999832e+01,     // itest=20: ixxxxx#2 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=20: ixxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=20: ixxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=20: ixxxxx#2 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=21: ixxxxx#2 nsp=-1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=21: ixxxxx#2 nsp=-1 mass=0
+     1.341640786499874e+01, -1.788854381999832e+01,     // itest=21: ixxxxx#2 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=21: ixxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=21: ixxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=21: ixxxxx#2 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=22: ixzxxx#2 nsp=-1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=22: ixzxxx#2 nsp=-1 mass=0
+     1.341640786499874e+01, -1.788854381999832e+01,     // itest=22: ixzxxx#2 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=22: ixzxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=22: ixzxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=22: ixzxxx#2 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=23: vxxxxx#2 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=23: vxxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=23: vxxxxx#2 nsp=-1 mass=0
+    -0.000000000000000e+00,  5.656854249492381e-01,     // itest=23: vxxxxx#2 nsp=-1 mass=0
+    -0.000000000000000e+00, -4.242640687119285e-01,     // itest=23: vxxxxx#2 nsp=-1 mass=0
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=23: vxxxxx#2 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=24: vxxxxx#2 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=24: vxxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=24: vxxxxx#2 nsp=-1 mass=0
+    -0.000000000000000e+00,  5.656854249492381e-01,     // itest=24: vxxxxx#2 nsp=-1 mass=0
+    -0.000000000000000e+00, -4.242640687119285e-01,     // itest=24: vxxxxx#2 nsp=-1 mass=0
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=24: vxxxxx#2 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=25: sxxxxx#2 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=25: sxxxxx#2 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=25: sxxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=25: sxxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=25: sxxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=25: sxxxxx#2 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=26: sxxxxx#2 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=26: sxxxxx#2 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=26: sxxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=26: sxxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=26: sxxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=26: sxxxxx#2 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=27: oxxxxx#2 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=27: oxxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=27: oxxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=27: oxxxxx#2 nsp=-1 mass=0
+     1.341640786499874e+01,  1.788854381999832e+01,     // itest=27: oxxxxx#2 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=27: oxxxxx#2 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=28: oxxxxx#2 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=28: oxxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=28: oxxxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=28: oxxxxx#2 nsp=-1 mass=0
+     1.341640786499874e+01,  1.788854381999832e+01,     // itest=28: oxxxxx#2 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=28: oxxxxx#2 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=29: oxzxxx#2 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=29: oxzxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=29: oxzxxx#2 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=29: oxzxxx#2 nsp=-1 mass=0
+     1.341640786499874e+01,  1.788854381999832e+01,     // itest=29: oxzxxx#2 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=29: oxzxxx#2 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=30: ixxxxx#3 nsp=-1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=30: ixxxxx#3 nsp=-1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=30: ixxxxx#3 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00,     // itest=30: ixxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=30: ixxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=30: ixxxxx#3 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=31: ixxxxx#3 nsp=-1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=31: ixxxxx#3 nsp=-1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=31: ixxxxx#3 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00,     // itest=31: ixxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=31: ixxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=31: ixxxxx#3 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=32: ixzxxx#3 nsp=-1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=32: ixzxxx#3 nsp=-1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=32: ixzxxx#3 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00,     // itest=32: ixzxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=32: ixzxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=32: ixzxxx#3 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=33: vxxxxx#3 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=33: vxxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=33: vxxxxx#3 nsp=-1 mass=0
+    -3.394112549695428e-01,  5.656854249492381e-01,     // itest=33: vxxxxx#3 nsp=-1 mass=0
+    -4.525483399593904e-01, -4.242640687119285e-01,     // itest=33: vxxxxx#3 nsp=-1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=33: vxxxxx#3 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=34: vxxxxx#3 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=34: vxxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=34: vxxxxx#3 nsp=-1 mass=0
+    -3.394112549695428e-01,  5.656854249492381e-01,     // itest=34: vxxxxx#3 nsp=-1 mass=0
+    -4.525483399593904e-01, -4.242640687119285e-01,     // itest=34: vxxxxx#3 nsp=-1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=34: vxxxxx#3 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=35: sxxxxx#3 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=35: sxxxxx#3 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=35: sxxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=35: sxxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=35: sxxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=35: sxxxxx#3 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=36: sxxxxx#3 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=36: sxxxxx#3 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=36: sxxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=36: sxxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=36: sxxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=36: sxxxxx#3 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=37: oxxxxx#3 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=37: oxxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=37: oxxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=37: oxxxxx#3 nsp=-1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00,     // itest=37: oxxxxx#3 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00 } ); // itest=37: oxxxxx#3 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=38: oxxxxx#3 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=38: oxxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=38: oxxxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=38: oxxxxx#3 nsp=-1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00,     // itest=38: oxxxxx#3 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00 } ); // itest=38: oxxxxx#3 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=39: oxzxxx#3 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=39: oxzxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=39: oxzxxx#3 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=39: oxzxxx#3 nsp=-1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00,     // itest=39: oxzxxx#3 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00 } ); // itest=39: oxzxxx#3 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=40: ixxxxx#4 nsp=-1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=40: ixxxxx#4 nsp=-1 mass=0
+     1.800000000000000e+01, -2.400000000000000e+01,     // itest=40: ixxxxx#4 nsp=-1 mass=0
+    -1.000000000000000e+01,  0.000000000000000e+00,     // itest=40: ixxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=40: ixxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=40: ixxxxx#4 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=41: ixxxxx#4 nsp=-1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=41: ixxxxx#4 nsp=-1 mass=0
+     1.800000000000000e+01, -2.400000000000000e+01,     // itest=41: ixxxxx#4 nsp=-1 mass=0
+    -1.000000000000000e+01,  0.000000000000000e+00,     // itest=41: ixxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=41: ixxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=41: ixxxxx#4 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=42: ixzxxx#4 nsp=-1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=42: ixzxxx#4 nsp=-1 mass=0
+     1.800000000000000e+01, -2.400000000000000e+01,     // itest=42: ixzxxx#4 nsp=-1 mass=0
+    -1.000000000000000e+01,  0.000000000000000e+00,     // itest=42: ixzxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=42: ixzxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=42: ixzxxx#4 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=43: vxxxxx#4 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=43: vxxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=43: vxxxxx#4 nsp=-1 mass=0
+     3.394112549695428e-01,  5.656854249492381e-01,     // itest=43: vxxxxx#4 nsp=-1 mass=0
+     4.525483399593904e-01, -4.242640687119285e-01,     // itest=43: vxxxxx#4 nsp=-1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=43: vxxxxx#4 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=44: vxxxxx#4 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=44: vxxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=44: vxxxxx#4 nsp=-1 mass=0
+     3.394112549695428e-01,  5.656854249492381e-01,     // itest=44: vxxxxx#4 nsp=-1 mass=0
+     4.525483399593904e-01, -4.242640687119285e-01,     // itest=44: vxxxxx#4 nsp=-1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=44: vxxxxx#4 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=45: sxxxxx#4 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=45: sxxxxx#4 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=45: sxxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=45: sxxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=45: sxxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=45: sxxxxx#4 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=46: sxxxxx#4 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=46: sxxxxx#4 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=46: sxxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=46: sxxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=46: sxxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=46: sxxxxx#4 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=47: oxxxxx#4 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=47: oxxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=47: oxxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=47: oxxxxx#4 nsp=-1 mass=0
+     1.800000000000000e+01,  2.400000000000000e+01,     // itest=47: oxxxxx#4 nsp=-1 mass=0
+    -1.000000000000000e+01,  0.000000000000000e+00 } ); // itest=47: oxxxxx#4 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=48: oxxxxx#4 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=48: oxxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=48: oxxxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=48: oxxxxx#4 nsp=-1 mass=0
+     1.800000000000000e+01,  2.400000000000000e+01,     // itest=48: oxxxxx#4 nsp=-1 mass=0
+    -1.000000000000000e+01,  0.000000000000000e+00 } ); // itest=48: oxxxxx#4 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=49: oxzxxx#4 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=49: oxzxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=49: oxzxxx#4 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=49: oxzxxx#4 nsp=-1 mass=0
+     1.800000000000000e+01,  2.400000000000000e+01,     // itest=49: oxzxxx#4 nsp=-1 mass=0
+    -1.000000000000000e+01,  0.000000000000000e+00 } ); // itest=49: oxzxxx#4 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=50: ixxxxx#5 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=50: ixxxxx#5 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=50: ixxxxx#5 nsp=-1 mass=500
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=50: ixxxxx#5 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=50: ixxxxx#5 nsp=-1 mass=500
+     2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=50: ixxxxx#5 nsp=-1 mass=500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=51: ixxxxx#5 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=51: ixxxxx#5 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=51: ixxxxx#5 nsp=-1 mass=-500
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=51: ixxxxx#5 nsp=-1 mass=-500
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=51: ixxxxx#5 nsp=-1 mass=-500
+    -2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=51: ixxxxx#5 nsp=-1 mass=-500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=52: vxxxxx#5 nsp=-1 mass=500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=52: vxxxxx#5 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=52: vxxxxx#5 nsp=-1 mass=500
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=52: vxxxxx#5 nsp=-1 mass=500
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=52: vxxxxx#5 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=52: vxxxxx#5 nsp=-1 mass=500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=53: vxxxxx#5 nsp=-1 mass=-500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=53: vxxxxx#5 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=53: vxxxxx#5 nsp=-1 mass=-500
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=53: vxxxxx#5 nsp=-1 mass=-500
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=53: vxxxxx#5 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=53: vxxxxx#5 nsp=-1 mass=-500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=54: sxxxxx#5 nsp=-1 mass=500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=54: sxxxxx#5 nsp=-1 mass=500
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=54: sxxxxx#5 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=54: sxxxxx#5 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=54: sxxxxx#5 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=54: sxxxxx#5 nsp=-1 mass=500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=55: sxxxxx#5 nsp=-1 mass=-500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=55: sxxxxx#5 nsp=-1 mass=-500
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=55: sxxxxx#5 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=55: sxxxxx#5 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=55: sxxxxx#5 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=55: sxxxxx#5 nsp=-1 mass=-500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=56: oxxxxx#5 nsp=-1 mass=500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=56: oxxxxx#5 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=56: oxxxxx#5 nsp=-1 mass=500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=56: oxxxxx#5 nsp=-1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=56: oxxxxx#5 nsp=-1 mass=500
+    -2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=56: oxxxxx#5 nsp=-1 mass=500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=57: oxxxxx#5 nsp=-1 mass=-500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=57: oxxxxx#5 nsp=-1 mass=-500
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=57: oxxxxx#5 nsp=-1 mass=-500
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=57: oxxxxx#5 nsp=-1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=57: oxxxxx#5 nsp=-1 mass=-500
+    -2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=57: oxxxxx#5 nsp=-1 mass=-500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=58: ixxxxx#6 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=58: ixxxxx#6 nsp=-1 mass=400
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=58: ixxxxx#6 nsp=-1 mass=400
+    -2.828427124746190e+01, -0.000000000000000e+00,     // itest=58: ixxxxx#6 nsp=-1 mass=400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=58: ixxxxx#6 nsp=-1 mass=400
+     1.414213562373095e+01,  0.000000000000000e+00 } ); // itest=58: ixxxxx#6 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=59: ixxxxx#6 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=59: ixxxxx#6 nsp=-1 mass=-400
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=59: ixxxxx#6 nsp=-1 mass=-400
+    -2.828427124746190e+01, -0.000000000000000e+00,     // itest=59: ixxxxx#6 nsp=-1 mass=-400
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=59: ixxxxx#6 nsp=-1 mass=-400
+    -1.414213562373095e+01, -0.000000000000000e+00 } ); // itest=59: ixxxxx#6 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=60: vxxxxx#6 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=60: vxxxxx#6 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=60: vxxxxx#6 nsp=-1 mass=400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=60: vxxxxx#6 nsp=-1 mass=400
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=60: vxxxxx#6 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=60: vxxxxx#6 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=61: vxxxxx#6 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=61: vxxxxx#6 nsp=-1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=61: vxxxxx#6 nsp=-1 mass=-400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=61: vxxxxx#6 nsp=-1 mass=-400
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=61: vxxxxx#6 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=61: vxxxxx#6 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=62: sxxxxx#6 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=62: sxxxxx#6 nsp=-1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=62: sxxxxx#6 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=62: sxxxxx#6 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=62: sxxxxx#6 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=62: sxxxxx#6 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=63: sxxxxx#6 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=63: sxxxxx#6 nsp=-1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=63: sxxxxx#6 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=63: sxxxxx#6 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=63: sxxxxx#6 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=63: sxxxxx#6 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=64: oxxxxx#6 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=64: oxxxxx#6 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=64: oxxxxx#6 nsp=-1 mass=400
+     1.414213562373095e+01,  0.000000000000000e+00,     // itest=64: oxxxxx#6 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=64: oxxxxx#6 nsp=-1 mass=400
+    -2.828427124746190e+01, -0.000000000000000e+00 } ); // itest=64: oxxxxx#6 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=65: oxxxxx#6 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=65: oxxxxx#6 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=65: oxxxxx#6 nsp=-1 mass=-400
+    -1.414213562373095e+01, -0.000000000000000e+00,     // itest=65: oxxxxx#6 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=65: oxxxxx#6 nsp=-1 mass=-400
+    -2.828427124746190e+01, -0.000000000000000e+00 } ); // itest=65: oxxxxx#6 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -3.000000000000000e+02,     // itest=66: ixxxxx#7 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=66: ixxxxx#7 nsp=-1 mass=400
+    -2.828427124746190e+01, -0.000000000000000e+00,     // itest=66: ixxxxx#7 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=66: ixxxxx#7 nsp=-1 mass=400
+     1.414213562373095e+01,  0.000000000000000e+00,     // itest=66: ixxxxx#7 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=66: ixxxxx#7 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -3.000000000000000e+02,     // itest=67: ixxxxx#7 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=67: ixxxxx#7 nsp=-1 mass=-400
+    -2.828427124746190e+01, -0.000000000000000e+00,     // itest=67: ixxxxx#7 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=67: ixxxxx#7 nsp=-1 mass=-400
+    -1.414213562373095e+01, -0.000000000000000e+00,     // itest=67: ixxxxx#7 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=67: ixxxxx#7 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  3.000000000000000e+02,     // itest=68: vxxxxx#7 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=68: vxxxxx#7 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=68: vxxxxx#7 nsp=-1 mass=400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=68: vxxxxx#7 nsp=-1 mass=400
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=68: vxxxxx#7 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=68: vxxxxx#7 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  3.000000000000000e+02,     // itest=69: vxxxxx#7 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=69: vxxxxx#7 nsp=-1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=69: vxxxxx#7 nsp=-1 mass=-400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=69: vxxxxx#7 nsp=-1 mass=-400
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=69: vxxxxx#7 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=69: vxxxxx#7 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  3.000000000000000e+02,     // itest=70: sxxxxx#7 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=70: sxxxxx#7 nsp=-1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=70: sxxxxx#7 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=70: sxxxxx#7 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=70: sxxxxx#7 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=70: sxxxxx#7 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  3.000000000000000e+02,     // itest=71: sxxxxx#7 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=71: sxxxxx#7 nsp=-1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=71: sxxxxx#7 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=71: sxxxxx#7 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=71: sxxxxx#7 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=71: sxxxxx#7 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  3.000000000000000e+02,     // itest=72: oxxxxx#7 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=72: oxxxxx#7 nsp=-1 mass=400
+     1.414213562373095e+01,  0.000000000000000e+00,     // itest=72: oxxxxx#7 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=72: oxxxxx#7 nsp=-1 mass=400
+    -2.828427124746190e+01, -0.000000000000000e+00,     // itest=72: oxxxxx#7 nsp=-1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=72: oxxxxx#7 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  3.000000000000000e+02,     // itest=73: oxxxxx#7 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=73: oxxxxx#7 nsp=-1 mass=-400
+    -1.414213562373095e+01, -0.000000000000000e+00,     // itest=73: oxxxxx#7 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=73: oxxxxx#7 nsp=-1 mass=-400
+    -2.828427124746190e+01, -0.000000000000000e+00,     // itest=73: oxxxxx#7 nsp=-1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=73: oxxxxx#7 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=74: ixxxxx#8 nsp=-1 mass=400
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=74: ixxxxx#8 nsp=-1 mass=400
+     1.200000000000000e+01, -1.600000000000000e+01,     // itest=74: ixxxxx#8 nsp=-1 mass=400
+    -2.000000000000000e+01, -0.000000000000000e+00,     // itest=74: ixxxxx#8 nsp=-1 mass=400
+    -5.999999999999999e+00,  7.999999999999999e+00,     // itest=74: ixxxxx#8 nsp=-1 mass=400
+     1.000000000000000e+01,  0.000000000000000e+00 } ); // itest=74: ixxxxx#8 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=75: ixxxxx#8 nsp=-1 mass=-400
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=75: ixxxxx#8 nsp=-1 mass=-400
+     1.200000000000000e+01, -1.600000000000000e+01,     // itest=75: ixxxxx#8 nsp=-1 mass=-400
+    -2.000000000000000e+01, -0.000000000000000e+00,     // itest=75: ixxxxx#8 nsp=-1 mass=-400
+     5.999999999999999e+00, -7.999999999999999e+00,     // itest=75: ixxxxx#8 nsp=-1 mass=-400
+    -1.000000000000000e+01, -0.000000000000000e+00 } ); // itest=75: ixxxxx#8 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=76: vxxxxx#8 nsp=-1 mass=400
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=76: vxxxxx#8 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=76: vxxxxx#8 nsp=-1 mass=400
+     0.000000000000000e+00,  5.656854249492381e-01,     // itest=76: vxxxxx#8 nsp=-1 mass=400
+     0.000000000000000e+00, -4.242640687119285e-01,     // itest=76: vxxxxx#8 nsp=-1 mass=400
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=76: vxxxxx#8 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=77: vxxxxx#8 nsp=-1 mass=-400
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=77: vxxxxx#8 nsp=-1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=77: vxxxxx#8 nsp=-1 mass=-400
+    -0.000000000000000e+00,  5.656854249492381e-01,     // itest=77: vxxxxx#8 nsp=-1 mass=-400
+    -0.000000000000000e+00, -4.242640687119285e-01,     // itest=77: vxxxxx#8 nsp=-1 mass=-400
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=77: vxxxxx#8 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=78: sxxxxx#8 nsp=-1 mass=400
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=78: sxxxxx#8 nsp=-1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=78: sxxxxx#8 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=78: sxxxxx#8 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=78: sxxxxx#8 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=78: sxxxxx#8 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=79: sxxxxx#8 nsp=-1 mass=-400
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=79: sxxxxx#8 nsp=-1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=79: sxxxxx#8 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=79: sxxxxx#8 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=79: sxxxxx#8 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=79: sxxxxx#8 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=80: oxxxxx#8 nsp=-1 mass=400
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=80: oxxxxx#8 nsp=-1 mass=400
+    -5.999999999999999e+00, -7.999999999999999e+00,     // itest=80: oxxxxx#8 nsp=-1 mass=400
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=80: oxxxxx#8 nsp=-1 mass=400
+     1.200000000000000e+01,  1.600000000000000e+01,     // itest=80: oxxxxx#8 nsp=-1 mass=400
+    -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=80: oxxxxx#8 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=81: oxxxxx#8 nsp=-1 mass=-400
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=81: oxxxxx#8 nsp=-1 mass=-400
+     5.999999999999999e+00,  7.999999999999999e+00,     // itest=81: oxxxxx#8 nsp=-1 mass=-400
+    -1.000000000000000e+01, -0.000000000000000e+00,     // itest=81: oxxxxx#8 nsp=-1 mass=-400
+     1.200000000000000e+01,  1.600000000000000e+01,     // itest=81: oxxxxx#8 nsp=-1 mass=-400
+    -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=81: oxxxxx#8 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=82: ixxxxx#9 nsp=-1 mass=400
+    -2.400000000000000e+02, -1.800000000000000e+02,     // itest=82: ixxxxx#9 nsp=-1 mass=400
+    -1.600000000000000e+01,  1.200000000000000e+01,     // itest=82: ixxxxx#9 nsp=-1 mass=400
+    -2.000000000000000e+01, -0.000000000000000e+00,     // itest=82: ixxxxx#9 nsp=-1 mass=400
+     7.999999999999999e+00, -5.999999999999999e+00,     // itest=82: ixxxxx#9 nsp=-1 mass=400
+     1.000000000000000e+01,  0.000000000000000e+00 } ); // itest=82: ixxxxx#9 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=83: ixxxxx#9 nsp=-1 mass=-400
+    -2.400000000000000e+02, -1.800000000000000e+02,     // itest=83: ixxxxx#9 nsp=-1 mass=-400
+    -1.600000000000000e+01,  1.200000000000000e+01,     // itest=83: ixxxxx#9 nsp=-1 mass=-400
+    -2.000000000000000e+01, -0.000000000000000e+00,     // itest=83: ixxxxx#9 nsp=-1 mass=-400
+    -7.999999999999999e+00,  5.999999999999999e+00,     // itest=83: ixxxxx#9 nsp=-1 mass=-400
+    -1.000000000000000e+01, -0.000000000000000e+00 } ); // itest=83: ixxxxx#9 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=84: vxxxxx#9 nsp=-1 mass=400
+     2.400000000000000e+02,  1.800000000000000e+02,     // itest=84: vxxxxx#9 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=84: vxxxxx#9 nsp=-1 mass=400
+     0.000000000000000e+00, -4.242640687119285e-01,     // itest=84: vxxxxx#9 nsp=-1 mass=400
+     0.000000000000000e+00,  5.656854249492381e-01,     // itest=84: vxxxxx#9 nsp=-1 mass=400
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=84: vxxxxx#9 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=85: vxxxxx#9 nsp=-1 mass=-400
+     2.400000000000000e+02,  1.800000000000000e+02,     // itest=85: vxxxxx#9 nsp=-1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=85: vxxxxx#9 nsp=-1 mass=-400
+     0.000000000000000e+00, -4.242640687119285e-01,     // itest=85: vxxxxx#9 nsp=-1 mass=-400
+     0.000000000000000e+00,  5.656854249492381e-01,     // itest=85: vxxxxx#9 nsp=-1 mass=-400
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=85: vxxxxx#9 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=86: sxxxxx#9 nsp=-1 mass=400
+     2.400000000000000e+02,  1.800000000000000e+02,     // itest=86: sxxxxx#9 nsp=-1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=86: sxxxxx#9 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=86: sxxxxx#9 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=86: sxxxxx#9 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=86: sxxxxx#9 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=87: sxxxxx#9 nsp=-1 mass=-400
+     2.400000000000000e+02,  1.800000000000000e+02,     // itest=87: sxxxxx#9 nsp=-1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=87: sxxxxx#9 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=87: sxxxxx#9 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=87: sxxxxx#9 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=87: sxxxxx#9 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=88: oxxxxx#9 nsp=-1 mass=400
+     2.400000000000000e+02,  1.800000000000000e+02,     // itest=88: oxxxxx#9 nsp=-1 mass=400
+     7.999999999999999e+00,  5.999999999999999e+00,     // itest=88: oxxxxx#9 nsp=-1 mass=400
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=88: oxxxxx#9 nsp=-1 mass=400
+    -1.600000000000000e+01, -1.200000000000000e+01,     // itest=88: oxxxxx#9 nsp=-1 mass=400
+    -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=88: oxxxxx#9 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=89: oxxxxx#9 nsp=-1 mass=-400
+     2.400000000000000e+02,  1.800000000000000e+02,     // itest=89: oxxxxx#9 nsp=-1 mass=-400
+    -7.999999999999999e+00, -5.999999999999999e+00,     // itest=89: oxxxxx#9 nsp=-1 mass=-400
+    -1.000000000000000e+01, -0.000000000000000e+00,     // itest=89: oxxxxx#9 nsp=-1 mass=-400
+    -1.600000000000000e+01, -1.200000000000000e+01,     // itest=89: oxxxxx#9 nsp=-1 mass=-400
+    -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=89: oxxxxx#9 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  1.440000000000000e+02,     // itest=90: ixxxxx#10 nsp=-1 mass=400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=90: ixxxxx#10 nsp=-1 mass=400
+     9.863939238321439e+00, -1.052153518754287e+01,     // itest=90: ixxxxx#10 nsp=-1 mass=400
+    -2.433105012119288e+01, -0.000000000000000e+00,     // itest=90: ixxxxx#10 nsp=-1 mass=400
+    -4.931969619160719e+00,  5.260767593771432e+00,     // itest=90: ixxxxx#10 nsp=-1 mass=400
+     1.216552506059644e+01,  0.000000000000000e+00 } ); // itest=90: ixxxxx#10 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  1.440000000000000e+02,     // itest=91: ixxxxx#10 nsp=-1 mass=-400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=91: ixxxxx#10 nsp=-1 mass=-400
+     9.863939238321439e+00, -1.052153518754287e+01,     // itest=91: ixxxxx#10 nsp=-1 mass=-400
+    -2.433105012119288e+01, -0.000000000000000e+00,     // itest=91: ixxxxx#10 nsp=-1 mass=-400
+     4.931969619160719e+00, -5.260767593771432e+00,     // itest=91: ixxxxx#10 nsp=-1 mass=-400
+    -1.216552506059644e+01, -0.000000000000000e+00 } ); // itest=91: ixxxxx#10 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -1.440000000000000e+02,     // itest=92: vxxxxx#10 nsp=-1 mass=400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=92: vxxxxx#10 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=92: vxxxxx#10 nsp=-1 mass=400
+    -2.321373168788980e-01,  5.158607041753289e-01,     // itest=92: vxxxxx#10 nsp=-1 mass=400
+    -2.476131380041579e-01, -4.836194101643708e-01,     // itest=92: vxxxxx#10 nsp=-1 mass=400
+     6.203224967708328e-01,  0.000000000000000e+00 } ); // itest=92: vxxxxx#10 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -1.440000000000000e+02,     // itest=93: vxxxxx#10 nsp=-1 mass=-400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=93: vxxxxx#10 nsp=-1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=93: vxxxxx#10 nsp=-1 mass=-400
+    -2.321373168788980e-01,  5.158607041753289e-01,     // itest=93: vxxxxx#10 nsp=-1 mass=-400
+    -2.476131380041579e-01, -4.836194101643708e-01,     // itest=93: vxxxxx#10 nsp=-1 mass=-400
+     6.203224967708328e-01,  0.000000000000000e+00 } ); // itest=93: vxxxxx#10 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -1.440000000000000e+02,     // itest=94: sxxxxx#10 nsp=-1 mass=400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=94: sxxxxx#10 nsp=-1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=94: sxxxxx#10 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=94: sxxxxx#10 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=94: sxxxxx#10 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=94: sxxxxx#10 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -1.440000000000000e+02,     // itest=95: sxxxxx#10 nsp=-1 mass=-400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=95: sxxxxx#10 nsp=-1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=95: sxxxxx#10 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=95: sxxxxx#10 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=95: sxxxxx#10 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=95: sxxxxx#10 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -1.440000000000000e+02,     // itest=96: oxxxxx#10 nsp=-1 mass=400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=96: oxxxxx#10 nsp=-1 mass=400
+    -4.931969619160719e+00, -5.260767593771432e+00,     // itest=96: oxxxxx#10 nsp=-1 mass=400
+     1.216552506059644e+01,  0.000000000000000e+00,     // itest=96: oxxxxx#10 nsp=-1 mass=400
+     9.863939238321439e+00,  1.052153518754287e+01,     // itest=96: oxxxxx#10 nsp=-1 mass=400
+    -2.433105012119288e+01, -0.000000000000000e+00 } ); // itest=96: oxxxxx#10 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -1.440000000000000e+02,     // itest=97: oxxxxx#10 nsp=-1 mass=-400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=97: oxxxxx#10 nsp=-1 mass=-400
+     4.931969619160719e+00,  5.260767593771432e+00,     // itest=97: oxxxxx#10 nsp=-1 mass=-400
+    -1.216552506059644e+01, -0.000000000000000e+00,     // itest=97: oxxxxx#10 nsp=-1 mass=-400
+     9.863939238321439e+00,  1.052153518754287e+01,     // itest=97: oxxxxx#10 nsp=-1 mass=-400
+    -2.433105012119288e+01, -0.000000000000000e+00 } ); // itest=97: oxxxxx#10 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -1.440000000000000e+02,     // itest=98: ixxxxx#11 nsp=-1 mass=400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=98: ixxxxx#11 nsp=-1 mass=400
+     1.664100588675688e+01, -1.775040627920733e+01,     // itest=98: ixxxxx#11 nsp=-1 mass=400
+    -1.442220510185596e+01, -0.000000000000000e+00,     // itest=98: ixxxxx#11 nsp=-1 mass=400
+    -8.320502943378436e+00,  8.875203139603666e+00,     // itest=98: ixxxxx#11 nsp=-1 mass=400
+     7.211102550927978e+00,  0.000000000000000e+00 } ); // itest=98: ixxxxx#11 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -1.440000000000000e+02,     // itest=99: ixxxxx#11 nsp=-1 mass=-400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=99: ixxxxx#11 nsp=-1 mass=-400
+     1.664100588675688e+01, -1.775040627920733e+01,     // itest=99: ixxxxx#11 nsp=-1 mass=-400
+    -1.442220510185596e+01, -0.000000000000000e+00,     // itest=99: ixxxxx#11 nsp=-1 mass=-400
+     8.320502943378436e+00, -8.875203139603666e+00,     // itest=99: ixxxxx#11 nsp=-1 mass=-400
+    -7.211102550927978e+00, -0.000000000000000e+00 } ); // itest=99: ixxxxx#11 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  1.440000000000000e+02,     // itest=100: vxxxxx#11 nsp=-1 mass=400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=100: vxxxxx#11 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=100: vxxxxx#11 nsp=-1 mass=400
+     2.321373168788980e-01,  5.158607041753289e-01,     // itest=100: vxxxxx#11 nsp=-1 mass=400
+     2.476131380041579e-01, -4.836194101643708e-01,     // itest=100: vxxxxx#11 nsp=-1 mass=400
+     6.203224967708328e-01,  0.000000000000000e+00 } ); // itest=100: vxxxxx#11 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  1.440000000000000e+02,     // itest=101: vxxxxx#11 nsp=-1 mass=-400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=101: vxxxxx#11 nsp=-1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=101: vxxxxx#11 nsp=-1 mass=-400
+     2.321373168788980e-01,  5.158607041753289e-01,     // itest=101: vxxxxx#11 nsp=-1 mass=-400
+     2.476131380041579e-01, -4.836194101643708e-01,     // itest=101: vxxxxx#11 nsp=-1 mass=-400
+     6.203224967708328e-01,  0.000000000000000e+00 } ); // itest=101: vxxxxx#11 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  1.440000000000000e+02,     // itest=102: sxxxxx#11 nsp=-1 mass=400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=102: sxxxxx#11 nsp=-1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=102: sxxxxx#11 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=102: sxxxxx#11 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=102: sxxxxx#11 nsp=-1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=102: sxxxxx#11 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  1.440000000000000e+02,     // itest=103: sxxxxx#11 nsp=-1 mass=-400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=103: sxxxxx#11 nsp=-1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=103: sxxxxx#11 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=103: sxxxxx#11 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=103: sxxxxx#11 nsp=-1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=103: sxxxxx#11 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  1.440000000000000e+02,     // itest=104: oxxxxx#11 nsp=-1 mass=400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=104: oxxxxx#11 nsp=-1 mass=400
+    -8.320502943378436e+00, -8.875203139603666e+00,     // itest=104: oxxxxx#11 nsp=-1 mass=400
+     7.211102550927978e+00,  0.000000000000000e+00,     // itest=104: oxxxxx#11 nsp=-1 mass=400
+     1.664100588675688e+01,  1.775040627920733e+01,     // itest=104: oxxxxx#11 nsp=-1 mass=400
+    -1.442220510185596e+01, -0.000000000000000e+00 } ); // itest=104: oxxxxx#11 nsp=-1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  1.440000000000000e+02,     // itest=105: oxxxxx#11 nsp=-1 mass=-400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=105: oxxxxx#11 nsp=-1 mass=-400
+     8.320502943378436e+00,  8.875203139603666e+00,     // itest=105: oxxxxx#11 nsp=-1 mass=-400
+    -7.211102550927978e+00, -0.000000000000000e+00,     // itest=105: oxxxxx#11 nsp=-1 mass=-400
+     1.664100588675688e+01,  1.775040627920733e+01,     // itest=105: oxxxxx#11 nsp=-1 mass=-400
+    -1.442220510185596e+01, -0.000000000000000e+00 } ); // itest=105: oxxxxx#11 nsp=-1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=106: ixxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=106: ixxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=106: ixxxxx#12 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=106: ixxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=106: ixxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=106: ixxxxx#12 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=107: ixxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=107: ixxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=107: ixxxxx#12 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=107: ixxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=107: ixxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=107: ixxxxx#12 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=108: ipzxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=108: ipzxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=108: ipzxxx#12 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=108: ipzxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=108: ipzxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=108: ipzxxx#12 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=109: vxxxxx#12 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=109: vxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=109: vxxxxx#12 nsp=-1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=109: vxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=109: vxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=109: vxxxxx#12 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=110: vxxxxx#12 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=110: vxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=110: vxxxxx#12 nsp=-1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=110: vxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=110: vxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=110: vxxxxx#12 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=111: sxxxxx#12 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=111: sxxxxx#12 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=111: sxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=111: sxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=111: sxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=111: sxxxxx#12 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=112: sxxxxx#12 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=112: sxxxxx#12 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=112: sxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=112: sxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=112: sxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=112: sxxxxx#12 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=113: oxxxxx#12 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=113: oxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=113: oxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=113: oxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=113: oxxxxx#12 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=113: oxxxxx#12 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=114: oxxxxx#12 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=114: oxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=114: oxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=114: oxxxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=114: oxxxxx#12 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=114: oxxxxx#12 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=115: opzxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=115: opzxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=115: opzxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=115: opzxxx#12 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=115: opzxxx#12 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=115: opzxxx#12 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=116: ixxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=116: ixxxxx#13 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=116: ixxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=116: ixxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=116: ixxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=116: ixxxxx#13 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=117: ixxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=117: ixxxxx#13 nsp=-1 mass=0
+    -3.162277660168379e+01, -0.000000000000000e+00,     // itest=117: ixxxxx#13 nsp=-1 mass=0
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=117: ixxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=117: ixxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=117: ixxxxx#13 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=118: imzxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=118: imzxxx#13 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=118: imzxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=118: imzxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=118: imzxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=118: imzxxx#13 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=119: vxxxxx#13 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=119: vxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=119: vxxxxx#13 nsp=-1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=119: vxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=119: vxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=119: vxxxxx#13 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=120: vxxxxx#13 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=120: vxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=120: vxxxxx#13 nsp=-1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=120: vxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=120: vxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=120: vxxxxx#13 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=121: sxxxxx#13 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=121: sxxxxx#13 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=121: sxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=121: sxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=121: sxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=121: sxxxxx#13 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=122: sxxxxx#13 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=122: sxxxxx#13 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=122: sxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=122: sxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=122: sxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=122: sxxxxx#13 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=123: oxxxxx#13 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=123: oxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=123: oxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=123: oxxxxx#13 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=123: oxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=123: oxxxxx#13 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=124: oxxxxx#13 nsp=-1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=124: oxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=124: oxxxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=124: oxxxxx#13 nsp=-1 mass=0
+    -3.162277660168379e+01, -0.000000000000000e+00,     // itest=124: oxxxxx#13 nsp=-1 mass=0
+    -0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=124: oxxxxx#13 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=125: omzxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=125: omzxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=125: omzxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=125: omzxxx#13 nsp=-1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=125: omzxxx#13 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=125: omzxxx#13 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=126: ixxxxx#14 nsp=-1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=126: ixxxxx#14 nsp=-1 mass=0
+     1.341640786499874e+01, -1.788854381999832e+01,     // itest=126: ixxxxx#14 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=126: ixxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=126: ixxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=126: ixxxxx#14 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=127: ixxxxx#14 nsp=-1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=127: ixxxxx#14 nsp=-1 mass=0
+     1.341640786499874e+01, -1.788854381999832e+01,     // itest=127: ixxxxx#14 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=127: ixxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=127: ixxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=127: ixxxxx#14 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=128: ixzxxx#14 nsp=-1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=128: ixzxxx#14 nsp=-1 mass=0
+     1.341640786499874e+01, -1.788854381999832e+01,     // itest=128: ixzxxx#14 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=128: ixzxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=128: ixzxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=128: ixzxxx#14 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=129: vxxxxx#14 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=129: vxxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=129: vxxxxx#14 nsp=-1 mass=0
+    -0.000000000000000e+00,  5.656854249492381e-01,     // itest=129: vxxxxx#14 nsp=-1 mass=0
+    -0.000000000000000e+00, -4.242640687119285e-01,     // itest=129: vxxxxx#14 nsp=-1 mass=0
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=129: vxxxxx#14 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=130: vxxxxx#14 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=130: vxxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=130: vxxxxx#14 nsp=-1 mass=0
+    -0.000000000000000e+00,  5.656854249492381e-01,     // itest=130: vxxxxx#14 nsp=-1 mass=0
+    -0.000000000000000e+00, -4.242640687119285e-01,     // itest=130: vxxxxx#14 nsp=-1 mass=0
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=130: vxxxxx#14 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=131: sxxxxx#14 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=131: sxxxxx#14 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=131: sxxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=131: sxxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=131: sxxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=131: sxxxxx#14 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=132: sxxxxx#14 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=132: sxxxxx#14 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=132: sxxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=132: sxxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=132: sxxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=132: sxxxxx#14 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=133: oxxxxx#14 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=133: oxxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=133: oxxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=133: oxxxxx#14 nsp=-1 mass=0
+     1.341640786499874e+01,  1.788854381999832e+01,     // itest=133: oxxxxx#14 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=133: oxxxxx#14 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=134: oxxxxx#14 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=134: oxxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=134: oxxxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=134: oxxxxx#14 nsp=-1 mass=0
+     1.341640786499874e+01,  1.788854381999832e+01,     // itest=134: oxxxxx#14 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=134: oxxxxx#14 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=135: oxzxxx#14 nsp=-1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=135: oxzxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=135: oxzxxx#14 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=135: oxzxxx#14 nsp=-1 mass=0
+     1.341640786499874e+01,  1.788854381999832e+01,     // itest=135: oxzxxx#14 nsp=-1 mass=0
+    -2.236067977499790e+01,  0.000000000000000e+00 } ); // itest=135: oxzxxx#14 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=136: ixxxxx#15 nsp=-1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=136: ixxxxx#15 nsp=-1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=136: ixxxxx#15 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00,     // itest=136: ixxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=136: ixxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=136: ixxxxx#15 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=137: ixxxxx#15 nsp=-1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=137: ixxxxx#15 nsp=-1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=137: ixxxxx#15 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00,     // itest=137: ixxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=137: ixxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=137: ixxxxx#15 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=138: ixzxxx#15 nsp=-1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=138: ixzxxx#15 nsp=-1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=138: ixzxxx#15 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00,     // itest=138: ixzxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=138: ixzxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=138: ixzxxx#15 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=139: vxxxxx#15 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=139: vxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=139: vxxxxx#15 nsp=-1 mass=0
+    -3.394112549695428e-01,  5.656854249492381e-01,     // itest=139: vxxxxx#15 nsp=-1 mass=0
+    -4.525483399593904e-01, -4.242640687119285e-01,     // itest=139: vxxxxx#15 nsp=-1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=139: vxxxxx#15 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=140: vxxxxx#15 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=140: vxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=140: vxxxxx#15 nsp=-1 mass=0
+    -3.394112549695428e-01,  5.656854249492381e-01,     // itest=140: vxxxxx#15 nsp=-1 mass=0
+    -4.525483399593904e-01, -4.242640687119285e-01,     // itest=140: vxxxxx#15 nsp=-1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=140: vxxxxx#15 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=141: sxxxxx#15 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=141: sxxxxx#15 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=141: sxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=141: sxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=141: sxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=141: sxxxxx#15 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=142: sxxxxx#15 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=142: sxxxxx#15 nsp=-1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=142: sxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=142: sxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=142: sxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=142: sxxxxx#15 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=143: oxxxxx#15 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=143: oxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=143: oxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=143: oxxxxx#15 nsp=-1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00,     // itest=143: oxxxxx#15 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00 } ); // itest=143: oxxxxx#15 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=144: oxxxxx#15 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=144: oxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=144: oxxxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=144: oxxxxx#15 nsp=-1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00,     // itest=144: oxxxxx#15 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00 } ); // itest=144: oxxxxx#15 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=145: oxzxxx#15 nsp=-1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=145: oxzxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=145: oxzxxx#15 nsp=-1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=145: oxzxxx#15 nsp=-1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00,     // itest=145: oxzxxx#15 nsp=-1 mass=0
+    -3.000000000000000e+01,  0.000000000000000e+00 } ); // itest=145: oxzxxx#15 nsp=-1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=146: ixxxxx#0 nsp=1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=146: ixxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=146: ixxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=146: ixxxxx#0 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=146: ixxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=146: ixxxxx#0 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=147: ixxxxx#0 nsp=1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=147: ixxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=147: ixxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=147: ixxxxx#0 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=147: ixxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=147: ixxxxx#0 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=148: ipzxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=148: ipzxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=148: ipzxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=148: ipzxxx#0 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=148: ipzxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=148: ipzxxx#0 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=149: vxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=149: vxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=149: vxxxxx#0 nsp=1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=149: vxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=149: vxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=149: vxxxxx#0 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=150: vxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=150: vxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=150: vxxxxx#0 nsp=1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=150: vxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=150: vxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=150: vxxxxx#0 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=151: sxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=151: sxxxxx#0 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=151: sxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=151: sxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=151: sxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=151: sxxxxx#0 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=152: sxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=152: sxxxxx#0 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=152: sxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=152: sxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=152: sxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=152: sxxxxx#0 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=153: oxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=153: oxxxxx#0 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=153: oxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=153: oxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=153: oxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=153: oxxxxx#0 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=154: oxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=154: oxxxxx#0 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=154: oxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=154: oxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=154: oxxxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=154: oxxxxx#0 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=155: opzxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=155: opzxxx#0 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=155: opzxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=155: opzxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=155: opzxxx#0 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=155: opzxxx#0 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=156: ixxxxx#1 nsp=1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=156: ixxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=156: ixxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=156: ixxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=156: ixxxxx#1 nsp=1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=156: ixxxxx#1 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=157: ixxxxx#1 nsp=1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=157: ixxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=157: ixxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=157: ixxxxx#1 nsp=1 mass=0
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=157: ixxxxx#1 nsp=1 mass=0
+    -3.162277660168379e+01, -0.000000000000000e+00 } ); // itest=157: ixxxxx#1 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=158: imzxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=158: imzxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=158: imzxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=158: imzxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=158: imzxxx#1 nsp=1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=158: imzxxx#1 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=159: vxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=159: vxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=159: vxxxxx#1 nsp=1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=159: vxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=159: vxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=159: vxxxxx#1 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=160: vxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=160: vxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=160: vxxxxx#1 nsp=1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=160: vxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=160: vxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=160: vxxxxx#1 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=161: sxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=161: sxxxxx#1 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=161: sxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=161: sxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=161: sxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=161: sxxxxx#1 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=162: sxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=162: sxxxxx#1 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=162: sxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=162: sxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=162: sxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=162: sxxxxx#1 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=163: oxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=163: oxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=163: oxxxxx#1 nsp=1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=163: oxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=163: oxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=163: oxxxxx#1 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=164: oxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=164: oxxxxx#1 nsp=1 mass=0
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=164: oxxxxx#1 nsp=1 mass=0
+    -3.162277660168379e+01, -0.000000000000000e+00,     // itest=164: oxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=164: oxxxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=164: oxxxxx#1 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=165: omzxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=165: omzxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=165: omzxxx#1 nsp=1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=165: omzxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=165: omzxxx#1 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=165: omzxxx#1 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=166: ixxxxx#2 nsp=1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=166: ixxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=166: ixxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=166: ixxxxx#2 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=166: ixxxxx#2 nsp=1 mass=0
+     1.341640786499874e+01,  1.788854381999832e+01 } ); // itest=166: ixxxxx#2 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=167: ixxxxx#2 nsp=1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=167: ixxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=167: ixxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=167: ixxxxx#2 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=167: ixxxxx#2 nsp=1 mass=0
+     1.341640786499874e+01,  1.788854381999832e+01 } ); // itest=167: ixxxxx#2 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=168: ixzxxx#2 nsp=1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=168: ixzxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=168: ixzxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=168: ixzxxx#2 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=168: ixzxxx#2 nsp=1 mass=0
+     1.341640786499874e+01,  1.788854381999832e+01 } ); // itest=168: ixzxxx#2 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=169: vxxxxx#2 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=169: vxxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=169: vxxxxx#2 nsp=1 mass=0
+    -0.000000000000000e+00, -5.656854249492381e-01,     // itest=169: vxxxxx#2 nsp=1 mass=0
+    -0.000000000000000e+00,  4.242640687119285e-01,     // itest=169: vxxxxx#2 nsp=1 mass=0
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=169: vxxxxx#2 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=170: vxxxxx#2 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=170: vxxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=170: vxxxxx#2 nsp=1 mass=0
+    -0.000000000000000e+00, -5.656854249492381e-01,     // itest=170: vxxxxx#2 nsp=1 mass=0
+    -0.000000000000000e+00,  4.242640687119285e-01,     // itest=170: vxxxxx#2 nsp=1 mass=0
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=170: vxxxxx#2 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=171: sxxxxx#2 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=171: sxxxxx#2 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=171: sxxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=171: sxxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=171: sxxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=171: sxxxxx#2 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=172: sxxxxx#2 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=172: sxxxxx#2 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=172: sxxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=172: sxxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=172: sxxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=172: sxxxxx#2 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=173: oxxxxx#2 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=173: oxxxxx#2 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=173: oxxxxx#2 nsp=1 mass=0
+     1.341640786499874e+01, -1.788854381999832e+01,     // itest=173: oxxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=173: oxxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=173: oxxxxx#2 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=174: oxxxxx#2 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=174: oxxxxx#2 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=174: oxxxxx#2 nsp=1 mass=0
+     1.341640786499874e+01, -1.788854381999832e+01,     // itest=174: oxxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=174: oxxxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=174: oxxxxx#2 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=175: oxzxxx#2 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=175: oxzxxx#2 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=175: oxzxxx#2 nsp=1 mass=0
+     1.341640786499874e+01, -1.788854381999832e+01,     // itest=175: oxzxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=175: oxzxxx#2 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=175: oxzxxx#2 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=176: ixxxxx#3 nsp=1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=176: ixxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=176: ixxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=176: ixxxxx#3 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=176: ixxxxx#3 nsp=1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00 } ); // itest=176: ixxxxx#3 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=177: ixxxxx#3 nsp=1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=177: ixxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=177: ixxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=177: ixxxxx#3 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=177: ixxxxx#3 nsp=1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00 } ); // itest=177: ixxxxx#3 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=178: ixzxxx#3 nsp=1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=178: ixzxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=178: ixzxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=178: ixzxxx#3 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=178: ixzxxx#3 nsp=1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00 } ); // itest=178: ixzxxx#3 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=179: vxxxxx#3 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=179: vxxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=179: vxxxxx#3 nsp=1 mass=0
+    -3.394112549695428e-01, -5.656854249492381e-01,     // itest=179: vxxxxx#3 nsp=1 mass=0
+    -4.525483399593904e-01,  4.242640687119285e-01,     // itest=179: vxxxxx#3 nsp=1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=179: vxxxxx#3 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=180: vxxxxx#3 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=180: vxxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=180: vxxxxx#3 nsp=1 mass=0
+    -3.394112549695428e-01, -5.656854249492381e-01,     // itest=180: vxxxxx#3 nsp=1 mass=0
+    -4.525483399593904e-01,  4.242640687119285e-01,     // itest=180: vxxxxx#3 nsp=1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=180: vxxxxx#3 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=181: sxxxxx#3 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=181: sxxxxx#3 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=181: sxxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=181: sxxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=181: sxxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=181: sxxxxx#3 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=182: sxxxxx#3 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=182: sxxxxx#3 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=182: sxxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=182: sxxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=182: sxxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=182: sxxxxx#3 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=183: oxxxxx#3 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=183: oxxxxx#3 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=183: oxxxxx#3 nsp=1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=183: oxxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=183: oxxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=183: oxxxxx#3 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=184: oxxxxx#3 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=184: oxxxxx#3 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=184: oxxxxx#3 nsp=1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=184: oxxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=184: oxxxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=184: oxxxxx#3 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=185: oxzxxx#3 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=185: oxzxxx#3 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=185: oxzxxx#3 nsp=1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=185: oxzxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=185: oxzxxx#3 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=185: oxzxxx#3 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=186: ixxxxx#4 nsp=1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=186: ixxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=186: ixxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=186: ixxxxx#4 nsp=1 mass=0
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=186: ixxxxx#4 nsp=1 mass=0
+     1.800000000000000e+01,  2.400000000000000e+01 } ); // itest=186: ixxxxx#4 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=187: ixxxxx#4 nsp=1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=187: ixxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=187: ixxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=187: ixxxxx#4 nsp=1 mass=0
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=187: ixxxxx#4 nsp=1 mass=0
+     1.800000000000000e+01,  2.400000000000000e+01 } ); // itest=187: ixxxxx#4 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  4.000000000000000e+02,     // itest=188: ixzxxx#4 nsp=1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=188: ixzxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=188: ixzxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=188: ixzxxx#4 nsp=1 mass=0
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=188: ixzxxx#4 nsp=1 mass=0
+     1.800000000000000e+01,  2.400000000000000e+01 } ); // itest=188: ixzxxx#4 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=189: vxxxxx#4 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=189: vxxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=189: vxxxxx#4 nsp=1 mass=0
+     3.394112549695428e-01, -5.656854249492381e-01,     // itest=189: vxxxxx#4 nsp=1 mass=0
+     4.525483399593904e-01,  4.242640687119285e-01,     // itest=189: vxxxxx#4 nsp=1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=189: vxxxxx#4 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=190: vxxxxx#4 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=190: vxxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=190: vxxxxx#4 nsp=1 mass=0
+     3.394112549695428e-01, -5.656854249492381e-01,     // itest=190: vxxxxx#4 nsp=1 mass=0
+     4.525483399593904e-01,  4.242640687119285e-01,     // itest=190: vxxxxx#4 nsp=1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=190: vxxxxx#4 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=191: sxxxxx#4 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=191: sxxxxx#4 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=191: sxxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=191: sxxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=191: sxxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=191: sxxxxx#4 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=192: sxxxxx#4 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=192: sxxxxx#4 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=192: sxxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=192: sxxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=192: sxxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=192: sxxxxx#4 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=193: oxxxxx#4 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=193: oxxxxx#4 nsp=1 mass=0
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=193: oxxxxx#4 nsp=1 mass=0
+     1.800000000000000e+01, -2.400000000000000e+01,     // itest=193: oxxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=193: oxxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=193: oxxxxx#4 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=194: oxxxxx#4 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=194: oxxxxx#4 nsp=1 mass=0
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=194: oxxxxx#4 nsp=1 mass=0
+     1.800000000000000e+01, -2.400000000000000e+01,     // itest=194: oxxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=194: oxxxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=194: oxxxxx#4 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -4.000000000000000e+02,     // itest=195: oxzxxx#4 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=195: oxzxxx#4 nsp=1 mass=0
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=195: oxzxxx#4 nsp=1 mass=0
+     1.800000000000000e+01, -2.400000000000000e+01,     // itest=195: oxzxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=195: oxzxxx#4 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=195: oxzxxx#4 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=196: ixxxxx#5 nsp=1 mass=500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=196: ixxxxx#5 nsp=1 mass=500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=196: ixxxxx#5 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=196: ixxxxx#5 nsp=1 mass=500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=196: ixxxxx#5 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=196: ixxxxx#5 nsp=1 mass=500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=197: ixxxxx#5 nsp=1 mass=-500
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=197: ixxxxx#5 nsp=1 mass=-500
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=197: ixxxxx#5 nsp=1 mass=-500
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=197: ixxxxx#5 nsp=1 mass=-500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=197: ixxxxx#5 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=197: ixxxxx#5 nsp=1 mass=-500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=198: vxxxxx#5 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=198: vxxxxx#5 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=198: vxxxxx#5 nsp=1 mass=500
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=198: vxxxxx#5 nsp=1 mass=500
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=198: vxxxxx#5 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=198: vxxxxx#5 nsp=1 mass=500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=199: vxxxxx#5 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=199: vxxxxx#5 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=199: vxxxxx#5 nsp=1 mass=-500
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=199: vxxxxx#5 nsp=1 mass=-500
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=199: vxxxxx#5 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=199: vxxxxx#5 nsp=1 mass=-500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=200: sxxxxx#5 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=200: sxxxxx#5 nsp=1 mass=500
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=200: sxxxxx#5 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=200: sxxxxx#5 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=200: sxxxxx#5 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=200: sxxxxx#5 nsp=1 mass=500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=201: sxxxxx#5 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=201: sxxxxx#5 nsp=1 mass=-500
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=201: sxxxxx#5 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=201: sxxxxx#5 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=201: sxxxxx#5 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=201: sxxxxx#5 nsp=1 mass=-500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=202: oxxxxx#5 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=202: oxxxxx#5 nsp=1 mass=500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=202: oxxxxx#5 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=202: oxxxxx#5 nsp=1 mass=500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=202: oxxxxx#5 nsp=1 mass=500
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=202: oxxxxx#5 nsp=1 mass=500
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=203: oxxxxx#5 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=203: oxxxxx#5 nsp=1 mass=-500
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=203: oxxxxx#5 nsp=1 mass=-500
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=203: oxxxxx#5 nsp=1 mass=-500
+    -2.236067977499790e+01,  0.000000000000000e+00,     // itest=203: oxxxxx#5 nsp=1 mass=-500
+    -0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=203: oxxxxx#5 nsp=1 mass=-500
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=204: ixxxxx#6 nsp=1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=204: ixxxxx#6 nsp=1 mass=400
+     1.414213562373095e+01,  0.000000000000000e+00,     // itest=204: ixxxxx#6 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=204: ixxxxx#6 nsp=1 mass=400
+     2.828427124746190e+01,  0.000000000000000e+00,     // itest=204: ixxxxx#6 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=204: ixxxxx#6 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -3.000000000000000e+02,     // itest=205: ixxxxx#6 nsp=1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=205: ixxxxx#6 nsp=1 mass=-400
+    -1.414213562373095e+01, -0.000000000000000e+00,     // itest=205: ixxxxx#6 nsp=1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=205: ixxxxx#6 nsp=1 mass=-400
+     2.828427124746190e+01,  0.000000000000000e+00,     // itest=205: ixxxxx#6 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=205: ixxxxx#6 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=206: vxxxxx#6 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=206: vxxxxx#6 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=206: vxxxxx#6 nsp=1 mass=400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=206: vxxxxx#6 nsp=1 mass=400
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=206: vxxxxx#6 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=206: vxxxxx#6 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=207: vxxxxx#6 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=207: vxxxxx#6 nsp=1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=207: vxxxxx#6 nsp=1 mass=-400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=207: vxxxxx#6 nsp=1 mass=-400
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=207: vxxxxx#6 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=207: vxxxxx#6 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=208: sxxxxx#6 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=208: sxxxxx#6 nsp=1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=208: sxxxxx#6 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=208: sxxxxx#6 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=208: sxxxxx#6 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=208: sxxxxx#6 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=209: sxxxxx#6 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=209: sxxxxx#6 nsp=1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=209: sxxxxx#6 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=209: sxxxxx#6 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=209: sxxxxx#6 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=209: sxxxxx#6 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=210: oxxxxx#6 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=210: oxxxxx#6 nsp=1 mass=400
+     2.828427124746190e+01,  0.000000000000000e+00,     // itest=210: oxxxxx#6 nsp=1 mass=400
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=210: oxxxxx#6 nsp=1 mass=400
+     1.414213562373095e+01,  0.000000000000000e+00,     // itest=210: oxxxxx#6 nsp=1 mass=400
+     0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=210: oxxxxx#6 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  3.000000000000000e+02,     // itest=211: oxxxxx#6 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=211: oxxxxx#6 nsp=1 mass=-400
+     2.828427124746190e+01,  0.000000000000000e+00,     // itest=211: oxxxxx#6 nsp=1 mass=-400
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=211: oxxxxx#6 nsp=1 mass=-400
+    -1.414213562373095e+01, -0.000000000000000e+00,     // itest=211: oxxxxx#6 nsp=1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=211: oxxxxx#6 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  3.000000000000000e+02,     // itest=212: ixxxxx#7 nsp=1 mass=400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=212: ixxxxx#7 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=212: ixxxxx#7 nsp=1 mass=400
+    -1.414213562373095e+01,  0.000000000000000e+00,     // itest=212: ixxxxx#7 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=212: ixxxxx#7 nsp=1 mass=400
+    -2.828427124746190e+01,  0.000000000000000e+00 } ); // itest=212: ixxxxx#7 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  3.000000000000000e+02,     // itest=213: ixxxxx#7 nsp=1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=213: ixxxxx#7 nsp=1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=213: ixxxxx#7 nsp=1 mass=-400
+     1.414213562373095e+01, -0.000000000000000e+00,     // itest=213: ixxxxx#7 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=213: ixxxxx#7 nsp=1 mass=-400
+    -2.828427124746190e+01,  0.000000000000000e+00 } ); // itest=213: ixxxxx#7 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -3.000000000000000e+02,     // itest=214: vxxxxx#7 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=214: vxxxxx#7 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=214: vxxxxx#7 nsp=1 mass=400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=214: vxxxxx#7 nsp=1 mass=400
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=214: vxxxxx#7 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=214: vxxxxx#7 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -3.000000000000000e+02,     // itest=215: vxxxxx#7 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=215: vxxxxx#7 nsp=1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=215: vxxxxx#7 nsp=1 mass=-400
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=215: vxxxxx#7 nsp=1 mass=-400
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=215: vxxxxx#7 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=215: vxxxxx#7 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -3.000000000000000e+02,     // itest=216: sxxxxx#7 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=216: sxxxxx#7 nsp=1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=216: sxxxxx#7 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=216: sxxxxx#7 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=216: sxxxxx#7 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=216: sxxxxx#7 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -3.000000000000000e+02,     // itest=217: sxxxxx#7 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=217: sxxxxx#7 nsp=1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=217: sxxxxx#7 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=217: sxxxxx#7 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=217: sxxxxx#7 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=217: sxxxxx#7 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -3.000000000000000e+02,     // itest=218: oxxxxx#7 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=218: oxxxxx#7 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=218: oxxxxx#7 nsp=1 mass=400
+    -2.828427124746190e+01,  0.000000000000000e+00,     // itest=218: oxxxxx#7 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=218: oxxxxx#7 nsp=1 mass=400
+    -1.414213562373095e+01,  0.000000000000000e+00 } ); // itest=218: oxxxxx#7 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -3.000000000000000e+02,     // itest=219: oxxxxx#7 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=219: oxxxxx#7 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=219: oxxxxx#7 nsp=1 mass=-400
+    -2.828427124746190e+01,  0.000000000000000e+00,     // itest=219: oxxxxx#7 nsp=1 mass=-400
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=219: oxxxxx#7 nsp=1 mass=-400
+     1.414213562373095e+01, -0.000000000000000e+00 } ); // itest=219: oxxxxx#7 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=220: ixxxxx#8 nsp=1 mass=400
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=220: ixxxxx#8 nsp=1 mass=400
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=220: ixxxxx#8 nsp=1 mass=400
+     5.999999999999999e+00,  7.999999999999999e+00,     // itest=220: ixxxxx#8 nsp=1 mass=400
+     2.000000000000000e+01,  0.000000000000000e+00,     // itest=220: ixxxxx#8 nsp=1 mass=400
+     1.200000000000000e+01,  1.600000000000000e+01 } ); // itest=220: ixxxxx#8 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=221: ixxxxx#8 nsp=1 mass=-400
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=221: ixxxxx#8 nsp=1 mass=-400
+    -1.000000000000000e+01, -0.000000000000000e+00,     // itest=221: ixxxxx#8 nsp=1 mass=-400
+    -5.999999999999999e+00, -7.999999999999999e+00,     // itest=221: ixxxxx#8 nsp=1 mass=-400
+     2.000000000000000e+01,  0.000000000000000e+00,     // itest=221: ixxxxx#8 nsp=1 mass=-400
+     1.200000000000000e+01,  1.600000000000000e+01 } ); // itest=221: ixxxxx#8 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=222: vxxxxx#8 nsp=1 mass=400
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=222: vxxxxx#8 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=222: vxxxxx#8 nsp=1 mass=400
+     0.000000000000000e+00, -5.656854249492381e-01,     // itest=222: vxxxxx#8 nsp=1 mass=400
+     0.000000000000000e+00,  4.242640687119285e-01,     // itest=222: vxxxxx#8 nsp=1 mass=400
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=222: vxxxxx#8 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=223: vxxxxx#8 nsp=1 mass=-400
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=223: vxxxxx#8 nsp=1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=223: vxxxxx#8 nsp=1 mass=-400
+    -0.000000000000000e+00, -5.656854249492381e-01,     // itest=223: vxxxxx#8 nsp=1 mass=-400
+    -0.000000000000000e+00,  4.242640687119285e-01,     // itest=223: vxxxxx#8 nsp=1 mass=-400
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=223: vxxxxx#8 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=224: sxxxxx#8 nsp=1 mass=400
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=224: sxxxxx#8 nsp=1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=224: sxxxxx#8 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=224: sxxxxx#8 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=224: sxxxxx#8 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=224: sxxxxx#8 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=225: sxxxxx#8 nsp=1 mass=-400
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=225: sxxxxx#8 nsp=1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=225: sxxxxx#8 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=225: sxxxxx#8 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=225: sxxxxx#8 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=225: sxxxxx#8 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=226: oxxxxx#8 nsp=1 mass=400
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=226: oxxxxx#8 nsp=1 mass=400
+     2.000000000000000e+01,  0.000000000000000e+00,     // itest=226: oxxxxx#8 nsp=1 mass=400
+     1.200000000000000e+01, -1.600000000000000e+01,     // itest=226: oxxxxx#8 nsp=1 mass=400
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=226: oxxxxx#8 nsp=1 mass=400
+     5.999999999999999e+00, -7.999999999999999e+00 } ); // itest=226: oxxxxx#8 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=227: oxxxxx#8 nsp=1 mass=-400
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=227: oxxxxx#8 nsp=1 mass=-400
+     2.000000000000000e+01,  0.000000000000000e+00,     // itest=227: oxxxxx#8 nsp=1 mass=-400
+     1.200000000000000e+01, -1.600000000000000e+01,     // itest=227: oxxxxx#8 nsp=1 mass=-400
+    -1.000000000000000e+01, -0.000000000000000e+00,     // itest=227: oxxxxx#8 nsp=1 mass=-400
+    -5.999999999999999e+00,  7.999999999999999e+00 } ); // itest=227: oxxxxx#8 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=228: ixxxxx#9 nsp=1 mass=400
+     2.400000000000000e+02,  1.800000000000000e+02,     // itest=228: ixxxxx#9 nsp=1 mass=400
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=228: ixxxxx#9 nsp=1 mass=400
+    -7.999999999999999e+00, -5.999999999999999e+00,     // itest=228: ixxxxx#9 nsp=1 mass=400
+     2.000000000000000e+01,  0.000000000000000e+00,     // itest=228: ixxxxx#9 nsp=1 mass=400
+    -1.600000000000000e+01, -1.200000000000000e+01 } ); // itest=228: ixxxxx#9 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=229: ixxxxx#9 nsp=1 mass=-400
+     2.400000000000000e+02,  1.800000000000000e+02,     // itest=229: ixxxxx#9 nsp=1 mass=-400
+    -1.000000000000000e+01, -0.000000000000000e+00,     // itest=229: ixxxxx#9 nsp=1 mass=-400
+     7.999999999999999e+00,  5.999999999999999e+00,     // itest=229: ixxxxx#9 nsp=1 mass=-400
+     2.000000000000000e+01,  0.000000000000000e+00,     // itest=229: ixxxxx#9 nsp=1 mass=-400
+    -1.600000000000000e+01, -1.200000000000000e+01 } ); // itest=229: ixxxxx#9 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=230: vxxxxx#9 nsp=1 mass=400
+    -2.400000000000000e+02, -1.800000000000000e+02,     // itest=230: vxxxxx#9 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=230: vxxxxx#9 nsp=1 mass=400
+     0.000000000000000e+00,  4.242640687119285e-01,     // itest=230: vxxxxx#9 nsp=1 mass=400
+     0.000000000000000e+00, -5.656854249492381e-01,     // itest=230: vxxxxx#9 nsp=1 mass=400
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=230: vxxxxx#9 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=231: vxxxxx#9 nsp=1 mass=-400
+    -2.400000000000000e+02, -1.800000000000000e+02,     // itest=231: vxxxxx#9 nsp=1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=231: vxxxxx#9 nsp=1 mass=-400
+     0.000000000000000e+00,  4.242640687119285e-01,     // itest=231: vxxxxx#9 nsp=1 mass=-400
+     0.000000000000000e+00, -5.656854249492381e-01,     // itest=231: vxxxxx#9 nsp=1 mass=-400
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=231: vxxxxx#9 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=232: sxxxxx#9 nsp=1 mass=400
+    -2.400000000000000e+02, -1.800000000000000e+02,     // itest=232: sxxxxx#9 nsp=1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=232: sxxxxx#9 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=232: sxxxxx#9 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=232: sxxxxx#9 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=232: sxxxxx#9 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=233: sxxxxx#9 nsp=1 mass=-400
+    -2.400000000000000e+02, -1.800000000000000e+02,     // itest=233: sxxxxx#9 nsp=1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=233: sxxxxx#9 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=233: sxxxxx#9 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=233: sxxxxx#9 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=233: sxxxxx#9 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=234: oxxxxx#9 nsp=1 mass=400
+    -2.400000000000000e+02, -1.800000000000000e+02,     // itest=234: oxxxxx#9 nsp=1 mass=400
+     2.000000000000000e+01,  0.000000000000000e+00,     // itest=234: oxxxxx#9 nsp=1 mass=400
+    -1.600000000000000e+01,  1.200000000000000e+01,     // itest=234: oxxxxx#9 nsp=1 mass=400
+     1.000000000000000e+01,  0.000000000000000e+00,     // itest=234: oxxxxx#9 nsp=1 mass=400
+    -7.999999999999999e+00,  5.999999999999999e+00 } ); // itest=234: oxxxxx#9 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=235: oxxxxx#9 nsp=1 mass=-400
+    -2.400000000000000e+02, -1.800000000000000e+02,     // itest=235: oxxxxx#9 nsp=1 mass=-400
+     2.000000000000000e+01,  0.000000000000000e+00,     // itest=235: oxxxxx#9 nsp=1 mass=-400
+    -1.600000000000000e+01,  1.200000000000000e+01,     // itest=235: oxxxxx#9 nsp=1 mass=-400
+    -1.000000000000000e+01, -0.000000000000000e+00,     // itest=235: oxxxxx#9 nsp=1 mass=-400
+     7.999999999999999e+00, -5.999999999999999e+00 } ); // itest=235: oxxxxx#9 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -1.440000000000000e+02,     // itest=236: ixxxxx#10 nsp=1 mass=400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=236: ixxxxx#10 nsp=1 mass=400
+     1.216552506059644e+01,  0.000000000000000e+00,     // itest=236: ixxxxx#10 nsp=1 mass=400
+     4.931969619160719e+00,  5.260767593771432e+00,     // itest=236: ixxxxx#10 nsp=1 mass=400
+     2.433105012119288e+01,  0.000000000000000e+00,     // itest=236: ixxxxx#10 nsp=1 mass=400
+     9.863939238321439e+00,  1.052153518754287e+01 } ); // itest=236: ixxxxx#10 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -1.440000000000000e+02,     // itest=237: ixxxxx#10 nsp=1 mass=-400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=237: ixxxxx#10 nsp=1 mass=-400
+    -1.216552506059644e+01, -0.000000000000000e+00,     // itest=237: ixxxxx#10 nsp=1 mass=-400
+    -4.931969619160719e+00, -5.260767593771432e+00,     // itest=237: ixxxxx#10 nsp=1 mass=-400
+     2.433105012119288e+01,  0.000000000000000e+00,     // itest=237: ixxxxx#10 nsp=1 mass=-400
+     9.863939238321439e+00,  1.052153518754287e+01 } ); // itest=237: ixxxxx#10 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  1.440000000000000e+02,     // itest=238: vxxxxx#10 nsp=1 mass=400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=238: vxxxxx#10 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=238: vxxxxx#10 nsp=1 mass=400
+    -2.321373168788980e-01, -5.158607041753289e-01,     // itest=238: vxxxxx#10 nsp=1 mass=400
+    -2.476131380041579e-01,  4.836194101643708e-01,     // itest=238: vxxxxx#10 nsp=1 mass=400
+     6.203224967708328e-01,  0.000000000000000e+00 } ); // itest=238: vxxxxx#10 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  1.440000000000000e+02,     // itest=239: vxxxxx#10 nsp=1 mass=-400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=239: vxxxxx#10 nsp=1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=239: vxxxxx#10 nsp=1 mass=-400
+    -2.321373168788980e-01, -5.158607041753289e-01,     // itest=239: vxxxxx#10 nsp=1 mass=-400
+    -2.476131380041579e-01,  4.836194101643708e-01,     // itest=239: vxxxxx#10 nsp=1 mass=-400
+     6.203224967708328e-01,  0.000000000000000e+00 } ); // itest=239: vxxxxx#10 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  1.440000000000000e+02,     // itest=240: sxxxxx#10 nsp=1 mass=400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=240: sxxxxx#10 nsp=1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=240: sxxxxx#10 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=240: sxxxxx#10 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=240: sxxxxx#10 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=240: sxxxxx#10 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  1.440000000000000e+02,     // itest=241: sxxxxx#10 nsp=1 mass=-400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=241: sxxxxx#10 nsp=1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=241: sxxxxx#10 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=241: sxxxxx#10 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=241: sxxxxx#10 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=241: sxxxxx#10 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  1.440000000000000e+02,     // itest=242: oxxxxx#10 nsp=1 mass=400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=242: oxxxxx#10 nsp=1 mass=400
+     2.433105012119288e+01,  0.000000000000000e+00,     // itest=242: oxxxxx#10 nsp=1 mass=400
+     9.863939238321439e+00, -1.052153518754287e+01,     // itest=242: oxxxxx#10 nsp=1 mass=400
+     1.216552506059644e+01,  0.000000000000000e+00,     // itest=242: oxxxxx#10 nsp=1 mass=400
+     4.931969619160719e+00, -5.260767593771432e+00 } ); // itest=242: oxxxxx#10 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  1.440000000000000e+02,     // itest=243: oxxxxx#10 nsp=1 mass=-400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=243: oxxxxx#10 nsp=1 mass=-400
+     2.433105012119288e+01,  0.000000000000000e+00,     // itest=243: oxxxxx#10 nsp=1 mass=-400
+     9.863939238321439e+00, -1.052153518754287e+01,     // itest=243: oxxxxx#10 nsp=1 mass=-400
+    -1.216552506059644e+01, -0.000000000000000e+00,     // itest=243: oxxxxx#10 nsp=1 mass=-400
+    -4.931969619160719e+00,  5.260767593771432e+00 } ); // itest=243: oxxxxx#10 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  1.440000000000000e+02,     // itest=244: ixxxxx#11 nsp=1 mass=400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=244: ixxxxx#11 nsp=1 mass=400
+     7.211102550927978e+00,  0.000000000000000e+00,     // itest=244: ixxxxx#11 nsp=1 mass=400
+     8.320502943378436e+00,  8.875203139603666e+00,     // itest=244: ixxxxx#11 nsp=1 mass=400
+     1.442220510185596e+01,  0.000000000000000e+00,     // itest=244: ixxxxx#11 nsp=1 mass=400
+     1.664100588675688e+01,  1.775040627920733e+01 } ); // itest=244: ixxxxx#11 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  1.440000000000000e+02,     // itest=245: ixxxxx#11 nsp=1 mass=-400
+    -1.800000000000000e+02, -1.920000000000000e+02,     // itest=245: ixxxxx#11 nsp=1 mass=-400
+    -7.211102550927978e+00, -0.000000000000000e+00,     // itest=245: ixxxxx#11 nsp=1 mass=-400
+    -8.320502943378436e+00, -8.875203139603666e+00,     // itest=245: ixxxxx#11 nsp=1 mass=-400
+     1.442220510185596e+01,  0.000000000000000e+00,     // itest=245: ixxxxx#11 nsp=1 mass=-400
+     1.664100588675688e+01,  1.775040627920733e+01 } ); // itest=245: ixxxxx#11 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -1.440000000000000e+02,     // itest=246: vxxxxx#11 nsp=1 mass=400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=246: vxxxxx#11 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=246: vxxxxx#11 nsp=1 mass=400
+     2.321373168788980e-01, -5.158607041753289e-01,     // itest=246: vxxxxx#11 nsp=1 mass=400
+     2.476131380041579e-01,  4.836194101643708e-01,     // itest=246: vxxxxx#11 nsp=1 mass=400
+     6.203224967708328e-01,  0.000000000000000e+00 } ); // itest=246: vxxxxx#11 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -1.440000000000000e+02,     // itest=247: vxxxxx#11 nsp=1 mass=-400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=247: vxxxxx#11 nsp=1 mass=-400
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=247: vxxxxx#11 nsp=1 mass=-400
+     2.321373168788980e-01, -5.158607041753289e-01,     // itest=247: vxxxxx#11 nsp=1 mass=-400
+     2.476131380041579e-01,  4.836194101643708e-01,     // itest=247: vxxxxx#11 nsp=1 mass=-400
+     6.203224967708328e-01,  0.000000000000000e+00 } ); // itest=247: vxxxxx#11 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -1.440000000000000e+02,     // itest=248: sxxxxx#11 nsp=1 mass=400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=248: sxxxxx#11 nsp=1 mass=400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=248: sxxxxx#11 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=248: sxxxxx#11 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=248: sxxxxx#11 nsp=1 mass=400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=248: sxxxxx#11 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -1.440000000000000e+02,     // itest=249: sxxxxx#11 nsp=1 mass=-400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=249: sxxxxx#11 nsp=1 mass=-400
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=249: sxxxxx#11 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=249: sxxxxx#11 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=249: sxxxxx#11 nsp=1 mass=-400
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=249: sxxxxx#11 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -1.440000000000000e+02,     // itest=250: oxxxxx#11 nsp=1 mass=400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=250: oxxxxx#11 nsp=1 mass=400
+     1.442220510185596e+01,  0.000000000000000e+00,     // itest=250: oxxxxx#11 nsp=1 mass=400
+     1.664100588675688e+01, -1.775040627920733e+01,     // itest=250: oxxxxx#11 nsp=1 mass=400
+     7.211102550927978e+00,  0.000000000000000e+00,     // itest=250: oxxxxx#11 nsp=1 mass=400
+     8.320502943378436e+00, -8.875203139603666e+00 } ); // itest=250: oxxxxx#11 nsp=1 mass=400
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -1.440000000000000e+02,     // itest=251: oxxxxx#11 nsp=1 mass=-400
+     1.800000000000000e+02,  1.920000000000000e+02,     // itest=251: oxxxxx#11 nsp=1 mass=-400
+     1.442220510185596e+01,  0.000000000000000e+00,     // itest=251: oxxxxx#11 nsp=1 mass=-400
+     1.664100588675688e+01, -1.775040627920733e+01,     // itest=251: oxxxxx#11 nsp=1 mass=-400
+    -7.211102550927978e+00, -0.000000000000000e+00,     // itest=251: oxxxxx#11 nsp=1 mass=-400
+    -8.320502943378436e+00,  8.875203139603666e+00 } ); // itest=251: oxxxxx#11 nsp=1 mass=-400
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=252: ixxxxx#12 nsp=1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=252: ixxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=252: ixxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=252: ixxxxx#12 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=252: ixxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=252: ixxxxx#12 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=253: ixxxxx#12 nsp=1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=253: ixxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=253: ixxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=253: ixxxxx#12 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=253: ixxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=253: ixxxxx#12 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -5.000000000000000e+02,     // itest=254: ipzxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=254: ipzxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=254: ipzxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=254: ipzxxx#12 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=254: ipzxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=254: ipzxxx#12 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=255: vxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=255: vxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=255: vxxxxx#12 nsp=1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=255: vxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=255: vxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=255: vxxxxx#12 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=256: vxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=256: vxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=256: vxxxxx#12 nsp=1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=256: vxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  7.071067811865476e-01,     // itest=256: vxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=256: vxxxxx#12 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=257: sxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=257: sxxxxx#12 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=257: sxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=257: sxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=257: sxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=257: sxxxxx#12 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=258: sxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=258: sxxxxx#12 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=258: sxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=258: sxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=258: sxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=258: sxxxxx#12 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=259: oxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=259: oxxxxx#12 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=259: oxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=259: oxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=259: oxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=259: oxxxxx#12 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=260: oxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=260: oxxxxx#12 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=260: oxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00, -0.000000000000000e+00,     // itest=260: oxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=260: oxxxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=260: oxxxxx#12 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  5.000000000000000e+02,     // itest=261: opzxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=261: opzxxx#12 nsp=1 mass=0
+     3.162277660168379e+01,  0.000000000000000e+00,     // itest=261: opzxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=261: opzxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=261: opzxxx#12 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=261: opzxxx#12 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=262: ixxxxx#13 nsp=1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=262: ixxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=262: ixxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=262: ixxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=262: ixxxxx#13 nsp=1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=262: ixxxxx#13 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=263: ixxxxx#13 nsp=1 mass=0
+    -0.000000000000000e+00, -0.000000000000000e+00,     // itest=263: ixxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=263: ixxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=263: ixxxxx#13 nsp=1 mass=0
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=263: ixxxxx#13 nsp=1 mass=0
+    -3.162277660168379e+01, -0.000000000000000e+00 } ); // itest=263: ixxxxx#13 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02,  5.000000000000000e+02,     // itest=264: imzxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=264: imzxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=264: imzxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=264: imzxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=264: imzxxx#13 nsp=1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00 } ); // itest=264: imzxxx#13 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=265: vxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=265: vxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=265: vxxxxx#13 nsp=1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=265: vxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=265: vxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=265: vxxxxx#13 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=266: vxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=266: vxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=266: vxxxxx#13 nsp=1 mass=0
+    -7.071067811865476e-01,  0.000000000000000e+00,     // itest=266: vxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00, -7.071067811865476e-01,     // itest=266: vxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=266: vxxxxx#13 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=267: sxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=267: sxxxxx#13 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=267: sxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=267: sxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=267: sxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=267: sxxxxx#13 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=268: sxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=268: sxxxxx#13 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=268: sxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=268: sxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=268: sxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=268: sxxxxx#13 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=269: oxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=269: oxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=269: oxxxxx#13 nsp=1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=269: oxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=269: oxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=269: oxxxxx#13 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=270: oxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=270: oxxxxx#13 nsp=1 mass=0
+    -0.000000000000000e+00,  0.000000000000000e+00,     // itest=270: oxxxxx#13 nsp=1 mass=0
+    -3.162277660168379e+01, -0.000000000000000e+00,     // itest=270: oxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=270: oxxxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=270: oxxxxx#13 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02, -5.000000000000000e+02,     // itest=271: omzxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=271: omzxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=271: omzxxx#13 nsp=1 mass=0
+    -3.162277660168379e+01,  0.000000000000000e+00,     // itest=271: omzxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=271: omzxxx#13 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=271: omzxxx#13 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=272: ixxxxx#14 nsp=1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=272: ixxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=272: ixxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=272: ixxxxx#14 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=272: ixxxxx#14 nsp=1 mass=0
+     1.341640786499874e+01,  1.788854381999832e+01 } ); // itest=272: ixxxxx#14 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=273: ixxxxx#14 nsp=1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=273: ixxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=273: ixxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=273: ixxxxx#14 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=273: ixxxxx#14 nsp=1 mass=0
+     1.341640786499874e+01,  1.788854381999832e+01 } ); // itest=273: ixxxxx#14 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -0.000000000000000e+00,     // itest=274: ixzxxx#14 nsp=1 mass=0
+    -3.000000000000000e+02, -4.000000000000000e+02,     // itest=274: ixzxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=274: ixzxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=274: ixzxxx#14 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=274: ixzxxx#14 nsp=1 mass=0
+     1.341640786499874e+01,  1.788854381999832e+01 } ); // itest=274: ixzxxx#14 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=275: vxxxxx#14 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=275: vxxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=275: vxxxxx#14 nsp=1 mass=0
+    -0.000000000000000e+00, -5.656854249492381e-01,     // itest=275: vxxxxx#14 nsp=1 mass=0
+    -0.000000000000000e+00,  4.242640687119285e-01,     // itest=275: vxxxxx#14 nsp=1 mass=0
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=275: vxxxxx#14 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=276: vxxxxx#14 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=276: vxxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=276: vxxxxx#14 nsp=1 mass=0
+    -0.000000000000000e+00, -5.656854249492381e-01,     // itest=276: vxxxxx#14 nsp=1 mass=0
+    -0.000000000000000e+00,  4.242640687119285e-01,     // itest=276: vxxxxx#14 nsp=1 mass=0
+     7.071067811865476e-01,  0.000000000000000e+00 } ); // itest=276: vxxxxx#14 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=277: sxxxxx#14 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=277: sxxxxx#14 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=277: sxxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=277: sxxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=277: sxxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=277: sxxxxx#14 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=278: sxxxxx#14 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=278: sxxxxx#14 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=278: sxxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=278: sxxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=278: sxxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=278: sxxxxx#14 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=279: oxxxxx#14 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=279: oxxxxx#14 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=279: oxxxxx#14 nsp=1 mass=0
+     1.341640786499874e+01, -1.788854381999832e+01,     // itest=279: oxxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=279: oxxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=279: oxxxxx#14 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=280: oxxxxx#14 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=280: oxxxxx#14 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=280: oxxxxx#14 nsp=1 mass=0
+     1.341640786499874e+01, -1.788854381999832e+01,     // itest=280: oxxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=280: oxxxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=280: oxxxxx#14 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  0.000000000000000e+00,     // itest=281: oxzxxx#14 nsp=1 mass=0
+     3.000000000000000e+02,  4.000000000000000e+02,     // itest=281: oxzxxx#14 nsp=1 mass=0
+     2.236067977499790e+01,  0.000000000000000e+00,     // itest=281: oxzxxx#14 nsp=1 mass=0
+     1.341640786499874e+01, -1.788854381999832e+01,     // itest=281: oxzxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=281: oxzxxx#14 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=281: oxzxxx#14 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=282: ixxxxx#15 nsp=1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=282: ixxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=282: ixxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=282: ixxxxx#15 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=282: ixxxxx#15 nsp=1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00 } ); // itest=282: ixxxxx#15 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=283: ixxxxx#15 nsp=1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=283: ixxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=283: ixxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=283: ixxxxx#15 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=283: ixxxxx#15 nsp=1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00 } ); // itest=283: ixxxxx#15 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+    -5.000000000000000e+02, -4.000000000000000e+02,     // itest=284: ixzxxx#15 nsp=1 mass=0
+    -1.800000000000000e+02, -2.400000000000000e+02,     // itest=284: ixzxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=284: ixzxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=284: ixzxxx#15 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=284: ixzxxx#15 nsp=1 mass=0
+     6.000000000000000e+00,  8.000000000000000e+00 } ); // itest=284: ixzxxx#15 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=285: vxxxxx#15 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=285: vxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=285: vxxxxx#15 nsp=1 mass=0
+    -3.394112549695428e-01, -5.656854249492381e-01,     // itest=285: vxxxxx#15 nsp=1 mass=0
+    -4.525483399593904e-01,  4.242640687119285e-01,     // itest=285: vxxxxx#15 nsp=1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=285: vxxxxx#15 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=286: vxxxxx#15 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=286: vxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=286: vxxxxx#15 nsp=1 mass=0
+    -3.394112549695428e-01, -5.656854249492381e-01,     // itest=286: vxxxxx#15 nsp=1 mass=0
+    -4.525483399593904e-01,  4.242640687119285e-01,     // itest=286: vxxxxx#15 nsp=1 mass=0
+     4.242640687119285e-01,  0.000000000000000e+00 } ); // itest=286: vxxxxx#15 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=287: sxxxxx#15 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=287: sxxxxx#15 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=287: sxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=287: sxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=287: sxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=287: sxxxxx#15 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=288: sxxxxx#15 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=288: sxxxxx#15 nsp=1 mass=0
+     1.000000000000000e+00,  0.000000000000000e+00,     // itest=288: sxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=288: sxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=288: sxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=288: sxxxxx#15 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=289: oxxxxx#15 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=289: oxxxxx#15 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=289: oxxxxx#15 nsp=1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=289: oxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=289: oxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=289: oxxxxx#15 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=290: oxxxxx#15 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=290: oxxxxx#15 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=290: oxxxxx#15 nsp=1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=290: oxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=290: oxxxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=290: oxxxxx#15 nsp=1 mass=0
+  expwfs.push_back( {                                   // ---------
+     5.000000000000000e+02,  4.000000000000000e+02,     // itest=291: oxzxxx#15 nsp=1 mass=0
+     1.800000000000000e+02,  2.400000000000000e+02,     // itest=291: oxzxxx#15 nsp=1 mass=0
+     3.000000000000000e+01,  0.000000000000000e+00,     // itest=291: oxzxxx#15 nsp=1 mass=0
+     6.000000000000000e+00, -8.000000000000000e+00,     // itest=291: oxzxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00,     // itest=291: oxzxxx#15 nsp=1 mass=0
+     0.000000000000000e+00,  0.000000000000000e+00 } ); // itest=291: oxzxxx#15 nsp=1 mass=0
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/timer.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/timer.h
new file mode 100644
index 0000000000..14d7a4d892
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/timer.h
@@ -0,0 +1,67 @@
+#ifndef MGONGPUTIMER_H
+#define MGONGPUTIMER_H 1
+
+#include <chrono>
+#include <iostream>
+
+namespace mgOnGpu
+{
+
+  /*
+  high_resolution_clock
+  steady_clock
+  system_clock
+
+  from https://www.modernescpp.com/index.php/the-three-clocks
+  and https://codereview.stackexchange.com/questions/196245/extremely-simple-timer-class-in-c
+  */
+
+  template<typename T>
+  class Timer
+  {
+  public:
+    Timer()
+      : m_StartTime( T::now() ) {}
+    virtual ~Timer() {}
+    void Start();
+    float GetDuration();
+    void Info();
+  private:
+    typedef typename T::time_point TTP;
+    TTP m_StartTime;
+  };
+
+  template<typename T>
+  void
+  Timer<T>::Start()
+  {
+    m_StartTime = T::now();
+  }
+
+  template<typename T>
+  float
+  Timer<T>::GetDuration()
+  {
+    std::chrono::duration<float> duration = T::now() - m_StartTime;
+    return duration.count();
+  }
+
+  template<typename T>
+  void
+  Timer<T>::Info()
+  {
+    typedef typename T::period TPER;
+    typedef typename std::ratio_multiply<TPER, std::kilo> MilliSec;
+    typedef typename std::ratio_multiply<TPER, std::mega> MicroSec;
+    std::cout << std::boolalpha << std::endl;
+    std::cout << "clock info: " << std::endl;
+    std::cout << "  is steady: " << T::is_steady << std::endl;
+    std::cout << "  precision: " << TPER::num << "/" << TPER::den << " second " << std::endl;
+    std::cout << std::fixed;
+    std::cout << "             " << static_cast<double>( MilliSec::num ) / MilliSec::den << " milliseconds " << std::endl;
+    std::cout << "             " << static_cast<double>( MicroSec::num ) / MicroSec::den << " microseconds " << std::endl;
+    std::cout << std::endl;
+  }
+
+}
+#endif // MGONGPUTIMER_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/timermap.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/timermap.h
new file mode 100644
index 0000000000..60d8c51021
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/timermap.h
@@ -0,0 +1,156 @@
+#ifndef MGONGPUTIMERMAP_H
+#define MGONGPUTIMERMAP_H 1
+
+#include <cassert>
+#include <fstream>
+#include <iomanip>
+#include <map>
+#include <string>
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#include "nvtx.h"
+#pragma GCC diagnostic pop
+
+#include "timer.h"
+#define TIMERTYPE std::chrono::high_resolution_clock
+
+namespace mgOnGpu
+{
+  class TimerMap
+  {
+
+  public:
+
+    TimerMap()
+      : m_timer(), m_active( "" ), m_partitionTimers(), m_partitionIds() {}
+    virtual ~TimerMap() {}
+
+    // Start the timer for a specific partition (key must be a non-empty string)
+    // Stop the timer for the current partition if there is one active
+    float start( const std::string& key )
+    {
+      assert( key != "" );
+      // Close the previously active partition
+      float last = stop();
+      // Switch to a new partition
+      m_timer.Start();
+      m_active = key;
+      if( m_partitionTimers.find( key ) == m_partitionTimers.end() )
+      {
+        m_partitionIds[key] = m_partitionTimers.size();
+        m_partitionTimers[key] = 0;
+      }
+      // Open a new Cuda NVTX range
+      NVTX_PUSH( key.c_str(), m_partitionIds[key] );
+      // Return last duration
+      return last;
+    }
+
+    // Stop the timer for the current partition if there is one active
+    float stop()
+    {
+      // Close the previously active partition
+      float last = 0;
+      if( m_active != "" )
+      {
+        last = m_timer.GetDuration();
+        m_partitionTimers[m_active] += last;
+      }
+      m_active = "";
+      // Close the current Cuda NVTX range
+      NVTX_POP();
+      // Return last duration
+      return last;
+    }
+
+    // Dump the overall results
+    void dump( std::ostream& ostr = std::cout, bool json = false )
+    {
+      // Improve key formatting
+      const std::string totalKey = "TOTAL      "; // "TOTAL (ANY)"?
+      //const std::string totalBut2Key = "TOTAL (n-2)";
+      const std::string total123Key = "TOTAL (123)";
+      const std::string total23Key = "TOTAL  (23)";
+      const std::string total1Key = "TOTAL   (1)";
+      const std::string total2Key = "TOTAL   (2)";
+      const std::string total3Key = "TOTAL   (3)";
+      const std::string total3aKey = "TOTAL  (3a)";
+      size_t maxsize = 0;
+      for( auto ip: m_partitionTimers )
+        maxsize = std::max( maxsize, ip.first.size() );
+      maxsize = std::max( maxsize, totalKey.size() );
+      // Compute the overall total
+      //size_t ipart = 0;
+      float total = 0;
+      //float totalBut2 = 0;
+      float total123 = 0;
+      float total23 = 0;
+      float total1 = 0;
+      float total2 = 0;
+      float total3 = 0;
+      float total3a = 0;
+      for( auto ip: m_partitionTimers )
+      {
+        total += ip.second;
+        //if ( ipart != 0 && ipart+1 != m_partitionTimers.size() ) totalBut2 += ip.second;
+        if( ip.first[0] == '1' || ip.first[0] == '2' || ip.first[0] == '3' ) total123 += ip.second;
+        if( ip.first[0] == '2' || ip.first[0] == '3' ) total23 += ip.second;
+        if( ip.first[0] == '1' ) total1 += ip.second;
+        if( ip.first[0] == '2' ) total2 += ip.second;
+        if( ip.first[0] == '3' ) total3 += ip.second;
+        if( ip.first[0] == '3' && ip.first[1] == 'a' ) total3a += ip.second;
+        //ipart++;
+      }
+      // Dump individual partition timers and the overall total
+      if( json )
+      {
+        std::string s1 = "\"", s2 = "\" : \"", s3 = " sec\",";
+        ostr << std::setprecision( 6 ); // set precision (default=6): affects all floats
+        ostr << std::fixed;             // fixed format: affects all floats
+        for( auto ip: m_partitionTimers )
+          ostr << s1 << ip.first << s2 << ip.second << s3 << std::endl;
+        ostr << s1 << totalKey << s2 << total << s3 << std::endl
+             << s1 << total123Key << s2 << total123 << s3 << std::endl
+             << s1 << total23Key << s2 << total23 << s3 << std::endl
+             << s1 << total3Key << s2 << total3 << s3 << std::endl
+             << s1 << total3aKey << s2 << total3a << " sec \"" << std::endl;
+        ostr << std::defaultfloat; // default format: affects all floats
+      }
+      else
+      {
+        // NB: 'setw' affects only the next field (of any type)
+        ostr << std::setprecision( 6 ); // set precision (default=6): affects all floats
+        ostr << std::fixed;             // fixed format: affects all floats
+        for( auto ip: m_partitionTimers )
+          ostr << std::setw( maxsize ) << ip.first << " : "
+               << std::setw( 12 ) << ip.second << " sec" << std::endl;
+        ostr << std::setw( maxsize ) << totalKey << " : "
+             << std::setw( 12 ) << total << " sec" << std::endl
+             << std::setw( maxsize ) << total123Key << " : "
+             << std::setw( 12 ) << total123 << " sec" << std::endl
+             << std::setw( maxsize ) << total23Key << " : "
+             << std::setw( 12 ) << total23 << " sec" << std::endl
+             << std::setw( maxsize ) << total1Key << " : "
+             << std::setw( 12 ) << total1 << " sec" << std::endl
+             << std::setw( maxsize ) << total2Key << " : "
+             << std::setw( 12 ) << total2 << " sec" << std::endl
+             << std::setw( maxsize ) << total3Key << " : "
+             << std::setw( 12 ) << total3 << " sec" << std::endl
+             << std::setw( maxsize ) << total3aKey << " : "
+             << std::setw( 12 ) << total3a << " sec" << std::endl;
+        ostr << std::defaultfloat; // default format: affects all floats
+      }
+    }
+
+  private:
+
+    Timer<TIMERTYPE> m_timer;
+    std::string m_active;
+    std::map<std::string, float> m_partitionTimers;
+    std::map<std::string, uint32_t> m_partitionIds;
+  };
+
+}
+
+#endif // MGONGPUTIMERMAP_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/CMakeLists.txt b/epochX/cudacpp/smeft_gg_tttt.sa/src/CMakeLists.txt
new file mode 100644
index 0000000000..bb6d5ee85d
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/CMakeLists.txt
@@ -0,0 +1,5 @@
+file(GLOB_RECURSE HEADERS "*.h")
+add_library(mg5amc_common Parameters_sm.cc read_slha.cc ${HEADERS})
+
+# some XCode specific stuff to make the executable run
+set_property(TARGET mg5amc_common PROPERTY XCODE_GENERATE_SCHEME TRUE)
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
new file mode 100644
index 0000000000..9e80ae076e
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
@@ -0,0 +1,1174 @@
+//==========================================================================
+// This file has been automatically generated for CUDA/C++ standalone by
+// MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-01-26
+// By the MadGraph5_aMC@NLO Development Team
+// Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
+//==========================================================================
+
+#ifndef HelAmps_SMEFTsim_topU3l_MwScheme_UFO_H
+#define HelAmps_SMEFTsim_topU3l_MwScheme_UFO_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "Parameters_SMEFTsim_topU3l_MwScheme_UFO.h"
+
+//#include <cmath>
+//#include <cstdlib>
+//#include <iomanip>
+//#include <iostream>
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPU_INLINE_HELAMPS
+#define INLINE inline
+#define ALWAYS_INLINE __attribute__( ( always_inline ) )
+#else
+#define INLINE
+#define ALWAYS_INLINE
+#endif
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fi[6] from the input momenta[npar*4*nevt]
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ INLINE void
+  ixxxxx( const fptype momenta[], // input: momenta
+          const fptype fmass,     // input: fermion mass
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          fptype wavefunctions[], // output: wavefunctions
+          const int ipar          // input: particle# out of npar
+          ) ALWAYS_INLINE;
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fi[6] from the input momenta[npar*4*nevt]
+  // ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == +PZ > 0)
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ INLINE void
+  ipzxxx( const fptype momenta[], // input: momenta
+          //const fptype fmass,   // [skip: ASSUME fermion mass==0]
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          fptype wavefunctions[], // output: wavefunctions
+          const int ipar          // input: particle# out of npar
+          ) ALWAYS_INLINE;
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fi[6] from the input momenta[npar*4*nevt]
+  // ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == -PZ > 0)
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ INLINE void
+  imzxxx( const fptype momenta[], // input: momenta
+          //const fptype fmass,   // [skip: ASSUME fermion mass==0]
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          fptype wavefunctions[], // output: wavefunctions
+          const int ipar          // input: particle# out of npar
+          ) ALWAYS_INLINE;
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fi[6] from the input momenta[npar*4*nevt]
+  // ASSUMPTIONS: (FMASS == 0) and (PT > 0)
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ INLINE void
+  ixzxxx( const fptype momenta[], // input: momenta
+          //const fptype fmass,   // [skip: ASSUME fermion mass==0]
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          fptype wavefunctions[], // output: wavefunctions
+          const int ipar          // input: particle# out of npar
+          ) ALWAYS_INLINE;
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction vc[6] from the input momenta[npar*4*nevt]
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ INLINE void
+  vxxxxx( const fptype momenta[], // input: momenta
+          const fptype vmass,     // input: vector boson mass
+          const int nhel,         // input: -1, 0 (only if vmass!=0) or +1 (helicity of vector boson)
+          const int nsv,          // input: +1 (final) or -1 (initial)
+          fptype wavefunctions[], // output: wavefunctions
+          const int ipar          // input: particle# out of npar
+          ) ALWAYS_INLINE;
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction sc[3] from the input momenta[npar*4*nevt]
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ INLINE void
+  sxxxxx( const fptype momenta[], // input: momenta
+          //const fptype,                 // WARNING: input "smass" unused (missing in Fortran) - scalar boson mass
+          //const int,                    // WARNING: input "nhel" unused (missing in Fortran) - scalar has no helicity!
+          const int nss,          // input: +1 (final) or -1 (initial)
+          fptype wavefunctions[], // output: wavefunctions
+          const int ipar          // input: particle# out of npar
+          ) ALWAYS_INLINE;
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fo[6] from the input momenta[npar*4*nevt]
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ INLINE void
+  oxxxxx( const fptype momenta[], // input: momenta
+          const fptype fmass,     // input: fermion mass
+          const int nhel,         // input: -1, 0 (only if vmass!=0) or +1 (helicity of vector boson)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          fptype wavefunctions[], // output: wavefunctions
+          const int ipar          // input: particle# out of npar
+          ) ALWAYS_INLINE;
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fo[6] from the input momenta[npar*4*nevt]
+  // ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == +PZ > 0)
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ INLINE void
+  opzxxx( const fptype momenta[], // input: momenta
+          //const fptype fmass,   // [skip: ASSUME fermion mass==0]
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          fptype wavefunctions[], // output: wavefunctions
+          const int ipar          // input: particle# out of npar
+          ) ALWAYS_INLINE;
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fo[6] from the input momenta[npar*4*nevt]
+  // ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == -PZ > 0)
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ INLINE void
+  omzxxx( const fptype momenta[], // input: momenta
+          //const fptype fmass,   // [skip: ASSUME fermion mass==0]
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          fptype wavefunctions[], // output: wavefunctions
+          const int ipar          // input: particle# out of npar
+          ) ALWAYS_INLINE;
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fo[6] from the input momenta[npar*4*nevt]
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ INLINE void
+  oxzxxx( const fptype momenta[], // input: momenta
+          //const fptype fmass,   // [skip: ASSUME fermion mass==0]
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          fptype wavefunctions[], // output: wavefunctions
+          const int ipar          // input: particle# out of npar
+          ) ALWAYS_INLINE;
+
+  //==========================================================================
+
+  // Compute the output wavefunction fi[6] from the input momenta[npar*4*nevt]
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ void
+  ixxxxx( const fptype momenta[], // input: momenta
+          const fptype fmass,     // input: fermion mass
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          fptype wavefunctions[], // output: wavefunctions
+          const int ipar )        // input: particle# out of npar
+  {
+    mgDebug( 0, __FUNCTION__ );
+    const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
+    const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
+    const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
+    const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar );
+    cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions );
+    fi[0] = cxmake( -pvec0 * (fptype)nsf, -pvec3 * (fptype)nsf );
+    fi[1] = cxmake( -pvec1 * (fptype)nsf, -pvec2 * (fptype)nsf );
+    const int nh = nhel * nsf;
+    if( fmass != 0. )
+    {
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) );
+#ifndef MGONGPU_CPPSIMD
+      if( pp == 0. )
+      {
+        // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
+        fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0. }; // possibility of negative fermion masses
+        //sqm[1] = ( fmass < 0. ? -abs( sqm[0] ) : abs( sqm[0] ) ); // AV: why abs here?
+        sqm[1] = ( fmass < 0. ? -sqm[0] : sqm[0] ); // AV: removed an abs here
+        const int ip = ( 1 + nh ) / 2;              // NB: Fortran sqm(0:1) also has indexes 0,1 as in C++
+        const int im = ( 1 - nh ) / 2;              // NB: Fortran sqm(0:1) also has indexes 0,1 as in C++
+        fi[2] = cxmake( ip * sqm[ip], 0 );
+        fi[3] = cxmake( im * nsf * sqm[ip], 0 );
+        fi[4] = cxmake( ip * nsf * sqm[im], 0 );
+        fi[5] = cxmake( im * sqm[im], 0 );
+      }
+      else
+      {
+        const fptype sf[2] = { fptype( 1 + nsf + ( 1 - nsf ) * nh ) * (fptype)0.5,
+                               fptype( 1 + nsf - ( 1 - nsf ) * nh ) * (fptype)0.5 };
+        fptype omega[2] = { fpsqrt( pvec0 + pp ), 0. };
+        omega[1] = fmass / omega[0];
+        const int ip = ( 1 + nh ) / 2; // NB: Fortran is (3+nh)/2 because omega(2) has indexes 1,2 and not 0,1
+        const int im = ( 1 - nh ) / 2; // NB: Fortran is (3-nh)/2 because omega(2) has indexes 1,2 and not 0,1
+        const fptype sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] };
+        const fptype pp3 = fpmax( pp + pvec3, 0. );
+        const cxtype chi[2] = { cxmake( fpsqrt( pp3 * (fptype)0.5 / pp ), 0. ),
+                                ( pp3 == 0. ? cxmake( -nh, 0. ) : cxmake( nh * pvec1, pvec2 ) / fpsqrt( 2. * pp * pp3 ) ) };
+        fi[2] = sfomega[0] * chi[im];
+        fi[3] = sfomega[0] * chi[ip];
+        fi[4] = sfomega[1] * chi[im];
+        fi[5] = sfomega[1] * chi[ip];
+      }
+#else
+      const int ip = ( 1 + nh ) / 2;
+      const int im = ( 1 - nh ) / 2;
+      // Branch A: pp == 0.
+      // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
+      fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses (NB: SCALAR!)
+      sqm[1] = ( fmass < 0 ? -sqm[0] : sqm[0] );          // AV: removed an abs here (as above)
+      const cxtype fiA_2 = ip * sqm[ip];                  // scalar cxtype: real part initialised from fptype, imag part = 0
+      const cxtype fiA_3 = im * nsf * sqm[ip];            // scalar cxtype: real part initialised from fptype, imag part = 0
+      const cxtype fiA_4 = ip * nsf * sqm[im];            // scalar cxtype: real part initialised from fptype, imag part = 0
+      const cxtype fiA_5 = im * sqm[im];                  // scalar cxtype: real part initialised from fptype, imag part = 0
+      // Branch B: pp != 0.
+      const fptype sf[2] = { fptype( 1 + nsf + ( 1 - nsf ) * nh ) * (fptype)0.5,
+                             fptype( 1 + nsf - ( 1 - nsf ) * nh ) * (fptype)0.5 };
+      fptype_v omega[2] = { fpsqrt( pvec0 + pp ), 0 };
+      omega[1] = fmass / omega[0];
+      const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] };
+      const fptype_v pp3 = fpmax( pp + pvec3, 0 );
+      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / pp ), 0 ),
+                                cxternary( ( pp3 == 0. ),
+                                           cxmake( -nh, 0 ),
+                                           cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * pp * pp3 ) ) };
+      const cxtype_v fiB_2 = sfomega[0] * chi[im];
+      const cxtype_v fiB_3 = sfomega[0] * chi[ip];
+      const cxtype_v fiB_4 = sfomega[1] * chi[im];
+      const cxtype_v fiB_5 = sfomega[1] * chi[ip];
+      // Choose between the results from branch A and branch B
+      const bool_v mask = ( pp == 0. );
+      fi[2] = cxternary( mask, fiA_2, fiB_2 );
+      fi[3] = cxternary( mask, fiA_3, fiB_3 );
+      fi[4] = cxternary( mask, fiA_4, fiB_4 );
+      fi[5] = cxternary( mask, fiA_5, fiB_5 );
+#endif
+    }
+    else
+    {
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ),
+                                          fptype_sv{ 0 },
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
+      const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), cxternary( ( sqp0p3 == 0. ), cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ), cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) };
+      if( nh == 1 )
+      {
+        fi[2] = cxzero_sv();
+        fi[3] = cxzero_sv();
+        fi[4] = chi[0];
+        fi[5] = chi[1];
+      }
+      else
+      {
+        fi[2] = chi[1];
+        fi[3] = chi[0];
+        fi[4] = cxzero_sv();
+        fi[5] = cxzero_sv();
+      }
+    }
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fi[6] from the input momenta[npar*4*nevt]
+  // ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == +PZ > 0)
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ void
+  ipzxxx( const fptype momenta[], // input: momenta
+          //const fptype fmass,   // [skip: ASSUME fermion mass==0]
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          fptype wavefunctions[], // output: wavefunctions
+          const int ipar )        // input: particle# out of npar
+  {
+    mgDebug( 0, __FUNCTION__ );
+    const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar );
+    cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions );
+    fi[0] = cxmake( -pvec3 * (fptype)nsf, -pvec3 * (fptype)nsf );
+    fi[1] = cxzero_sv();
+    const int nh = nhel * nsf;
+    const cxtype_sv sqp0p3 = cxmake( fpsqrt( 2. * pvec3 ) * (fptype)nsf, 0. );
+    fi[2] = fi[1];
+    if( nh == 1 )
+    {
+      fi[3] = fi[1];
+      fi[4] = sqp0p3;
+    }
+    else
+    {
+      fi[3] = sqp0p3;
+      fi[4] = fi[1];
+    }
+    fi[5] = fi[1];
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fi[6] from the input momenta[npar*4*nevt]
+  // ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == -PZ > 0)
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ void
+  imzxxx( const fptype momenta[], // input: momenta
+          //const fptype fmass,   // [skip: ASSUME fermion mass==0]
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          fptype wavefunctions[], // output: wavefunctions
+          const int ipar )        // input: particle# out of npar
+  {
+    mgDebug( 0, __FUNCTION__ );
+    const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar );
+    cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions );
+    fi[0] = cxmake( pvec3 * (fptype)nsf, -pvec3 * (fptype)nsf );
+    fi[1] = cxzero_sv();
+    const int nh = nhel * nsf;
+    const cxtype_sv chi = cxmake( -(fptype)nhel * fpsqrt( -2. * pvec3 ), 0. );
+    fi[3] = cxzero_sv();
+    fi[4] = cxzero_sv();
+    if( nh == 1 )
+    {
+      fi[2] = cxzero_sv();
+      fi[5] = chi;
+    }
+    else
+    {
+      fi[2] = chi;
+      fi[5] = cxzero_sv();
+    }
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fi[6] from the input momenta[npar*4*nevt]
+  // ASSUMPTIONS: (FMASS == 0) and (PT > 0)
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ void
+  ixzxxx( const fptype momenta[], // input: momenta
+          //const fptype fmass,   // [skip: ASSUME fermion mass==0]
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          fptype wavefunctions[], // output: wavefunctions
+          const int ipar )        // input: particle# out of npar
+  {
+    mgDebug( 0, __FUNCTION__ );
+    const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
+    const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
+    const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
+    const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar );
+    cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions );
+    //fi[0] = cxmake( -pvec0 * nsf, -pvec2 * nsf ); // AV: BUG! not the same as ixxxxx
+    //fi[1] = cxmake( -pvec0 * nsf, -pvec1 * nsf ); // AV: BUG! not the same as ixxxxx
+    fi[0] = cxmake( -pvec0 * (fptype)nsf, -pvec3 * (fptype)nsf ); // AV: BUG FIX
+    fi[1] = cxmake( -pvec1 * (fptype)nsf, -pvec2 * (fptype)nsf ); // AV: BUG FIX
+    const int nh = nhel * nsf;
+    //const float sqp0p3 = sqrtf( pvec0 + pvec3 ) * nsf; // AV: why force a float here?
+    const fptype_sv sqp0p3 = fpsqrt( pvec0 + pvec3 ) * (fptype)nsf;
+    const cxtype_sv chi0 = cxmake( sqp0p3, 0. );
+    const cxtype_sv chi1 = cxmake( (fptype)nh * pvec1 / sqp0p3, pvec2 / sqp0p3 );
+    if( nh == 1 )
+    {
+      fi[2] = cxzero_sv();
+      fi[3] = cxzero_sv();
+      fi[4] = chi0;
+      fi[5] = chi1;
+    }
+    else
+    {
+      fi[2] = chi1;
+      fi[3] = chi0;
+      fi[4] = cxzero_sv();
+      fi[5] = cxzero_sv();
+    }
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction vc[6] from the input momenta[npar*4*nevt]
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ void
+  vxxxxx( const fptype momenta[], // input: momenta
+          const fptype vmass,     // input: vector boson mass
+          const int nhel,         // input: -1, 0 (only if vmass!=0) or +1 (helicity of vector boson)
+          const int nsv,          // input: +1 (final) or -1 (initial)
+          fptype wavefunctions[], // output: wavefunctions
+          const int ipar )        // input: particle# out of npar
+  {
+    mgDebug( 0, __FUNCTION__ );
+    const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
+    const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
+    const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
+    const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar );
+    cxtype_sv* vc = W_ACCESS::kernelAccess( wavefunctions );
+    const fptype sqh = fpsqrt( 0.5 ); // AV this is > 0!
+    const fptype hel = nhel;
+    vc[0] = cxmake( pvec0 * (fptype)nsv, pvec3 * (fptype)nsv );
+    vc[1] = cxmake( pvec1 * (fptype)nsv, pvec2 * (fptype)nsv );
+    if( vmass != 0. )
+    {
+      const int nsvahl = nsv * std::abs( hel );
+      const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 );
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) );
+      const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) );
+      const fptype hel0 = 1. - std::abs( hel );
+#ifndef MGONGPU_CPPSIMD
+      if( pp == 0. )
+      {
+        vc[2] = cxmake( 0., 0. );
+        vc[3] = cxmake( -hel * sqh, 0. );
+        vc[4] = cxmake( 0., nsvahl * sqh );
+        vc[5] = cxmake( hel0, 0. );
+      }
+      else
+      {
+        const fptype emp = pvec0 / ( vmass * pp );
+        vc[2] = cxmake( hel0 * pp / vmass, 0. );
+        vc[5] = cxmake( hel0 * pvec3 * emp + hel * pt / pp * sqh, 0. );
+        if( pt != 0. )
+        {
+          const fptype pzpt = pvec3 / ( pp * pt ) * sqh * hel;
+          vc[3] = cxmake( hel0 * pvec1 * emp - pvec1 * pzpt, -nsvahl * pvec2 / pt * sqh );
+          vc[4] = cxmake( hel0 * pvec2 * emp - pvec2 * pzpt, nsvahl * pvec1 / pt * sqh );
+        }
+        else
+        {
+          vc[3] = cxmake( -hel * sqh, 0. );
+          // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
+          //vc[4] = cxmake( 0., nsvahl * ( pvec3 < 0. ? -std::abs( sqh ) : std::abs( sqh ) ) ); // AV: why abs here?
+          vc[4] = cxmake( 0., nsvahl * ( pvec3 < 0. ? -sqh : sqh ) ); // AV: removed an abs here
+        }
+      }
+#else
+      // Branch A: pp == 0.
+      const cxtype vcA_2 = cxmake( 0, 0 );
+      const cxtype vcA_3 = cxmake( -hel * sqh, 0 );
+      const cxtype vcA_4 = cxmake( 0, nsvahl * sqh );
+      const cxtype vcA_5 = cxmake( hel0, 0 );
+      // Branch B: pp != 0.
+      const fptype_v emp = pvec0 / ( vmass * pp );
+      const cxtype_v vcB_2 = cxmake( hel0 * pp / vmass, 0 );
+      const cxtype_v vcB_5 = cxmake( hel0 * pvec3 * emp + hel * pt / pp * sqh, 0 );
+      // Branch B1: pp != 0. and pt != 0.
+      const fptype_v pzpt = pvec3 / ( pp * pt ) * sqh * hel;
+      const cxtype_v vcB1_3 = cxmake( hel0 * pvec1 * emp - pvec1 * pzpt, -(fptype)nsvahl * pvec2 / pt * sqh );
+      const cxtype_v vcB1_4 = cxmake( hel0 * pvec2 * emp - pvec2 * pzpt, (fptype)nsvahl * pvec1 / pt * sqh );
+      // Branch B2: pp != 0. and pt == 0.
+      const cxtype vcB2_3 = cxmake( -hel * sqh, 0. );
+      const cxtype_v vcB2_4 = cxmake( 0., (fptype)nsvahl * fpternary( ( pvec3 < 0 ), -sqh, sqh ) ); // AV: removed an abs here
+      // Choose between the results from branch A and branch B (and from branch B1 and branch B2)
+      const bool_v mask = ( pp == 0. );
+      const bool_v maskB = ( pt != 0. );
+      vc[2] = cxternary( mask, vcA_2, vcB_2 );
+      vc[3] = cxternary( mask, vcA_3, cxternary( maskB, vcB1_3, vcB2_3 ) );
+      vc[4] = cxternary( mask, vcA_4, cxternary( maskB, vcB1_4, vcB2_4 ) );
+      vc[5] = cxternary( mask, vcA_5, vcB_5 );
+#endif
+    }
+    else
+    {
+      const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0
+      const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) );
+      vc[2] = cxzero_sv();
+      vc[5] = cxmake( hel * pt / pp * sqh, 0. );
+#ifndef MGONGPU_CPPSIMD
+      if( pt != 0. )
+      {
+        const fptype pzpt = pvec3 / ( pp * pt ) * sqh * hel;
+        vc[3] = cxmake( -pvec1 * pzpt, -nsv * pvec2 / pt * sqh );
+        vc[4] = cxmake( -pvec2 * pzpt, nsv * pvec1 / pt * sqh );
+      }
+      else
+      {
+        vc[3] = cxmake( -hel * sqh, 0. );
+        // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
+        //vc[4] = cxmake( 0, nsv * ( pvec3 < 0. ? -std::abs( sqh ) : std::abs( sqh ) ) ); // AV why abs here?
+        vc[4] = cxmake( 0., nsv * ( pvec3 < 0. ? -sqh : sqh ) ); // AV: removed an abs here
+      }
+#else
+      // Branch A: pt != 0.
+      const fptype_v pzpt = pvec3 / ( pp * pt ) * sqh * hel;
+      const cxtype_v vcA_3 = cxmake( -pvec1 * pzpt, -(fptype)nsv * pvec2 / pt * sqh );
+      const cxtype_v vcA_4 = cxmake( -pvec2 * pzpt, (fptype)nsv * pvec1 / pt * sqh );
+      // Branch B: pt == 0.
+      const cxtype vcB_3 = cxmake( -(fptype)hel * sqh, 0 );
+      const cxtype_v vcB_4 = cxmake( 0, (fptype)nsv * fpternary( ( pvec3 < 0 ), -sqh, sqh ) ); // AV: removed an abs here
+      // Choose between the results from branch A and branch B
+      const bool_v mask = ( pt != 0. );
+      vc[3] = cxternary( mask, vcA_3, vcB_3 );
+      vc[4] = cxternary( mask, vcA_4, vcB_4 );
+#endif
+    }
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction sc[3] from the input momenta[npar*4*nevt]
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ void
+  sxxxxx( const fptype momenta[], // input: momenta
+          //const fptype,                 // WARNING: input "smass" unused (missing in Fortran) - scalar boson mass
+          //const int,                    // WARNING: input "nhel" unused (missing in Fortran) - scalar has no helicity!
+          const int nss,          // input: +1 (final) or -1 (initial)
+          fptype wavefunctions[], // output: wavefunctions
+          const int ipar )        // input: particle# out of npar
+  {
+    mgDebug( 0, __FUNCTION__ );
+    const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
+    const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
+    const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
+    const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar );
+    cxtype_sv* sc = W_ACCESS::kernelAccess( wavefunctions );
+    sc[2] = cxmake( 1 + fptype_sv{ 0 }, 0 );
+    sc[0] = cxmake( pvec0 * (fptype)nss, pvec3 * (fptype)nss );
+    sc[1] = cxmake( pvec1 * (fptype)nss, pvec2 * (fptype)nss );
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fo[6] from the input momenta[npar*4*nevt]
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ void
+  oxxxxx( const fptype momenta[], // input: momenta
+          const fptype fmass,     // input: fermion mass
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          fptype wavefunctions[], // output: wavefunctions
+          const int ipar )        // input: particle# out of npar
+  {
+    mgDebug( 0, __FUNCTION__ );
+    const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
+    const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
+    const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
+    const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar );
+    cxtype_sv* fo = W_ACCESS::kernelAccess( wavefunctions );
+    fo[0] = cxmake( pvec0 * (fptype)nsf, pvec3 * (fptype)nsf );
+    fo[1] = cxmake( pvec1 * (fptype)nsf, pvec2 * (fptype)nsf );
+    const int nh = nhel * nsf;
+    if( fmass != 0. )
+    {
+      const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) );
+#ifndef MGONGPU_CPPSIMD
+      if( pp == 0. )
+      {
+        // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
+        fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0. }; // possibility of negative fermion masses
+        //sqm[1] = ( fmass < 0. ? -abs( sqm[0] ) : abs( sqm[0] ) ); // AV: why abs here?
+        sqm[1] = ( fmass < 0. ? -sqm[0] : sqm[0] ); // AV: removed an abs here
+        const int ip = -( ( 1 - nh ) / 2 ) * nhel;  // NB: Fortran sqm(0:1) also has indexes 0,1 as in C++
+        const int im = ( 1 + nh ) / 2 * nhel;       // NB: Fortran sqm(0:1) also has indexes 0,1 as in C++
+        fo[2] = cxmake( im * sqm[std::abs( ip )], 0 );
+        fo[3] = cxmake( ip * nsf * sqm[std::abs( ip )], 0 );
+        fo[4] = cxmake( im * nsf * sqm[std::abs( im )], 0 );
+        fo[5] = cxmake( ip * sqm[std::abs( im )], 0 );
+      }
+      else
+      {
+        const fptype sf[2] = { fptype( 1 + nsf + ( 1 - nsf ) * nh ) * (fptype)0.5,
+                               fptype( 1 + nsf - ( 1 - nsf ) * nh ) * (fptype)0.5 };
+        fptype omega[2] = { fpsqrt( pvec0 + pp ), 0. };
+        omega[1] = fmass / omega[0];
+        const int ip = ( 1 + nh ) / 2; // NB: Fortran is (3+nh)/2 because omega(2) has indexes 1,2 and not 0,1
+        const int im = ( 1 - nh ) / 2; // NB: Fortran is (3-nh)/2 because omega(2) has indexes 1,2 and not 0,1
+        const fptype sfomeg[2] = { sf[0] * omega[ip], sf[1] * omega[im] };
+        const fptype pp3 = fpmax( pp + pvec3, 0. );
+        const cxtype chi[2] = { cxmake( fpsqrt( pp3 * (fptype)0.5 / pp ), 0. ),
+                                ( ( pp3 == 0. ) ? cxmake( -nh, 0. )
+                                                : cxmake( nh * pvec1, -pvec2 ) / fpsqrt( 2. * pp * pp3 ) ) };
+        fo[2] = sfomeg[1] * chi[im];
+        fo[3] = sfomeg[1] * chi[ip];
+        fo[4] = sfomeg[0] * chi[im];
+        fo[5] = sfomeg[0] * chi[ip];
+      }
+#else
+      // Branch A: pp == 0.
+      // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs!
+      fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses
+      sqm[1] = ( fmass < 0 ? -sqm[0] : sqm[0] );          // AV: removed an abs here (as above)
+      const int ipA = -( ( 1 - nh ) / 2 ) * nhel;
+      const int imA = ( 1 + nh ) / 2 * nhel;
+      const cxtype foA_2 = imA * sqm[std::abs( ipA )];
+      const cxtype foA_3 = ipA * nsf * sqm[std::abs( ipA )];
+      const cxtype foA_4 = imA * nsf * sqm[std::abs( imA )];
+      const cxtype foA_5 = ipA * sqm[std::abs( imA )];
+      // Branch B: pp != 0.
+      const fptype sf[2] = { fptype( 1 + nsf + ( 1 - nsf ) * nh ) * (fptype)0.5,
+                             fptype( 1 + nsf - ( 1 - nsf ) * nh ) * (fptype)0.5 };
+      fptype_v omega[2] = { fpsqrt( pvec0 + pp ), 0 };
+      omega[1] = fmass / omega[0];
+      const int ipB = ( 1 + nh ) / 2;
+      const int imB = ( 1 - nh ) / 2;
+      const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] };
+      const fptype_v pp3 = fpmax( pp + pvec3, 0. );
+      const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / pp ), 0. ),
+                                ( cxternary( ( pp3 == 0. ),
+                                             cxmake( -nh, 0. ),
+                                             cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * pp * pp3 ) ) ) };
+      const cxtype_v foB_2 = sfomeg[1] * chi[imB];
+      const cxtype_v foB_3 = sfomeg[1] * chi[ipB];
+      const cxtype_v foB_4 = sfomeg[0] * chi[imB];
+      const cxtype_v foB_5 = sfomeg[0] * chi[ipB];
+      // Choose between the results from branch A and branch B
+      const bool_v mask = ( pp == 0. );
+      fo[2] = cxternary( mask, foA_2, foB_2 );
+      fo[3] = cxternary( mask, foA_3, foB_3 );
+      fo[4] = cxternary( mask, foA_4, foB_4 );
+      fo[5] = cxternary( mask, foA_5, foB_5 );
+#endif
+    }
+    else
+    {
+      const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ),
+                                          0,
+                                          fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf );
+      const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ),
+                                 cxternary( ( sqp0p3 == 0. ),
+                                            cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ),
+                                            cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) };
+      if( nh == 1 )
+      {
+        fo[2] = chi[0];
+        fo[3] = chi[1];
+        fo[4] = cxzero_sv();
+        fo[5] = cxzero_sv();
+      }
+      else
+      {
+        fo[2] = cxzero_sv();
+        fo[3] = cxzero_sv();
+        fo[4] = chi[1];
+        fo[5] = chi[0];
+      }
+    }
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fo[6] from the input momenta[npar*4*nevt]
+  // ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == +PZ > 0)
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ void
+  opzxxx( const fptype momenta[], // input: momenta
+          //const fptype fmass,   // [skip: ASSUME fermion mass==0]
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          fptype wavefunctions[], // output: wavefunctions
+          const int ipar )        // input: particle# out of npar
+  {
+    mgDebug( 0, __FUNCTION__ );
+    const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar );
+    cxtype_sv* fo = W_ACCESS::kernelAccess( wavefunctions );
+    fo[0] = cxmake( pvec3 * (fptype)nsf, pvec3 * (fptype)nsf );
+    fo[1] = cxzero_sv();
+    const int nh = nhel * nsf;
+    const cxtype_sv csqp0p3 = cxmake( fpsqrt( 2. * pvec3 ) * (fptype)nsf, 0. );
+    fo[3] = cxzero_sv();
+    fo[4] = cxzero_sv();
+    if( nh == 1 )
+    {
+      fo[2] = csqp0p3;
+      fo[5] = cxzero_sv();
+    }
+    else
+    {
+      fo[2] = cxzero_sv();
+      fo[5] = csqp0p3;
+    }
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fo[6] from the input momenta[npar*4*nevt]
+  // ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == -PZ > 0)
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ void
+  omzxxx( const fptype momenta[], // input: momenta
+          //const fptype fmass,   // [skip: ASSUME fermion mass==0]
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          fptype wavefunctions[], // output: wavefunctions
+          const int ipar )        // input: particle# out of npar
+  {
+    mgDebug( 0, __FUNCTION__ );
+    const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar );
+    cxtype_sv* fo = W_ACCESS::kernelAccess( wavefunctions );
+    fo[0] = cxmake( -pvec3 * (fptype)nsf, pvec3 * (fptype)nsf ); // remember pvec0 == -pvec3
+    fo[1] = cxzero_sv();
+    const int nh = nhel * nsf;
+    const cxtype_sv chi1 = cxmake( -nhel, 0. ) * fpsqrt( -2. * pvec3 );
+    if( nh == 1 )
+    {
+      fo[2] = cxzero_sv();
+      fo[3] = chi1;
+      fo[4] = cxzero_sv();
+      fo[5] = cxzero_sv();
+    }
+    else
+    {
+      fo[2] = cxzero_sv();
+      fo[3] = cxzero_sv();
+      fo[4] = chi1;
+      //fo[5] = chi1; // AV: BUG!
+      fo[5] = cxzero_sv(); // AV: BUG FIX
+    }
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction fo[6] from the input momenta[npar*4*nevt]
+  // ASSUMPTIONS: (FMASS == 0) and (PT > 0)
+  template<class M_ACCESS, class W_ACCESS>
+  __host__ __device__ void
+  oxzxxx( const fptype momenta[], // input: momenta
+          //const fptype fmass,   // [skip: ASSUME fermion mass==0]
+          const int nhel,         // input: -1 or +1 (helicity of fermion)
+          const int nsf,          // input: +1 (particle) or -1 (antiparticle)
+          fptype wavefunctions[], // output: wavefunctions
+          const int ipar )        // input: particle# out of npar
+  {
+    mgDebug( 0, __FUNCTION__ );
+    const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar );
+    const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar );
+    const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar );
+    const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar );
+    cxtype_sv* fo = W_ACCESS::kernelAccess( wavefunctions );
+    fo[0] = cxmake( pvec0 * (fptype)nsf, pvec3 * (fptype)nsf );
+    fo[1] = cxmake( pvec1 * (fptype)nsf, pvec2 * (fptype)nsf );
+    const int nh = nhel * nsf;
+    //const float sqp0p3 = sqrtf( pvec0 + pvec3 ) * nsf; // AV: why force a float here?
+    const fptype_sv sqp0p3 = fpsqrt( pvec0 + pvec3 ) * (fptype)nsf;
+    const cxtype_sv chi0 = cxmake( sqp0p3, 0. );
+    const cxtype_sv chi1 = cxmake( (fptype)nh * pvec1 / sqp0p3, -pvec2 / sqp0p3 );
+    if( nh == 1 )
+    {
+      fo[2] = chi0;
+      fo[3] = chi1;
+      fo[4] = cxzero_sv();
+      fo[5] = cxzero_sv();
+    }
+    else
+    {
+      fo[2] = cxzero_sv();
+      fo[3] = cxzero_sv();
+      fo[4] = chi1;
+      fo[5] = chi0;
+    }
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //==========================================================================
+
+  // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6]
+  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  __device__ INLINE void
+  VVV5_0( const fptype allV1[],
+          const fptype allV2[],
+          const fptype allV3[],
+          const fptype allCOUP[],
+          fptype allvertexes[] ) ALWAYS_INLINE;
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
+  template<class W_ACCESS, class C_ACCESS>
+  __device__ INLINE void
+  VVV5P0_1( const fptype allV2[],
+            const fptype allV3[],
+            const fptype allCOUP[],
+            const fptype M1,
+            const fptype W1,
+            fptype allV1[] ) ALWAYS_INLINE;
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
+  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  __device__ INLINE void
+  FFV1_0( const fptype allF1[],
+          const fptype allF2[],
+          const fptype allV3[],
+          const fptype allCOUP[],
+          fptype allvertexes[] ) ALWAYS_INLINE;
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
+  template<class W_ACCESS, class C_ACCESS>
+  __device__ INLINE void
+  FFV1_1( const fptype allF2[],
+          const fptype allV3[],
+          const fptype allCOUP[],
+          const fptype M1,
+          const fptype W1,
+          fptype allF1[] ) ALWAYS_INLINE;
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
+  template<class W_ACCESS, class C_ACCESS>
+  __device__ INLINE void
+  FFV1_2( const fptype allF1[],
+          const fptype allV3[],
+          const fptype allCOUP[],
+          const fptype M2,
+          const fptype W2,
+          fptype allF2[] ) ALWAYS_INLINE;
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
+  template<class W_ACCESS, class C_ACCESS>
+  __device__ INLINE void
+  FFV1P0_3( const fptype allF1[],
+            const fptype allF2[],
+            const fptype allCOUP[],
+            const fptype M3,
+            const fptype W3,
+            fptype allV3[] ) ALWAYS_INLINE;
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
+  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  __device__ INLINE void
+  VVVV1_0( const fptype allV1[],
+           const fptype allV2[],
+           const fptype allV3[],
+           const fptype allV4[],
+           const fptype allCOUP[],
+           fptype allvertexes[] ) ALWAYS_INLINE;
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
+  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  __device__ INLINE void
+  VVVV9_0( const fptype allV1[],
+           const fptype allV2[],
+           const fptype allV3[],
+           const fptype allV4[],
+           const fptype allCOUP[],
+           fptype allvertexes[] ) ALWAYS_INLINE;
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
+  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  __device__ INLINE void
+  VVVV10_0( const fptype allV1[],
+            const fptype allV2[],
+            const fptype allV3[],
+            const fptype allV4[],
+            const fptype allCOUP[],
+            fptype allvertexes[] ) ALWAYS_INLINE;
+
+  //==========================================================================
+
+  // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6]
+  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  __device__ void
+  VVV5_0( const fptype allV1[],
+          const fptype allV2[],
+          const fptype allV3[],
+          const fptype allCOUP[],
+          fptype allvertexes[] )
+  {
+    mgDebug( 0, __FUNCTION__ );
+    const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 );
+    const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
+    const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
+    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
+    const cxtype cI = cxmake( 0., 1. );
+    const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) };
+    const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) };
+    const fptype_sv P3[4] = { +cxreal( V3[0] ), +cxreal( V3[1] ), +cximag( V3[1] ), +cximag( V3[0] ) };
+    const cxtype_sv TMP0 = ( V3[2] * P1[0] - V3[3] * P1[1] - V3[4] * P1[2] - V3[5] * P1[3] );
+    const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] );
+    const cxtype_sv TMP2 = ( V3[2] * P2[0] - V3[3] * P2[1] - V3[4] * P2[2] - V3[5] * P2[3] );
+    const cxtype_sv TMP3 = ( V3[2] * V1[2] - V3[3] * V1[3] - V3[4] * V1[4] - V3[5] * V1[5] );
+    const cxtype_sv TMP4 = ( P1[0] * V2[2] - P1[1] * V2[3] - P1[2] * V2[4] - P1[3] * V2[5] );
+    const cxtype_sv TMP5 = ( V2[2] * P3[0] - V2[3] * P3[1] - V2[4] * P3[2] - V2[5] * P3[3] );
+    const cxtype_sv TMP6 = ( V3[2] * V2[2] - V3[3] * V2[3] - V3[4] * V2[4] - V3[5] * V2[5] );
+    const cxtype_sv TMP7 = ( V1[2] * P2[0] - V1[3] * P2[1] - V1[4] * P2[2] - V1[5] * P2[3] );
+    const cxtype_sv TMP8 = ( V1[2] * P3[0] - V1[3] * P3[1] - V1[4] * P3[2] - V1[5] * P3[3] );
+    ( *vertex ) = COUP * ( TMP1 * ( -cI * TMP0 + cI * TMP2 ) + ( TMP3 * ( +cI * TMP4 - cI * TMP5 ) + TMP6 * ( -cI * TMP7 + cI * TMP8 ) ) );
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
+  template<class W_ACCESS, class C_ACCESS>
+  __device__ void
+  VVV5P0_1( const fptype allV2[],
+            const fptype allV3[],
+            const fptype allCOUP[],
+            const fptype M1,
+            const fptype W1,
+            fptype allV1[] )
+  {
+    mgDebug( 0, __FUNCTION__ );
+    const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
+    const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
+    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
+    const cxtype cI = cxmake( 0., 1. );
+    const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) };
+    const fptype_sv P3[4] = { +cxreal( V3[0] ), +cxreal( V3[1] ), +cximag( V3[1] ), +cximag( V3[0] ) };
+    V1[0] = +V2[0] + V3[0];
+    V1[1] = +V2[1] + V3[1];
+    const fptype_sv P1[4] = { -cxreal( V1[0] ), -cxreal( V1[1] ), -cximag( V1[1] ), -cximag( V1[0] ) };
+    const cxtype_sv TMP0 = ( V3[2] * P1[0] - V3[3] * P1[1] - V3[4] * P1[2] - V3[5] * P1[3] );
+    const cxtype_sv TMP2 = ( V3[2] * P2[0] - V3[3] * P2[1] - V3[4] * P2[2] - V3[5] * P2[3] );
+    const cxtype_sv TMP4 = ( P1[0] * V2[2] - P1[1] * V2[3] - P1[2] * V2[4] - P1[3] * V2[5] );
+    const cxtype_sv TMP5 = ( V2[2] * P3[0] - V2[3] * P3[1] - V2[4] * P3[2] - V2[5] * P3[3] );
+    const cxtype_sv TMP6 = ( V3[2] * V2[2] - V3[3] * V2[3] - V3[4] * V2[4] - V3[5] * V2[5] );
+    const cxtype_sv denom = COUP / ( ( P1[0] * P1[0] ) - ( P1[1] * P1[1] ) - ( P1[2] * P1[2] ) - ( P1[3] * P1[3] ) - M1 * ( M1 - cI * W1 ) );
+    V1[2] = denom * ( TMP6 * ( -cI * P2[0] + cI * P3[0] ) + ( V2[2] * ( -cI * TMP0 + cI * TMP2 ) + V3[2] * ( +cI * TMP4 - cI * TMP5 ) ) );
+    V1[3] = denom * ( TMP6 * ( -cI * P2[1] + cI * P3[1] ) + ( V2[3] * ( -cI * TMP0 + cI * TMP2 ) + V3[3] * ( +cI * TMP4 - cI * TMP5 ) ) );
+    V1[4] = denom * ( TMP6 * ( -cI * P2[2] + cI * P3[2] ) + ( V2[4] * ( -cI * TMP0 + cI * TMP2 ) + V3[4] * ( +cI * TMP4 - cI * TMP5 ) ) );
+    V1[5] = denom * ( TMP6 * ( -cI * P2[3] + cI * P3[3] ) + ( V2[5] * ( -cI * TMP0 + cI * TMP2 ) + V3[5] * ( +cI * TMP4 - cI * TMP5 ) ) );
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
+  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  __device__ void
+  FFV1_0( const fptype allF1[],
+          const fptype allF2[],
+          const fptype allV3[],
+          const fptype allCOUP[],
+          fptype allvertexes[] )
+  {
+    mgDebug( 0, __FUNCTION__ );
+    const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
+    const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
+    const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
+    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
+    const cxtype cI = cxmake( 0., 1. );
+    const cxtype_sv TMP9 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) );
+    ( *vertex ) = COUP * -cI * TMP9;
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
+  template<class W_ACCESS, class C_ACCESS>
+  __device__ void
+  FFV1_1( const fptype allF2[],
+          const fptype allV3[],
+          const fptype allCOUP[],
+          const fptype M1,
+          const fptype W1,
+          fptype allF1[] )
+  {
+    mgDebug( 0, __FUNCTION__ );
+    const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
+    const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
+    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 );
+    const cxtype cI = cxmake( 0., 1. );
+    F1[0] = +F2[0] + V3[0];
+    F1[1] = +F2[1] + V3[1];
+    const fptype_sv P1[4] = { -cxreal( F1[0] ), -cxreal( F1[1] ), -cximag( F1[1] ), -cximag( F1[0] ) };
+    constexpr fptype one( 1. );
+    const cxtype_sv denom = COUP / ( ( P1[0] * P1[0] ) - ( P1[1] * P1[1] ) - ( P1[2] * P1[2] ) - ( P1[3] * P1[3] ) - M1 * ( M1 - cI * W1 ) );
+    F1[2] = denom * cI * ( F2[2] * ( P1[0] * ( -V3[2] + V3[5] ) + ( P1[1] * ( V3[3] - cI * V3[4] ) + ( P1[2] * ( +cI * V3[3] + V3[4] ) + P1[3] * ( -V3[2] + V3[5] ) ) ) ) + ( F2[3] * ( P1[0] * ( V3[3] + cI * V3[4] ) + ( P1[1] * ( -one ) * ( V3[2] + V3[5] ) + ( P1[2] * ( -one ) * ( +cI * ( V3[2] + V3[5] ) ) + P1[3] * ( V3[3] + cI * V3[4] ) ) ) ) + M1 * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) ) );
+    F1[3] = denom * ( -cI ) * ( F2[2] * ( P1[0] * ( -V3[3] + cI * V3[4] ) + ( P1[1] * ( V3[2] - V3[5] ) + ( P1[2] * ( -cI * V3[2] + cI * V3[5] ) + P1[3] * ( V3[3] - cI * V3[4] ) ) ) ) + ( F2[3] * ( P1[0] * ( V3[2] + V3[5] ) + ( P1[1] * ( -one ) * ( V3[3] + cI * V3[4] ) + ( P1[2] * ( +cI * V3[3] - V3[4] ) - P1[3] * ( V3[2] + V3[5] ) ) ) ) + M1 * ( F2[4] * ( -V3[3] + cI * V3[4] ) + F2[5] * ( -V3[2] + V3[5] ) ) ) );
+    F1[4] = denom * ( -cI ) * ( F2[4] * ( P1[0] * ( V3[2] + V3[5] ) + ( P1[1] * ( -V3[3] + cI * V3[4] ) + ( P1[2] * ( -one ) * ( +cI * V3[3] + V3[4] ) - P1[3] * ( V3[2] + V3[5] ) ) ) ) + ( F2[5] * ( P1[0] * ( V3[3] + cI * V3[4] ) + ( P1[1] * ( -V3[2] + V3[5] ) + ( P1[2] * ( -cI * V3[2] + cI * V3[5] ) - P1[3] * ( V3[3] + cI * V3[4] ) ) ) ) + M1 * ( F2[2] * ( -V3[2] + V3[5] ) + F2[3] * ( V3[3] + cI * V3[4] ) ) ) );
+    F1[5] = denom * cI * ( F2[4] * ( P1[0] * ( -V3[3] + cI * V3[4] ) + ( P1[1] * ( V3[2] + V3[5] ) + ( P1[2] * ( -one ) * ( +cI * ( V3[2] + V3[5] ) ) + P1[3] * ( -V3[3] + cI * V3[4] ) ) ) ) + ( F2[5] * ( P1[0] * ( -V3[2] + V3[5] ) + ( P1[1] * ( V3[3] + cI * V3[4] ) + ( P1[2] * ( -cI * V3[3] + V3[4] ) + P1[3] * ( -V3[2] + V3[5] ) ) ) ) + M1 * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) );
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
+  template<class W_ACCESS, class C_ACCESS>
+  __device__ void
+  FFV1_2( const fptype allF1[],
+          const fptype allV3[],
+          const fptype allCOUP[],
+          const fptype M2,
+          const fptype W2,
+          fptype allF2[] )
+  {
+    mgDebug( 0, __FUNCTION__ );
+    const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
+    const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
+    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 );
+    const cxtype cI = cxmake( 0., 1. );
+    F2[0] = +F1[0] + V3[0];
+    F2[1] = +F1[1] + V3[1];
+    const fptype_sv P2[4] = { -cxreal( F2[0] ), -cxreal( F2[1] ), -cximag( F2[1] ), -cximag( F2[0] ) };
+    constexpr fptype one( 1. );
+    const cxtype_sv denom = COUP / ( ( P2[0] * P2[0] ) - ( P2[1] * P2[1] ) - ( P2[2] * P2[2] ) - ( P2[3] * P2[3] ) - M2 * ( M2 - cI * W2 ) );
+    F2[2] = denom * cI * ( F1[2] * ( P2[0] * ( V3[2] + V3[5] ) + ( P2[1] * ( -one ) * ( V3[3] + cI * V3[4] ) + ( P2[2] * ( +cI * V3[3] - V3[4] ) - P2[3] * ( V3[2] + V3[5] ) ) ) ) + ( F1[3] * ( P2[0] * ( V3[3] - cI * V3[4] ) + ( P2[1] * ( -V3[2] + V3[5] ) + ( P2[2] * ( +cI * V3[2] - cI * V3[5] ) + P2[3] * ( -V3[3] + cI * V3[4] ) ) ) ) + M2 * ( F1[4] * ( V3[2] - V3[5] ) + F1[5] * ( -V3[3] + cI * V3[4] ) ) ) );
+    F2[3] = denom * ( -cI ) * ( F1[2] * ( P2[0] * ( -one ) * ( V3[3] + cI * V3[4] ) + ( P2[1] * ( V3[2] + V3[5] ) + ( P2[2] * ( +cI * ( V3[2] + V3[5] ) ) - P2[3] * ( V3[3] + cI * V3[4] ) ) ) ) + ( F1[3] * ( P2[0] * ( -V3[2] + V3[5] ) + ( P2[1] * ( V3[3] - cI * V3[4] ) + ( P2[2] * ( +cI * V3[3] + V3[4] ) + P2[3] * ( -V3[2] + V3[5] ) ) ) ) + M2 * ( F1[4] * ( V3[3] + cI * V3[4] ) - F1[5] * ( V3[2] + V3[5] ) ) ) );
+    F2[4] = denom * ( -cI ) * ( F1[4] * ( P2[0] * ( -V3[2] + V3[5] ) + ( P2[1] * ( V3[3] + cI * V3[4] ) + ( P2[2] * ( -cI * V3[3] + V3[4] ) + P2[3] * ( -V3[2] + V3[5] ) ) ) ) + ( F1[5] * ( P2[0] * ( V3[3] - cI * V3[4] ) + ( P2[1] * ( -one ) * ( V3[2] + V3[5] ) + ( P2[2] * ( +cI * ( V3[2] + V3[5] ) ) + P2[3] * ( V3[3] - cI * V3[4] ) ) ) ) + M2 * ( F1[2] * ( -one ) * ( V3[2] + V3[5] ) + F1[3] * ( -V3[3] + cI * V3[4] ) ) ) );
+    F2[5] = denom * cI * ( F1[4] * ( P2[0] * ( -one ) * ( V3[3] + cI * V3[4] ) + ( P2[1] * ( V3[2] - V3[5] ) + ( P2[2] * ( +cI * V3[2] - cI * V3[5] ) + P2[3] * ( V3[3] + cI * V3[4] ) ) ) ) + ( F1[5] * ( P2[0] * ( V3[2] + V3[5] ) + ( P2[1] * ( -V3[3] + cI * V3[4] ) + ( P2[2] * ( -one ) * ( +cI * V3[3] + V3[4] ) - P2[3] * ( V3[2] + V3[5] ) ) ) ) + M2 * ( F1[2] * ( V3[3] + cI * V3[4] ) + F1[3] * ( V3[2] - V3[5] ) ) ) );
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
+  template<class W_ACCESS, class C_ACCESS>
+  __device__ void
+  FFV1P0_3( const fptype allF1[],
+            const fptype allF2[],
+            const fptype allCOUP[],
+            const fptype M3,
+            const fptype W3,
+            fptype allV3[] )
+  {
+    mgDebug( 0, __FUNCTION__ );
+    const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
+    const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
+    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 );
+    const cxtype cI = cxmake( 0., 1. );
+    V3[0] = +F1[0] + F2[0];
+    V3[1] = +F1[1] + F2[1];
+    const fptype_sv P3[4] = { -cxreal( V3[0] ), -cxreal( V3[1] ), -cximag( V3[1] ), -cximag( V3[0] ) };
+    const cxtype_sv denom = COUP / ( ( P3[0] * P3[0] ) - ( P3[1] * P3[1] ) - ( P3[2] * P3[2] ) - ( P3[3] * P3[3] ) - M3 * ( M3 - cI * W3 ) );
+    V3[2] = denom * ( -cI ) * ( F1[2] * F2[4] + F1[3] * F2[5] + F1[4] * F2[2] + F1[5] * F2[3] );
+    V3[3] = denom * ( -cI ) * ( -F1[2] * F2[5] - F1[3] * F2[4] + F1[4] * F2[3] + F1[5] * F2[2] );
+    V3[4] = denom * ( -cI ) * ( -cI * ( F1[2] * F2[5] + F1[5] * F2[2] ) + cI * ( F1[3] * F2[4] + F1[4] * F2[3] ) );
+    V3[5] = denom * ( -cI ) * ( -F1[2] * F2[4] - F1[5] * F2[3] + F1[3] * F2[5] + F1[4] * F2[2] );
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
+  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  __device__ void
+  VVVV1_0( const fptype allV1[],
+           const fptype allV2[],
+           const fptype allV3[],
+           const fptype allV4[],
+           const fptype allCOUP[],
+           fptype allvertexes[] )
+  {
+    mgDebug( 0, __FUNCTION__ );
+    const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 );
+    const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
+    const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
+    const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
+    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
+    const cxtype cI = cxmake( 0., 1. );
+    const cxtype_sv TMP10 = ( V1[2] * V4[2] - V1[3] * V4[3] - V1[4] * V4[4] - V1[5] * V4[5] );
+    const cxtype_sv TMP11 = ( V2[2] * V4[2] - V2[3] * V4[3] - V2[4] * V4[4] - V2[5] * V4[5] );
+    const cxtype_sv TMP3 = ( V3[2] * V1[2] - V3[3] * V1[3] - V3[4] * V1[4] - V3[5] * V1[5] );
+    const cxtype_sv TMP6 = ( V3[2] * V2[2] - V3[3] * V2[3] - V3[4] * V2[4] - V3[5] * V2[5] );
+    ( *vertex ) = COUP * ( -cI * ( TMP6 * TMP10 ) + cI * ( TMP3 * TMP11 ) );
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
+  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  __device__ void
+  VVVV9_0( const fptype allV1[],
+           const fptype allV2[],
+           const fptype allV3[],
+           const fptype allV4[],
+           const fptype allCOUP[],
+           fptype allvertexes[] )
+  {
+    mgDebug( 0, __FUNCTION__ );
+    const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 );
+    const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
+    const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
+    const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
+    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
+    const cxtype cI = cxmake( 0., 1. );
+    const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] );
+    const cxtype_sv TMP10 = ( V1[2] * V4[2] - V1[3] * V4[3] - V1[4] * V4[4] - V1[5] * V4[5] );
+    const cxtype_sv TMP12 = ( V3[2] * V4[2] - V3[3] * V4[3] - V3[4] * V4[4] - V3[5] * V4[5] );
+    const cxtype_sv TMP6 = ( V3[2] * V2[2] - V3[3] * V2[3] - V3[4] * V2[4] - V3[5] * V2[5] );
+    ( *vertex ) = COUP * ( -cI * ( TMP6 * TMP10 ) + cI * ( TMP1 * TMP12 ) );
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
+  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  __device__ void
+  VVVV10_0( const fptype allV1[],
+            const fptype allV2[],
+            const fptype allV3[],
+            const fptype allV4[],
+            const fptype allCOUP[],
+            fptype allvertexes[] )
+  {
+    mgDebug( 0, __FUNCTION__ );
+    const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 );
+    const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
+    const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
+    const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
+    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
+    const cxtype cI = cxmake( 0., 1. );
+    const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] );
+    const cxtype_sv TMP11 = ( V2[2] * V4[2] - V2[3] * V4[3] - V2[4] * V4[4] - V2[5] * V4[5] );
+    const cxtype_sv TMP12 = ( V3[2] * V4[2] - V3[3] * V4[3] - V3[4] * V4[4] - V3[5] * V4[5] );
+    const cxtype_sv TMP3 = ( V3[2] * V1[2] - V3[3] * V1[3] - V3[4] * V1[4] - V3[5] * V1[5] );
+    ( *vertex ) = COUP * ( -cI * ( TMP3 * TMP11 ) + cI * ( TMP1 * TMP12 ) );
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
+
+#endif // HelAmps_SMEFTsim_topU3l_MwScheme_UFO_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc b/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
new file mode 100644
index 0000000000..f08a14b80a
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
@@ -0,0 +1,796 @@
+//==========================================================================
+// This file has been automatically generated for CUDA/C++ standalone by
+// MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-01-26
+// By the MadGraph5_aMC@NLO Development Team
+// Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
+//==========================================================================
+
+#include "Parameters_SMEFTsim_topU3l_MwScheme_UFO.h"
+
+#include <iomanip>
+#include <iostream>
+
+#ifndef MGONGPU_HARDCODE_PARAM
+
+// Initialize static instance
+Parameters_SMEFTsim_topU3l_MwScheme_UFO* Parameters_SMEFTsim_topU3l_MwScheme_UFO::instance = 0;
+
+// Function to get static instance - only one instance per program
+Parameters_SMEFTsim_topU3l_MwScheme_UFO*
+Parameters_SMEFTsim_topU3l_MwScheme_UFO::getInstance()
+{
+  if( instance == 0 )
+    instance = new Parameters_SMEFTsim_topU3l_MwScheme_UFO();
+  return instance;
+}
+
+void
+Parameters_SMEFTsim_topU3l_MwScheme_UFO::setIndependentParameters( SLHAReader& slha )
+{
+  zero = 0; // define "zero"
+  ZERO = 0; // define "zero"
+  //std::vector<int> indices(2, 0); // prepare a vector for indices
+  mdl_WH = slha.get_block_entry( "decay", 25, 4.070000e - 03 );
+  mdl_WW = slha.get_block_entry( "decay", 24, 2.085000e + 00 );
+  mdl_WZ = slha.get_block_entry( "decay", 23, 2.495200e + 00 );
+  mdl_WT = slha.get_block_entry( "decay", 6, 1.330000e + 00 );
+  mdl_ymtau = slha.get_block_entry( "yukawa", 15, 1.777000e + 00 );
+  mdl_ymm = slha.get_block_entry( "yukawa", 13, 1.056600e - 01 );
+  mdl_yme = slha.get_block_entry( "yukawa", 11, 5.110000e - 04 );
+  mdl_ymt = slha.get_block_entry( "yukawa", 6, 1.727600e + 02 );
+  mdl_ymb = slha.get_block_entry( "yukawa", 5, 4.180000e + 00 );
+  mdl_ymc = slha.get_block_entry( "yukawa", 4, 1.270000e + 00 );
+  mdl_yms = slha.get_block_entry( "yukawa", 3, 9.300000e - 02 );
+  mdl_ymup = slha.get_block_entry( "yukawa", 2, 2.160000e - 03 );
+  mdl_ymdo = slha.get_block_entry( "yukawa", 1, 4.670000e - 03 );
+  mdl_linearPropCorrections = slha.get_block_entry( "switches", 1, 0.000000e + 00 );
+  //aS = slha.get_block_entry( "sminputs", 3, 1.179000e - 01 ); // now retrieved event-by-event (as G) from Fortran (running alphas #373)
+  mdl_Gf = slha.get_block_entry( "sminputs", 2, 1.166379e - 05 );
+  mdl_MW = slha.get_block_entry( "sminputs", 1, 8.038700e + 01 );
+  mdl_LambdaSMEFT = slha.get_block_entry( "smeftcutoff", 1, 1.000000e + 03 );
+  mdl_cleQt3Im = slha.get_block_entry( "smeftcpv", 53, 0.000000e + 00 );
+  mdl_cleQt1Im = slha.get_block_entry( "smeftcpv", 52, 0.000000e + 00 );
+  mdl_cleju3Im = slha.get_block_entry( "smeftcpv", 51, 0.000000e + 00 );
+  mdl_cleju1Im = slha.get_block_entry( "smeftcpv", 50, 0.000000e + 00 );
+  mdl_clebQIm = slha.get_block_entry( "smeftcpv", 49, 0.000000e + 00 );
+  mdl_cledjIm = slha.get_block_entry( "smeftcpv", 48, 0.000000e + 00 );
+  mdl_ceBIm = slha.get_block_entry( "smeftcpv", 47, 0.000000e + 00 );
+  mdl_ceWIm = slha.get_block_entry( "smeftcpv", 46, 0.000000e + 00 );
+  mdl_ceHIm = slha.get_block_entry( "smeftcpv", 45, 0.000000e + 00 );
+  mdl_cQtQb8Im = slha.get_block_entry( "smeftcpv", 44, 0.000000e + 00 );
+  mdl_cQtQb1Im = slha.get_block_entry( "smeftcpv", 43, 0.000000e + 00 );
+  mdl_cjtQd8Im = slha.get_block_entry( "smeftcpv", 42, 0.000000e + 00 );
+  mdl_cjtQd1Im = slha.get_block_entry( "smeftcpv", 41, 0.000000e + 00 );
+  mdl_cQujb8Im = slha.get_block_entry( "smeftcpv", 40, 0.000000e + 00 );
+  mdl_cQujb1Im = slha.get_block_entry( "smeftcpv", 39, 0.000000e + 00 );
+  mdl_cjuQb8Im = slha.get_block_entry( "smeftcpv", 38, 0.000000e + 00 );
+  mdl_cjuQb1Im = slha.get_block_entry( "smeftcpv", 37, 0.000000e + 00 );
+  mdl_cQtjd8Im = slha.get_block_entry( "smeftcpv", 36, 0.000000e + 00 );
+  mdl_cQtjd1Im = slha.get_block_entry( "smeftcpv", 35, 0.000000e + 00 );
+  mdl_cjujd81Im = slha.get_block_entry( "smeftcpv", 34, 0.000000e + 00 );
+  mdl_cjujd11Im = slha.get_block_entry( "smeftcpv", 33, 0.000000e + 00 );
+  mdl_cjujd8Im = slha.get_block_entry( "smeftcpv", 32, 0.000000e + 00 );
+  mdl_cjujd1Im = slha.get_block_entry( "smeftcpv", 31, 0.000000e + 00 );
+  mdl_cjQbd8Im = slha.get_block_entry( "smeftcpv", 30, 0.000000e + 00 );
+  mdl_cjQbd1Im = slha.get_block_entry( "smeftcpv", 29, 0.000000e + 00 );
+  mdl_cjQtu8Im = slha.get_block_entry( "smeftcpv", 28, 0.000000e + 00 );
+  mdl_cjQtu1Im = slha.get_block_entry( "smeftcpv", 27, 0.000000e + 00 );
+  mdl_cutbd8Im = slha.get_block_entry( "smeftcpv", 26, 0.000000e + 00 );
+  mdl_cutbd1Im = slha.get_block_entry( "smeftcpv", 25, 0.000000e + 00 );
+  mdl_cHtbIm = slha.get_block_entry( "smeftcpv", 24, 0.000000e + 00 );
+  mdl_cHudIm = slha.get_block_entry( "smeftcpv", 23, 0.000000e + 00 );
+  mdl_cbHIm = slha.get_block_entry( "smeftcpv", 22, 0.000000e + 00 );
+  mdl_cdHIm = slha.get_block_entry( "smeftcpv", 21, 0.000000e + 00 );
+  mdl_ctHIm = slha.get_block_entry( "smeftcpv", 20, 0.000000e + 00 );
+  mdl_cuHIm = slha.get_block_entry( "smeftcpv", 19, 0.000000e + 00 );
+  mdl_cbBIm = slha.get_block_entry( "smeftcpv", 18, 0.000000e + 00 );
+  mdl_cdBIm = slha.get_block_entry( "smeftcpv", 17, 0.000000e + 00 );
+  mdl_cbWIm = slha.get_block_entry( "smeftcpv", 16, 0.000000e + 00 );
+  mdl_cdWIm = slha.get_block_entry( "smeftcpv", 15, 0.000000e + 00 );
+  mdl_cbGIm = slha.get_block_entry( "smeftcpv", 14, 0.000000e + 00 );
+  mdl_cdGIm = slha.get_block_entry( "smeftcpv", 13, 0.000000e + 00 );
+  mdl_ctBIm = slha.get_block_entry( "smeftcpv", 12, 0.000000e + 00 );
+  mdl_cuBIm = slha.get_block_entry( "smeftcpv", 11, 0.000000e + 00 );
+  mdl_ctWIm = slha.get_block_entry( "smeftcpv", 10, 0.000000e + 00 );
+  mdl_cuWIm = slha.get_block_entry( "smeftcpv", 9, 0.000000e + 00 );
+  mdl_ctGIm = slha.get_block_entry( "smeftcpv", 8, 0.000000e + 00 );
+  mdl_cuGIm = slha.get_block_entry( "smeftcpv", 7, 0.000000e + 00 );
+  mdl_cHWBtil = slha.get_block_entry( "smeftcpv", 6, 0.000000e + 00 );
+  mdl_cHBtil = slha.get_block_entry( "smeftcpv", 5, 0.000000e + 00 );
+  mdl_cHWtil = slha.get_block_entry( "smeftcpv", 4, 0.000000e + 00 );
+  mdl_cHGtil = slha.get_block_entry( "smeftcpv", 3, 0.000000e + 00 );
+  mdl_cWtil = slha.get_block_entry( "smeftcpv", 2, 0.000000e + 00 );
+  mdl_cGtil = slha.get_block_entry( "smeftcpv", 1, 0.000000e + 00 );
+  mdl_cleQt3Re = slha.get_block_entry( "smeft", 129, 0.000000e + 00 );
+  mdl_cleju3Re = slha.get_block_entry( "smeft", 128, 0.000000e + 00 );
+  mdl_cleQt1Re = slha.get_block_entry( "smeft", 127, 0.000000e + 00 );
+  mdl_cleju1Re = slha.get_block_entry( "smeft", 126, 0.000000e + 00 );
+  mdl_clebQRe = slha.get_block_entry( "smeft", 125, 0.000000e + 00 );
+  mdl_cledjRe = slha.get_block_entry( "smeft", 124, 0.000000e + 00 );
+  mdl_cle = slha.get_block_entry( "smeft", 123, 0.000000e + 00 );
+  mdl_cbl = slha.get_block_entry( "smeft", 122, 0.000000e + 00 );
+  mdl_cld = slha.get_block_entry( "smeft", 121, 0.000000e + 00 );
+  mdl_ctl = slha.get_block_entry( "smeft", 120, 0.000000e + 00 );
+  mdl_clu = slha.get_block_entry( "smeft", 119, 0.000000e + 00 );
+  mdl_cQe = slha.get_block_entry( "smeft", 118, 0.000000e + 00 );
+  mdl_cje = slha.get_block_entry( "smeft", 117, 0.000000e + 00 );
+  mdl_cbe = slha.get_block_entry( "smeft", 116, 0.000000e + 00 );
+  mdl_ced = slha.get_block_entry( "smeft", 115, 0.000000e + 00 );
+  mdl_cte = slha.get_block_entry( "smeft", 114, 0.000000e + 00 );
+  mdl_ceu = slha.get_block_entry( "smeft", 113, 0.000000e + 00 );
+  mdl_cee = slha.get_block_entry( "smeft", 112, 0.000000e + 00 );
+  mdl_cQl3 = slha.get_block_entry( "smeft", 111, 0.000000e + 00 );
+  mdl_cQl1 = slha.get_block_entry( "smeft", 110, 0.000000e + 00 );
+  mdl_clj3 = slha.get_block_entry( "smeft", 109, 0.000000e + 00 );
+  mdl_clj1 = slha.get_block_entry( "smeft", 108, 0.000000e + 00 );
+  mdl_cll1 = slha.get_block_entry( "smeft", 107, 0.000000e + 00 );
+  mdl_cll = slha.get_block_entry( "smeft", 106, 0.000000e + 00 );
+  mdl_cHe = slha.get_block_entry( "smeft", 105, 0.000000e + 00 );
+  mdl_cHl3 = slha.get_block_entry( "smeft", 104, 0.000000e + 00 );
+  mdl_cHl1 = slha.get_block_entry( "smeft", 103, 0.000000e + 00 );
+  mdl_ceBRe = slha.get_block_entry( "smeft", 102, 0.000000e + 00 );
+  mdl_ceWRe = slha.get_block_entry( "smeft", 101, 0.000000e + 00 );
+  mdl_ceHRe = slha.get_block_entry( "smeft", 100, 0.000000e + 00 );
+  mdl_cQtQb8Re = slha.get_block_entry( "smeft", 99, 0.000000e + 00 );
+  mdl_cQtQb1Re = slha.get_block_entry( "smeft", 98, 0.000000e + 00 );
+  mdl_cjtQd8Re = slha.get_block_entry( "smeft", 97, 0.000000e + 00 );
+  mdl_cjtQd1Re = slha.get_block_entry( "smeft", 96, 0.000000e + 00 );
+  mdl_cQujb8Re = slha.get_block_entry( "smeft", 95, 0.000000e + 00 );
+  mdl_cQujb1Re = slha.get_block_entry( "smeft", 94, 0.000000e + 00 );
+  mdl_cjuQb8Re = slha.get_block_entry( "smeft", 93, 0.000000e + 00 );
+  mdl_cjuQb1Re = slha.get_block_entry( "smeft", 92, 0.000000e + 00 );
+  mdl_cQtjd8Re = slha.get_block_entry( "smeft", 91, 0.000000e + 00 );
+  mdl_cQtjd1Re = slha.get_block_entry( "smeft", 90, 0.000000e + 00 );
+  mdl_cjujd81Re = slha.get_block_entry( "smeft", 89, 0.000000e + 00 );
+  mdl_cjujd11Re = slha.get_block_entry( "smeft", 88, 0.000000e + 00 );
+  mdl_cjujd8Re = slha.get_block_entry( "smeft", 87, 0.000000e + 00 );
+  mdl_cjujd1Re = slha.get_block_entry( "smeft", 86, 0.000000e + 00 );
+  mdl_cjQbd8Re = slha.get_block_entry( "smeft", 85, 0.000000e + 00 );
+  mdl_cjQbd1Re = slha.get_block_entry( "smeft", 84, 0.000000e + 00 );
+  mdl_cjQtu8Re = slha.get_block_entry( "smeft", 83, 0.000000e + 00 );
+  mdl_cjQtu1Re = slha.get_block_entry( "smeft", 82, 0.000000e + 00 );
+  mdl_cQb8 = slha.get_block_entry( "smeft", 81, 0.000000e + 00 );
+  mdl_cQb1 = slha.get_block_entry( "smeft", 80, 0.000000e + 00 );
+  mdl_cbj8 = slha.get_block_entry( "smeft", 79, 0.000000e + 00 );
+  mdl_cbj1 = slha.get_block_entry( "smeft", 78, 0.000000e + 00 );
+  mdl_cQd8 = slha.get_block_entry( "smeft", 77, 0.000000e + 00 );
+  mdl_cQd1 = slha.get_block_entry( "smeft", 76, 0.000000e + 00 );
+  mdl_cjd8 = slha.get_block_entry( "smeft", 75, 0.000000e + 00 );
+  mdl_cjd1 = slha.get_block_entry( "smeft", 74, 0.000000e + 00 );
+  mdl_cQt8 = slha.get_block_entry( "smeft", 73, 0.000000e + 00 );
+  mdl_cQt1 = slha.get_block_entry( "smeft", 72, 0.000000e + 00 );
+  mdl_ctj8 = slha.get_block_entry( "smeft", 71, 0.000000e + 00 );
+  mdl_ctj1 = slha.get_block_entry( "smeft", 70, 0.000000e + 00 );
+  mdl_cQu8 = slha.get_block_entry( "smeft", 69, 0.000000e + 00 );
+  mdl_cju8 = slha.get_block_entry( "smeft", 68, 0.000000e + 00 );
+  mdl_cQu1 = slha.get_block_entry( "smeft", 67, 0.000000e + 00 );
+  mdl_cju1 = slha.get_block_entry( "smeft", 66, 0.000000e + 00 );
+  mdl_cutbd8Re = slha.get_block_entry( "smeft", 65, 0.000000e + 00 );
+  mdl_cutbd1Re = slha.get_block_entry( "smeft", 64, 0.000000e + 00 );
+  mdl_cbu8 = slha.get_block_entry( "smeft", 63, 0.000000e + 00 );
+  mdl_ctd8 = slha.get_block_entry( "smeft", 62, 0.000000e + 00 );
+  mdl_ctb8 = slha.get_block_entry( "smeft", 61, 0.000000e + 00 );
+  mdl_cud8 = slha.get_block_entry( "smeft", 60, 0.000000e + 00 );
+  mdl_cbu1 = slha.get_block_entry( "smeft", 59, 0.000000e + 00 );
+  mdl_ctd1 = slha.get_block_entry( "smeft", 58, 0.000000e + 00 );
+  mdl_ctb1 = slha.get_block_entry( "smeft", 57, 0.000000e + 00 );
+  mdl_cud1 = slha.get_block_entry( "smeft", 56, 0.000000e + 00 );
+  mdl_cbd8 = slha.get_block_entry( "smeft", 55, 0.000000e + 00 );
+  mdl_cbd1 = slha.get_block_entry( "smeft", 54, 0.000000e + 00 );
+  mdl_cbb = slha.get_block_entry( "smeft", 53, 0.000000e + 00 );
+  mdl_cdd8 = slha.get_block_entry( "smeft", 52, 0.000000e + 00 );
+  mdl_cdd1 = slha.get_block_entry( "smeft", 51, 0.000000e + 00 );
+  mdl_ctu8 = slha.get_block_entry( "smeft", 50, 0.000000e + 00 );
+  mdl_ctu1 = slha.get_block_entry( "smeft", 49, 0.000000e + 00 );
+  mdl_ctt = slha.get_block_entry( "smeft", 48, 0.000000e + 00 );
+  mdl_cuu8 = slha.get_block_entry( "smeft", 47, 0.000000e + 00 );
+  mdl_cuu1 = slha.get_block_entry( "smeft", 46, 0.000000e + 00 );
+  mdl_cQQ8 = slha.get_block_entry( "smeft", 45, 0.000000e + 00 );
+  mdl_cQQ1 = slha.get_block_entry( "smeft", 44, 0.000000e + 00 );
+  mdl_cQj38 = slha.get_block_entry( "smeft", 43, 0.000000e + 00 );
+  mdl_cQj31 = slha.get_block_entry( "smeft", 42, 0.000000e + 00 );
+  mdl_cQj18 = slha.get_block_entry( "smeft", 41, 0.000000e + 00 );
+  mdl_cQj11 = slha.get_block_entry( "smeft", 40, 0.000000e + 00 );
+  mdl_cjj38 = slha.get_block_entry( "smeft", 39, 0.000000e + 00 );
+  mdl_cjj31 = slha.get_block_entry( "smeft", 38, 0.000000e + 00 );
+  mdl_cjj18 = slha.get_block_entry( "smeft", 37, 0.000000e + 00 );
+  mdl_cjj11 = slha.get_block_entry( "smeft", 36, 0.000000e + 00 );
+  mdl_cHtbRe = slha.get_block_entry( "smeft", 35, 0.000000e + 00 );
+  mdl_cHudRe = slha.get_block_entry( "smeft", 34, 0.000000e + 00 );
+  mdl_cHbq = slha.get_block_entry( "smeft", 33, 0.000000e + 00 );
+  mdl_cHd = slha.get_block_entry( "smeft", 32, 0.000000e + 00 );
+  mdl_cHt = slha.get_block_entry( "smeft", 31, 0.000000e + 00 );
+  mdl_cHu = slha.get_block_entry( "smeft", 30, 0.000000e + 00 );
+  mdl_cHQ3 = slha.get_block_entry( "smeft", 29, 0.000000e + 00 );
+  mdl_cHj3 = slha.get_block_entry( "smeft", 28, 0.000000e + 00 );
+  mdl_cHQ1 = slha.get_block_entry( "smeft", 27, 0.000000e + 00 );
+  mdl_cHj1 = slha.get_block_entry( "smeft", 26, 0.000000e + 00 );
+  mdl_cbBRe = slha.get_block_entry( "smeft", 25, 0.000000e + 00 );
+  mdl_cdBRe = slha.get_block_entry( "smeft", 24, 0.000000e + 00 );
+  mdl_cbWRe = slha.get_block_entry( "smeft", 23, 0.000000e + 00 );
+  mdl_cdWRe = slha.get_block_entry( "smeft", 22, 0.000000e + 00 );
+  mdl_cbGRe = slha.get_block_entry( "smeft", 21, 0.000000e + 00 );
+  mdl_cdGRe = slha.get_block_entry( "smeft", 20, 0.000000e + 00 );
+  mdl_ctBRe = slha.get_block_entry( "smeft", 19, 0.000000e + 00 );
+  mdl_cuBRe = slha.get_block_entry( "smeft", 18, 0.000000e + 00 );
+  mdl_ctWRe = slha.get_block_entry( "smeft", 17, 0.000000e + 00 );
+  mdl_cuWRe = slha.get_block_entry( "smeft", 16, 0.000000e + 00 );
+  mdl_ctGRe = slha.get_block_entry( "smeft", 15, 0.000000e + 00 );
+  mdl_cuGRe = slha.get_block_entry( "smeft", 14, 0.000000e + 00 );
+  mdl_cbHRe = slha.get_block_entry( "smeft", 13, 0.000000e + 00 );
+  mdl_cdHRe = slha.get_block_entry( "smeft", 12, 0.000000e + 00 );
+  mdl_ctHRe = slha.get_block_entry( "smeft", 11, 0.000000e + 00 );
+  mdl_cuHRe = slha.get_block_entry( "smeft", 10, 0.000000e + 00 );
+  mdl_cHWB = slha.get_block_entry( "smeft", 9, 0.000000e + 00 );
+  mdl_cHB = slha.get_block_entry( "smeft", 8, 0.000000e + 00 );
+  mdl_cHW = slha.get_block_entry( "smeft", 7, 0.000000e + 00 );
+  mdl_cHG = slha.get_block_entry( "smeft", 6, 0.000000e + 00 );
+  mdl_cHDD = slha.get_block_entry( "smeft", 5, 0.000000e + 00 );
+  mdl_cHbox = slha.get_block_entry( "smeft", 4, 0.000000e + 00 );
+  mdl_cH = slha.get_block_entry( "smeft", 3, 0.000000e + 00 );
+  mdl_cW = slha.get_block_entry( "smeft", 2, 0.000000e + 00 );
+  mdl_cG = slha.get_block_entry( "smeft", 1, 0.000000e + 00 );
+  mdl_MH = slha.get_block_entry( "mass", 25, 1.250900e + 02 );
+  mdl_MZ = slha.get_block_entry( "mass", 23, 9.118760e + 01 );
+  mdl_MTA = slha.get_block_entry( "mass", 15, 1.777000e + 00 );
+  mdl_MMU = slha.get_block_entry( "mass", 13, 1.056600e - 01 );
+  mdl_Me = slha.get_block_entry( "mass", 11, 5.110000e - 04 );
+  mdl_MT = slha.get_block_entry( "mass", 6, 1.727600e + 02 );
+  mdl_MB = slha.get_block_entry( "mass", 5, 4.180000e + 00 );
+  mdl_MC = slha.get_block_entry( "mass", 4, 1.270000e + 00 );
+  mdl_MS = slha.get_block_entry( "mass", 3, 9.300000e - 02 );
+  mdl_MU = slha.get_block_entry( "mass", 2, 2.160000e - 03 );
+  mdl_MD = slha.get_block_entry( "mass", 1, 4.670000e - 03 );
+  mdl_complexi = cxsmpl<double>( 0., 1. );
+  mdl_cuH = mdl_cuHRe + mdl_cuHIm * mdl_complexi;
+  mdl_ctHH = mdl_ctHRe + mdl_ctHIm * mdl_complexi;
+  mdl_cdH = mdl_cdHRe + mdl_cdHIm * mdl_complexi;
+  mdl_cbH = mdl_cbHRe + mdl_cbHIm * mdl_complexi;
+  mdl_cuG = mdl_cuGRe + mdl_cuGIm * mdl_complexi;
+  mdl_ctG = mdl_ctGRe + mdl_ctGIm * mdl_complexi;
+  mdl_cuW = mdl_cuWRe + mdl_cuWIm * mdl_complexi;
+  mdl_ctW = mdl_ctWRe + mdl_ctWIm * mdl_complexi;
+  mdl_cuB = mdl_cuBRe + mdl_cuBIm * mdl_complexi;
+  mdl_ctB = mdl_ctBRe + mdl_ctBIm * mdl_complexi;
+  mdl_cdG = mdl_cdGRe + mdl_cdGIm * mdl_complexi;
+  mdl_cbG = mdl_cbGRe + mdl_cbGIm * mdl_complexi;
+  mdl_cdW = mdl_cdWRe + mdl_cdWIm * mdl_complexi;
+  mdl_cbW = mdl_cbWRe + mdl_cbWIm * mdl_complexi;
+  mdl_cdB = mdl_cdBRe + mdl_cdBIm * mdl_complexi;
+  mdl_cbBB = mdl_cbBRe + mdl_cbBIm * mdl_complexi;
+  mdl_cHud = mdl_cHudRe + mdl_cHudIm * mdl_complexi;
+  mdl_cHtb = mdl_cHtbRe + mdl_cHtbIm * mdl_complexi;
+  mdl_cutbd1 = mdl_cutbd1Re + mdl_cutbd1Im * mdl_complexi;
+  mdl_cutbd8 = mdl_cutbd8Re + mdl_cutbd8Im * mdl_complexi;
+  mdl_cjQtu1 = mdl_cjQtu1Re + mdl_cjQtu1Im * mdl_complexi;
+  mdl_cjQtu8 = mdl_cjQtu8Re + mdl_cjQtu8Im * mdl_complexi;
+  mdl_cjQbd1 = mdl_cjQbd1Re + mdl_cjQbd1Im * mdl_complexi;
+  mdl_cjQbd8 = mdl_cjQbd8Re + mdl_cjQbd8Im * mdl_complexi;
+  mdl_cjujd1 = mdl_cjujd1Re + mdl_cjujd1Im * mdl_complexi;
+  mdl_cjujd8 = mdl_cjujd8Re + mdl_cjujd8Im * mdl_complexi;
+  mdl_cjujd11 = mdl_cjujd11Re + mdl_cjujd11Im * mdl_complexi;
+  mdl_cjujd81 = mdl_cjujd81Re + mdl_cjujd81Im * mdl_complexi;
+  mdl_cQtjd1 = mdl_cQtjd1Re + mdl_cQtjd1Im * mdl_complexi;
+  mdl_cQtjd8 = mdl_cQtjd8Re + mdl_cQtjd8Im * mdl_complexi;
+  mdl_cjuQb1 = mdl_cjuQb1Re + mdl_cjuQb1Im * mdl_complexi;
+  mdl_cjuQb8 = mdl_cjuQb8Re + mdl_cjuQb8Im * mdl_complexi;
+  mdl_cQujb1 = mdl_cQujb1Re + mdl_cQujb1Im * mdl_complexi;
+  mdl_cQujb8 = mdl_cQujb8Re + mdl_cQujb8Im * mdl_complexi;
+  mdl_cjtQd1 = mdl_cjtQd1Re + mdl_cjtQd1Im * mdl_complexi;
+  mdl_cjtQd8 = mdl_cjtQd8Re + mdl_cjtQd8Im * mdl_complexi;
+  mdl_cQtQb1 = mdl_cQtQb1Re + mdl_cQtQb1Im * mdl_complexi;
+  mdl_cQtQb8 = mdl_cQtQb8Re + mdl_cQtQb8Im * mdl_complexi;
+  mdl_ceH = mdl_ceHRe + mdl_ceHIm * mdl_complexi;
+  mdl_ceW = mdl_ceWRe + mdl_ceWIm * mdl_complexi;
+  mdl_ceB = mdl_ceBRe + mdl_ceBIm * mdl_complexi;
+  mdl_cledj = mdl_cledjRe + mdl_cledjIm * mdl_complexi;
+  mdl_clebQ = mdl_clebQRe + mdl_clebQIm * mdl_complexi;
+  mdl_cleju1 = mdl_cleju1Re + mdl_cleju1Im * mdl_complexi;
+  mdl_cleju3 = mdl_cleju3Re + mdl_cleju3Im * mdl_complexi;
+  mdl_cleQt1 = mdl_cleQt1Re + mdl_cleQt1Im * mdl_complexi;
+  mdl_cleQt3 = mdl_cleQt3Re + mdl_cleQt3Im * mdl_complexi;
+  mdl_MWsm = mdl_MW;
+  mdl_MW__exp__2 = ( ( mdl_MW ) * ( mdl_MW ) );
+  mdl_MZ__exp__2 = ( ( mdl_MZ ) * ( mdl_MZ ) );
+  mdl_sqrt__2 = sqrt( 2. );
+  mdl_nb__2__exp__0_25 = pow( 2., 0.25 );
+  mdl_MH__exp__2 = ( ( mdl_MH ) * ( mdl_MH ) );
+  mdl_sth2 = 1. - mdl_MW__exp__2 / mdl_MZ__exp__2;
+  mdl_nb__10__exp___m_40 = pow( 10., -40. );
+  mdl_propCorr = ABS( mdl_linearPropCorrections ) / ( ABS( mdl_linearPropCorrections ) + mdl_nb__10__exp___m_40 );
+  mdl_MZ1 = mdl_MZ;
+  mdl_MH1 = mdl_MH;
+  mdl_MT1 = mdl_MT;
+  mdl_WZ1 = mdl_WZ;
+  mdl_WW1 = mdl_WW;
+  mdl_WH1 = mdl_WH;
+  mdl_WT1 = mdl_WT;
+  mdl_cth = sqrt( 1. - mdl_sth2 );
+  mdl_MW1 = mdl_MWsm;
+  mdl_sqrt__sth2 = sqrt( mdl_sth2 );
+  mdl_sth = mdl_sqrt__sth2;
+  mdl_LambdaSMEFT__exp__2 = ( ( mdl_LambdaSMEFT ) * ( mdl_LambdaSMEFT ) );
+  mdl_conjg__cbH = conj( mdl_cbH );
+  mdl_conjg__ctHH = conj( mdl_ctHH );
+  mdl_MT__exp__2 = ( ( mdl_MT ) * ( mdl_MT ) );
+  mdl_MH__exp__6 = pow( mdl_MH, 6. );
+  mdl_MWsm__exp__6 = pow( mdl_MWsm, 6. );
+  mdl_MH__exp__4 = ( ( mdl_MH ) * ( mdl_MH ) * ( mdl_MH ) * ( mdl_MH ) );
+  mdl_MWsm__exp__4 = ( ( mdl_MWsm ) * ( mdl_MWsm ) * ( mdl_MWsm ) * ( mdl_MWsm ) );
+  mdl_MWsm__exp__2 = ( ( mdl_MWsm ) * ( mdl_MWsm ) );
+  mdl_MZ__exp__4 = ( ( mdl_MZ ) * ( mdl_MZ ) * ( mdl_MZ ) * ( mdl_MZ ) );
+  mdl_MZ__exp__6 = pow( mdl_MZ, 6. );
+  mdl_cth__exp__2 = ( ( mdl_cth ) * ( mdl_cth ) );
+  mdl_sth__exp__2 = ( ( mdl_sth ) * ( mdl_sth ) );
+  mdl_MB__exp__2 = ( ( mdl_MB ) * ( mdl_MB ) );
+  mdl_MZ__exp__3 = ( ( mdl_MZ ) * ( mdl_MZ ) * ( mdl_MZ ) );
+  mdl_sth__exp__4 = ( ( mdl_sth ) * ( mdl_sth ) * ( mdl_sth ) * ( mdl_sth ) );
+  mdl_sth__exp__6 = pow( mdl_sth, 6. );
+  mdl_sth__exp__3 = ( ( mdl_sth ) * ( mdl_sth ) * ( mdl_sth ) );
+  mdl_sth__exp__5 = pow( mdl_sth, 5. );
+  mdl_propCorr__exp__2 = ( ( mdl_propCorr ) * ( mdl_propCorr ) );
+  mdl_propCorr__exp__3 = ( ( mdl_propCorr ) * ( mdl_propCorr ) * ( mdl_propCorr ) );
+  mdl_propCorr__exp__4 = ( ( mdl_propCorr ) * ( mdl_propCorr ) * ( mdl_propCorr ) * ( mdl_propCorr ) );
+  mdl_cth__exp__3 = ( ( mdl_cth ) * ( mdl_cth ) * ( mdl_cth ) );
+  mdl_aEW = ( mdl_Gf * mdl_MW__exp__2 * ( 1. - mdl_MW__exp__2 / mdl_MZ__exp__2 ) * mdl_sqrt__2 ) / M_PI;
+  mdl_sqrt__Gf = sqrt( mdl_Gf );
+  mdl_vevhat = 1. / ( mdl_nb__2__exp__0_25 * mdl_sqrt__Gf );
+  mdl_lam = ( mdl_Gf * mdl_MH__exp__2 ) / mdl_sqrt__2;
+  mdl_sqrt__aEW = sqrt( mdl_aEW );
+  mdl_ee = 2. * mdl_sqrt__aEW * sqrt( M_PI );
+  mdl_yb = ( mdl_ymb * mdl_sqrt__2 ) / mdl_vevhat;
+  mdl_yc = ( mdl_ymc * mdl_sqrt__2 ) / mdl_vevhat;
+  mdl_ydo = ( mdl_ymdo * mdl_sqrt__2 ) / mdl_vevhat;
+  mdl_ye = ( mdl_yme * mdl_sqrt__2 ) / mdl_vevhat;
+  mdl_ym = ( mdl_ymm * mdl_sqrt__2 ) / mdl_vevhat;
+  mdl_ys = ( mdl_yms * mdl_sqrt__2 ) / mdl_vevhat;
+  mdl_yt = ( mdl_ymt * mdl_sqrt__2 ) / mdl_vevhat;
+  mdl_ytau = ( mdl_ymtau * mdl_sqrt__2 ) / mdl_vevhat;
+  mdl_yup = ( mdl_ymup * mdl_sqrt__2 ) / mdl_vevhat;
+  mdl_vevhat__exp__2 = ( ( mdl_vevhat ) * ( mdl_vevhat ) );
+  mdl_dGf = ( ( 2. * mdl_cHl3 - mdl_cll1 ) * mdl_vevhat__exp__2 ) / mdl_LambdaSMEFT__exp__2;
+  mdl_dkH = ( ( mdl_cHbox - mdl_cHDD / 4. ) * mdl_vevhat__exp__2 ) / mdl_LambdaSMEFT__exp__2;
+  mdl_vevT = ( 1. + mdl_dGf / 2. ) * mdl_vevhat;
+  mdl_g1 = mdl_ee / mdl_cth;
+  mdl_gw = mdl_ee / mdl_sth;
+  mdl_yb0 = ( 1. - mdl_dGf / 2. ) * mdl_yb + ( mdl_vevhat__exp__2 * mdl_conjg__cbH ) / ( 2. * mdl_LambdaSMEFT__exp__2 );
+  mdl_yt0 = ( 1. - mdl_dGf / 2. ) * mdl_yt + ( mdl_vevhat__exp__2 * mdl_conjg__ctHH ) / ( 2. * mdl_LambdaSMEFT__exp__2 );
+  mdl_ee__exp__2 = ( ( mdl_ee ) * ( mdl_ee ) );
+  mdl_gHaa = ( mdl_ee__exp__2 * ( -1.75 + ( 4. * ( 0.3333333333333333 + ( 7. * mdl_MH__exp__2 ) / ( 360. * mdl_MT__exp__2 ) ) ) / 3. - ( 29. * mdl_MH__exp__6 ) / ( 16800. * mdl_MWsm__exp__6 ) - ( 19. * mdl_MH__exp__4 ) / ( 1680. * mdl_MWsm__exp__4 ) - ( 11. * mdl_MH__exp__2 ) / ( 120. * mdl_MWsm__exp__2 ) ) ) / ( 8. * ( ( M_PI ) * ( M_PI ) ) );
+  mdl_gHza = ( mdl_ee__exp__2 * ( ( ( 0.4583333333333333 + ( 29. * mdl_MH__exp__6 ) / ( 100800. * mdl_MWsm__exp__6 ) + ( 19. * mdl_MH__exp__4 ) / ( 10080. * mdl_MWsm__exp__4 ) + ( 11. * mdl_MH__exp__2 ) / ( 720. * mdl_MWsm__exp__2 ) + ( mdl_MH__exp__4 * mdl_MZ__exp__2 ) / ( 2100. * mdl_MWsm__exp__6 ) + ( mdl_MH__exp__2 * mdl_MZ__exp__2 ) / ( 280. * mdl_MWsm__exp__4 ) + ( 7. * mdl_MZ__exp__2 ) / ( 180. * mdl_MWsm__exp__2 ) + ( 67. * mdl_MH__exp__2 * mdl_MZ__exp__4 ) / ( 100800. * mdl_MWsm__exp__6 ) + ( 53. * mdl_MZ__exp__4 ) / ( 10080. * mdl_MWsm__exp__4 ) + ( 43. * mdl_MZ__exp__6 ) / ( 50400. * mdl_MWsm__exp__6 ) - ( 31. * mdl_cth__exp__2 ) / ( 24. * mdl_sth__exp__2 ) - ( 29. * mdl_cth__exp__2 * mdl_MH__exp__6 ) / ( 20160. * mdl_MWsm__exp__6 * mdl_sth__exp__2 ) - ( 19. * mdl_cth__exp__2 * mdl_MH__exp__4 ) / ( 2016. * mdl_MWsm__exp__4 * mdl_sth__exp__2 ) - ( 11. * mdl_cth__exp__2 * mdl_MH__exp__2 ) / ( 144. * mdl_MWsm__exp__2 * mdl_sth__exp__2 ) - ( mdl_cth__exp__2 * mdl_MH__exp__4 * mdl_MZ__exp__2 ) / ( 560. * mdl_MWsm__exp__6 * mdl_sth__exp__2 ) - ( 31. * mdl_cth__exp__2 * mdl_MH__exp__2 * mdl_MZ__exp__2 ) / ( 2520. * mdl_MWsm__exp__4 * mdl_sth__exp__2 ) - ( mdl_cth__exp__2 * mdl_MZ__exp__2 ) / ( 9. * mdl_MWsm__exp__2 * mdl_sth__exp__2 ) - ( 43. * mdl_cth__exp__2 * mdl_MH__exp__2 * mdl_MZ__exp__4 ) / ( 20160. * mdl_MWsm__exp__6 * mdl_sth__exp__2 ) - ( 17. * mdl_cth__exp__2 * mdl_MZ__exp__4 ) / ( 1120. * mdl_MWsm__exp__4 * mdl_sth__exp__2 ) - ( 5. * mdl_cth__exp__2 * mdl_MZ__exp__6 ) / ( 2016. * mdl_MWsm__exp__6 * mdl_sth__exp__2 ) ) * mdl_sth ) / mdl_cth + ( ( 0.3333333333333333 + ( 7. * mdl_MH__exp__2 ) / ( 360. * mdl_MT__exp__2 ) + ( 11. * mdl_MZ__exp__2 ) / ( 360. * mdl_MT__exp__2 ) ) * ( 0.5 - ( 4. * mdl_sth__exp__2 ) / 3. ) ) / ( mdl_cth * mdl_sth ) ) ) / ( 4. * ( ( M_PI ) * ( M_PI ) ) );
+  mdl_dMZ2 = ( ( mdl_cHDD / 2. + 2. * mdl_cHWB * mdl_cth * mdl_sth ) * mdl_vevhat__exp__2 ) / mdl_LambdaSMEFT__exp__2;
+  mdl_dMH2 = 2. * mdl_dkH - ( 3. * mdl_cH * mdl_vevhat__exp__2 ) / ( 2. * mdl_lam * mdl_LambdaSMEFT__exp__2 );
+  mdl_dgw = -mdl_dGf / 2.;
+  mdl_barlam = ( 1. - mdl_dGf - mdl_dMH2 ) * mdl_lam;
+  mdl_dWT = 2. * mdl_WT * ( mdl_dgw + ( mdl_vevhat * ( mdl_ee * ( 3. * mdl_cHtbRe * mdl_MB * mdl_MT * mdl_MWsm__exp__2 + mdl_cHQ3 * ( ( ( mdl_MB__exp__2 - mdl_MT__exp__2 ) * ( mdl_MB__exp__2 - mdl_MT__exp__2 ) ) + ( mdl_MB__exp__2 + mdl_MT__exp__2 ) * mdl_MWsm__exp__2 - 2. * mdl_MWsm__exp__4 ) ) * mdl_vevhat + 6. * mdl_MWsm__exp__2 * ( mdl_ctWRe * mdl_MT * ( mdl_MB__exp__2 - mdl_MT__exp__2 + mdl_MWsm__exp__2 ) + mdl_cbWRe * mdl_MB * ( -mdl_MB__exp__2 + mdl_MT__exp__2 + mdl_MWsm__exp__2 ) ) * mdl_sth * mdl_sqrt__2 ) ) / ( mdl_ee * mdl_LambdaSMEFT__exp__2 * ( ( ( mdl_MB__exp__2 - mdl_MT__exp__2 ) * ( mdl_MB__exp__2 - mdl_MT__exp__2 ) ) + ( mdl_MB__exp__2 + mdl_MT__exp__2 ) * mdl_MWsm__exp__2 - 2. * mdl_MWsm__exp__4 ) ) );
+  mdl_dWW = ( 2. * mdl_dgw + ( 2. * ( 2. * mdl_cHj3 + mdl_cHl3 ) * mdl_vevhat__exp__2 ) / ( 3. * mdl_LambdaSMEFT__exp__2 ) ) * mdl_WW;
+  mdl_gwsh = ( mdl_ee * ( 1. + mdl_dgw - ( mdl_cHW * mdl_vevhat__exp__2 ) / mdl_LambdaSMEFT__exp__2 ) ) / mdl_sth;
+  mdl_vev = ( 1. - ( 3. * mdl_cH * mdl_vevhat__exp__2 ) / ( 8. * mdl_lam * mdl_LambdaSMEFT__exp__2 ) ) * mdl_vevT;
+  mdl_dg1 = ( -mdl_dGf - mdl_dMZ2 / mdl_sth__exp__2 ) / 2.;
+  mdl_dWHc = mdl_yc / ( mdl_yc + mdl_nb__10__exp___m_40 ) * ( -0.02884 * mdl_dGf + ( ( 0.05768 * mdl_cHbox - 0.01442 * mdl_cHDD - 0.05768 * mdl_cuHRe ) * mdl_vevhat__exp__2 ) / mdl_LambdaSMEFT__exp__2 );
+  mdl_dWHb = mdl_yb / ( mdl_yb + mdl_nb__10__exp___m_40 ) * ( mdl_vevhat__exp__2 * ( -1.1618 * mdl_cbHRe ) / ( mdl_LambdaSMEFT__exp__2 * ( mdl_yb + mdl_nb__10__exp___m_40 ) ) - 0.5809 * mdl_dGf + ( mdl_vevhat__exp__2 * ( 1.1618 * mdl_cHbox - 0.29045 * mdl_cHDD ) ) / ( mdl_LambdaSMEFT__exp__2 ) );
+  mdl_dWHta = mdl_ytau / ( mdl_ytau + mdl_nb__10__exp___m_40 ) * ( -0.06256 * mdl_dGf + mdl_vevhat__exp__2 * ( -0.12512 * mdl_ceHRe + 0.12512 * mdl_cHbox - 0.03128 * mdl_cHDD ) / ( mdl_LambdaSMEFT__exp__2 ) );
+  mdl_dWZ = mdl_WZ * ( -1. + ( 36. * mdl_cth * mdl_MB * mdl_MZ__exp__2 * mdl_sth * ( mdl_cbWRe * mdl_cth + mdl_cbBRe * mdl_sth ) * ( -3. + 4. * mdl_sth__exp__2 ) * mdl_vevhat * mdl_sqrt__2 * sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) + mdl_ee * mdl_LambdaSMEFT__exp__2 * ( 2. * mdl_MZ__exp__3 * ( 27. + 54. * mdl_dgw - 54. * ( 1. + mdl_dg1 + mdl_dgw ) * mdl_sth__exp__2 + 76. * ( 1. + 4. * mdl_dg1 - 2. * mdl_dgw ) * mdl_sth__exp__4 + 152. * ( -mdl_dg1 + mdl_dgw ) * mdl_sth__exp__6 ) + mdl_MZ__exp__2 * ( 9. + 18. * mdl_dgw - 6. * ( 2. + mdl_dg1 + 3. * mdl_dgw ) * mdl_sth__exp__2 + 8. * ( 1. + 4. * mdl_dg1 - 2. * mdl_dgw ) * mdl_sth__exp__4 + 16. * ( -mdl_dg1 + mdl_dgw ) * mdl_sth__exp__6 ) * sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) + mdl_MB__exp__2 * ( -9. - 18. * mdl_dgw - 6. * ( 4. + 11. * mdl_dg1 - 3. * mdl_dgw ) * mdl_sth__exp__2 + 16. * ( 1. + 4. * mdl_dg1 - 2. * mdl_dgw ) * mdl_sth__exp__4 + 32. * ( -mdl_dg1 + mdl_dgw ) * mdl_sth__exp__6 ) * sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) ) + 2. * mdl_ee * mdl_vevhat__exp__2 * ( 36. * mdl_cHj3 * mdl_MZ__exp__3 + 18. * mdl_cHl3 * mdl_MZ__exp__3 + 9. * ( 3. * mdl_cHbq - mdl_cHQ1 - mdl_cHQ3 ) * mdl_MB__exp__2 * sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) + 9. * mdl_cHQ1 * mdl_MZ__exp__2 * sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) + 9. * mdl_cHQ3 * mdl_MZ__exp__2 * sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) + 3. * mdl_cHWB * mdl_cth * ( -7. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) * mdl_sth * sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) + 8. * mdl_cHWB * mdl_cth * mdl_sth__exp__3 * ( 2. * mdl_MB__exp__2 * sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) + mdl_MZ__exp__2 * ( 19. * mdl_MZ + sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) ) ) - 8. * mdl_cHWB * mdl_cth * mdl_sth__exp__5 * ( 2. * mdl_MB__exp__2 * sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) + mdl_MZ__exp__2 * ( 19. * mdl_MZ + sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) ) ) - 6. * mdl_sth__exp__2 * ( 2. * ( mdl_cHbq + mdl_cHQ1 + mdl_cHQ3 ) * mdl_MB__exp__2 * sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) + mdl_MZ__exp__2 * ( ( 2. * mdl_cHd + 3. * mdl_cHe - 2. * mdl_cHj1 + 3. * ( 2. * mdl_cHj3 + mdl_cHl1 + mdl_cHl3 ) - 4. * mdl_cHu ) * mdl_MZ + ( mdl_cHbq + mdl_cHQ1 + mdl_cHQ3 ) * sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) ) ) ) ) / ( mdl_ee * mdl_LambdaSMEFT__exp__2 * ( 2. * mdl_MZ__exp__3 * ( 27. - 54. * mdl_sth__exp__2 + 76. * mdl_sth__exp__4 ) + mdl_MZ__exp__2 * ( 9. - 12. * mdl_sth__exp__2 + 8. * mdl_sth__exp__4 ) * sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) + mdl_MB__exp__2 * ( -9. - 24. * mdl_sth__exp__2 + 16. * mdl_sth__exp__4 ) * sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) ) ) );
+  mdl_g1sh = ( mdl_ee * ( 1. + mdl_dg1 - ( mdl_cHB * mdl_vevhat__exp__2 ) / mdl_LambdaSMEFT__exp__2 ) ) / mdl_cth;
+  mdl_ee__exp__3 = ( ( mdl_ee ) * ( mdl_ee ) * ( mdl_ee ) );
+  mdl_vevhat__exp__3 = ( ( mdl_vevhat ) * ( mdl_vevhat ) * ( mdl_vevhat ) );
+}
+
+void
+Parameters_SMEFTsim_topU3l_MwScheme_UFO::setIndependentCouplings()
+{
+  // (none)
+}
+
+/*
+void
+Parameters_SMEFTsim_topU3l_MwScheme_UFO::setDependentParameters() // now computed event-by-event (running alphas #373)
+{
+  mdl_sqrt__aS = sqrt( aS );
+  G = 2. * mdl_sqrt__aS * sqrt( M_PI );
+  mdl_gHgg2 = ( -7. * aS ) / ( 720. * M_PI );
+  mdl_gHgg4 = aS / ( 360. * M_PI );
+  mdl_gHgg5 = aS / ( 20. * M_PI );
+  mdl_G__exp__2 = ( ( G ) * ( G ) );
+  mdl_gHgg1 = mdl_G__exp__2 / ( 48. * ( ( M_PI ) * ( M_PI ) ) );
+  mdl_gHgg3 = ( aS * G ) / ( 60. * M_PI );
+  mdl_G__exp__3 = ( ( G ) * ( G ) * ( G ) );
+  mdl_dWH = mdl_WH * ( -0.24161 * mdl_dGf + 0.96644 * mdl_dgw + 0.4832199999999999 * mdl_dkH - 0.11186509426655467 * mdl_dWW + ( 0.36410378449238195 * mdl_cHj3 * mdl_vevhat__exp__2 ) / mdl_LambdaSMEFT__exp__2 + ( 0.17608307708657747 * mdl_cHl3 * mdl_vevhat__exp__2 ) / mdl_LambdaSMEFT__exp__2 + ( 0.1636 * mdl_cHG * mdl_MT__exp__2 * mdl_vevhat__exp__2 ) / ( mdl_LambdaSMEFT__exp__2 * ( -0.5 * mdl_gHgg2 * mdl_MH__exp__2 + mdl_gHgg1 * mdl_MT__exp__2 ) ) + ( mdl_cHW * ( -0.35937785117066967 * mdl_gHaa * mdl_gHza + 0.006164 * mdl_cth * mdl_gHaa * mdl_sth + 0.00454 * mdl_gHza * mdl_sth__exp__2 ) * mdl_vevhat__exp__2 ) / ( mdl_gHaa * mdl_gHza * mdl_LambdaSMEFT__exp__2 ) + ( mdl_cHWB * ( -0.00454 * mdl_cth * mdl_gHza * mdl_sth + mdl_gHaa * ( -0.0030819999999999997 + 0.006163999999999999 * mdl_sth__exp__2 ) ) * mdl_vevhat__exp__2 ) / ( mdl_gHaa * mdl_gHza * mdl_LambdaSMEFT__exp__2 ) + ( mdl_cHB * ( -0.006163999999999999 * mdl_cth * mdl_gHaa * mdl_sth - 0.00454 * mdl_gHza * ( -1. + mdl_sth__exp__2 ) ) * mdl_vevhat__exp__2 ) / ( mdl_gHaa * mdl_gHza * mdl_LambdaSMEFT__exp__2 ) + mdl_dWHc + mdl_dWHb + mdl_dWHta );
+}
+
+void
+Parameters_SMEFTsim_topU3l_MwScheme_UFO::setDependentCouplings() // now computed event-by-event (running alphas #373)
+{
+  GC_6 = -( mdl_complexi * G );
+  GC_7 = G;
+  GC_8 = mdl_complexi * mdl_G__exp__2;
+}
+*/
+
+#endif
+
+// Routines for printing out parameters
+void
+Parameters_SMEFTsim_topU3l_MwScheme_UFO::printIndependentParameters()
+{
+  std::cout << "SMEFTsim_topU3l_MwScheme_UFO model parameters independent of event kinematics:" << std::endl;
+  std::cout << "(Warning: aS in the runcard is ignored because event-by-event Gs are hardcoded or retrieved from Fortran)" << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_WH = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_WH << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_WW = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_WW << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_WZ = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_WZ << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_WT = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_WT << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ymtau = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ymtau << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ymm = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ymm << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_yme = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_yme << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ymt = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ymt << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ymb = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ymb << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ymc = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ymc << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_yms = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_yms << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ymup = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ymup << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ymdo = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ymdo << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_linearPropCorrections = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_linearPropCorrections << std::endl;
+  //std::cout << std::setw( 20 ) << "aS = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << aS << std::endl; // now retrieved event-by-event (as G) from Fortran (running alphas #373)
+  std::cout << std::setw( 20 ) << "mdl_Gf = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Gf << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MW = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MW << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_LambdaSMEFT = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_LambdaSMEFT << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cleQt3Im = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cleQt3Im << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cleQt1Im = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cleQt1Im << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cleju3Im = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cleju3Im << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cleju1Im = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cleju1Im << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_clebQIm = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_clebQIm << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cledjIm = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cledjIm << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ceBIm = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ceBIm << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ceWIm = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ceWIm << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ceHIm = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ceHIm << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQtQb8Im = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQtQb8Im << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQtQb1Im = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQtQb1Im << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjtQd8Im = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjtQd8Im << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjtQd1Im = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjtQd1Im << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQujb8Im = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQujb8Im << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQujb1Im = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQujb1Im << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjuQb8Im = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjuQb8Im << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjuQb1Im = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjuQb1Im << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQtjd8Im = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQtjd8Im << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQtjd1Im = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQtjd1Im << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjujd81Im = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjujd81Im << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjujd11Im = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjujd11Im << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjujd8Im = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjujd8Im << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjujd1Im = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjujd1Im << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjQbd8Im = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjQbd8Im << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjQbd1Im = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjQbd1Im << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjQtu8Im = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjQtu8Im << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjQtu1Im = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjQtu1Im << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cutbd8Im = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cutbd8Im << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cutbd1Im = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cutbd1Im << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cHtbIm = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cHtbIm << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cHudIm = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cHudIm << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cbHIm = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cbHIm << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cdHIm = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cdHIm << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ctHIm = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ctHIm << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cuHIm = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cuHIm << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cbBIm = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cbBIm << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cdBIm = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cdBIm << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cbWIm = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cbWIm << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cdWIm = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cdWIm << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cbGIm = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cbGIm << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cdGIm = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cdGIm << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ctBIm = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ctBIm << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cuBIm = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cuBIm << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ctWIm = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ctWIm << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cuWIm = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cuWIm << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ctGIm = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ctGIm << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cuGIm = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cuGIm << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cHWBtil = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cHWBtil << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cHBtil = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cHBtil << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cHWtil = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cHWtil << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cHGtil = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cHGtil << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cWtil = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cWtil << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cGtil = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cGtil << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cleQt3Re = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cleQt3Re << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cleju3Re = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cleju3Re << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cleQt1Re = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cleQt1Re << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cleju1Re = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cleju1Re << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_clebQRe = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_clebQRe << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cledjRe = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cledjRe << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cle = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cle << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cbl = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cbl << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cld = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cld << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ctl = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ctl << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_clu = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_clu << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQe = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQe << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cje = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cje << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cbe = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cbe << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ced = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ced << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cte = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cte << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ceu = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ceu << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cee = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cee << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQl3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQl3 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQl1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQl1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_clj3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_clj3 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_clj1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_clj1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cll1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cll1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cll = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cll << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cHe = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cHe << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cHl3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cHl3 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cHl1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cHl1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ceBRe = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ceBRe << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ceWRe = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ceWRe << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ceHRe = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ceHRe << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQtQb8Re = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQtQb8Re << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQtQb1Re = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQtQb1Re << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjtQd8Re = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjtQd8Re << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjtQd1Re = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjtQd1Re << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQujb8Re = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQujb8Re << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQujb1Re = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQujb1Re << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjuQb8Re = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjuQb8Re << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjuQb1Re = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjuQb1Re << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQtjd8Re = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQtjd8Re << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQtjd1Re = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQtjd1Re << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjujd81Re = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjujd81Re << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjujd11Re = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjujd11Re << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjujd8Re = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjujd8Re << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjujd1Re = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjujd1Re << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjQbd8Re = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjQbd8Re << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjQbd1Re = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjQbd1Re << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjQtu8Re = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjQtu8Re << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjQtu1Re = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjQtu1Re << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQb8 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQb8 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQb1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQb1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cbj8 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cbj8 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cbj1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cbj1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQd8 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQd8 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQd1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQd1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjd8 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjd8 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjd1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjd1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQt8 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQt8 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQt1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQt1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ctj8 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ctj8 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ctj1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ctj1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQu8 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQu8 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cju8 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cju8 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQu1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQu1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cju1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cju1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cutbd8Re = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cutbd8Re << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cutbd1Re = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cutbd1Re << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cbu8 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cbu8 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ctd8 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ctd8 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ctb8 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ctb8 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cud8 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cud8 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cbu1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cbu1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ctd1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ctd1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ctb1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ctb1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cud1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cud1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cbd8 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cbd8 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cbd1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cbd1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cbb = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cbb << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cdd8 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cdd8 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cdd1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cdd1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ctu8 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ctu8 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ctu1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ctu1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ctt = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ctt << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cuu8 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cuu8 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cuu1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cuu1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQQ8 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQQ8 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQQ1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQQ1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQj38 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQj38 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQj31 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQj31 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQj18 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQj18 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQj11 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQj11 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjj38 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjj38 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjj31 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjj31 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjj18 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjj18 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjj11 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjj11 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cHtbRe = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cHtbRe << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cHudRe = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cHudRe << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cHbq = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cHbq << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cHd = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cHd << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cHt = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cHt << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cHu = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cHu << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cHQ3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cHQ3 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cHj3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cHj3 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cHQ1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cHQ1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cHj1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cHj1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cbBRe = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cbBRe << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cdBRe = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cdBRe << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cbWRe = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cbWRe << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cdWRe = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cdWRe << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cbGRe = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cbGRe << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cdGRe = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cdGRe << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ctBRe = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ctBRe << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cuBRe = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cuBRe << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ctWRe = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ctWRe << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cuWRe = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cuWRe << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ctGRe = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ctGRe << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cuGRe = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cuGRe << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cbHRe = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cbHRe << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cdHRe = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cdHRe << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ctHRe = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ctHRe << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cuHRe = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cuHRe << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cHWB = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cHWB << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cHB = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cHB << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cHW = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cHW << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cHG = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cHG << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cHDD = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cHDD << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cHbox = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cHbox << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cH = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cH << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cW = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cW << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cG = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cG << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MH = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MH << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MZ = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MZ << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MTA = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MTA << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MMU = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MMU << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_Me = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Me << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MT = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MT << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MB = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MB << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MC = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MC << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MS = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MS << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MU = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MU << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MD = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MD << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_complexi = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_complexi << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cuH = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cuH << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ctHH = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ctHH << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cdH = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cdH << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cbH = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cbH << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cuG = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cuG << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ctG = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ctG << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cuW = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cuW << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ctW = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ctW << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cuB = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cuB << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ctB = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ctB << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cdG = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cdG << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cbG = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cbG << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cdW = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cdW << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cbW = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cbW << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cdB = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cdB << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cbBB = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cbBB << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cHud = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cHud << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cHtb = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cHtb << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cutbd1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cutbd1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cutbd8 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cutbd8 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjQtu1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjQtu1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjQtu8 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjQtu8 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjQbd1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjQbd1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjQbd8 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjQbd8 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjujd1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjujd1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjujd8 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjujd8 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjujd11 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjujd11 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjujd81 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjujd81 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQtjd1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQtjd1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQtjd8 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQtjd8 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjuQb1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjuQb1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjuQb8 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjuQb8 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQujb1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQujb1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQujb8 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQujb8 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjtQd1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjtQd1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cjtQd8 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cjtQd8 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQtQb1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQtQb1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cQtQb8 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cQtQb8 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ceH = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ceH << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ceW = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ceW << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ceB = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ceB << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cledj = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cledj << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_clebQ = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_clebQ << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cleju1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cleju1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cleju3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cleju3 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cleQt1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cleQt1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cleQt3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cleQt3 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MWsm = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MWsm << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MW__exp__2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MW__exp__2 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MZ__exp__2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MZ__exp__2 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_sqrt__2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_sqrt__2 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_nb__2__exp__0_25 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_nb__2__exp__0_25 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MH__exp__2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MH__exp__2 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_sth2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_sth2 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_nb__10__exp___m_40 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_nb__10__exp___m_40 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_propCorr = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_propCorr << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MZ1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MZ1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MH1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MH1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MT1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MT1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_WZ1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_WZ1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_WW1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_WW1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_WH1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_WH1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_WT1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_WT1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cth = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cth << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MW1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MW1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_sqrt__sth2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_sqrt__sth2 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_sth = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_sth << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_LambdaSMEFT__exp__2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_LambdaSMEFT__exp__2 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_conjg__cbH = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__cbH << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_conjg__ctHH = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__ctHH << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MT__exp__2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MT__exp__2 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MH__exp__6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MH__exp__6 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MWsm__exp__6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MWsm__exp__6 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MH__exp__4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MH__exp__4 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MWsm__exp__4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MWsm__exp__4 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MWsm__exp__2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MWsm__exp__2 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MZ__exp__4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MZ__exp__4 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MZ__exp__6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MZ__exp__6 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cth__exp__2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cth__exp__2 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_sth__exp__2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_sth__exp__2 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MB__exp__2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MB__exp__2 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_MZ__exp__3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MZ__exp__3 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_sth__exp__4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_sth__exp__4 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_sth__exp__6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_sth__exp__6 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_sth__exp__3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_sth__exp__3 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_sth__exp__5 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_sth__exp__5 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_propCorr__exp__2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_propCorr__exp__2 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_propCorr__exp__3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_propCorr__exp__3 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_propCorr__exp__4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_propCorr__exp__4 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_cth__exp__3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cth__exp__3 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_aEW = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_aEW << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_sqrt__Gf = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_sqrt__Gf << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_vevhat = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_vevhat << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_lam = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_lam << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_sqrt__aEW = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_sqrt__aEW << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ee = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ee << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_yb = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_yb << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_yc = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_yc << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ydo = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ydo << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ye = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ye << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ym = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ym << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ys = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ys << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_yt = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_yt << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ytau = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ytau << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_yup = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_yup << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_vevhat__exp__2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_vevhat__exp__2 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_dGf = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_dGf << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_dkH = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_dkH << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_vevT = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_vevT << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_g1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_g1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_gw = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_gw << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_yb0 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_yb0 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_yt0 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_yt0 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ee__exp__2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ee__exp__2 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_gHaa = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_gHaa << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_gHza = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_gHza << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_dMZ2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_dMZ2 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_dMH2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_dMH2 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_dgw = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_dgw << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_barlam = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_barlam << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_dWT = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_dWT << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_dWW = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_dWW << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_gwsh = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_gwsh << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_vev = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_vev << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_dg1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_dg1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_dWHc = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_dWHc << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_dWHb = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_dWHb << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_dWHta = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_dWHta << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_dWZ = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_dWZ << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_g1sh = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_g1sh << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_ee__exp__3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ee__exp__3 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_vevhat__exp__3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_vevhat__exp__3 << std::endl;
+}
+
+void
+Parameters_SMEFTsim_topU3l_MwScheme_UFO::printIndependentCouplings()
+{
+  std::cout << "SMEFTsim_topU3l_MwScheme_UFO model couplings independent of event kinematics:" << std::endl;
+  // (none)
+}
+
+/*
+void
+Parameters_SMEFTsim_topU3l_MwScheme_UFO::printDependentParameters() // now computed event-by-event (running alphas #373)
+{
+  std::cout << "SMEFTsim_topU3l_MwScheme_UFO model parameters dependent on event kinematics:" << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_sqrt__aS = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_sqrt__aS << std::endl;
+  std::cout << std::setw( 20 ) << "G = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << G << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_gHgg2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_gHgg2 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_gHgg4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_gHgg4 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_gHgg5 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_gHgg5 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_G__exp__2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_G__exp__2 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_gHgg1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_gHgg1 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_gHgg3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_gHgg3 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_G__exp__3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_G__exp__3 << std::endl;
+  std::cout << std::setw( 20 ) << "mdl_dWH = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_dWH << std::endl;
+}
+
+void
+Parameters_SMEFTsim_topU3l_MwScheme_UFO::printDependentCouplings() // now computed event-by-event (running alphas #373)
+{
+  std::cout << "SMEFTsim_topU3l_MwScheme_UFO model couplings dependent on event kinematics:" << std::endl;
+  std::cout << std::setw( 20 ) << "GC_6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << GC_6 << std::endl;
+  std::cout << std::setw( 20 ) << "GC_7 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << GC_7 << std::endl;
+  std::cout << std::setw( 20 ) << "GC_8 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << GC_8 << std::endl;
+}
+*/
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
new file mode 100644
index 0000000000..fd5e4ee1f4
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
@@ -0,0 +1,643 @@
+//==========================================================================
+// This file has been automatically generated for CUDA/C++ standalone by
+// MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-01-26
+// By the MadGraph5_aMC@NLO Development Team
+// Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
+//==========================================================================
+
+#ifndef Parameters_SMEFTsim_topU3l_MwScheme_UFO_H
+#define Parameters_SMEFTsim_topU3l_MwScheme_UFO_H
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuCxtypes.h"
+#include "mgOnGpuVectors.h"
+
+//==========================================================================
+
+#ifndef MGONGPU_HARDCODE_PARAM // this is only supported in SM processes (e.g. not in EFT models) for the moment (#439)
+#error This non-SM physics process only supports MGONGPU_HARDCODE_PARAM builds (#439): please run "make HRDCOD=1"
+
+#include "read_slha.h"
+
+class Parameters_SMEFTsim_topU3l_MwScheme_UFO
+{
+public:
+
+  static Parameters_SMEFTsim_topU3l_MwScheme_UFO* getInstance();
+
+  // Define "zero"
+  double zero, ZERO;
+
+  // Model parameters independent of aS
+  //double aS; // now retrieved event-by-event (as G) from Fortran (running alphas #373)
+  double mdl_WH, mdl_WW, mdl_WZ, mdl_WT, mdl_ymtau, mdl_ymm, mdl_yme, mdl_ymt, mdl_ymb, mdl_ymc, mdl_yms, mdl_ymup, mdl_ymdo, mdl_linearPropCorrections, mdl_Gf, mdl_MW, mdl_LambdaSMEFT, mdl_cleQt3Im, mdl_cleQt1Im, mdl_cleju3Im, mdl_cleju1Im, mdl_clebQIm, mdl_cledjIm, mdl_ceBIm, mdl_ceWIm, mdl_ceHIm, mdl_cQtQb8Im, mdl_cQtQb1Im, mdl_cjtQd8Im, mdl_cjtQd1Im, mdl_cQujb8Im, mdl_cQujb1Im, mdl_cjuQb8Im, mdl_cjuQb1Im, mdl_cQtjd8Im, mdl_cQtjd1Im, mdl_cjujd81Im, mdl_cjujd11Im, mdl_cjujd8Im, mdl_cjujd1Im, mdl_cjQbd8Im, mdl_cjQbd1Im, mdl_cjQtu8Im, mdl_cjQtu1Im, mdl_cutbd8Im, mdl_cutbd1Im, mdl_cHtbIm, mdl_cHudIm, mdl_cbHIm, mdl_cdHIm, mdl_ctHIm, mdl_cuHIm, mdl_cbBIm, mdl_cdBIm, mdl_cbWIm, mdl_cdWIm, mdl_cbGIm, mdl_cdGIm, mdl_ctBIm, mdl_cuBIm, mdl_ctWIm, mdl_cuWIm, mdl_ctGIm, mdl_cuGIm, mdl_cHWBtil, mdl_cHBtil, mdl_cHWtil, mdl_cHGtil, mdl_cWtil, mdl_cGtil, mdl_cleQt3Re, mdl_cleju3Re, mdl_cleQt1Re, mdl_cleju1Re, mdl_clebQRe, mdl_cledjRe, mdl_cle, mdl_cbl, mdl_cld, mdl_ctl, mdl_clu, mdl_cQe, mdl_cje, mdl_cbe, mdl_ced, mdl_cte, mdl_ceu, mdl_cee, mdl_cQl3, mdl_cQl1, mdl_clj3, mdl_clj1, mdl_cll1, mdl_cll, mdl_cHe, mdl_cHl3, mdl_cHl1, mdl_ceBRe, mdl_ceWRe, mdl_ceHRe, mdl_cQtQb8Re, mdl_cQtQb1Re, mdl_cjtQd8Re, mdl_cjtQd1Re, mdl_cQujb8Re, mdl_cQujb1Re, mdl_cjuQb8Re, mdl_cjuQb1Re, mdl_cQtjd8Re, mdl_cQtjd1Re, mdl_cjujd81Re, mdl_cjujd11Re, mdl_cjujd8Re, mdl_cjujd1Re, mdl_cjQbd8Re, mdl_cjQbd1Re, mdl_cjQtu8Re, mdl_cjQtu1Re, mdl_cQb8, mdl_cQb1, mdl_cbj8, mdl_cbj1, mdl_cQd8, mdl_cQd1, mdl_cjd8, mdl_cjd1, mdl_cQt8, mdl_cQt1, mdl_ctj8, mdl_ctj1, mdl_cQu8, mdl_cju8, mdl_cQu1, mdl_cju1, mdl_cutbd8Re, mdl_cutbd1Re, mdl_cbu8, mdl_ctd8, mdl_ctb8, mdl_cud8, mdl_cbu1, mdl_ctd1, mdl_ctb1, mdl_cud1, mdl_cbd8, mdl_cbd1, mdl_cbb, mdl_cdd8, mdl_cdd1, mdl_ctu8, mdl_ctu1, mdl_ctt, mdl_cuu8, mdl_cuu1, mdl_cQQ8, mdl_cQQ1, mdl_cQj38, mdl_cQj31, mdl_cQj18, mdl_cQj11, mdl_cjj38, mdl_cjj31, mdl_cjj18, mdl_cjj11, mdl_cHtbRe, mdl_cHudRe, mdl_cHbq, mdl_cHd, mdl_cHt, mdl_cHu, mdl_cHQ3, mdl_cHj3, mdl_cHQ1, mdl_cHj1, mdl_cbBRe, mdl_cdBRe, mdl_cbWRe, mdl_cdWRe, mdl_cbGRe, mdl_cdGRe, mdl_ctBRe, mdl_cuBRe, mdl_ctWRe, mdl_cuWRe, mdl_ctGRe, mdl_cuGRe, mdl_cbHRe, mdl_cdHRe, mdl_ctHRe, mdl_cuHRe, mdl_cHWB, mdl_cHB, mdl_cHW, mdl_cHG, mdl_cHDD, mdl_cHbox, mdl_cH, mdl_cW, mdl_cG, mdl_MH, mdl_MZ, mdl_MTA, mdl_MMU, mdl_Me, mdl_MT, mdl_MB, mdl_MC, mdl_MS, mdl_MU, mdl_MD, mdl_MWsm, mdl_MW__exp__2, mdl_MZ__exp__2, mdl_sqrt__2, mdl_nb__2__exp__0_25, mdl_MH__exp__2, mdl_sth2, mdl_nb__10__exp___m_40, mdl_propCorr, mdl_MZ1, mdl_MH1, mdl_MT1, mdl_WZ1, mdl_WW1, mdl_WH1, mdl_WT1, mdl_cth, mdl_MW1, mdl_sqrt__sth2, mdl_sth, mdl_LambdaSMEFT__exp__2, mdl_MT__exp__2, mdl_MH__exp__6, mdl_MWsm__exp__6, mdl_MH__exp__4, mdl_MWsm__exp__4, mdl_MWsm__exp__2, mdl_MZ__exp__4, mdl_MZ__exp__6, mdl_cth__exp__2, mdl_sth__exp__2, mdl_MB__exp__2, mdl_MZ__exp__3, mdl_sth__exp__4, mdl_sth__exp__6, mdl_sth__exp__3, mdl_sth__exp__5, mdl_propCorr__exp__2, mdl_propCorr__exp__3, mdl_propCorr__exp__4, mdl_cth__exp__3, mdl_aEW, mdl_sqrt__Gf, mdl_vevhat, mdl_lam, mdl_sqrt__aEW, mdl_ee, mdl_yb, mdl_yc, mdl_ydo, mdl_ye, mdl_ym, mdl_ys, mdl_yt, mdl_ytau, mdl_yup, mdl_vevhat__exp__2, mdl_dGf, mdl_dkH, mdl_vevT, mdl_g1, mdl_gw, mdl_ee__exp__2, mdl_gHaa, mdl_gHza, mdl_dMZ2, mdl_dMH2, mdl_dgw, mdl_barlam, mdl_dWT, mdl_dWW, mdl_gwsh, mdl_vev, mdl_dg1, mdl_dWHc, mdl_dWHb, mdl_dWHta, mdl_dWZ, mdl_g1sh, mdl_ee__exp__3, mdl_vevhat__exp__3;
+  cxsmpl<double> mdl_complexi, mdl_cuH, mdl_ctHH, mdl_cdH, mdl_cbH, mdl_cuG, mdl_ctG, mdl_cuW, mdl_ctW, mdl_cuB, mdl_ctB, mdl_cdG, mdl_cbG, mdl_cdW, mdl_cbW, mdl_cdB, mdl_cbBB, mdl_cHud, mdl_cHtb, mdl_cutbd1, mdl_cutbd8, mdl_cjQtu1, mdl_cjQtu8, mdl_cjQbd1, mdl_cjQbd8, mdl_cjujd1, mdl_cjujd8, mdl_cjujd11, mdl_cjujd81, mdl_cQtjd1, mdl_cQtjd8, mdl_cjuQb1, mdl_cjuQb8, mdl_cQujb1, mdl_cQujb8, mdl_cjtQd1, mdl_cjtQd8, mdl_cQtQb1, mdl_cQtQb8, mdl_ceH, mdl_ceW, mdl_ceB, mdl_cledj, mdl_clebQ, mdl_cleju1, mdl_cleju3, mdl_cleQt1, mdl_cleQt3, mdl_conjg__cbH, mdl_conjg__ctHH, mdl_yb0, mdl_yt0;
+
+  // Model couplings independent of aS
+  // (none)
+
+  // Model parameters dependent on aS
+  //double mdl_sqrt__aS, G, mdl_gHgg2, mdl_gHgg4, mdl_gHgg5, mdl_G__exp__2, mdl_gHgg1, mdl_gHgg3, mdl_dWH; // now computed event-by-event (running alphas #373)
+  //cxsmpl<double> mdl_G__exp__3; // now computed event-by-event (running alphas #373)
+
+  // Model couplings dependent on aS
+  //cxsmpl<double> GC_6, GC_7, GC_8; // now computed event-by-event (running alphas #373)
+
+  // Set parameters that are unchanged during the run
+  void setIndependentParameters( SLHAReader& slha );
+
+  // Set couplings that are unchanged during the run
+  void setIndependentCouplings();
+
+  // Set parameters that are changed event by event
+  //void setDependentParameters(); // now computed event-by-event (running alphas #373)
+
+  // Set couplings that are changed event by event
+  //void setDependentCouplings(); // now computed event-by-event (running alphas #373)
+
+  // Print parameters that are unchanged during the run
+  void printIndependentParameters();
+
+  // Print couplings that are unchanged during the run
+  void printIndependentCouplings();
+
+  // Print parameters that are changed event by event
+  //void printDependentParameters(); // now computed event-by-event (running alphas #373)
+
+  // Print couplings that are changed event by event
+  //void printDependentCouplings(); // now computed event-by-event (running alphas #373)
+
+private:
+
+  static Parameters_SMEFTsim_topU3l_MwScheme_UFO* instance;
+};
+
+#else
+
+#include <cassert>
+#include <limits>
+
+// Hardcoded constexpr physics parameters
+namespace Parameters_SMEFTsim_topU3l_MwScheme_UFO // keep the same name rather than HardcodedParameters_SMEFTsim_topU3l_MwScheme_UFO for simplicity
+{
+  // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071)
+  double constexpr sqrtNewtonRaphson( double x, double curr, double prev )
+  {
+    return curr == prev ? curr : sqrtNewtonRaphson( x, 0.5 * ( curr + x / curr ), curr );
+  }
+  double constexpr constexpr_sqrt( double x )
+  {
+    return x >= 0 // && x < std::numeric_limits<double>::infinity() // avoid -Wtautological-constant-compare warning in fast math
+      ? sqrtNewtonRaphson( x, x, 0 )
+      : std::numeric_limits<double>::quiet_NaN();
+  }
+
+  // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159)
+  constexpr int constexpr_floor( double d )
+  {
+    const int i = static_cast<int>( d );
+    return d < i ? i - 1 : i;
+  }
+
+  // Constexpr implementation of pow
+  constexpr double constexpr_pow( double base, double exp )
+  {
+    // NB(1): this implementation of constexpr_pow requires exponent >= 0
+    assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'"
+    // NB(2): this implementation of constexpr_pow requires an integer exponent
+    const int iexp = constexpr_floor( exp );
+    assert( static_cast<double>( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'"
+    // Iterative implementation of pow if exp is a non negative integer
+    return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 );
+  }
+
+  // Model parameters independent of aS
+  constexpr double zero = 0;
+  constexpr double ZERO = 0;
+  constexpr double mdl_WH = 4.070000e - 03;
+  constexpr double mdl_WW = 2.085000e + 00;
+  constexpr double mdl_WZ = 2.495200e + 00;
+  constexpr double mdl_WT = 1.330000e + 00;
+  constexpr double mdl_ymtau = 1.777000e + 00;
+  constexpr double mdl_ymm = 1.056600e - 01;
+  constexpr double mdl_yme = 5.110000e - 04;
+  constexpr double mdl_ymt = 1.727600e + 02;
+  constexpr double mdl_ymb = 4.180000e + 00;
+  constexpr double mdl_ymc = 1.270000e + 00;
+  constexpr double mdl_yms = 9.300000e - 02;
+  constexpr double mdl_ymup = 2.160000e - 03;
+  constexpr double mdl_ymdo = 4.670000e - 03;
+  constexpr double mdl_linearPropCorrections = 0.000000e + 00;
+  //constexpr double aS = 1.179000e - 01; // now retrieved event-by-event (as G) from Fortran (running alphas #373)
+  constexpr double mdl_Gf = 1.166379e - 05;
+  constexpr double mdl_MW = 8.038700e + 01;
+  constexpr double mdl_LambdaSMEFT = 1.000000e + 03;
+  constexpr double mdl_cleQt3Im = 0.000000e + 00;
+  constexpr double mdl_cleQt1Im = 0.000000e + 00;
+  constexpr double mdl_cleju3Im = 0.000000e + 00;
+  constexpr double mdl_cleju1Im = 0.000000e + 00;
+  constexpr double mdl_clebQIm = 0.000000e + 00;
+  constexpr double mdl_cledjIm = 0.000000e + 00;
+  constexpr double mdl_ceBIm = 0.000000e + 00;
+  constexpr double mdl_ceWIm = 0.000000e + 00;
+  constexpr double mdl_ceHIm = 0.000000e + 00;
+  constexpr double mdl_cQtQb8Im = 0.000000e + 00;
+  constexpr double mdl_cQtQb1Im = 0.000000e + 00;
+  constexpr double mdl_cjtQd8Im = 0.000000e + 00;
+  constexpr double mdl_cjtQd1Im = 0.000000e + 00;
+  constexpr double mdl_cQujb8Im = 0.000000e + 00;
+  constexpr double mdl_cQujb1Im = 0.000000e + 00;
+  constexpr double mdl_cjuQb8Im = 0.000000e + 00;
+  constexpr double mdl_cjuQb1Im = 0.000000e + 00;
+  constexpr double mdl_cQtjd8Im = 0.000000e + 00;
+  constexpr double mdl_cQtjd1Im = 0.000000e + 00;
+  constexpr double mdl_cjujd81Im = 0.000000e + 00;
+  constexpr double mdl_cjujd11Im = 0.000000e + 00;
+  constexpr double mdl_cjujd8Im = 0.000000e + 00;
+  constexpr double mdl_cjujd1Im = 0.000000e + 00;
+  constexpr double mdl_cjQbd8Im = 0.000000e + 00;
+  constexpr double mdl_cjQbd1Im = 0.000000e + 00;
+  constexpr double mdl_cjQtu8Im = 0.000000e + 00;
+  constexpr double mdl_cjQtu1Im = 0.000000e + 00;
+  constexpr double mdl_cutbd8Im = 0.000000e + 00;
+  constexpr double mdl_cutbd1Im = 0.000000e + 00;
+  constexpr double mdl_cHtbIm = 0.000000e + 00;
+  constexpr double mdl_cHudIm = 0.000000e + 00;
+  constexpr double mdl_cbHIm = 0.000000e + 00;
+  constexpr double mdl_cdHIm = 0.000000e + 00;
+  constexpr double mdl_ctHIm = 0.000000e + 00;
+  constexpr double mdl_cuHIm = 0.000000e + 00;
+  constexpr double mdl_cbBIm = 0.000000e + 00;
+  constexpr double mdl_cdBIm = 0.000000e + 00;
+  constexpr double mdl_cbWIm = 0.000000e + 00;
+  constexpr double mdl_cdWIm = 0.000000e + 00;
+  constexpr double mdl_cbGIm = 0.000000e + 00;
+  constexpr double mdl_cdGIm = 0.000000e + 00;
+  constexpr double mdl_ctBIm = 0.000000e + 00;
+  constexpr double mdl_cuBIm = 0.000000e + 00;
+  constexpr double mdl_ctWIm = 0.000000e + 00;
+  constexpr double mdl_cuWIm = 0.000000e + 00;
+  constexpr double mdl_ctGIm = 0.000000e + 00;
+  constexpr double mdl_cuGIm = 0.000000e + 00;
+  constexpr double mdl_cHWBtil = 0.000000e + 00;
+  constexpr double mdl_cHBtil = 0.000000e + 00;
+  constexpr double mdl_cHWtil = 0.000000e + 00;
+  constexpr double mdl_cHGtil = 0.000000e + 00;
+  constexpr double mdl_cWtil = 0.000000e + 00;
+  constexpr double mdl_cGtil = 0.000000e + 00;
+  constexpr double mdl_cleQt3Re = 0.000000e + 00;
+  constexpr double mdl_cleju3Re = 0.000000e + 00;
+  constexpr double mdl_cleQt1Re = 0.000000e + 00;
+  constexpr double mdl_cleju1Re = 0.000000e + 00;
+  constexpr double mdl_clebQRe = 0.000000e + 00;
+  constexpr double mdl_cledjRe = 0.000000e + 00;
+  constexpr double mdl_cle = 0.000000e + 00;
+  constexpr double mdl_cbl = 0.000000e + 00;
+  constexpr double mdl_cld = 0.000000e + 00;
+  constexpr double mdl_ctl = 0.000000e + 00;
+  constexpr double mdl_clu = 0.000000e + 00;
+  constexpr double mdl_cQe = 0.000000e + 00;
+  constexpr double mdl_cje = 0.000000e + 00;
+  constexpr double mdl_cbe = 0.000000e + 00;
+  constexpr double mdl_ced = 0.000000e + 00;
+  constexpr double mdl_cte = 0.000000e + 00;
+  constexpr double mdl_ceu = 0.000000e + 00;
+  constexpr double mdl_cee = 0.000000e + 00;
+  constexpr double mdl_cQl3 = 0.000000e + 00;
+  constexpr double mdl_cQl1 = 0.000000e + 00;
+  constexpr double mdl_clj3 = 0.000000e + 00;
+  constexpr double mdl_clj1 = 0.000000e + 00;
+  constexpr double mdl_cll1 = 0.000000e + 00;
+  constexpr double mdl_cll = 0.000000e + 00;
+  constexpr double mdl_cHe = 0.000000e + 00;
+  constexpr double mdl_cHl3 = 0.000000e + 00;
+  constexpr double mdl_cHl1 = 0.000000e + 00;
+  constexpr double mdl_ceBRe = 0.000000e + 00;
+  constexpr double mdl_ceWRe = 0.000000e + 00;
+  constexpr double mdl_ceHRe = 0.000000e + 00;
+  constexpr double mdl_cQtQb8Re = 0.000000e + 00;
+  constexpr double mdl_cQtQb1Re = 0.000000e + 00;
+  constexpr double mdl_cjtQd8Re = 0.000000e + 00;
+  constexpr double mdl_cjtQd1Re = 0.000000e + 00;
+  constexpr double mdl_cQujb8Re = 0.000000e + 00;
+  constexpr double mdl_cQujb1Re = 0.000000e + 00;
+  constexpr double mdl_cjuQb8Re = 0.000000e + 00;
+  constexpr double mdl_cjuQb1Re = 0.000000e + 00;
+  constexpr double mdl_cQtjd8Re = 0.000000e + 00;
+  constexpr double mdl_cQtjd1Re = 0.000000e + 00;
+  constexpr double mdl_cjujd81Re = 0.000000e + 00;
+  constexpr double mdl_cjujd11Re = 0.000000e + 00;
+  constexpr double mdl_cjujd8Re = 0.000000e + 00;
+  constexpr double mdl_cjujd1Re = 0.000000e + 00;
+  constexpr double mdl_cjQbd8Re = 0.000000e + 00;
+  constexpr double mdl_cjQbd1Re = 0.000000e + 00;
+  constexpr double mdl_cjQtu8Re = 0.000000e + 00;
+  constexpr double mdl_cjQtu1Re = 0.000000e + 00;
+  constexpr double mdl_cQb8 = 0.000000e + 00;
+  constexpr double mdl_cQb1 = 0.000000e + 00;
+  constexpr double mdl_cbj8 = 0.000000e + 00;
+  constexpr double mdl_cbj1 = 0.000000e + 00;
+  constexpr double mdl_cQd8 = 0.000000e + 00;
+  constexpr double mdl_cQd1 = 0.000000e + 00;
+  constexpr double mdl_cjd8 = 0.000000e + 00;
+  constexpr double mdl_cjd1 = 0.000000e + 00;
+  constexpr double mdl_cQt8 = 0.000000e + 00;
+  constexpr double mdl_cQt1 = 0.000000e + 00;
+  constexpr double mdl_ctj8 = 0.000000e + 00;
+  constexpr double mdl_ctj1 = 0.000000e + 00;
+  constexpr double mdl_cQu8 = 0.000000e + 00;
+  constexpr double mdl_cju8 = 0.000000e + 00;
+  constexpr double mdl_cQu1 = 0.000000e + 00;
+  constexpr double mdl_cju1 = 0.000000e + 00;
+  constexpr double mdl_cutbd8Re = 0.000000e + 00;
+  constexpr double mdl_cutbd1Re = 0.000000e + 00;
+  constexpr double mdl_cbu8 = 0.000000e + 00;
+  constexpr double mdl_ctd8 = 0.000000e + 00;
+  constexpr double mdl_ctb8 = 0.000000e + 00;
+  constexpr double mdl_cud8 = 0.000000e + 00;
+  constexpr double mdl_cbu1 = 0.000000e + 00;
+  constexpr double mdl_ctd1 = 0.000000e + 00;
+  constexpr double mdl_ctb1 = 0.000000e + 00;
+  constexpr double mdl_cud1 = 0.000000e + 00;
+  constexpr double mdl_cbd8 = 0.000000e + 00;
+  constexpr double mdl_cbd1 = 0.000000e + 00;
+  constexpr double mdl_cbb = 0.000000e + 00;
+  constexpr double mdl_cdd8 = 0.000000e + 00;
+  constexpr double mdl_cdd1 = 0.000000e + 00;
+  constexpr double mdl_ctu8 = 0.000000e + 00;
+  constexpr double mdl_ctu1 = 0.000000e + 00;
+  constexpr double mdl_ctt = 0.000000e + 00;
+  constexpr double mdl_cuu8 = 0.000000e + 00;
+  constexpr double mdl_cuu1 = 0.000000e + 00;
+  constexpr double mdl_cQQ8 = 0.000000e + 00;
+  constexpr double mdl_cQQ1 = 0.000000e + 00;
+  constexpr double mdl_cQj38 = 0.000000e + 00;
+  constexpr double mdl_cQj31 = 0.000000e + 00;
+  constexpr double mdl_cQj18 = 0.000000e + 00;
+  constexpr double mdl_cQj11 = 0.000000e + 00;
+  constexpr double mdl_cjj38 = 0.000000e + 00;
+  constexpr double mdl_cjj31 = 0.000000e + 00;
+  constexpr double mdl_cjj18 = 0.000000e + 00;
+  constexpr double mdl_cjj11 = 0.000000e + 00;
+  constexpr double mdl_cHtbRe = 0.000000e + 00;
+  constexpr double mdl_cHudRe = 0.000000e + 00;
+  constexpr double mdl_cHbq = 0.000000e + 00;
+  constexpr double mdl_cHd = 0.000000e + 00;
+  constexpr double mdl_cHt = 0.000000e + 00;
+  constexpr double mdl_cHu = 0.000000e + 00;
+  constexpr double mdl_cHQ3 = 0.000000e + 00;
+  constexpr double mdl_cHj3 = 0.000000e + 00;
+  constexpr double mdl_cHQ1 = 0.000000e + 00;
+  constexpr double mdl_cHj1 = 0.000000e + 00;
+  constexpr double mdl_cbBRe = 0.000000e + 00;
+  constexpr double mdl_cdBRe = 0.000000e + 00;
+  constexpr double mdl_cbWRe = 0.000000e + 00;
+  constexpr double mdl_cdWRe = 0.000000e + 00;
+  constexpr double mdl_cbGRe = 0.000000e + 00;
+  constexpr double mdl_cdGRe = 0.000000e + 00;
+  constexpr double mdl_ctBRe = 0.000000e + 00;
+  constexpr double mdl_cuBRe = 0.000000e + 00;
+  constexpr double mdl_ctWRe = 0.000000e + 00;
+  constexpr double mdl_cuWRe = 0.000000e + 00;
+  constexpr double mdl_ctGRe = 0.000000e + 00;
+  constexpr double mdl_cuGRe = 0.000000e + 00;
+  constexpr double mdl_cbHRe = 0.000000e + 00;
+  constexpr double mdl_cdHRe = 0.000000e + 00;
+  constexpr double mdl_ctHRe = 0.000000e + 00;
+  constexpr double mdl_cuHRe = 0.000000e + 00;
+  constexpr double mdl_cHWB = 0.000000e + 00;
+  constexpr double mdl_cHB = 0.000000e + 00;
+  constexpr double mdl_cHW = 0.000000e + 00;
+  constexpr double mdl_cHG = 0.000000e + 00;
+  constexpr double mdl_cHDD = 0.000000e + 00;
+  constexpr double mdl_cHbox = 0.000000e + 00;
+  constexpr double mdl_cH = 0.000000e + 00;
+  constexpr double mdl_cW = 0.000000e + 00;
+  constexpr double mdl_cG = 0.000000e + 00;
+  constexpr double mdl_MH = 1.250900e + 02;
+  constexpr double mdl_MZ = 9.118760e + 01;
+  constexpr double mdl_MTA = 1.777000e + 00;
+  constexpr double mdl_MMU = 1.056600e - 01;
+  constexpr double mdl_Me = 5.110000e - 04;
+  constexpr double mdl_MT = 1.727600e + 02;
+  constexpr double mdl_MB = 4.180000e + 00;
+  constexpr double mdl_MC = 1.270000e + 00;
+  constexpr double mdl_MS = 9.300000e - 02;
+  constexpr double mdl_MU = 2.160000e - 03;
+  constexpr double mdl_MD = 4.670000e - 03;
+  constexpr cxsmpl<double> mdl_complexi = cxsmpl<double>( 0., 1. );
+  constexpr cxsmpl<double> mdl_cuH = mdl_cuHRe + mdl_cuHIm * mdl_complexi;
+  constexpr cxsmpl<double> mdl_ctHH = mdl_ctHRe + mdl_ctHIm * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cdH = mdl_cdHRe + mdl_cdHIm * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cbH = mdl_cbHRe + mdl_cbHIm * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cuG = mdl_cuGRe + mdl_cuGIm * mdl_complexi;
+  constexpr cxsmpl<double> mdl_ctG = mdl_ctGRe + mdl_ctGIm * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cuW = mdl_cuWRe + mdl_cuWIm * mdl_complexi;
+  constexpr cxsmpl<double> mdl_ctW = mdl_ctWRe + mdl_ctWIm * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cuB = mdl_cuBRe + mdl_cuBIm * mdl_complexi;
+  constexpr cxsmpl<double> mdl_ctB = mdl_ctBRe + mdl_ctBIm * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cdG = mdl_cdGRe + mdl_cdGIm * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cbG = mdl_cbGRe + mdl_cbGIm * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cdW = mdl_cdWRe + mdl_cdWIm * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cbW = mdl_cbWRe + mdl_cbWIm * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cdB = mdl_cdBRe + mdl_cdBIm * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cbBB = mdl_cbBRe + mdl_cbBIm * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cHud = mdl_cHudRe + mdl_cHudIm * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cHtb = mdl_cHtbRe + mdl_cHtbIm * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cutbd1 = mdl_cutbd1Re + mdl_cutbd1Im * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cutbd8 = mdl_cutbd8Re + mdl_cutbd8Im * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cjQtu1 = mdl_cjQtu1Re + mdl_cjQtu1Im * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cjQtu8 = mdl_cjQtu8Re + mdl_cjQtu8Im * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cjQbd1 = mdl_cjQbd1Re + mdl_cjQbd1Im * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cjQbd8 = mdl_cjQbd8Re + mdl_cjQbd8Im * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cjujd1 = mdl_cjujd1Re + mdl_cjujd1Im * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cjujd8 = mdl_cjujd8Re + mdl_cjujd8Im * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cjujd11 = mdl_cjujd11Re + mdl_cjujd11Im * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cjujd81 = mdl_cjujd81Re + mdl_cjujd81Im * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cQtjd1 = mdl_cQtjd1Re + mdl_cQtjd1Im * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cQtjd8 = mdl_cQtjd8Re + mdl_cQtjd8Im * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cjuQb1 = mdl_cjuQb1Re + mdl_cjuQb1Im * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cjuQb8 = mdl_cjuQb8Re + mdl_cjuQb8Im * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cQujb1 = mdl_cQujb1Re + mdl_cQujb1Im * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cQujb8 = mdl_cQujb8Re + mdl_cQujb8Im * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cjtQd1 = mdl_cjtQd1Re + mdl_cjtQd1Im * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cjtQd8 = mdl_cjtQd8Re + mdl_cjtQd8Im * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cQtQb1 = mdl_cQtQb1Re + mdl_cQtQb1Im * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cQtQb8 = mdl_cQtQb8Re + mdl_cQtQb8Im * mdl_complexi;
+  constexpr cxsmpl<double> mdl_ceH = mdl_ceHRe + mdl_ceHIm * mdl_complexi;
+  constexpr cxsmpl<double> mdl_ceW = mdl_ceWRe + mdl_ceWIm * mdl_complexi;
+  constexpr cxsmpl<double> mdl_ceB = mdl_ceBRe + mdl_ceBIm * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cledj = mdl_cledjRe + mdl_cledjIm * mdl_complexi;
+  constexpr cxsmpl<double> mdl_clebQ = mdl_clebQRe + mdl_clebQIm * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cleju1 = mdl_cleju1Re + mdl_cleju1Im * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cleju3 = mdl_cleju3Re + mdl_cleju3Im * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cleQt1 = mdl_cleQt1Re + mdl_cleQt1Im * mdl_complexi;
+  constexpr cxsmpl<double> mdl_cleQt3 = mdl_cleQt3Re + mdl_cleQt3Im * mdl_complexi;
+  constexpr double mdl_MWsm = mdl_MW;
+  constexpr double mdl_MW__exp__2 = ( ( mdl_MW ) * ( mdl_MW ) );
+  constexpr double mdl_MZ__exp__2 = ( ( mdl_MZ ) * ( mdl_MZ ) );
+  constexpr double mdl_sqrt__2 = constexpr_sqrt( 2. );
+  constexpr double mdl_nb__2__exp__0_25 = constexpr_pow( 2., 0.25 );
+  constexpr double mdl_MH__exp__2 = ( ( mdl_MH ) * ( mdl_MH ) );
+  constexpr double mdl_sth2 = 1. - mdl_MW__exp__2 / mdl_MZ__exp__2;
+  constexpr double mdl_nb__10__exp___m_40 = constexpr_pow( 10., -40. );
+  constexpr double mdl_propCorr = ABS( mdl_linearPropCorrections ) / ( ABS( mdl_linearPropCorrections ) + mdl_nb__10__exp___m_40 );
+  constexpr double mdl_MZ1 = mdl_MZ;
+  constexpr double mdl_MH1 = mdl_MH;
+  constexpr double mdl_MT1 = mdl_MT;
+  constexpr double mdl_WZ1 = mdl_WZ;
+  constexpr double mdl_WW1 = mdl_WW;
+  constexpr double mdl_WH1 = mdl_WH;
+  constexpr double mdl_WT1 = mdl_WT;
+  constexpr double mdl_cth = constexpr_sqrt( 1. - mdl_sth2 );
+  constexpr double mdl_MW1 = mdl_MWsm;
+  constexpr double mdl_sqrt__sth2 = constexpr_sqrt( mdl_sth2 );
+  constexpr double mdl_sth = mdl_sqrt__sth2;
+  constexpr double mdl_LambdaSMEFT__exp__2 = ( ( mdl_LambdaSMEFT ) * ( mdl_LambdaSMEFT ) );
+  constexpr cxsmpl<double> mdl_conjg__cbH = conj( mdl_cbH );
+  constexpr cxsmpl<double> mdl_conjg__ctHH = conj( mdl_ctHH );
+  constexpr double mdl_MT__exp__2 = ( ( mdl_MT ) * ( mdl_MT ) );
+  constexpr double mdl_MH__exp__6 = constexpr_pow( mdl_MH, 6. );
+  constexpr double mdl_MWsm__exp__6 = constexpr_pow( mdl_MWsm, 6. );
+  constexpr double mdl_MH__exp__4 = ( ( mdl_MH ) * ( mdl_MH ) * ( mdl_MH ) * ( mdl_MH ) );
+  constexpr double mdl_MWsm__exp__4 = ( ( mdl_MWsm ) * ( mdl_MWsm ) * ( mdl_MWsm ) * ( mdl_MWsm ) );
+  constexpr double mdl_MWsm__exp__2 = ( ( mdl_MWsm ) * ( mdl_MWsm ) );
+  constexpr double mdl_MZ__exp__4 = ( ( mdl_MZ ) * ( mdl_MZ ) * ( mdl_MZ ) * ( mdl_MZ ) );
+  constexpr double mdl_MZ__exp__6 = constexpr_pow( mdl_MZ, 6. );
+  constexpr double mdl_cth__exp__2 = ( ( mdl_cth ) * ( mdl_cth ) );
+  constexpr double mdl_sth__exp__2 = ( ( mdl_sth ) * ( mdl_sth ) );
+  constexpr double mdl_MB__exp__2 = ( ( mdl_MB ) * ( mdl_MB ) );
+  constexpr double mdl_MZ__exp__3 = ( ( mdl_MZ ) * ( mdl_MZ ) * ( mdl_MZ ) );
+  constexpr double mdl_sth__exp__4 = ( ( mdl_sth ) * ( mdl_sth ) * ( mdl_sth ) * ( mdl_sth ) );
+  constexpr double mdl_sth__exp__6 = constexpr_pow( mdl_sth, 6. );
+  constexpr double mdl_sth__exp__3 = ( ( mdl_sth ) * ( mdl_sth ) * ( mdl_sth ) );
+  constexpr double mdl_sth__exp__5 = constexpr_pow( mdl_sth, 5. );
+  constexpr double mdl_propCorr__exp__2 = ( ( mdl_propCorr ) * ( mdl_propCorr ) );
+  constexpr double mdl_propCorr__exp__3 = ( ( mdl_propCorr ) * ( mdl_propCorr ) * ( mdl_propCorr ) );
+  constexpr double mdl_propCorr__exp__4 = ( ( mdl_propCorr ) * ( mdl_propCorr ) * ( mdl_propCorr ) * ( mdl_propCorr ) );
+  constexpr double mdl_cth__exp__3 = ( ( mdl_cth ) * ( mdl_cth ) * ( mdl_cth ) );
+  constexpr double mdl_aEW = ( mdl_Gf * mdl_MW__exp__2 * ( 1. - mdl_MW__exp__2 / mdl_MZ__exp__2 ) * mdl_sqrt__2 ) / M_PI;
+  constexpr double mdl_sqrt__Gf = constexpr_sqrt( mdl_Gf );
+  constexpr double mdl_vevhat = 1. / ( mdl_nb__2__exp__0_25 * mdl_sqrt__Gf );
+  constexpr double mdl_lam = ( mdl_Gf * mdl_MH__exp__2 ) / mdl_sqrt__2;
+  constexpr double mdl_sqrt__aEW = constexpr_sqrt( mdl_aEW );
+  constexpr double mdl_ee = 2. * mdl_sqrt__aEW * constexpr_sqrt( M_PI );
+  constexpr double mdl_yb = ( mdl_ymb * mdl_sqrt__2 ) / mdl_vevhat;
+  constexpr double mdl_yc = ( mdl_ymc * mdl_sqrt__2 ) / mdl_vevhat;
+  constexpr double mdl_ydo = ( mdl_ymdo * mdl_sqrt__2 ) / mdl_vevhat;
+  constexpr double mdl_ye = ( mdl_yme * mdl_sqrt__2 ) / mdl_vevhat;
+  constexpr double mdl_ym = ( mdl_ymm * mdl_sqrt__2 ) / mdl_vevhat;
+  constexpr double mdl_ys = ( mdl_yms * mdl_sqrt__2 ) / mdl_vevhat;
+  constexpr double mdl_yt = ( mdl_ymt * mdl_sqrt__2 ) / mdl_vevhat;
+  constexpr double mdl_ytau = ( mdl_ymtau * mdl_sqrt__2 ) / mdl_vevhat;
+  constexpr double mdl_yup = ( mdl_ymup * mdl_sqrt__2 ) / mdl_vevhat;
+  constexpr double mdl_vevhat__exp__2 = ( ( mdl_vevhat ) * ( mdl_vevhat ) );
+  constexpr double mdl_dGf = ( ( 2. * mdl_cHl3 - mdl_cll1 ) * mdl_vevhat__exp__2 ) / mdl_LambdaSMEFT__exp__2;
+  constexpr double mdl_dkH = ( ( mdl_cHbox - mdl_cHDD / 4. ) * mdl_vevhat__exp__2 ) / mdl_LambdaSMEFT__exp__2;
+  constexpr double mdl_vevT = ( 1. + mdl_dGf / 2. ) * mdl_vevhat;
+  constexpr double mdl_g1 = mdl_ee / mdl_cth;
+  constexpr double mdl_gw = mdl_ee / mdl_sth;
+  constexpr cxsmpl<double> mdl_yb0 = ( 1. - mdl_dGf / 2. ) * mdl_yb + ( mdl_vevhat__exp__2 * mdl_conjg__cbH ) / ( 2. * mdl_LambdaSMEFT__exp__2 );
+  constexpr cxsmpl<double> mdl_yt0 = ( 1. - mdl_dGf / 2. ) * mdl_yt + ( mdl_vevhat__exp__2 * mdl_conjg__ctHH ) / ( 2. * mdl_LambdaSMEFT__exp__2 );
+  constexpr double mdl_ee__exp__2 = ( ( mdl_ee ) * ( mdl_ee ) );
+  constexpr double mdl_gHaa = ( mdl_ee__exp__2 * ( -1.75 + ( 4. * ( 0.3333333333333333 + ( 7. * mdl_MH__exp__2 ) / ( 360. * mdl_MT__exp__2 ) ) ) / 3. - ( 29. * mdl_MH__exp__6 ) / ( 16800. * mdl_MWsm__exp__6 ) - ( 19. * mdl_MH__exp__4 ) / ( 1680. * mdl_MWsm__exp__4 ) - ( 11. * mdl_MH__exp__2 ) / ( 120. * mdl_MWsm__exp__2 ) ) ) / ( 8. * ( ( M_PI ) * ( M_PI ) ) );
+  constexpr double mdl_gHza = ( mdl_ee__exp__2 * ( ( ( 0.4583333333333333 + ( 29. * mdl_MH__exp__6 ) / ( 100800. * mdl_MWsm__exp__6 ) + ( 19. * mdl_MH__exp__4 ) / ( 10080. * mdl_MWsm__exp__4 ) + ( 11. * mdl_MH__exp__2 ) / ( 720. * mdl_MWsm__exp__2 ) + ( mdl_MH__exp__4 * mdl_MZ__exp__2 ) / ( 2100. * mdl_MWsm__exp__6 ) + ( mdl_MH__exp__2 * mdl_MZ__exp__2 ) / ( 280. * mdl_MWsm__exp__4 ) + ( 7. * mdl_MZ__exp__2 ) / ( 180. * mdl_MWsm__exp__2 ) + ( 67. * mdl_MH__exp__2 * mdl_MZ__exp__4 ) / ( 100800. * mdl_MWsm__exp__6 ) + ( 53. * mdl_MZ__exp__4 ) / ( 10080. * mdl_MWsm__exp__4 ) + ( 43. * mdl_MZ__exp__6 ) / ( 50400. * mdl_MWsm__exp__6 ) - ( 31. * mdl_cth__exp__2 ) / ( 24. * mdl_sth__exp__2 ) - ( 29. * mdl_cth__exp__2 * mdl_MH__exp__6 ) / ( 20160. * mdl_MWsm__exp__6 * mdl_sth__exp__2 ) - ( 19. * mdl_cth__exp__2 * mdl_MH__exp__4 ) / ( 2016. * mdl_MWsm__exp__4 * mdl_sth__exp__2 ) - ( 11. * mdl_cth__exp__2 * mdl_MH__exp__2 ) / ( 144. * mdl_MWsm__exp__2 * mdl_sth__exp__2 ) - ( mdl_cth__exp__2 * mdl_MH__exp__4 * mdl_MZ__exp__2 ) / ( 560. * mdl_MWsm__exp__6 * mdl_sth__exp__2 ) - ( 31. * mdl_cth__exp__2 * mdl_MH__exp__2 * mdl_MZ__exp__2 ) / ( 2520. * mdl_MWsm__exp__4 * mdl_sth__exp__2 ) - ( mdl_cth__exp__2 * mdl_MZ__exp__2 ) / ( 9. * mdl_MWsm__exp__2 * mdl_sth__exp__2 ) - ( 43. * mdl_cth__exp__2 * mdl_MH__exp__2 * mdl_MZ__exp__4 ) / ( 20160. * mdl_MWsm__exp__6 * mdl_sth__exp__2 ) - ( 17. * mdl_cth__exp__2 * mdl_MZ__exp__4 ) / ( 1120. * mdl_MWsm__exp__4 * mdl_sth__exp__2 ) - ( 5. * mdl_cth__exp__2 * mdl_MZ__exp__6 ) / ( 2016. * mdl_MWsm__exp__6 * mdl_sth__exp__2 ) ) * mdl_sth ) / mdl_cth + ( ( 0.3333333333333333 + ( 7. * mdl_MH__exp__2 ) / ( 360. * mdl_MT__exp__2 ) + ( 11. * mdl_MZ__exp__2 ) / ( 360. * mdl_MT__exp__2 ) ) * ( 0.5 - ( 4. * mdl_sth__exp__2 ) / 3. ) ) / ( mdl_cth * mdl_sth ) ) ) / ( 4. * ( ( M_PI ) * ( M_PI ) ) );
+  constexpr double mdl_dMZ2 = ( ( mdl_cHDD / 2. + 2. * mdl_cHWB * mdl_cth * mdl_sth ) * mdl_vevhat__exp__2 ) / mdl_LambdaSMEFT__exp__2;
+  constexpr double mdl_dMH2 = 2. * mdl_dkH - ( 3. * mdl_cH * mdl_vevhat__exp__2 ) / ( 2. * mdl_lam * mdl_LambdaSMEFT__exp__2 );
+  constexpr double mdl_dgw = -mdl_dGf / 2.;
+  constexpr double mdl_barlam = ( 1. - mdl_dGf - mdl_dMH2 ) * mdl_lam;
+  constexpr double mdl_dWT = 2. * mdl_WT * ( mdl_dgw + ( mdl_vevhat * ( mdl_ee * ( 3. * mdl_cHtbRe * mdl_MB * mdl_MT * mdl_MWsm__exp__2 + mdl_cHQ3 * ( ( ( mdl_MB__exp__2 - mdl_MT__exp__2 ) * ( mdl_MB__exp__2 - mdl_MT__exp__2 ) ) + ( mdl_MB__exp__2 + mdl_MT__exp__2 ) * mdl_MWsm__exp__2 - 2. * mdl_MWsm__exp__4 ) ) * mdl_vevhat + 6. * mdl_MWsm__exp__2 * ( mdl_ctWRe * mdl_MT * ( mdl_MB__exp__2 - mdl_MT__exp__2 + mdl_MWsm__exp__2 ) + mdl_cbWRe * mdl_MB * ( -mdl_MB__exp__2 + mdl_MT__exp__2 + mdl_MWsm__exp__2 ) ) * mdl_sth * mdl_sqrt__2 ) ) / ( mdl_ee * mdl_LambdaSMEFT__exp__2 * ( ( ( mdl_MB__exp__2 - mdl_MT__exp__2 ) * ( mdl_MB__exp__2 - mdl_MT__exp__2 ) ) + ( mdl_MB__exp__2 + mdl_MT__exp__2 ) * mdl_MWsm__exp__2 - 2. * mdl_MWsm__exp__4 ) ) );
+  constexpr double mdl_dWW = ( 2. * mdl_dgw + ( 2. * ( 2. * mdl_cHj3 + mdl_cHl3 ) * mdl_vevhat__exp__2 ) / ( 3. * mdl_LambdaSMEFT__exp__2 ) ) * mdl_WW;
+  constexpr double mdl_gwsh = ( mdl_ee * ( 1. + mdl_dgw - ( mdl_cHW * mdl_vevhat__exp__2 ) / mdl_LambdaSMEFT__exp__2 ) ) / mdl_sth;
+  constexpr double mdl_vev = ( 1. - ( 3. * mdl_cH * mdl_vevhat__exp__2 ) / ( 8. * mdl_lam * mdl_LambdaSMEFT__exp__2 ) ) * mdl_vevT;
+  constexpr double mdl_dg1 = ( -mdl_dGf - mdl_dMZ2 / mdl_sth__exp__2 ) / 2.;
+  constexpr double mdl_dWHc = mdl_yc / ( mdl_yc + mdl_nb__10__exp___m_40 ) * ( -0.02884 * mdl_dGf + ( ( 0.05768 * mdl_cHbox - 0.01442 * mdl_cHDD - 0.05768 * mdl_cuHRe ) * mdl_vevhat__exp__2 ) / mdl_LambdaSMEFT__exp__2 );
+  constexpr double mdl_dWHb = mdl_yb / ( mdl_yb + mdl_nb__10__exp___m_40 ) * ( mdl_vevhat__exp__2 * ( -1.1618 * mdl_cbHRe ) / ( mdl_LambdaSMEFT__exp__2 * ( mdl_yb + mdl_nb__10__exp___m_40 ) ) - 0.5809 * mdl_dGf + ( mdl_vevhat__exp__2 * ( 1.1618 * mdl_cHbox - 0.29045 * mdl_cHDD ) ) / ( mdl_LambdaSMEFT__exp__2 ) );
+  constexpr double mdl_dWHta = mdl_ytau / ( mdl_ytau + mdl_nb__10__exp___m_40 ) * ( -0.06256 * mdl_dGf + mdl_vevhat__exp__2 * ( -0.12512 * mdl_ceHRe + 0.12512 * mdl_cHbox - 0.03128 * mdl_cHDD ) / ( mdl_LambdaSMEFT__exp__2 ) );
+  constexpr double mdl_dWZ = mdl_WZ * ( -1. + ( 36. * mdl_cth * mdl_MB * mdl_MZ__exp__2 * mdl_sth * ( mdl_cbWRe * mdl_cth + mdl_cbBRe * mdl_sth ) * ( -3. + 4. * mdl_sth__exp__2 ) * mdl_vevhat * mdl_sqrt__2 * constexpr_sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) + mdl_ee * mdl_LambdaSMEFT__exp__2 * ( 2. * mdl_MZ__exp__3 * ( 27. + 54. * mdl_dgw - 54. * ( 1. + mdl_dg1 + mdl_dgw ) * mdl_sth__exp__2 + 76. * ( 1. + 4. * mdl_dg1 - 2. * mdl_dgw ) * mdl_sth__exp__4 + 152. * ( -mdl_dg1 + mdl_dgw ) * mdl_sth__exp__6 ) + mdl_MZ__exp__2 * ( 9. + 18. * mdl_dgw - 6. * ( 2. + mdl_dg1 + 3. * mdl_dgw ) * mdl_sth__exp__2 + 8. * ( 1. + 4. * mdl_dg1 - 2. * mdl_dgw ) * mdl_sth__exp__4 + 16. * ( -mdl_dg1 + mdl_dgw ) * mdl_sth__exp__6 ) * constexpr_sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) + mdl_MB__exp__2 * ( -9. - 18. * mdl_dgw - 6. * ( 4. + 11. * mdl_dg1 - 3. * mdl_dgw ) * mdl_sth__exp__2 + 16. * ( 1. + 4. * mdl_dg1 - 2. * mdl_dgw ) * mdl_sth__exp__4 + 32. * ( -mdl_dg1 + mdl_dgw ) * mdl_sth__exp__6 ) * constexpr_sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) ) + 2. * mdl_ee * mdl_vevhat__exp__2 * ( 36. * mdl_cHj3 * mdl_MZ__exp__3 + 18. * mdl_cHl3 * mdl_MZ__exp__3 + 9. * ( 3. * mdl_cHbq - mdl_cHQ1 - mdl_cHQ3 ) * mdl_MB__exp__2 * constexpr_sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) + 9. * mdl_cHQ1 * mdl_MZ__exp__2 * constexpr_sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) + 9. * mdl_cHQ3 * mdl_MZ__exp__2 * constexpr_sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) + 3. * mdl_cHWB * mdl_cth * ( -7. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) * mdl_sth * constexpr_sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) + 8. * mdl_cHWB * mdl_cth * mdl_sth__exp__3 * ( 2. * mdl_MB__exp__2 * constexpr_sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) + mdl_MZ__exp__2 * ( 19. * mdl_MZ + constexpr_sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) ) ) - 8. * mdl_cHWB * mdl_cth * mdl_sth__exp__5 * ( 2. * mdl_MB__exp__2 * constexpr_sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) + mdl_MZ__exp__2 * ( 19. * mdl_MZ + constexpr_sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) ) ) - 6. * mdl_sth__exp__2 * ( 2. * ( mdl_cHbq + mdl_cHQ1 + mdl_cHQ3 ) * mdl_MB__exp__2 * constexpr_sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) + mdl_MZ__exp__2 * ( ( 2. * mdl_cHd + 3. * mdl_cHe - 2. * mdl_cHj1 + 3. * ( 2. * mdl_cHj3 + mdl_cHl1 + mdl_cHl3 ) - 4. * mdl_cHu ) * mdl_MZ + ( mdl_cHbq + mdl_cHQ1 + mdl_cHQ3 ) * constexpr_sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) ) ) ) ) / ( mdl_ee * mdl_LambdaSMEFT__exp__2 * ( 2. * mdl_MZ__exp__3 * ( 27. - 54. * mdl_sth__exp__2 + 76. * mdl_sth__exp__4 ) + mdl_MZ__exp__2 * ( 9. - 12. * mdl_sth__exp__2 + 8. * mdl_sth__exp__4 ) * constexpr_sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) + mdl_MB__exp__2 * ( -9. - 24. * mdl_sth__exp__2 + 16. * mdl_sth__exp__4 ) * constexpr_sqrt( -4. * mdl_MB__exp__2 + mdl_MZ__exp__2 ) ) ) );
+  constexpr double mdl_g1sh = ( mdl_ee * ( 1. + mdl_dg1 - ( mdl_cHB * mdl_vevhat__exp__2 ) / mdl_LambdaSMEFT__exp__2 ) ) / mdl_cth;
+  constexpr double mdl_ee__exp__3 = ( ( mdl_ee ) * ( mdl_ee ) * ( mdl_ee ) );
+  constexpr double mdl_vevhat__exp__3 = ( ( mdl_vevhat ) * ( mdl_vevhat ) * ( mdl_vevhat ) );
+
+  // Model couplings independent of aS
+  // (none)
+
+  // Model parameters dependent on aS
+  //constexpr double mdl_sqrt__aS = //constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373)
+  //constexpr double G = 2. * mdl_sqrt__aS * //constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373)
+  //constexpr double mdl_gHgg2 = ( -7. * aS ) / ( 720. * M_PI ); // now computed event-by-event (running alphas #373)
+  //constexpr double mdl_gHgg4 = aS / ( 360. * M_PI ); // now computed event-by-event (running alphas #373)
+  //constexpr double mdl_gHgg5 = aS / ( 20. * M_PI ); // now computed event-by-event (running alphas #373)
+  //constexpr double mdl_G__exp__2 = ( ( G ) * ( G ) ); // now computed event-by-event (running alphas #373)
+  //constexpr double mdl_gHgg1 = mdl_G__exp__2 / ( 48. * ( ( M_PI ) * ( M_PI ) ) ); // now computed event-by-event (running alphas #373)
+  //constexpr double mdl_gHgg3 = ( aS * G ) / ( 60. * M_PI ); // now computed event-by-event (running alphas #373)
+  //constexpr cxsmpl<double> mdl_G__exp__3 = ( ( G ) * ( G ) * ( G ) ); // now computed event-by-event (running alphas #373)
+  //constexpr double mdl_dWH = mdl_WH * ( -0.24161 * mdl_dGf + 0.96644 * mdl_dgw + 0.4832199999999999 * mdl_dkH - 0.11186509426655467 * mdl_dWW + ( 0.36410378449238195 * mdl_cHj3 * mdl_vevhat__exp__2 ) / mdl_LambdaSMEFT__exp__2 + ( 0.17608307708657747 * mdl_cHl3 * mdl_vevhat__exp__2 ) / mdl_LambdaSMEFT__exp__2 + ( 0.1636 * mdl_cHG * mdl_MT__exp__2 * mdl_vevhat__exp__2 ) / ( mdl_LambdaSMEFT__exp__2 * ( -0.5 * mdl_gHgg2 * mdl_MH__exp__2 + mdl_gHgg1 * mdl_MT__exp__2 ) ) + ( mdl_cHW * ( -0.35937785117066967 * mdl_gHaa * mdl_gHza + 0.006164 * mdl_cth * mdl_gHaa * mdl_sth + 0.00454 * mdl_gHza * mdl_sth__exp__2 ) * mdl_vevhat__exp__2 ) / ( mdl_gHaa * mdl_gHza * mdl_LambdaSMEFT__exp__2 ) + ( mdl_cHWB * ( -0.00454 * mdl_cth * mdl_gHza * mdl_sth + mdl_gHaa * ( -0.0030819999999999997 + 0.006163999999999999 * mdl_sth__exp__2 ) ) * mdl_vevhat__exp__2 ) / ( mdl_gHaa * mdl_gHza * mdl_LambdaSMEFT__exp__2 ) + ( mdl_cHB * ( -0.006163999999999999 * mdl_cth * mdl_gHaa * mdl_sth - 0.00454 * mdl_gHza * ( -1. + mdl_sth__exp__2 ) ) * mdl_vevhat__exp__2 ) / ( mdl_gHaa * mdl_gHza * mdl_LambdaSMEFT__exp__2 ) + mdl_dWHc + mdl_dWHb + mdl_dWHta ); // now computed event-by-event (running alphas #373)
+
+  // Model couplings dependent on aS
+  //constexpr cxsmpl<double> GC_6 = -( mdl_complexi * G ); // now computed event-by-event (running alphas #373)
+  //constexpr cxsmpl<double> GC_7 = G; // now computed event-by-event (running alphas #373)
+  //constexpr cxsmpl<double> GC_8 = mdl_complexi * mdl_G__exp__2; // now computed event-by-event (running alphas #373)
+
+  // Print parameters that are unchanged during the run
+  void printIndependentParameters();
+
+  // Print couplings that are unchanged during the run
+  void printIndependentCouplings();
+
+  // Print parameters that are changed event by event
+  //void printDependentParameters(); // now computed event-by-event (running alphas #373)
+
+  // Print couplings that are changed event by event
+  //void printDependentCouplings(); // now computed event-by-event (running alphas #373)
+}
+
+#endif
+
+//==========================================================================
+
+namespace Parameters_SMEFTsim_topU3l_MwScheme_UFO_dependentCouplings
+{
+  constexpr size_t ndcoup = 3; // #couplings that vary event by event because they depend on the running alphas QCD
+  constexpr size_t idcoup_GC_6 = 0;
+  constexpr size_t idcoup_GC_7 = 1;
+  constexpr size_t idcoup_GC_8 = 2;
+  struct DependentCouplings_sv
+  {
+    cxtype_sv GC_6;
+    cxtype_sv GC_7;
+    cxtype_sv GC_8;
+  };
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-variable"  // e.g. <<warning: unused variable ‘mdl_G__exp__2’ [-Wunused-variable]>>
+#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <<warning: unused parameter ‘G’ [-Wunused-parameter]>>
+#ifdef __CUDACC__
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 177 // e.g. <<warning #177-D: variable "mdl_G__exp__2" was declared but never referenced>>
+#endif
+  __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv )
+  {
+#ifdef MGONGPU_HARDCODE_PARAM
+    using namespace Parameters_SMEFTsim_topU3l_MwScheme_UFO;
+#endif
+    // NB: hardcode cxtype cI(0,1) instead of cxtype (or hardcoded cxsmpl) mdl_complexi (which exists in Parameters_SMEFTsim_topU3l_MwScheme_UFO) because:
+    // (1) mdl_complexi is always (0,1); (2) mdl_complexi is undefined in device code; (3) need cxsmpl conversion to cxtype in code below
+    const cxtype cI( 0., 1. );
+    DependentCouplings_sv out;
+    // Begin non-SM (e.g. EFT) implementation - special handling of vectors of floats (#439)
+#if not( defined MGONGPU_CPPSIMD && defined MGONGPU_FPTYPE_FLOAT )
+    {
+      const fptype_sv& G = G_sv;
+      // Model parameters dependent on aS
+      //const fptype_sv mdl_sqrt__aS = constexpr_sqrt( aS );
+      //const fptype_sv G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI );
+      const fptype_sv mdl_gHgg2 = ( -7. * aS ) / ( 720. * M_PI );
+      const fptype_sv mdl_gHgg4 = aS / ( 360. * M_PI );
+      const fptype_sv mdl_gHgg5 = aS / ( 20. * M_PI );
+      const fptype_sv mdl_G__exp__2 = ( ( G ) * ( G ) );
+      const fptype_sv mdl_gHgg1 = mdl_G__exp__2 / ( 48. * ( ( M_PI ) * ( M_PI ) ) );
+      const fptype_sv mdl_gHgg3 = ( aS * G ) / ( 60. * M_PI );
+      constexpr cxsmpl<double> mdl_G__exp__3 = ( ( G ) * ( G ) * ( G ) );
+      const fptype_sv mdl_dWH = mdl_WH * ( -0.24161 * mdl_dGf + 0.96644 * mdl_dgw + 0.4832199999999999 * mdl_dkH - 0.11186509426655467 * mdl_dWW + ( 0.36410378449238195 * mdl_cHj3 * mdl_vevhat__exp__2 ) / mdl_LambdaSMEFT__exp__2 + ( 0.17608307708657747 * mdl_cHl3 * mdl_vevhat__exp__2 ) / mdl_LambdaSMEFT__exp__2 + ( 0.1636 * mdl_cHG * mdl_MT__exp__2 * mdl_vevhat__exp__2 ) / ( mdl_LambdaSMEFT__exp__2 * ( -0.5 * mdl_gHgg2 * mdl_MH__exp__2 + mdl_gHgg1 * mdl_MT__exp__2 ) ) + ( mdl_cHW * ( -0.35937785117066967 * mdl_gHaa * mdl_gHza + 0.006164 * mdl_cth * mdl_gHaa * mdl_sth + 0.00454 * mdl_gHza * mdl_sth__exp__2 ) * mdl_vevhat__exp__2 ) / ( mdl_gHaa * mdl_gHza * mdl_LambdaSMEFT__exp__2 ) + ( mdl_cHWB * ( -0.00454 * mdl_cth * mdl_gHza * mdl_sth + mdl_gHaa * ( -0.0030819999999999997 + 0.006163999999999999 * mdl_sth__exp__2 ) ) * mdl_vevhat__exp__2 ) / ( mdl_gHaa * mdl_gHza * mdl_LambdaSMEFT__exp__2 ) + ( mdl_cHB * ( -0.006163999999999999 * mdl_cth * mdl_gHaa * mdl_sth - 0.00454 * mdl_gHza * ( -1. + mdl_sth__exp__2 ) ) * mdl_vevhat__exp__2 ) / ( mdl_gHaa * mdl_gHza * mdl_LambdaSMEFT__exp__2 ) + mdl_dWHc + mdl_dWHb + mdl_dWHta );
+      // Model couplings dependent on aS
+      out.GC_6 = -( cI * G );
+      out.GC_7 = G;
+      out.GC_8 = cI * mdl_G__exp__2;
+    }
+#else
+    // ** NB #439: special handling is necessary ONLY FOR VECTORS OF FLOATS (variable Gs are vector floats, fixed parameters are scalar doubles)
+    // Use an explicit loop to avoid <<error: conversion of scalar ‘double’ to vector ‘fptype_sv’ {aka ‘__vector(8) float’} involves truncation>>
+    // Problems may come e.g. in EFTs from multiplying a vector float (related to aS-dependent G) by a scalar double (aS-independent parameters)
+    fptype_v GC_6r_v;
+    fptype_v GC_6i_v;
+    fptype_v GC_7r_v;
+    fptype_v GC_7i_v;
+    fptype_v GC_8r_v;
+    fptype_v GC_8i_v;
+    for( int i = 0; i < neppV; i++ )
+    {
+      const fptype& G = G_sv[i];
+      // Model parameters dependent on aS
+      //const fptype mdl_sqrt__aS = constexpr_sqrt( aS );
+      //const fptype G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI );
+      const fptype mdl_gHgg2 = ( -7. * aS ) / ( 720. * M_PI );
+      const fptype mdl_gHgg4 = aS / ( 360. * M_PI );
+      const fptype mdl_gHgg5 = aS / ( 20. * M_PI );
+      const fptype mdl_G__exp__2 = ( ( G ) * ( G ) );
+      const fptype mdl_gHgg1 = mdl_G__exp__2 / ( 48. * ( ( M_PI ) * ( M_PI ) ) );
+      const fptype mdl_gHgg3 = ( aS * G ) / ( 60. * M_PI );
+      constexpr cxsmpl<double> mdl_G__exp__3 = ( ( G ) * ( G ) * ( G ) );
+      const fptype mdl_dWH = mdl_WH * ( -0.24161 * mdl_dGf + 0.96644 * mdl_dgw + 0.4832199999999999 * mdl_dkH - 0.11186509426655467 * mdl_dWW + ( 0.36410378449238195 * mdl_cHj3 * mdl_vevhat__exp__2 ) / mdl_LambdaSMEFT__exp__2 + ( 0.17608307708657747 * mdl_cHl3 * mdl_vevhat__exp__2 ) / mdl_LambdaSMEFT__exp__2 + ( 0.1636 * mdl_cHG * mdl_MT__exp__2 * mdl_vevhat__exp__2 ) / ( mdl_LambdaSMEFT__exp__2 * ( -0.5 * mdl_gHgg2 * mdl_MH__exp__2 + mdl_gHgg1 * mdl_MT__exp__2 ) ) + ( mdl_cHW * ( -0.35937785117066967 * mdl_gHaa * mdl_gHza + 0.006164 * mdl_cth * mdl_gHaa * mdl_sth + 0.00454 * mdl_gHza * mdl_sth__exp__2 ) * mdl_vevhat__exp__2 ) / ( mdl_gHaa * mdl_gHza * mdl_LambdaSMEFT__exp__2 ) + ( mdl_cHWB * ( -0.00454 * mdl_cth * mdl_gHza * mdl_sth + mdl_gHaa * ( -0.0030819999999999997 + 0.006163999999999999 * mdl_sth__exp__2 ) ) * mdl_vevhat__exp__2 ) / ( mdl_gHaa * mdl_gHza * mdl_LambdaSMEFT__exp__2 ) + ( mdl_cHB * ( -0.006163999999999999 * mdl_cth * mdl_gHaa * mdl_sth - 0.00454 * mdl_gHza * ( -1. + mdl_sth__exp__2 ) ) * mdl_vevhat__exp__2 ) / ( mdl_gHaa * mdl_gHza * mdl_LambdaSMEFT__exp__2 ) + mdl_dWHc + mdl_dWHb + mdl_dWHta );
+      // Model couplings dependent on aS
+      const cxtype GC_6 = -( cI * G );
+      const cxtype GC_7 = G;
+      const cxtype GC_8 = cI * mdl_G__exp__2;
+      GC_6r_v[i] = cxreal( GC_6 );
+      GC_6i_v[i] = cximag( GC_6 );
+      GC_7r_v[i] = cxreal( GC_7 );
+      GC_7i_v[i] = cximag( GC_7 );
+      GC_8r_v[i] = cxreal( GC_8 );
+      GC_8i_v[i] = cximag( GC_8 );
+    }
+    out.GC_6 = cxtype_v( GC_6r_v, GC_6i_v );
+    out.GC_7 = cxtype_v( GC_7r_v, GC_7i_v );
+    out.GC_8 = cxtype_v( GC_8r_v, GC_8i_v );
+#endif
+    // End non-SM (e.g. EFT) implementation - special handling of vectors of floats (#439)
+    return out;
+  }
+#ifdef __CUDACC__
+#pragma GCC diagnostic pop
+#pragma nv_diagnostic pop
+#endif
+}
+
+//==========================================================================
+
+namespace Parameters_SMEFTsim_topU3l_MwScheme_UFO_independentCouplings
+{
+  constexpr size_t nicoup = 0; // #couplings that are fixed for all events because they do not depend on the running alphas QCD
+  // NB: there are no aS-independent couplings in this physics process
+}
+
+//==========================================================================
+
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+#pragma GCC diagnostic push
+#ifndef __clang__
+#pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
+#endif
+  // Compute the output couplings (e.g. gc10 and gc11) from the input gs
+  template<class G_ACCESS, class C_ACCESS>
+  __device__ inline void
+  G2COUP( const fptype gs[],
+          fptype couplings[] )
+  {
+    mgDebug( 0, __FUNCTION__ );
+    using namespace Parameters_SMEFTsim_topU3l_MwScheme_UFO_dependentCouplings;
+    const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
+    DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv );
+    fptype* GC_6s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 );
+    fptype* GC_7s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_7 );
+    fptype* GC_8s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_8 );
+    cxtype_sv_ref GC_6s_sv = C_ACCESS::kernelAccess( GC_6s );
+    cxtype_sv_ref GC_7s_sv = C_ACCESS::kernelAccess( GC_7s );
+    cxtype_sv_ref GC_8s_sv = C_ACCESS::kernelAccess( GC_8s );
+    GC_6s_sv = couplings_sv.GC_6;
+    GC_7s_sv = couplings_sv.GC_7;
+    GC_8s_sv = couplings_sv.GC_8;
+    mgDebug( 1, __FUNCTION__ );
+    return;
+  }
+#pragma GCC diagnostic pop
+}
+
+//==========================================================================
+
+#endif // Parameters_SMEFTsim_topU3l_MwScheme_UFO_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/cudacpp_src.mk b/epochX/cudacpp/smeft_gg_tttt.sa/src/cudacpp_src.mk
new file mode 100644
index 0000000000..87e1ae946d
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/cudacpp_src.mk
@@ -0,0 +1,268 @@
+#=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
+#=== NB: assume that the same name (e.g. cudacpp.mk, Makefile...) is used in the Subprocess and src directories
+
+THISMK = $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST))
+
+#-------------------------------------------------------------------------------
+
+#=== Use bash in the Makefile (https://www.gnu.org/software/make/manual/html_node/Choosing-the-Shell.html)
+
+SHELL := /bin/bash
+
+#-------------------------------------------------------------------------------
+
+#=== Configure common compiler flags for CUDA and C++
+
+INCFLAGS = -I.
+OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here
+
+#-------------------------------------------------------------------------------
+
+#=== Configure the C++ compiler
+
+CXXFLAGS = $(OPTFLAGS) -std=c++17 $(INCFLAGS) $(USE_NVTX) -fPIC -Wall -Wshadow -Wextra
+ifeq ($(shell $(CXX) --version | grep ^nvc++),)
+CXXFLAGS+= -ffast-math # see issue #117
+endif
+###CXXFLAGS+= -Ofast # performance is not different from --fast-math
+###CXXFLAGS+= -g # FOR DEBUGGING ONLY
+
+# Note: AR, CXX and FC are implicitly defined if not set externally
+# See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html
+###RANLIB = ranlib
+
+#-------------------------------------------------------------------------------
+
+#=== Configure ccache for CUDA and C++ builds
+
+# Enable ccache if USECCACHE=1
+ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1)
+  override CXX:=ccache $(CXX)
+endif
+#ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1)
+#  override AR:=ccache $(AR)
+#endif
+#ifneq ($(NVCC),)
+#  ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1)
+#    override NVCC:=ccache $(NVCC)
+#  endif
+#endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure PowerPC-specific compiler flags for CUDA and C++
+
+# Assuming uname is available, detect if architecture is PowerPC
+UNAME_P := $(shell uname -p)
+
+# PowerPC-specific CXX compiler flags (being reviewed)
+ifeq ($(UNAME_P),ppc64le)
+  CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3% both for none and sse4
+  # Throughput references without the extra flags below: none=1.41-1.42E6, sse4=2.15-2.19E6
+  ###CXXFLAGS+= -DNO_WARN_X86_INTRINSICS # no change
+  ###CXXFLAGS+= -fpeel-loops # no change
+  ###CXXFLAGS+= -funroll-loops # gains ~1% for none, loses ~1% for sse4
+  ###CXXFLAGS+= -ftree-vectorize # no change
+  ###CXXFLAGS+= -flto # BUILD ERROR IF THIS ADDED IN SRC?!
+else
+  ###AR=gcc-ar # needed by -flto
+  ###RANLIB=gcc-ranlib # needed by -flto
+  ###CXXFLAGS+= -flto # NB: build error from src/Makefile unless gcc-ar and gcc-ranlib are used
+  ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto)
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN
+
+# Set the build flags appropriate to OMPFLAGS
+###$(info OMPFLAGS=$(OMPFLAGS))
+CXXFLAGS += $(OMPFLAGS)
+
+# Set the build flags appropriate to each AVX choice (example: "make AVX=none")
+# [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro]
+# [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476]
+$(info AVX=$(AVX))
+ifeq ($(UNAME_P),ppc64le)
+  ifeq ($(AVX),sse4)
+    override AVXFLAGS = -D__SSE4_2__ # Power9 VSX with 128 width (VSR registers)
+  else ifneq ($(AVX),none)
+    $(error Unknown AVX='$(AVX)': only 'none' and 'sse4' are supported on PowerPC for the moment)
+  endif
+else ifeq ($(UNAME_P),arm)
+  ifeq ($(AVX),sse4)
+    override AVXFLAGS = -D__SSE4_2__ # ARM NEON with 128 width (Q/quadword registers)
+  else ifneq ($(AVX),none)
+    $(error Unknown AVX='$(AVX)': only 'none' and 'sse4' are supported on ARM for the moment)
+  endif
+else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
+  ifeq ($(AVX),none)
+    override AVXFLAGS = -mno-sse3 # no SIMD
+  else ifeq ($(AVX),sse4)
+    override AVXFLAGS = -mno-avx # SSE4.2 with 128 width (xmm registers)
+  else ifeq ($(AVX),avx2)
+    override AVXFLAGS = -march=haswell # AVX2 with 256 width (ymm registers) [DEFAULT for clang]
+  else ifeq ($(AVX),512y)
+    override AVXFLAGS = -march=skylake -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
+  else ifeq ($(AVX),512z)
+    override AVXFLAGS = -march=skylake -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else
+    $(error Unknown AVX='$(AVX)': only 'none', 'sse4', 'avx2', '512y' and '512z' are supported)
+  endif
+else
+  ifeq ($(AVX),sse4)
+    override AVXFLAGS = -march=nehalem # SSE4.2 with 128 width (xmm registers)
+  else ifeq ($(AVX),avx2)
+    override AVXFLAGS = -march=haswell # AVX2 with 256 width (ymm registers) [DEFAULT for clang]
+  else ifeq ($(AVX),512y)
+    override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
+  else ifeq ($(AVX),512z)
+    override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else ifneq ($(AVX),none)
+    $(error Unknown AVX='$(AVX)': only 'none', 'sse4', 'avx2', '512y' and '512z' are supported)
+  endif
+endif
+# For the moment, use AVXFLAGS everywhere: eventually, use them only in encapsulated implementations?
+CXXFLAGS+= $(AVXFLAGS)
+
+# Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
+###$(info FPTYPE=$(FPTYPE))
+ifeq ($(FPTYPE),d)
+  CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE
+else ifeq ($(FPTYPE),f)
+  CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT
+else ifeq ($(FPTYPE),m)
+  CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT
+else
+  $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported)
+endif
+
+# Set the build flags appropriate to each HELINL choice (example: "make HELINL=1")
+###$(info HELINL=$(HELINL))
+ifeq ($(HELINL),1)
+  CXXFLAGS += -DMGONGPU_INLINE_HELAMPS
+else ifneq ($(HELINL),0)
+  $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported)
+endif
+
+# Set the build flags appropriate to each HRDCOD choice (example: "make HRDCOD=1")
+###$(info HRDCOD=$(HRDCOD))
+ifeq ($(HRDCOD),1)
+  CXXFLAGS += -DMGONGPU_HARDCODE_PARAM
+else ifneq ($(HRDCOD),0)
+  $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported)
+endif
+
+# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand")
+###$(info RNDGEN=$(RNDGEN))
+ifeq ($(RNDGEN),hasNoCurand)
+  CXXFLAGS += -DMGONGPU_HAS_NO_CURAND
+else ifneq ($(RNDGEN),hasCurand)
+  $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported)
+endif
+
+#-------------------------------------------------------------------------------
+
+#=== Configure build directories and build lockfiles ===
+
+# Build directory "short" tag (defines target and path to the optional build directory)
+# (Rationale: keep directory names shorter, e.g. do not include random number generator choice)
+override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)
+
+# Build lockfile "full" tag (defines full specification of build options that cannot be intermixed)
+# (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators)
+override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN)
+
+# Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1
+###$(info Current directory is $(shell pwd))
+ifeq ($(USEBUILDDIR),1)
+  override BUILDDIR = build.$(DIRTAG)
+  override LIBDIRREL = ../lib/$(BUILDDIR)
+  ###$(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG) (USEBUILDDIR=1 is set))
+else
+  override BUILDDIR = .
+  override LIBDIRREL = ../lib
+  ###$(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG) (USEBUILDDIR is not set))
+endif
+######$(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG))
+
+# Workaround for Mac #375 (I did not manage to fix rpath with @executable_path): use absolute paths for LIBDIR
+# (NB: this is quite ugly because it creates the directory if it does not exist - to avoid removing src by mistake)
+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S),Darwin)
+override LIBDIR = $(shell mkdir -p $(LIBDIRREL); cd $(LIBDIRREL); pwd)
+ifeq ($(wildcard $(LIBDIR)),)
+$(error Directory LIBDIR="$(LIBDIR)" should have been created by now)
+endif
+else
+override LIBDIR = $(LIBDIRREL)
+endif
+
+#===============================================================================
+#=== Makefile TARGETS and build rules below
+#===============================================================================
+
+# NB1: there are no CUDA targets in src as we avoid RDC!
+# NB2: CUDA includes for curand.h are no longer needed in the C++ code anywhere in src!
+
+MG5AMC_COMMONLIB = mg5amc_common
+
+# First target (default goal)
+all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so
+
+# Target (and build options): debug
+debug: OPTFLAGS = -g -O0 -DDEBUG2
+debug: all.$(TAG)
+
+# Target: tag-specific build lockfiles
+override oldtagsb=`if [ -d $(BUILDDIR) ]; then find $(BUILDDIR) -maxdepth 1 -name '.build.*' ! -name '.build.$(TAG)' -exec echo $(shell pwd)/{} \; ; fi`
+override oldtagsl=`if [ -d $(LIBDIR) ]; then find $(LIBDIR) -maxdepth 1 -name '.build.*' ! -name '.build.$(TAG)' -exec echo $(shell pwd)/{} \; ; fi`
+
+$(BUILDDIR)/.build.$(TAG): $(LIBDIR)/.build.$(TAG)
+
+$(LIBDIR)/.build.$(TAG):
+	@if [ "$(oldtagsl)" != "" ]; then echo -e "Cannot build for tag=$(TAG) as old builds exist in $(LIBDIR) for other tags:\n$(oldtagsl)\nPlease run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi
+	@if [ "$(oldtagsb)" != "" ]; then echo -e "Cannot build for tag=$(TAG) as old builds exist in $(BUILDDIR) for other tags:\n$(oldtagsb)\nPlease run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi
+	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
+	@touch $(LIBDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	@touch $(BUILDDIR)/.build.$(TAG)
+
+#-------------------------------------------------------------------------------
+
+# Generic target and build rules: objects from C++ compilation
+$(BUILDDIR)/%.o : %.cc *.h
+	@if [ ! -d $(BUILDDIR) ]; then mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@
+
+#-------------------------------------------------------------------------------
+
+cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_SMEFTsim_topU3l_MwScheme_UFO.o read_slha.o)
+
+# Target (and build rules): common (src) library
+$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects)
+	@if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi
+	$(CXX) -shared -o$@ $(cxx_objects)
+
+#-------------------------------------------------------------------------------
+
+# Target: clean the builds
+.PHONY: clean
+
+clean:
+ifeq ($(USEBUILDDIR),1)
+	rm -rf $(LIBDIR)
+	rm -rf $(BUILDDIR)
+else
+	rm -f $(LIBDIR)/.build.* $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so
+	rm -f $(BUILDDIR)/.build.* $(BUILDDIR)/*.o $(BUILDDIR)/*.exe
+endif
+
+cleanall:
+	@echo
+	$(MAKE) clean -f $(THISMK)
+	@echo
+	rm -rf $(LIBDIR)/build.*
+	rm -rf build.*
+
+#-------------------------------------------------------------------------------
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h
new file mode 100644
index 0000000000..e11e8ec53b
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h
@@ -0,0 +1,234 @@
+#ifndef MGONGPUCONFIG_H
+#define MGONGPUCONFIG_H 1
+
+// HARDCODED AT CODE GENERATION TIME: DO NOT MODIFY (#473)
+// There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel)
+#undef MGONGPU_SUPPORTS_MULTICHANNEL
+
+// ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12"
+// ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs)
+
+// Choose if curand is supported for generating random numbers
+// For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND
+#ifdef __CUDACC__
+#undef MGONGPU_HAS_NO_CURAND
+#else
+//#undef MGONGPU_HAS_NO_CURAND // default
+////#define MGONGPU_HAS_NO_CURAND 1
+#endif
+
+// Choose floating point precision (for everything but color algebra #537)
+// If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167)
+#if not defined MGONGPU_FPTYPE_DOUBLE and not defined MGONGPU_FPTYPE_FLOAT
+// Floating point precision (CHOOSE ONLY ONE)
+#define MGONGPU_FPTYPE_DOUBLE 1 // default
+//#define MGONGPU_FPTYPE_FLOAT 1 // 2x faster
+#endif
+
+// Choose floating point precision (for color algebra alone #537)
+// If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE2_FLOAT, nothing happens (issue #167)
+#if not defined MGONGPU_FPTYPE2_DOUBLE and not defined MGONGPU_FPTYPE2_FLOAT
+// Floating point precision (CHOOSE ONLY ONE)
+#define MGONGPU_FPTYPE2_DOUBLE 1 // default
+//#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
+#endif
+
+// Choose whether to inline all HelAmps functions
+// This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
+// By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
+//#undef MGONGPU_INLINE_HELAMPS // default
+////#define MGONGPU_INLINE_HELAMPS 1
+
+// Choose whether to hardcode the cIPD physics parameters rather than reading them from user cards
+// This optimization can gain 20% in CUDA in eemumu (issue #39)
+// By default, do not hardcode, but allow this macro to be set from outside with e.g. -DMGONGPU_HARDCODE_PARAM
+// ** NB: The option to use hardcoded cIPD physics parameters is supported again even now when alphas is running (#373)
+// ** NB: Note however that it now only refers to cIPD parameters (cIPC parameters are always accessed through global memory)
+//#undef MGONGPU_HARDCODE_PARAM // default
+////#define MGONGPU_HARDCODE_PARAM 1
+
+// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE)
+#ifndef __CUDACC__
+//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float)
+#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
+#endif
+
+// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE)
+#ifdef __CUDACC__
+#define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float)
+//#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float)
+//#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float)
+#endif
+
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+#ifdef __CUDACC__
+#undef MGONGPU_NSIGHT_DEBUG // default
+//#define MGONGPU_NSIGHT_DEBUG 1
+#endif
+
+// SANITY CHECKS (floating point precision for everything but color algebra #537)
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE_FLOAT
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_FPTYPE_DOUBLE or defined MGONGPU_FPTYPE_FLOAT
+#endif
+
+// SANITY CHECKS (floating point precision for color algebra alone #537)
+#if defined MGONGPU_FPTYPE2_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_FPTYPE2_DOUBLE or defined MGONGPU_FPTYPE2_FLOAT
+#endif
+#if defined MGONGPU_FPTYPE2_DOUBLE and defined MGONGPU_FPTYPE_FLOAT
+#error You cannot use double precision for color algebra and single precision elsewhere
+#endif
+
+// SANITY CHECKS (c++ complex number implementation)
+#ifndef __CUDACC__
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+#endif
+#endif
+
+// SANITY CHECKS (cuda complex number implementation)
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+#endif
+#endif
+
+namespace mgOnGpu
+{
+
+  // --- Type definitions
+
+  // Floating point type (for everything but color algebra #537): fptype
+#if defined MGONGPU_FPTYPE_DOUBLE
+  typedef double fptype; // double precision (8 bytes, fp64)
+#elif defined MGONGPU_FPTYPE_FLOAT
+  typedef float fptype;  // single precision (4 bytes, fp32)
+#endif
+
+  // Floating point type (for color algebra alone #537): fptype2
+#if defined MGONGPU_FPTYPE2_DOUBLE
+  typedef double fptype2; // double precision (8 bytes, fp64)
+#elif defined MGONGPU_FPTYPE2_FLOAT
+  typedef float fptype2; // single precision (4 bytes, fp32)
+#endif
+
+  // --- Physics process-specific constants that are best declared at compile time
+
+  const int np4 = 4; // dimensions of 4-momenta (E,px,py,pz)
+
+  const int npari = 2; // #particles in the initial state (incoming): e.g. 2 (e+ e-) for e+ e- -> mu+ mu-
+
+  const int nparf = 4; // #particles in the final state (outgoing): e.g. 2 (mu+ mu-) for e+ e- -> mu+ mu-
+
+  const int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+
+  const int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+
+  const int nw6 = 6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+
+  const int nwf = 18; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+
+  // --- Platform-specific software implementation details
+
+  // Maximum number of blocks per grid
+  // ** NB Some arrays of pointers will be allocated statically to fit all these blocks
+  // ** (the actual memory for each block will then be allocated dynamically only for existing blocks)
+  //const int nbpgMAX = 2048;
+
+  // Maximum number of threads per block
+  //const int ntpbMAX = 256; // AV Apr2021: why had I set this to 256?
+  const int ntpbMAX = 1024; // NB: 512 is ok, but 1024 does fail with "too many resources requested for launch"
+
+  // Alignment requirement for using reinterpret_cast with SIMD vectorized code
+  // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
+  // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
+#ifndef __CUDACC__
+  constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
+#endif
+
+}
+
+// Expose typedefs and operators outside the namespace
+using mgOnGpu::fptype;
+using mgOnGpu::fptype2;
+
+// C++ SIMD vectorization width (this will be used to set neppV)
+#ifdef __CUDACC__ // CUDA implementation has no SIMD
+#undef MGONGPU_CPPSIMD
+#elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT)
+#ifdef MGONGPU_FPTYPE_DOUBLE
+#define MGONGPU_CPPSIMD 8
+#else
+#define MGONGPU_CPPSIMD 16
+#endif
+#elif defined __AVX512VL__ // C++ "512y" AVX512 with 256 width (256-bit ie 32-byte): 4 (DOUBLE) or 8 (FLOAT) [gcc DEFAULT]
+#ifdef MGONGPU_FPTYPE_DOUBLE
+#define MGONGPU_CPPSIMD 4
+#else
+#define MGONGPU_CPPSIMD 8
+#endif
+#elif defined __AVX2__ // C++ "avx2" AVX2 (256-bit ie 32-byte): 4 (DOUBLE) or 8 (FLOAT) [clang DEFAULT]
+#ifdef MGONGPU_FPTYPE_DOUBLE
+#define MGONGPU_CPPSIMD 4
+#else
+#define MGONGPU_CPPSIMD 8
+#endif
+#elif defined __SSE4_2__ // C++ "sse4" SSE4.2 (128-bit ie 16-byte): 2 (DOUBLE) or 4 (FLOAT) [Power9 and ARM default]
+#ifdef MGONGPU_FPTYPE_DOUBLE
+#define MGONGPU_CPPSIMD 2
+#else
+#define MGONGPU_CPPSIMD 4
+#endif
+#else // C++ "none" i.e. no SIMD
+#undef MGONGPU_CPPSIMD
+#endif
+
+// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
+#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */
+#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+#else
+#define mgDebugDeclare() /*noop*/
+#define mgDebugInitialise() { /*noop*/ }
+#define mgDebug( code, text ) { /*noop*/ }
+#define mgDebugFinalise() { /*noop*/ }
+#endif /* clang-format on */
+
+// Define empty CUDA declaration specifiers for C++
+#ifndef __CUDACC__
+#define __global__
+#define __host__
+#define __device__
+#endif
+
+// For SANITY CHECKS: check that neppR, neppM, neppV... are powers of two (https://stackoverflow.com/a/108360)
+inline constexpr bool
+ispoweroftwo( int n )
+{
+  return ( n > 0 ) && !( n & ( n - 1 ) );
+}
+
+// Compiler version support (#96): require nvcc from CUDA >= 11.2, e.g. to use C++17 (see #333)
+#ifdef __NVCC__
+#if( __CUDACC_VER_MAJOR__ < 11 ) || ( __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ < 2 )
+#error Unsupported CUDA version: please use CUDA >= 11.2
+#endif
+#endif
+
+// Compiler version support (#96): require clang >= 11
+#if defined __clang__
+#if( __clang_major__ < 11 )
+#error Unsupported clang version: please use clang >= 11
+#endif
+// Compiler version support (#96): require gcc >= 9.3, e.g. for some OMP issues (see #269)
+// [NB skip this check for the gcc toolchain below clang or icx (TEMPORARY? #355)]
+#elif defined __GNUC__
+#if( __GNUC__ < 9 ) || ( __GNUC__ == 9 && __GNUC_MINOR__ < 3 )
+#error Unsupported gcc version: please gcc >= 9.3
+#endif
+#endif
+
+#endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuCxtypes.h
new file mode 100644
index 0000000000..caff927311
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuCxtypes.h
@@ -0,0 +1,633 @@
+#ifndef MGONGPUCXTYPES_H
+#define MGONGPUCXTYPES_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuFptypes.h"
+
+#include <iostream>
+
+//==========================================================================
+// COMPLEX TYPES: (PLATFORM-SPECIFIC) HEADERS
+//==========================================================================
+
+#include <complex>
+
+// Complex type in cuda: thrust or cucomplex or cxsmpl
+#ifdef __CUDACC__
+#if defined MGONGPU_CUCXTYPE_THRUST
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661)
+#include <thrust/complex.h>
+#pragma clang diagnostic pop
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#include <cuComplex.h>
+#elif not defined MGONGPU_CUCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL
+#endif
+#else
+// Complex type in c++: std::complex or cxsmpl
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+#include <cmath>
+#elif not defined MGONGPU_CPPCXTYPE_CXSMPL
+#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL
+#endif
+#endif
+
+//==========================================================================
+// COMPLEX TYPES: SIMPLE COMPLEX CLASS (cxsmpl)
+//==========================================================================
+
+namespace mgOnGpu /* clang-format off */
+{
+  // --- Type definition (simple complex type derived from cxtype_v)
+  template<typename FP>
+  class cxsmpl
+  {
+  public:
+    __host__ __device__ constexpr cxsmpl() : m_real( 0 ), m_imag( 0 ) {}
+    cxsmpl( const cxsmpl& ) = default;
+    cxsmpl( cxsmpl&& ) = default;
+    __host__ __device__ constexpr cxsmpl( const FP& r, const FP& i = 0 ) : m_real( r ), m_imag( i ) {}
+    __host__ __device__ constexpr cxsmpl( const std::complex<FP>& c ) : m_real( c.real() ), m_imag( c.imag() ) {}
+    cxsmpl& operator=( const cxsmpl& ) = default;
+    cxsmpl& operator=( cxsmpl&& ) = default;
+    __host__ __device__ constexpr cxsmpl& operator+=( const cxsmpl& c ) { m_real += c.real(); m_imag += c.imag(); return *this; }
+    __host__ __device__ constexpr cxsmpl& operator-=( const cxsmpl& c ) { m_real -= c.real(); m_imag -= c.imag(); return *this; }
+    __host__ __device__ constexpr const FP& real() const { return m_real; }
+    __host__ __device__ constexpr const FP& imag() const { return m_imag; }
+    //constexpr operator std::complex<FP>() const { return std::complex( m_real, m_imag ); } // cxsmpl to std::complex (float-to-float or double-to-double)
+  private:
+    FP m_real, m_imag; // RI
+  };
+
+  template<typename FP>
+  inline __host__ __device__ cxsmpl<FP> // (NB: cannot be constexpr as a constexpr function cannot have a nonliteral return type "mgOnGpu::cxsmpl")
+  conj( const cxsmpl<FP>& c )
+  {
+    return cxsmpl<FP>( c.real(), -c.imag() );
+  }
+} /* clang-format on */
+
+// Expose the cxsmpl class outside the namespace
+using mgOnGpu::cxsmpl;
+
+// Printout to stream for user defined types
+template<typename FP>
+inline __host__ __device__ std::ostream&
+operator<<( std::ostream& out, const cxsmpl<FP>& c )
+{
+  out << std::complex( c.real(), c.imag() );
+  return out;
+}
+
+// Operators for cxsmpl
+template<typename FP>
+inline __host__ __device__ constexpr cxsmpl<FP>
+operator+( const cxsmpl<FP> a )
+{
+  return a;
+}
+
+template<typename FP>
+inline __host__ __device__ constexpr cxsmpl<FP>
+operator-( const cxsmpl<FP>& a )
+{
+  return cxsmpl<FP>( -a.real(), -a.imag() );
+}
+
+template<typename FP>
+inline __host__ __device__ constexpr cxsmpl<FP>
+operator+( const cxsmpl<FP>& a, const cxsmpl<FP>& b )
+{
+  return cxsmpl<FP>( a.real() + b.real(), a.imag() + b.imag() );
+}
+
+template<typename FP>
+inline __host__ __device__ constexpr cxsmpl<FP>
+operator+( const FP& a, const cxsmpl<FP>& b )
+{
+  return cxsmpl<FP>( a, 0 ) + b;
+}
+
+template<typename FP>
+inline __host__ __device__ constexpr cxsmpl<FP>
+operator-( const cxsmpl<FP>& a, const cxsmpl<FP>& b )
+{
+  return cxsmpl<FP>( a.real() - b.real(), a.imag() - b.imag() );
+}
+
+template<typename FP>
+inline __host__ __device__ constexpr cxsmpl<FP>
+operator-( const FP& a, const cxsmpl<FP>& b )
+{
+  return cxsmpl<FP>( a, 0 ) - b;
+}
+
+template<typename FP>
+inline __host__ __device__ constexpr cxsmpl<FP>
+operator*( const cxsmpl<FP>& a, const cxsmpl<FP>& b )
+{
+  return cxsmpl<FP>( a.real() * b.real() - a.imag() * b.imag(), a.imag() * b.real() + a.real() * b.imag() );
+}
+
+template<typename FP>
+inline __host__ __device__ constexpr cxsmpl<FP>
+operator*( const FP& a, const cxsmpl<FP>& b )
+{
+  return cxsmpl<FP>( a, 0 ) * b;
+}
+
+inline __host__ __device__ constexpr cxsmpl<float>
+operator*( const double& a, const cxsmpl<float>& b )
+{
+  return cxsmpl<float>( a, 0 ) * b;
+}
+
+template<typename FP>
+inline __host__ __device__ constexpr cxsmpl<FP>
+operator/( const cxsmpl<FP>& a, const cxsmpl<FP>& b )
+{
+  FP bnorm = b.real() * b.real() + b.imag() * b.imag();
+  return cxsmpl<FP>( ( a.real() * b.real() + a.imag() * b.imag() ) / bnorm,
+                     ( a.imag() * b.real() - a.real() * b.imag() ) / bnorm );
+}
+
+template<typename FP>
+inline __host__ __device__ constexpr cxsmpl<FP>
+operator/( const FP& a, const cxsmpl<FP>& b )
+{
+  return cxsmpl<FP>( a, 0 ) / b;
+}
+
+template<typename FP>
+inline __host__ __device__ constexpr cxsmpl<FP>
+operator+( const cxsmpl<FP>& a, const FP& b )
+{
+  return a + cxsmpl<FP>( b, 0 );
+}
+
+template<typename FP>
+inline __host__ __device__ constexpr cxsmpl<FP>
+operator-( const cxsmpl<FP>& a, const FP& b )
+{
+  return a - cxsmpl<FP>( b, 0 );
+}
+
+template<typename FP>
+inline __host__ __device__ constexpr cxsmpl<FP>
+operator*( const cxsmpl<FP>& a, const FP& b )
+{
+  return a * cxsmpl<FP>( b, 0 );
+}
+
+template<typename FP>
+inline __host__ __device__ constexpr cxsmpl<FP>
+operator/( const cxsmpl<FP>& a, const FP& b )
+{
+  return a / cxsmpl<FP>( b, 0 );
+}
+
+//==========================================================================
+// COMPLEX TYPES: (PLATFORM-SPECIFIC) TYPEDEFS
+//==========================================================================
+
+namespace mgOnGpu
+{
+
+  // --- Type definitions (complex type: cxtype)
+#ifdef __CUDACC__ // cuda
+#if defined MGONGPU_CUCXTYPE_THRUST
+  typedef thrust::complex<fptype> cxtype;
+#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX
+#if defined MGONGPU_FPTYPE_DOUBLE
+  typedef cuDoubleComplex cxtype;
+#elif defined MGONGPU_FPTYPE_FLOAT
+  typedef cuFloatComplex cxtype;
+#endif
+#else
+  typedef cxsmpl<fptype> cxtype;
+#endif
+#else // c++
+#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+  typedef std::complex<fptype> cxtype;
+#else
+  typedef cxsmpl<fptype> cxtype;
+#endif
+#endif
+
+  // The number of floating point types in a complex type (real, imaginary)
+  constexpr int nx2 = 2;
+
+  // SANITY CHECK: memory access may be based on casts of fptype[2] to cxtype (e.g. for wavefunctions)
+  static_assert( sizeof( cxtype ) == nx2 * sizeof( fptype ), "sizeof(cxtype) is not 2*sizeof(fptype)" );
+}
+
+// Expose typedefs and operators outside the namespace
+using mgOnGpu::cxtype;
+
+//==========================================================================
+// COMPLEX TYPES: (PLATFORM-SPECIFIC) FUNCTIONS AND OPERATORS
+//==========================================================================
+
+#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL
+
+//------------------------------
+// CUDA or C++ - using cxsmpl
+//------------------------------
+
+inline __host__ __device__ cxtype
+cxmake( const fptype& r, const fptype& i )
+{
+  return cxtype( r, i ); // cxsmpl constructor
+}
+
+inline __host__ __device__ fptype
+cxreal( const cxtype& c )
+{
+  return c.real(); // cxsmpl::real()
+}
+
+inline __host__ __device__ fptype
+cximag( const cxtype& c )
+{
+  return c.imag(); // cxsmpl::imag()
+}
+
+inline __host__ __device__ cxtype
+cxconj( const cxtype& c )
+{
+  return conj( c ); // conj( cxsmpl )
+}
+
+inline __host__ cxtype                 // NOT __device__
+cxmake( const std::complex<float>& c ) // std::complex to cxsmpl (float-to-float or float-to-double)
+{
+  return cxmake( c.real(), c.imag() );
+}
+
+inline __host__ cxtype                  // NOT __device__
+cxmake( const std::complex<double>& c ) // std::complex to cxsmpl (double-to-float or double-to-double)
+{
+  return cxmake( c.real(), c.imag() );
+}
+
+#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL
+
+//==========================================================================
+
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust
+
+//------------------------------
+// CUDA - using thrust::complex
+//------------------------------
+
+inline __host__ __device__ cxtype
+cxmake( const fptype& r, const fptype& i )
+{
+  return cxtype( r, i ); // thrust::complex<fptype> constructor
+}
+
+inline __host__ __device__ fptype
+cxreal( const cxtype& c )
+{
+  return c.real(); // thrust::complex<fptype>::real()
+}
+
+inline __host__ __device__ fptype
+cximag( const cxtype& c )
+{
+  return c.imag(); // thrust::complex<fptype>::imag()
+}
+
+inline __host__ __device__ cxtype
+cxconj( const cxtype& c )
+{
+  return conj( c ); // conj( thrust::complex<fptype> )
+}
+
+inline __host__ __device__ const cxtype&
+cxmake( const cxtype& c )
+{
+  return c;
+}
+
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST
+
+//==========================================================================
+
+#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex
+
+//------------------------------
+// CUDA - using cuComplex
+//------------------------------
+
+#if defined MGONGPU_FPTYPE_DOUBLE // cuda + cucomplex + double
+
+//+++++++++++++++++++++++++
+// cuDoubleComplex ONLY
+//+++++++++++++++++++++++++
+
+inline __host__ __device__ cxtype
+cxmake( const fptype& r, const fptype& i )
+{
+  return make_cuDoubleComplex( r, i );
+}
+
+inline __host__ __device__ fptype
+cxreal( const cxtype& c )
+{
+  return cuCreal( c ); // returns by value
+}
+
+inline __host__ __device__ fptype
+cximag( const cxtype& c )
+{
+  return cuCimag( c ); // returns by value
+}
+
+inline __host__ __device__ cxtype
+operator+( const cxtype& a, const cxtype& b )
+{
+  return cuCadd( a, b );
+}
+
+inline __host__ __device__ cxtype&
+operator+=( cxtype& a, const cxtype& b )
+{
+  a = cuCadd( a, b );
+  return a;
+}
+
+inline __host__ __device__ cxtype
+operator-( const cxtype& a, const cxtype& b )
+{
+  return cuCsub( a, b );
+}
+
+inline __host__ __device__ cxtype&
+operator-=( cxtype& a, const cxtype& b )
+{
+  a = cuCsub( a, b );
+  return a;
+}
+
+inline __host__ __device__ cxtype
+operator*( const cxtype& a, const cxtype& b )
+{
+  return cuCmul( a, b );
+}
+
+inline __host__ __device__ cxtype
+operator/( const cxtype& a, const cxtype& b )
+{
+  return cuCdiv( a, b );
+}
+
+#elif defined MGONGPU_FPTYPE_FLOAT // cuda + cucomplex + float
+
+//+++++++++++++++++++++++++
+// cuFloatComplex ONLY
+//+++++++++++++++++++++++++
+
+inline __host__ __device__ cxtype
+cxmake( const fptype& r, const fptype& i )
+{
+  return make_cuFloatComplex( r, i );
+}
+
+inline __host__ __device__ fptype
+cxreal( const cxtype& c )
+{
+  return cuCrealf( c ); // returns by value
+}
+
+inline __host__ __device__ fptype
+cximag( const cxtype& c )
+{
+  return cuCimagf( c ); // returns by value
+}
+
+inline __host__ __device__ cxtype
+operator+( const cxtype& a, const cxtype& b )
+{
+  return cuCaddf( a, b );
+}
+
+inline __host__ __device__ cxtype&
+operator+=( cxtype& a, const cxtype& b )
+{
+  a = cuCaddf( a, b );
+  return a;
+}
+
+inline __host__ __device__ cxtype
+operator-( const cxtype& a, const cxtype& b )
+{
+  return cuCsubf( a, b );
+}
+
+inline __host__ __device__ cxtype&
+operator-=( cxtype& a, const cxtype& b )
+{
+  a = cuCsubf( a, b );
+  return a;
+}
+
+inline __host__ __device__ cxtype
+operator*( const cxtype& a, const cxtype& b )
+{
+  return cuCmulf( a, b );
+}
+
+inline __host__ __device__ cxtype
+operator/( const cxtype& a, const cxtype& b )
+{
+  return cuCdivf( a, b );
+}
+
+inline __host__ cxtype                  // NOT __device__
+cxmake( const std::complex<double>& c ) // std::complex to cucomplex (cast double-to-float)
+{
+  return cxmake( (fptype)c.real(), (fptype)c.imag() );
+}
+
+#endif
+
+//+++++++++++++++++++++++++
+// cuDoubleComplex OR
+// cuFloatComplex
+//+++++++++++++++++++++++++
+
+inline __host__ __device__ cxtype
+operator+( const cxtype a )
+{
+  return a;
+}
+
+inline __host__ __device__ cxtype
+operator-( const cxtype& a )
+{
+  return cxmake( -cxreal( a ), -cximag( a ) );
+}
+
+inline __host__ __device__ cxtype
+operator+( const fptype& a, const cxtype& b )
+{
+  return cxmake( a, 0 ) + b;
+}
+
+inline __host__ __device__ cxtype
+operator-( const fptype& a, const cxtype& b )
+{
+  return cxmake( a, 0 ) - b;
+}
+
+inline __host__ __device__ cxtype
+operator*( const fptype& a, const cxtype& b )
+{
+  return cxmake( a, 0 ) * b;
+}
+
+inline __host__ __device__ cxtype
+operator/( const fptype& a, const cxtype& b )
+{
+  return cxmake( a, 0 ) / b;
+}
+
+inline __host__ __device__ cxtype
+operator+( const cxtype& a, const fptype& b )
+{
+  return a + cxmake( b, 0 );
+}
+
+inline __host__ __device__ cxtype
+operator-( const cxtype& a, const fptype& b )
+{
+  return a - cxmake( b, 0 );
+}
+
+inline __host__ __device__ cxtype
+operator*( const cxtype& a, const fptype& b )
+{
+  return a * cxmake( b, 0 );
+}
+
+inline __host__ __device__ cxtype
+operator/( const cxtype& a, const fptype& b )
+{
+  return a / cxmake( b, 0 );
+}
+
+inline __host__ __device__ cxtype
+cxconj( const cxtype& c )
+{
+  return cxmake( cxreal( c ), -cximag( c ) );
+}
+
+inline __host__ cxtype                  // NOT __device__
+cxmake( const std::complex<fptype>& c ) // std::complex to cucomplex (float-to-float or double-to-double)
+{
+  return cxmake( c.real(), c.imag() );
+}
+
+#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX
+
+//==========================================================================
+
+#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex
+
+//------------------------------
+// C++ - using std::complex
+//------------------------------
+
+inline cxtype
+cxmake( const fptype& r, const fptype& i )
+{
+  return cxtype( r, i ); // std::complex<fptype> constructor
+}
+
+inline fptype
+cxreal( const cxtype& c )
+{
+  return c.real(); // std::complex<fptype>::real()
+}
+
+inline fptype
+cximag( const cxtype& c )
+{
+  return c.imag(); // std::complex<fptype>::imag()
+}
+
+inline cxtype
+cxconj( const cxtype& c )
+{
+  return conj( c ); // conj( std::complex<fptype> )
+}
+
+inline const cxtype&
+cxmake( const cxtype& c ) // std::complex to std::complex (float-to-float or double-to-double)
+{
+  return c;
+}
+
+#if defined MGONGPU_FPTYPE_FLOAT
+inline cxtype
+cxmake( const std::complex<double>& c ) // std::complex to std::complex (cast double-to-float)
+{
+  return cxmake( (fptype)c.real(), (fptype)c.imag() );
+}
+#endif
+
+#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX
+
+//==========================================================================
+
+inline __host__ __device__ const cxtype
+cxmake( const cxsmpl<float>& c ) // cxsmpl to cxtype (float-to-float or float-to-double)
+{
+  return cxmake( c.real(), c.imag() );
+}
+
+inline __host__ __device__ const cxtype
+cxmake( const cxsmpl<double>& c ) // cxsmpl to cxtype (double-to-float or double-to-double)
+{
+  return cxmake( c.real(), c.imag() );
+}
+
+//==========================================================================
+// COMPLEX TYPES: WRAPPER OVER RI FLOATING POINT PAIR (cxtype_ref)
+//==========================================================================
+
+namespace mgOnGpu /* clang-format off */
+{
+  // The cxtype_ref class (a non-const reference to two fp variables) was originally designed for cxtype_v::operator[]
+  // It used to be included in the code only when MGONGPU_HAS_CPPCXTYPEV_BRK (originally MGONGPU_HAS_CPPCXTYPE_REF) is defined
+  // It is now always included in the code because it is needed also to access an fptype wavefunction buffer as a cxtype
+  class cxtype_ref
+  {
+  public:
+    cxtype_ref() = delete;
+    cxtype_ref( const cxtype_ref& ) = delete;
+    cxtype_ref( cxtype_ref&& ) = default; // copy refs
+    __host__ __device__ cxtype_ref( fptype& r, fptype& i ) : m_preal( &r ), m_pimag( &i ) {} // copy refs
+    cxtype_ref& operator=( const cxtype_ref& ) = delete;
+    //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary
+    __host__ __device__ cxtype_ref& operator=( const cxtype& c ) { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; } // copy values
+    __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); }
+  private:
+    fptype *m_preal, *m_pimag; // RI
+  };
+} /* clang-format on */
+
+// Printout to stream for user defined types
+inline __host__ __device__ std::ostream&
+operator<<( std::ostream& out, const mgOnGpu::cxtype_ref& c )
+{
+  out << (cxtype)c;
+  return out;
+}
+
+//==========================================================================
+
+#endif // MGONGPUCXTYPES_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuFptypes.h
new file mode 100644
index 0000000000..b278275f80
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuFptypes.h
@@ -0,0 +1,87 @@
+#ifndef MGONGPUFPTYPES_H
+#define MGONGPUFPTYPES_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include <algorithm>
+#include <cmath>
+
+//==========================================================================
+
+#ifdef __CUDACC__ // cuda
+
+//------------------------------
+// Floating point types - Cuda
+//------------------------------
+
+/*
+inline __host__ __device__ fptype
+fpmax( const fptype& a, const fptype& b )
+{
+  return max( a, b );
+}
+
+inline __host__ __device__ fptype
+fpmin( const fptype& a, const fptype& b )
+{
+  return min( a, b );
+}
+*/
+
+inline __host__ __device__ const fptype&
+fpmax( const fptype& a, const fptype& b )
+{
+  return ( ( b < a ) ? a : b );
+}
+
+inline __host__ __device__ const fptype&
+fpmin( const fptype& a, const fptype& b )
+{
+  return ( ( a < b ) ? a : b );
+}
+
+inline __host__ __device__ fptype
+fpsqrt( const fptype& f )
+{
+#if defined MGONGPU_FPTYPE_FLOAT
+  // See https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html
+  return sqrtf( f );
+#else
+  // See https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__DOUBLE.html
+  return sqrt( f );
+#endif
+}
+
+#endif // #ifdef __CUDACC__
+
+//==========================================================================
+
+#ifndef __CUDACC__
+
+//------------------------------
+// Floating point types - C++
+//------------------------------
+
+inline const fptype&
+fpmax( const fptype& a, const fptype& b )
+{
+  return std::max( a, b );
+}
+
+inline const fptype&
+fpmin( const fptype& a, const fptype& b )
+{
+  return std::min( a, b );
+}
+
+inline fptype
+fpsqrt( const fptype& f )
+{
+  return std::sqrt( f );
+}
+
+#endif // #ifndef __CUDACC__
+
+//==========================================================================
+
+#endif // MGONGPUFPTYPES_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectors.h
new file mode 100644
index 0000000000..0dd4c69bd4
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuVectors.h
@@ -0,0 +1,829 @@
+#ifndef MGONGPUVECTORS_H
+#define MGONGPUVECTORS_H 1
+
+#include "mgOnGpuCxtypes.h"
+#include "mgOnGpuFptypes.h"
+
+#include <iostream>
+
+//==========================================================================
+
+//------------------------------
+// Vector types - C++
+//------------------------------
+
+#ifdef __clang__
+// If set: return a pair of (fptype&, fptype&) by non-const reference in cxtype_v::operator[]
+// This is forbidden in clang ("non-const reference cannot bind to vector element")
+// See also https://stackoverflow.com/questions/26554829
+//#define MGONGPU_HAS_CPPCXTYPEV_BRK 1 // clang test (compilation fails also on clang 12.0, issue #182)
+#undef MGONGPU_HAS_CPPCXTYPEV_BRK // clang default
+#elif defined __INTEL_COMPILER
+//#define MGONGPU_HAS_CPPCXTYPEV_BRK 1 // icc default?
+#undef MGONGPU_HAS_CPPCXTYPEV_BRK // icc test
+#else
+#define MGONGPU_HAS_CPPCXTYPEV_BRK 1 // gcc default
+//#undef MGONGPU_HAS_CPPCXTYPEV_BRK // gcc test (very slightly slower? issue #172)
+#endif
+
+namespace mgOnGpu /* clang-format off */
+{
+#ifdef MGONGPU_CPPSIMD
+
+  const int neppV = MGONGPU_CPPSIMD;
+
+  // SANITY CHECK: cppAlign must be a multiple of neppV * sizeof(fptype)
+  static_assert( mgOnGpu::cppAlign % ( neppV * sizeof( fptype ) ) == 0 );
+
+  // SANITY CHECK: check that neppV is a power of two
+  static_assert( ispoweroftwo( neppV ), "neppV is not a power of 2" );
+
+  // --- Type definition (using vector compiler extensions: need -march=...)
+  // For gcc: https://gcc.gnu.org/onlinedocs/gcc/Vector-Extensions.html
+  // For clang: https://clang.llvm.org/docs/LanguageExtensions.html#vectors-and-extended-vectors
+#ifdef __clang__
+  typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR
+#else
+  typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR
+#endif
+
+  // Mixed fptypes #537: float for color algebra and double elsewhere
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  const int neppV2 = MGONGPU_CPPSIMD * 2;
+  static_assert( mgOnGpu::cppAlign % ( neppV2 * sizeof( fptype2 ) ) == 0 );
+  static_assert( ispoweroftwo( neppV2 ), "neppV2 is not a power of 2" );
+#ifdef __clang__
+  typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR
+#else
+  typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR
+#endif
+#else
+  typedef fptype_v fptype2_v;
+#endif
+
+  // --- Type definition (using vector compiler extensions: need -march=...)
+  class cxtype_v // no need for "class alignas(2*sizeof(fptype_v)) cxtype_v"
+  {
+  public:
+    // Array initialization: zero-out as "{0}" (C and C++) or as "{}" (C++ only)
+    // See https://en.cppreference.com/w/c/language/array_initialization#Notes
+    cxtype_v() : m_real{ 0 }, m_imag{ 0 } {} // RRRR=0000 IIII=0000
+    cxtype_v( const cxtype_v& ) = default;
+    cxtype_v( cxtype_v&& ) = default;
+    cxtype_v( const fptype_v& r, const fptype_v& i ) : m_real( r ), m_imag( i ) {}
+    cxtype_v( const fptype_v& r ) : m_real( r ), m_imag{ 0 } {} // IIII=0000
+    cxtype_v& operator=( const cxtype_v& ) = default;
+    cxtype_v& operator=( cxtype_v&& ) = default;
+    cxtype_v& operator+=( const cxtype_v& c ) { m_real += c.real(); m_imag += c.imag(); return *this; }
+    cxtype_v& operator-=( const cxtype_v& c ) { m_real -= c.real(); m_imag -= c.imag(); return *this; }
+#ifdef MGONGPU_HAS_CPPCXTYPEV_BRK
+    // NB: THIS IS THE FUNDAMENTAL DIFFERENCE BETWEEN MGONGPU_HAS_CPPCXTYPEV_BRK DEFINED AND NOT DEFINED
+    // NB: the alternative "clang" implementation is simpler: it simply does not have any bracket operator[]
+    // NB: ** do NOT implement operator[] to return a value: it does not fail the build (why?) and gives unexpected results! **
+    cxtype_ref operator[]( size_t i ) const { return cxtype_ref( m_real[i], m_imag[i] ); }
+#endif
+    const fptype_v& real() const { return m_real; }
+    const fptype_v& imag() const { return m_imag; }
+  private:
+    fptype_v m_real, m_imag; // RRRRIIII
+  };
+
+  // --- Type definition (using vector compiler extensions: need -march=...)
+#ifdef __clang__ // https://clang.llvm.org/docs/LanguageExtensions.html#vectors-and-extended-vectors
+#if defined MGONGPU_FPTYPE_DOUBLE
+  typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
+#elif defined MGONGPU_FPTYPE_FLOAT
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
+#endif
+#else // gcc
+#if defined MGONGPU_FPTYPE_DOUBLE
+  typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb
+#elif defined MGONGPU_FPTYPE_FLOAT
+  typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb
+#endif
+#endif
+
+#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__)
+
+  const int neppV = 1;
+
+#endif // #ifdef MGONGPU_CPPSIMD
+
+} /* clang-format on */
+
+//--------------------------------------------------------------------------
+
+// Expose typedefs outside the namespace
+using mgOnGpu::neppV;
+#ifdef MGONGPU_CPPSIMD
+using mgOnGpu::fptype_v;
+using mgOnGpu::fptype2_v;
+using mgOnGpu::cxtype_v;
+using mgOnGpu::bool_v;
+#endif
+
+//--------------------------------------------------------------------------
+
+#ifndef __CUDACC__
+
+// Printout to stream for user defined types
+
+#ifndef MGONGPU_CPPCXTYPE_CXSMPL // operator<< for cxsmpl has already been defined!
+inline std::ostream&
+operator<<( std::ostream& out, const cxtype& c )
+{
+  out << "[" << cxreal( c ) << "," << cximag( c ) << "]";
+  //out << cxreal(c) << "+i" << cximag(c);
+  return out;
+}
+#endif
+
+/*
+#ifdef MGONGPU_CPPSIMD
+inline std::ostream&
+operator<<( std::ostream& out, const bool_v& v )
+{
+  out << "{ " << v[0];
+  for ( int i=1; i<neppV; i++ ) out << ", " << (bool)(v[i]);
+  out << " }";
+  return out;
+}
+#endif
+*/
+
+#ifdef MGONGPU_CPPSIMD
+inline std::ostream&
+operator<<( std::ostream& out, const fptype_v& v )
+{
+  out << "{ " << v[0];
+  for( int i = 1; i < neppV; i++ ) out << ", " << v[i];
+  out << " }";
+  return out;
+}
+#endif
+
+#ifdef MGONGPU_CPPSIMD
+inline std::ostream&
+operator<<( std::ostream& out, const cxtype_v& v )
+{
+#ifdef MGONGPU_HAS_CPPCXTYPEV_BRK
+  out << "{ " << v[0];
+  for( int i = 1; i < neppV; i++ ) out << ", " << v[i];
+#else
+  out << "{ " << cxmake( v.real()[0], v.imag()[0] );
+  for( int i = 1; i < neppV; i++ ) out << ", " << cxmake( v.real()[i], v.imag()[i] );
+#endif
+  out << " }";
+  return out;
+}
+#endif
+
+//--------------------------------------------------------------------------
+
+/*
+// Printout to std::cout for user defined types
+
+inline void print( const fptype& f ) { std::cout << f << std::endl; }
+
+#ifdef MGONGPU_CPPSIMD
+inline void print( const fptype_v& v ) { std::cout << v << std::endl; }
+#endif
+
+inline void print( const cxtype& c ) { std::cout << c << std::endl; }
+
+#ifdef MGONGPU_CPPSIMD
+inline void print( const cxtype_v& v ) { std::cout << v << std::endl; }
+#endif
+*/
+
+//--------------------------------------------------------------------------
+
+// Functions and operators for fptype_v
+
+#ifdef MGONGPU_CPPSIMD
+inline fptype_v
+fpsqrt( const fptype_v& v )
+{
+  // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt
+  fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594
+  for( int i = 0; i < neppV; i++ ) out[i] = fpsqrt( v[i] );
+  return out;
+}
+#endif
+
+/*
+#ifdef MGONGPU_CPPSIMD
+inline fptype_v
+fpvmake( const fptype v[neppV] )
+{
+  fptype_v out = {}; // see #594
+  for ( int i=0; i<neppV; i++ ) out[i] = v[i];
+  return out;
+}
+#endif
+*/
+
+//--------------------------------------------------------------------------
+
+// Functions and operators for cxtype_v
+
+#ifdef MGONGPU_CPPSIMD
+
+/*
+inline cxtype_v
+cxvmake( const cxtype c )
+{
+  cxtype_v out;
+  for ( int i=0; i<neppV; i++ ) out[i] = c;
+  return out;
+}
+*/
+
+inline cxtype_v
+cxmake( const fptype_v& r, const fptype_v& i )
+{
+  return cxtype_v( r, i );
+}
+
+inline cxtype_v
+cxmake( const fptype_v& r, const fptype& i )
+{
+  //return cxtype_v( r, fptype_v{i} ); // THIS WAS A BUG! #339
+  return cxtype_v( r, fptype_v{} + i ); // IIII=0000+i=iiii
+}
+
+inline cxtype_v
+cxmake( const fptype& r, const fptype_v& i )
+{
+  //return cxtype_v( fptype_v{r}, i ); // THIS WAS A BUG! #339
+  return cxtype_v( fptype_v{} + r, i ); // IIII=0000+r=rrrr
+}
+
+inline const fptype_v&
+cxreal( const cxtype_v& c )
+{
+  return c.real(); // returns by reference
+}
+
+inline const fptype_v&
+cximag( const cxtype_v& c )
+{
+  return c.imag(); // returns by reference
+}
+
+inline const cxtype_v
+cxconj( const cxtype_v& c )
+{
+  return cxtype_v( c.real(), -c.imag() );
+}
+
+inline cxtype_v
+operator+( const cxtype_v& a, const cxtype_v& b )
+{
+  return cxtype_v( a.real() + b.real(), a.imag() + b.imag() );
+}
+
+inline cxtype_v
+operator+( const fptype_v& a, const cxtype_v& b )
+{
+  return cxtype_v( a + b.real(), b.imag() );
+}
+
+inline cxtype_v
+operator+( const cxtype_v& a, const fptype_v& b )
+{
+  return cxtype_v( a.real() + b, a.imag() );
+}
+
+inline const cxtype_v&
+operator+( const cxtype_v& a )
+{
+  return a;
+}
+
+inline cxtype_v
+operator-( const cxtype_v& a, const cxtype_v& b )
+{
+  return cxtype_v( a.real() - b.real(), a.imag() - b.imag() );
+}
+
+inline cxtype_v
+operator-( const fptype& a, const cxtype_v& b )
+{
+  return cxtype_v( a - b.real(), -b.imag() );
+}
+
+inline cxtype_v
+operator-( const cxtype_v& a )
+{
+  return 0 - a;
+}
+
+inline cxtype_v
+operator-( const cxtype_v& a, const fptype& b )
+{
+  return cxtype_v( a.real() - b, a.imag() );
+}
+
+inline cxtype_v
+operator-( const fptype_v& a, const cxtype_v& b )
+{
+  return cxtype_v( a - b.real(), -b.imag() );
+}
+
+inline cxtype_v
+operator-( const cxtype_v& a, const fptype_v& b )
+{
+  return cxtype_v( a.real() - b, a.imag() );
+}
+
+inline cxtype_v
+operator-( const fptype_v& a, const cxtype& b )
+{
+  return cxtype_v( a - b.real(), fptype_v{} - b.imag() ); // IIII=0000-b.imag()
+}
+
+inline cxtype_v
+operator*( const cxtype_v& a, const cxtype_v& b )
+{
+  return cxtype_v( a.real() * b.real() - a.imag() * b.imag(), a.imag() * b.real() + a.real() * b.imag() );
+}
+
+inline cxtype_v
+operator*( const cxtype& a, const cxtype_v& b )
+{
+  return cxtype_v( a.real() * b.real() - a.imag() * b.imag(), a.imag() * b.real() + a.real() * b.imag() );
+}
+
+inline cxtype_v
+operator*( const cxtype_v& a, const cxtype& b )
+{
+  return cxtype_v( a.real() * b.real() - a.imag() * b.imag(), a.imag() * b.real() + a.real() * b.imag() );
+}
+
+inline cxtype_v
+operator*( const fptype& a, const cxtype_v& b )
+{
+  return cxtype_v( a * b.real(), a * b.imag() );
+}
+
+inline cxtype_v
+operator*( const cxtype_v& a, const fptype& b )
+{
+  return cxtype_v( a.real() * b, a.imag() * b );
+}
+
+inline cxtype_v
+operator*( const fptype_v& a, const cxtype_v& b )
+{
+  return cxtype_v( a * b.real(), a * b.imag() );
+}
+
+inline cxtype_v
+operator*( const cxtype_v& a, const fptype_v& b )
+{
+  return cxtype_v( a.real() * b, a.imag() * b );
+}
+
+inline cxtype_v
+operator*( const fptype_v& a, const cxtype& b )
+{
+  return cxtype_v( a * b.real(), a * b.imag() );
+}
+
+inline cxtype_v
+operator*( const cxtype& a, const fptype_v& b )
+{
+  return cxtype_v( a.real() * b, a.imag() * b );
+}
+
+inline cxtype_v
+operator/( const cxtype_v& a, const cxtype_v& b )
+{
+  fptype_v bnorm = b.real() * b.real() + b.imag() * b.imag();
+  return cxtype_v( ( a.real() * b.real() + a.imag() * b.imag() ) / bnorm,
+                   ( a.imag() * b.real() - a.real() * b.imag() ) / bnorm );
+}
+
+inline cxtype_v
+operator/( const cxtype& a, const cxtype_v& b )
+{
+  fptype_v bnorm = b.real() * b.real() + b.imag() * b.imag();
+  return cxtype_v( ( cxreal( a ) * b.real() + cximag( a ) * b.imag() ) / bnorm,
+                   ( cximag( a ) * b.real() - cxreal( a ) * b.imag() ) / bnorm );
+}
+
+inline cxtype_v
+operator/( const fptype& a, const cxtype_v& b )
+{
+  fptype_v bnorm = b.real() * b.real() + b.imag() * b.imag();
+  return cxtype_v( ( a * b.real() ) / bnorm, ( -a * b.imag() ) / bnorm );
+}
+
+inline cxtype_v
+operator/( const cxtype_v& a, const fptype_v& b )
+{
+  return cxtype_v( a.real() / b, a.imag() / b );
+}
+
+inline cxtype_v
+operator/( const cxtype_v& a, const fptype& b )
+{
+  return cxtype_v( a.real() / b, a.imag() / b );
+}
+
+#endif // #ifdef MGONGPU_CPPSIMD
+
+//--------------------------------------------------------------------------
+
+// Functions and operators for bool_v (ternary and masks)
+
+#ifdef MGONGPU_CPPSIMD
+
+inline fptype_v
+fpternary( const bool_v& mask, const fptype_v& a, const fptype_v& b )
+{
+  fptype_v out = {}; // see #594
+  for( int i = 0; i < neppV; i++ ) out[i] = ( mask[i] ? a[i] : b[i] );
+  return out;
+}
+
+inline fptype_v
+fpternary( const bool_v& mask, const fptype_v& a, const fptype& b )
+{
+  fptype_v out = {}; // see #594
+  for( int i = 0; i < neppV; i++ ) out[i] = ( mask[i] ? a[i] : b );
+  return out;
+}
+
+inline fptype_v
+fpternary( const bool_v& mask, const fptype& a, const fptype_v& b )
+{
+  fptype_v out = {}; // see #594
+  for( int i = 0; i < neppV; i++ ) out[i] = ( mask[i] ? a : b[i] );
+  return out;
+}
+
+inline fptype_v
+fpternary( const bool_v& mask, const fptype& a, const fptype& b )
+{
+  fptype_v out = {}; // see #594
+  for( int i = 0; i < neppV; i++ ) out[i] = ( mask[i] ? a : b );
+  return out;
+}
+
+inline cxtype_v
+cxternary( const bool_v& mask, const cxtype_v& a, const cxtype_v& b )
+{
+#ifdef MGONGPU_HAS_CPPCXTYPEV_BRK
+  cxtype_v out;
+  //for( int i = 0; i < neppV; i++ ) out[i] = ( mask[i] ? a[i] : b[i] ); // OLD error-prone depends on "cxtype_ref& operator=( cxtype_ref&& c )"
+  for( int i = 0; i < neppV; i++ ) out[i] = cxtype( mask[i] ? a[i] : b[i] );
+  return out;
+#else
+  fptype_v outr = {}; // see #594
+  fptype_v outi = {}; // see #594
+  for( int i = 0; i < neppV; i++ )
+  {
+    outr[i] = ( mask[i] ? a.real()[i] : b.real()[i] );
+    outi[i] = ( mask[i] ? a.imag()[i] : b.imag()[i] );
+  }
+  return cxtype_v( outr, outi );
+#endif
+}
+
+inline cxtype_v
+cxternary( const bool_v& mask, const cxtype_v& a, const cxtype& b )
+{
+#ifdef MGONGPU_HAS_CPPCXTYPEV_BRK
+  cxtype_v out;
+  for( int i = 0; i < neppV; i++ ) out[i] = ( mask[i] ? a[i] : b );
+  return out;
+#else
+  fptype_v outr = {}; // see #594
+  fptype_v outi = {}; // see #594
+  for( int i = 0; i < neppV; i++ )
+  {
+    outr[i] = ( mask[i] ? a.real()[i] : b.real() );
+    outi[i] = ( mask[i] ? a.imag()[i] : b.imag() );
+  }
+  return cxtype_v( outr, outi );
+#endif
+}
+
+inline cxtype_v
+cxternary( const bool_v& mask, const cxtype& a, const cxtype_v& b )
+{
+#ifdef MGONGPU_HAS_CPPCXTYPEV_BRK
+  cxtype_v out;
+  for( int i = 0; i < neppV; i++ ) out[i] = ( mask[i] ? a : b[i] );
+  return out;
+#else
+  fptype_v outr = {}; // see #594
+  fptype_v outi = {}; // see #594
+  for( int i = 0; i < neppV; i++ )
+  {
+    outr[i] = ( mask[i] ? a.real() : b.real()[i] );
+    outi[i] = ( mask[i] ? a.imag() : b.imag()[i] );
+  }
+  return cxtype_v( outr, outi );
+#endif
+}
+
+inline cxtype_v
+cxternary( const bool_v& mask, const cxtype& a, const cxtype& b )
+{
+#ifdef MGONGPU_HAS_CPPCXTYPEV_BRK
+  cxtype_v out;
+  for( int i = 0; i < neppV; i++ ) out[i] = ( mask[i] ? a : b );
+  return out;
+#else
+  fptype_v outr = {}; // see #594
+  fptype_v outi = {}; // see #594
+  for( int i = 0; i < neppV; i++ )
+  {
+    outr[i] = ( mask[i] ? a.real() : b.real() );
+    outi[i] = ( mask[i] ? a.imag() : b.imag() );
+  }
+  return cxtype_v( outr, outi );
+#endif
+}
+
+/*
+inline bool
+maskor( const bool_v& mask )
+{
+  bool out = false;
+  for ( int i=0; i<neppV; i++ ) out = out || mask[i];
+  return out;
+}
+*/
+
+#else // i.e. #ifndef MGONGPU_CPPSIMD
+
+inline fptype
+fpternary( const bool& mask, const fptype& a, const fptype& b )
+{
+  return ( mask ? a : b );
+}
+
+inline cxtype
+cxternary( const bool& mask, const cxtype& a, const cxtype& b )
+{
+  return ( mask ? a : b );
+}
+
+/*
+inline bool
+maskor( const bool& mask )
+{
+  return mask;
+}
+*/
+
+#endif // #ifdef MGONGPU_CPPSIMD
+
+//--------------------------------------------------------------------------
+
+// Functions and operators for fptype_v (min/max)
+
+#ifdef MGONGPU_CPPSIMD
+
+inline fptype_v
+fpmax( const fptype_v& a, const fptype_v& b )
+{
+  return fpternary( ( b < a ), a, b );
+}
+
+inline fptype_v
+fpmax( const fptype_v& a, const fptype& b )
+{
+  return fpternary( ( b < a ), a, b );
+}
+
+/*
+inline fptype_v
+fpmax( const fptype& a, const fptype_v& b )
+{
+  return fpternary( ( b < a ), a, b );
+}
+*/
+
+inline fptype_v
+fpmin( const fptype_v& a, const fptype_v& b )
+{
+  return fpternary( ( a < b ), a, b );
+}
+
+/*
+inline fptype_v
+fpmin( const fptype_v& a, const fptype& b )
+{
+  return fpternary( ( a < b ), a, b );
+}
+
+inline fptype_v
+fpmin( const fptype& a, const fptype_v& b )
+{
+  return fpternary( ( a < b ), a, b );
+}
+*/
+
+//--------------------------------------------------------------------------
+
+// Vector wrapper over RRRRIIII floating point vectors (cxtype_v_ref)
+namespace mgOnGpu /* clang-format off */
+{
+  // The cxtype_v_ref class (a non-const reference to two fptype_v variables) was originally designed for MemoryAccessCouplings.
+  class cxtype_v_ref
+  {
+  public:
+    cxtype_v_ref() = delete;
+    cxtype_v_ref( const cxtype_v_ref& ) = delete;
+    cxtype_v_ref( cxtype_v_ref&& ) = default; // copy refs
+    cxtype_v_ref( fptype_v& r, fptype_v& i ) : m_preal( &r ), m_pimag( &i ) {} // copy refs
+    cxtype_v_ref& operator=( const cxtype_v_ref& ) = delete;
+    cxtype_v_ref& operator=( cxtype_v_ref&& c ) = delete;
+    cxtype_v_ref& operator=( const cxtype_v& c ) { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; } // copy values
+    __host__ __device__ operator cxtype_v() const { return cxmake( *m_preal, *m_pimag ); }
+  private:
+    fptype_v *m_preal, *m_pimag; // RRRRIIII
+  };
+} /* clang-format on */
+
+#endif // #ifdef MGONGPU_CPPSIMD
+
+//--------------------------------------------------------------------------
+
+// Functions and operators for fptype2_v
+
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+
+inline fptype2_v
+fpvmerge( const fptype_v& v1, const fptype_v& v2 )
+{
+  // This code is not very efficient! It makes mixed precision FFV/color not faster than double on C++ (#537).
+  // I considered various alternatives, including
+  // - in gcc12 and clang, __builtin_shufflevector (works with different vector lengths, BUT the same fptype...)
+  // - casting vector(4)double to vector(4)float and then assigning via reinterpret_cast... but how to do the cast?
+  // Probably the best solution is intrinsics?
+  // - see https://stackoverflow.com/questions/5139363
+  // - see https://stackoverflow.com/questions/54518744
+  /*
+  fptype2_v out;
+  for( int ieppV = 0; ieppV < neppV; ieppV++ )
+  {
+    out[ieppV] = v1[ieppV];
+    out[ieppV+neppV] = v2[ieppV];
+  }
+  return out;
+  */
+#if MGONGPU_CPPSIMD == 2
+  fptype2_v out =
+    { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v2[0], (fptype2)v2[1] };
+#elif MGONGPU_CPPSIMD == 4
+  fptype2_v out =
+    { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3] };
+#elif MGONGPU_CPPSIMD == 8
+  fptype2_v out =
+    { (fptype2)v1[0], (fptype2)v1[1], (fptype2)v1[2], (fptype2)v1[3], (fptype2)v1[4], (fptype2)v1[5], (fptype2)v1[6], (fptype2)v1[7], (fptype2)v2[0], (fptype2)v2[1], (fptype2)v2[2], (fptype2)v2[3], (fptype2)v2[4], (fptype2)v2[5], (fptype2)v2[6], (fptype2)v2[7] };
+#endif
+  return out;
+}
+
+inline fptype_v
+fpvsplit0( const fptype2_v& v )
+{
+  /*
+  fptype_v out = {}; // see #594
+  for( int ieppV = 0; ieppV < neppV; ieppV++ )
+  {
+    out[ieppV] = v[ieppV];
+  }
+  */
+#if MGONGPU_CPPSIMD == 2
+  fptype_v out =
+    { (fptype)v[0], (fptype)v[1] };
+#elif MGONGPU_CPPSIMD == 4
+  fptype_v out =
+    { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3] };
+#elif MGONGPU_CPPSIMD == 8
+  fptype_v out =
+    { (fptype)v[0], (fptype)v[1], (fptype)v[2], (fptype)v[3], (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] };
+#endif
+  return out;
+}
+
+inline fptype_v
+fpvsplit1( const fptype2_v& v )
+{
+  /*
+  fptype_v out = {}; // see #594
+  for( int ieppV = 0; ieppV < neppV; ieppV++ )
+  {
+    out[ieppV] = v[ieppV+neppV];
+  }
+  */
+#if MGONGPU_CPPSIMD == 2
+  fptype_v out =
+    { (fptype)v[2], (fptype)v[3] };
+#elif MGONGPU_CPPSIMD == 4
+  fptype_v out =
+    { (fptype)v[4], (fptype)v[5], (fptype)v[6], (fptype)v[7] };
+#elif MGONGPU_CPPSIMD == 8
+  fptype_v out =
+    { (fptype)v[8], (fptype)v[9], (fptype)v[10], (fptype)v[11], (fptype)v[12], (fptype)v[13], (fptype)v[14], (fptype)v[15] };
+#endif
+  return out;
+}
+
+#endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+
+#endif // #ifndef __CUDACC__
+
+//==========================================================================
+
+#ifdef __CUDACC__
+
+//------------------------------
+// Vector types - CUDA
+//------------------------------
+
+// Printout to std::cout for user defined types
+inline __host__ __device__ void
+print( const fptype& f )
+{
+  printf( "%f\n", f );
+}
+inline __host__ __device__ void
+print( const cxtype& c )
+{
+  printf( "[%f, %f]\n", cxreal( c ), cximag( c ) );
+}
+
+/*
+inline __host__ __device__ const cxtype&
+cxvmake( const cxtype& c )
+{
+  return c;
+}
+*/
+
+inline __host__ __device__ fptype
+fpternary( const bool& mask, const fptype& a, const fptype& b )
+{
+  return ( mask ? a : b );
+}
+
+inline __host__ __device__ cxtype
+cxternary( const bool& mask, const cxtype& a, const cxtype& b )
+{
+  return ( mask ? a : b );
+}
+
+#endif // #ifdef __CUDACC__
+
+//==========================================================================
+
+// Scalar-or-vector types: scalar in CUDA, vector or scalar in C++
+#ifdef __CUDACC__
+typedef bool bool_sv;
+typedef fptype fptype_sv;
+typedef fptype2 fptype2_sv;
+typedef cxtype cxtype_sv;
+typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
+#elif defined MGONGPU_CPPSIMD
+typedef bool_v bool_sv;
+typedef fptype_v fptype_sv;
+typedef fptype2_v fptype2_sv;
+typedef cxtype_v cxtype_sv;
+typedef mgOnGpu::cxtype_v_ref cxtype_sv_ref;
+#else
+typedef bool bool_sv;
+typedef fptype fptype_sv;
+typedef fptype2 fptype2_sv;
+typedef cxtype cxtype_sv;
+typedef mgOnGpu::cxtype_ref cxtype_sv_ref;
+#endif
+
+// Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++
+#ifdef __CUDACC__ /* clang-format off */
+inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); }
+#elif defined MGONGPU_CPPSIMD
+inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000
+#else
+inline cxtype cxzero_sv() { return cxtype( 0, 0 ); }
+#endif /* clang-format on */
+
+//==========================================================================
+
+// Functions and operators for cxtype_sv
+inline __host__ __device__ fptype_sv
+cxabs2( const cxtype_sv& c )
+{
+  return cxreal( c ) * cxreal( c ) + cximag( c ) * cximag( c );
+}
+
+//==========================================================================
+
+#endif // MGONGPUVECTORS_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/rambo.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/rambo.h
new file mode 100644
index 0000000000..929204feff
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/rambo.h
@@ -0,0 +1,180 @@
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuFptypes.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <iostream>
+
+// Simplified rambo version for 2 to N (with N>=2) processes with massless particles
+#ifdef __CUDACC__
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  using mgOnGpu::np4;
+  using mgOnGpu::npari;
+  using mgOnGpu::nparf;
+  using mgOnGpu::npar;
+
+  //--------------------------------------------------------------------------
+
+  // Fill in the momenta of the initial particles
+  // [NB: the output buffer includes both initial and final momenta, but only initial momenta are filled in]
+  template<class M_ACCESS>
+  __host__ __device__ void
+  ramboGetMomentaInitial( const fptype energy, // input: energy
+                          fptype* momenta )    // output: momenta for one event or for a set of events
+  {
+    const fptype energy1 = energy / 2;
+    const fptype energy2 = energy / 2;
+    const fptype mom = energy / 2;
+    M_ACCESS::kernelAccessIp4Ipar( momenta, 0, 0 ) = energy1;
+    M_ACCESS::kernelAccessIp4Ipar( momenta, 1, 0 ) = 0;
+    M_ACCESS::kernelAccessIp4Ipar( momenta, 2, 0 ) = 0;
+    M_ACCESS::kernelAccessIp4Ipar( momenta, 3, 0 ) = mom;
+    M_ACCESS::kernelAccessIp4Ipar( momenta, 0, 1 ) = energy2;
+    M_ACCESS::kernelAccessIp4Ipar( momenta, 1, 1 ) = 0;
+    M_ACCESS::kernelAccessIp4Ipar( momenta, 2, 1 ) = 0;
+    M_ACCESS::kernelAccessIp4Ipar( momenta, 3, 1 ) = -mom;
+  }
+
+  //--------------------------------------------------------------------------
+
+  // Fill in the momenta of the final particles using the RAMBO algorithm
+  // [NB: the output buffer includes both initial and final momenta, but only initial momenta are filled in]
+  template<class R_ACCESS, class M_ACCESS, class W_ACCESS>
+  __host__ __device__ void
+  ramboGetMomentaFinal( const fptype energy,  // input: energy
+                        const fptype* rndmom, // input: random numbers in [0,1] for one event or for a set of events
+                        fptype* momenta,      // output: momenta for one event or for a set of events
+                        fptype* wgts )        // output: weights for one event or for a set of events
+  {
+    /****************************************************************************
+     *                       rambo                                              *
+     *    ra(ndom)  m(omenta)  b(eautifully)  o(rganized)                       *
+     *                                                                          *
+     *    a democratic multi-particle phase space generator                     *
+     *    authors:  s.d. ellis,  r. kleiss,  w.j. stirling                      *
+     *    this is version 1.0 -  written by r. kleiss                           *
+     *    -- adjusted by hans kuijf, weights are logarithmic (1990-08-20)       *
+     *    -- adjusted by madgraph@sheffield_gpu_hackathon team (2020-07-29)     *
+     *                                                                          *
+     ****************************************************************************/
+
+    // output weight
+    fptype& wt = W_ACCESS::kernelAccess( wgts );
+
+    // AV special case nparf==1 (issue #358)
+    if constexpr( nparf == 1 )
+    {
+      static bool first = true;
+      if( first )
+      {
+#ifdef __CUDACC__
+        if constexpr( M_ACCESS::isOnDevice() ) // avoid
+        {
+          const int ievt0 = 0;
+          const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+          if( ievt == ievt0 )
+            printf( "WARNING! Rambo called with 1 final particle: random numbers will be ignored\n" );
+        }
+        else
+#endif
+        {
+          printf( "WARNING! Rambo called with 1 final particle: random numbers will be ignored\n" );
+        }
+        first = false;
+      }
+      const int iparf = 0;
+      for( int i4 = 0; i4 < np4; i4++ )
+      {
+        M_ACCESS::kernelAccessIp4Ipar( momenta, i4, iparf + npari ) = 0;
+        for( int ipari = 0; ipari < npari; ipari++ )
+        {
+          M_ACCESS::kernelAccessIp4Ipar( momenta, i4, iparf + npari ) += M_ACCESS::kernelAccessIp4Ipar( momenta, i4, ipari );
+        }
+      }
+      wt = 1;
+      return;
+    }
+
+    // initialization step: factorials for the phase space weight
+    const fptype twopi = 8. * atan( 1. );
+    const fptype po2log = log( twopi / 4. );
+    fptype z[nparf];
+    if constexpr( nparf > 1 ) // avoid build warning on clang (related to #358)
+      z[1] = po2log;
+    for( int kpar = 2; kpar < nparf; kpar++ ) z[kpar] = z[kpar - 1] + po2log - 2. * log( fptype( kpar - 1 ) );
+    for( int kpar = 2; kpar < nparf; kpar++ ) z[kpar] = ( z[kpar] - log( fptype( kpar ) ) );
+
+    // generate n massless momenta in infinite phase space
+    fptype q[nparf][np4];
+    for( int iparf = 0; iparf < nparf; iparf++ )
+    {
+      const fptype r1 = R_ACCESS::kernelAccessIp4IparfConst( rndmom, 0, iparf );
+      const fptype r2 = R_ACCESS::kernelAccessIp4IparfConst( rndmom, 1, iparf );
+      const fptype r3 = R_ACCESS::kernelAccessIp4IparfConst( rndmom, 2, iparf );
+      const fptype r4 = R_ACCESS::kernelAccessIp4IparfConst( rndmom, 3, iparf );
+      const fptype c = 2. * r1 - 1.;
+      const fptype s = sqrt( 1. - c * c );
+      const fptype f = twopi * r2;
+      q[iparf][0] = -log( r3 * r4 );
+      q[iparf][3] = q[iparf][0] * c;
+      q[iparf][2] = q[iparf][0] * s * cos( f );
+      q[iparf][1] = q[iparf][0] * s * sin( f );
+    }
+
+    // calculate the parameters of the conformal transformation
+    fptype r[np4];
+    fptype b[np4 - 1];
+    for( int i4 = 0; i4 < np4; i4++ ) r[i4] = 0.;
+    for( int iparf = 0; iparf < nparf; iparf++ )
+    {
+      for( int i4 = 0; i4 < np4; i4++ ) r[i4] = r[i4] + q[iparf][i4];
+    }
+    const fptype rmas = sqrt( pow( r[0], 2 ) - pow( r[3], 2 ) - pow( r[2], 2 ) - pow( r[1], 2 ) );
+    for( int i4 = 1; i4 < np4; i4++ ) b[i4 - 1] = -r[i4] / rmas;
+    const fptype g = r[0] / rmas;
+    const fptype a = 1. / ( 1. + g );
+    const fptype x0 = energy / rmas;
+
+    // transform the q's conformally into the p's (i.e. the 'momenta')
+    for( int iparf = 0; iparf < nparf; iparf++ )
+    {
+      fptype bq = b[0] * q[iparf][1] + b[1] * q[iparf][2] + b[2] * q[iparf][3];
+      for( int i4 = 1; i4 < np4; i4++ )
+      {
+        M_ACCESS::kernelAccessIp4Ipar( momenta, i4, iparf + npari ) = x0 * ( q[iparf][i4] + b[i4 - 1] * ( q[iparf][0] + a * bq ) );
+      }
+      M_ACCESS::kernelAccessIp4Ipar( momenta, 0, iparf + npari ) = x0 * ( g * q[iparf][0] + bq );
+    }
+
+    // calculate weight (NB return log of weight)
+    wt = po2log;
+    if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1];
+
+#ifndef __CUDACC__
+    // issue warnings if weight is too small or too large
+    static int iwarn[5] = { 0, 0, 0, 0, 0 };
+    if( wt < -180. )
+    {
+      if( iwarn[0] <= 5 ) std::cout << "Too small wt, risk for underflow: " << wt << std::endl;
+      iwarn[0] = iwarn[0] + 1;
+    }
+    if( wt > 174. )
+    {
+      if( iwarn[1] <= 5 ) std::cout << "Too large wt, risk for overflow: " << wt << std::endl;
+      iwarn[1] = iwarn[1] + 1;
+    }
+#endif
+
+    // return for weighted massless momenta
+    // nothing else to do in this event if all particles are massless (nm==0)
+
+    return;
+  }
+
+  //--------------------------------------------------------------------------
+}
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/read_slha.cc b/epochX/cudacpp/smeft_gg_tttt.sa/src/read_slha.cc
new file mode 100644
index 0000000000..2934e3a476
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/read_slha.cc
@@ -0,0 +1,184 @@
+#include "read_slha.h"
+
+#include <algorithm>
+#include <cstdlib>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+
+void
+SLHABlock::set_entry( std::vector<int> indices, double value )
+{
+  if( _entries.size() == 0 )
+    _indices = indices.size();
+  else if( indices.size() != _indices )
+    throw "Wrong number of indices in set_entry";
+
+  _entries[indices] = value;
+}
+
+double
+SLHABlock::get_entry( std::vector<int> indices, double def_val )
+{
+  if( _entries.find( indices ) == _entries.end() )
+  {
+    std::cout << "Warning: No such entry in " << _name << ", using default value "
+              << def_val << std::endl;
+    return def_val;
+  }
+  return _entries[indices];
+}
+
+void
+SLHAReader::read_slha_file( std::string file_name, bool verbose )
+{
+  std::ifstream param_card;
+  param_card.open( file_name.c_str(), std::ifstream::in );
+  if( param_card.good() )
+  {
+    if( verbose ) std::cout << "Opened slha file " << file_name << " for reading" << std::endl;
+  }
+  else
+  {
+    const char envpath[] = "MG5AMC_CARD_PATH";
+    if( !getenv( envpath ) )
+    {
+      std::cout << "ERROR! Card file '" << file_name << "' does not exist"
+                << " and environment variable '" << envpath << "' is not set" << std::endl;
+      throw "Error while opening param card";
+    }
+    else
+    {
+      std::cout << "WARNING! Card file '" << file_name << "' does not exist:"
+                << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl;
+      const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename();
+      param_card.open( file_name2.c_str(), std::ifstream::in );
+      if( param_card.good() )
+      {
+        std::cout << "Opened slha file " << file_name2 << " for reading" << std::endl;
+      }
+      else
+      {
+        std::cout << "ERROR! Card file '" << file_name2 << "' does not exist" << std::endl;
+        throw "Error while opening param card";
+      }
+    }
+  }
+  char buf[200];
+  std::string line;
+  std::string block( "" );
+  while( param_card.good() )
+  {
+    param_card.getline( buf, 200 );
+    line = buf;
+    // Change to lowercase
+    transform( line.begin(), line.end(), line.begin(), (int ( * )( int ))tolower );
+    if( line != "" && line[0] != '#' )
+    {
+      if( block != "" )
+      {
+        // Look for double index blocks
+        double dindex1, dindex2;
+        double value;
+        std::stringstream linestr2( line );
+        if( linestr2 >> dindex1 >> dindex2 >> value &&
+            dindex1 == int( dindex1 ) and dindex2 == int( dindex2 ) )
+        {
+          std::vector<int> indices;
+          indices.push_back( int( dindex1 ) );
+          indices.push_back( int( dindex2 ) );
+          set_block_entry( block, indices, value );
+          // Done with this line, read next
+          continue;
+        }
+        std::stringstream linestr1( line );
+        // Look for single index blocks
+        if( linestr1 >> dindex1 >> value && dindex1 == int( dindex1 ) )
+        {
+          std::vector<int> indices;
+          indices.push_back( int( dindex1 ) );
+          set_block_entry( block, indices, value );
+          // Done with this line, read next
+          continue;
+        }
+      }
+      // Look for block
+      if( line.find( "block " ) != line.npos )
+      {
+        line = line.substr( 6 );
+        // Get rid of spaces between block and block name
+        while( line[0] == ' ' )
+          line = line.substr( 1 );
+        // Now find end of block name
+        size_t space_pos = line.find( ' ' );
+        if( space_pos != std::string::npos )
+          line = line.substr( 0, space_pos );
+        block = line;
+        continue;
+      }
+      // Look for decay
+      if( line.find( "decay " ) == 0 )
+      {
+        line = line.substr( 6 );
+        block = "";
+        std::stringstream linestr( line );
+        int pdg_code;
+        double value;
+        if( linestr >> pdg_code >> value )
+          set_block_entry( "decay", pdg_code, value );
+        else
+          std::cout << "Warning: Wrong format for decay block " << line << std::endl;
+        continue;
+      }
+    }
+  }
+  if( _blocks.size() == 0 )
+    throw "No information read from SLHA card";
+
+  param_card.close();
+}
+
+double
+SLHAReader::get_block_entry( std::string block_name, std::vector<int> indices, double def_val )
+{
+  if( _blocks.find( block_name ) == _blocks.end() )
+  {
+    std::cout << "No such block " << block_name << ", using default value "
+              << def_val << std::endl;
+    return def_val;
+  }
+  return _blocks[block_name].get_entry( indices );
+}
+
+double
+SLHAReader::get_block_entry( std::string block_name, int index, double def_val )
+{
+  std::vector<int> indices;
+  indices.push_back( index );
+  return get_block_entry( block_name, indices, def_val );
+}
+
+void
+SLHAReader::set_block_entry( std::string block_name, std::vector<int> indices, double value )
+{
+  if( _blocks.find( block_name ) == _blocks.end() )
+  {
+    SLHABlock block( block_name );
+    _blocks[block_name] = block;
+  }
+  _blocks[block_name].set_entry( indices, value );
+  /*
+  cout << "Set block " << block_name << " entry ";
+  for (int i=0;i < indices.size();i++)
+    cout << indices[i] << " ";
+  cout << "to " << _blocks[block_name].get_entry(indices) << endl;
+  */
+}
+
+void
+SLHAReader::set_block_entry( std::string block_name, int index, double value )
+{
+  std::vector<int> indices;
+  indices.push_back( index );
+  set_block_entry( block_name, indices, value );
+}
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/read_slha.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/read_slha.h
new file mode 100644
index 0000000000..feb8b43b5a
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/read_slha.h
@@ -0,0 +1,41 @@
+#ifndef READ_SLHA_H
+#define READ_SLHA_H 1
+
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+class SLHABlock
+{
+public:
+  SLHABlock( std::string name = "" ) { _name = name; }
+  ~SLHABlock() {}
+  void set_entry( std::vector<int> indices, double value );
+  double get_entry( std::vector<int> indices, double def_val = 0 );
+  void set_name( std::string name ) { _name = name; }
+  std::string get_name() { return _name; }
+  unsigned int get_indices() { return _indices; }
+private:
+  std::string _name;
+  std::map<std::vector<int>, double> _entries;
+  unsigned int _indices;
+};
+
+class SLHAReader
+{
+public:
+  SLHAReader( std::string file_name = "", bool verbose = true )
+  {
+    if( file_name != "" ) read_slha_file( file_name, verbose );
+  }
+  void read_slha_file( std::string file_name, bool verbose );
+  double get_block_entry( std::string block_name, std::vector<int> indices, double def_val = 0 );
+  double get_block_entry( std::string block_name, int index, double def_val = 0 );
+  void set_block_entry( std::string block_name, std::vector<int> indices, double value );
+  void set_block_entry( std::string block_name, int index, double value );
+private:
+  std::map<std::string, SLHABlock> _blocks;
+};
+
+#endif